# for emacs: -*- mode: sh; -*- # Caenorhabditis elegans # Washington University School of Medicine GSC and Sanger Institute WS204 # $Id: ce8.txt,v 1.3 2009/09/24 20:54:24 hiram Exp $ ######################################################################### # DOWNLOAD SEQUENCE (DONE - 2009-07-22 - Hiram) mkdir /hive/data/genomes/ce8 cd /hive/data/genomes/ce8 mkdir ws204 cd ws204 TOP=/hive/data/genomes/ce8/ws204 export TOP for D in annotation genome_feature_tables/GFF2 \ genome_feature_tables/SUPPLEMENTARY_GFF sequences/dna \ sequences/protein sequences/rna do mkdir -p ${D} cd ${D} wget --timestamping \ ftp://ftp.sanger.ac.uk/pub2/wormbase/WS204/genomes/c_elegans/${D}/*.* cd ${TOP} done # about 1h24m ######################################################################### # NORMALIZE SEQUENCE NAMES TO BEGIN WITH chr (DONE - 2009-07-24 - Hiram) mkdir /hive/data/genomes/ce8/sanger cd /hive/data/genomes/ce8/sanger # Fix fasta names: zcat ../ws204/sequences/dna/CHR*[XVAI].dna.gz \ | sed -e '/^$/ d; s/^>CHROMOSOME_MtDNA/>chrM/; s/^>CHROMOSOME_/>chr/;' \ | gzip -c > UCSC.fa.gz faSize -detailed UCSC.fa.gz # chrI 15072421 # chrII 15279323 # chrIII 13783685 # chrIV 17493784 # chrM 13794 # chrV 20924143 # chrX 17718854 # chrII(+1) and chrII(+3) are slightly different than WS200 # Make sure we get the same sizes from this command: zcat ../ws204/sequences/dna/CHR*[XVAI].dna.gz | sed -e '/^$/ d;' \ | faSize -detailed stdin faCount UCSC.fa.gz #seq len A C G T N cpg # chrI 15072421 4835939 2695879 2692150 4848453 0 503521 # chrII 15279323 4878195 2769216 2762198 4869714 0 492149 # chrIII 13783685 4444652 2449141 2466322 4423570 0 459669 # chrIV 17493784 5711040 3034767 3017008 5730969 0 522372 # chrM 13794 4335 1225 2055 6179 0 110 # chrV 20924143 6750393 3712058 3701397 6760295 0 638983 # chrX 17718854 5747199 3119702 3117868 5734085 0 514715 # total 100286004 32371753 17781988 17758998 323732650 3131519 # WS200: # chrII 15279324 4878196 2769216 2762198 4869714 0 492149 # chrIII 13783682 4444652 2449139 2466321 4423570 0 459669 # total 100286002 32371754 17781986 17758997 323732650 3131519 # Fix AGP names: sed -e 's/^/chr/' ../ws204/sequences/dna/CHR*.agp > UCSC.agp # And add a fake mitochondrial AGP entry for the sake of downstream # tools (make sure the GenBank sequence is identical to given): echo -e "chrM\t1\t13794\t1\tF\tNC_001328.1\t1\t13794\t+" >> UCSC.agp ######################################################################### # run the makeGenomeDb procedure to create the db and unmasked sequence # (DONE - 2009-07-22 - Hiram) cd /hive/data/genomes/ce8 cat << '_EOF_' > ce8.config.ra # Config parameters for makeGenomeDb.pl: db ce8 clade worm genomeCladePriority 10 scientificName Caenorhabditis elegans commonName C. elegans assemblyDate Jun 2009 assemblyLabel Washington University School of Medicine GSC and Sanger Institute WS204 orderKey 824 mitoAcc none fastaFiles /hive/data/genomes/ce8/sanger/UCSC.fa.gz agpFiles /hive/data/genomes/ce8/sanger/UCSC.agp # qualFiles /dev/null dbDbSpeciesDir worm taxId 6239 '_EOF_' # << emacs mkdir jkStuff # run just to AGP to make sure things are sane first nice -n +19 makeGenomeDb.pl ce8.config.ra -stop=agp \ > jkStuff/makeGenomeDb.agp.log 2>&1 # now, continuing to make the Db and all time nice -n +19 makeGenomeDb.pl ce8.config.ra -continue=db \ > jkStuff/makeGenomeDb.db.log 2>&1 # real 1m33.036s # take the trackDb business there and check it into the source tree # fixup the description, gap and gold html page descriptions ######################################################################### # REPEATMASKER (DONE - 2009-07-24 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/ce8/bed/repeatMasker cd /hive/data/genomes/ce8/bed/repeatMasker time nice -n +19 doRepeatMasker.pl -bigClusterHub=swarm \ -buildDir=`pwd` ce8 > do.log 2>&1 & # real 35m58.812s # from the do.log: # June 4 2009 (open-3-2-8) version of RepeatMasker # CC RELEASE 20090604; cat faSize.rmsk.txt # 100286004 bases (0 N's 100286004 real 87035623 upper 13250381 lower) # in 7 sequences in 1 files # %13.21 masked total, %13.21 masked real ######################################################################### # SIMPLE REPEATS (DONE - 2009-07-24 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/ce8/bed/simpleRepeat cd /hive/data/genomes/ce8/bed/simpleRepeat time nice -n +19 doSimpleRepeat.pl -smallClusterHub=encodek \ -buildDir=`pwd` ce8 > do.log 2>&1 & # real 18m30.323s cat fb.simpleRepeat # 4331076 bases of 100286004 (4.319%) in intersection ######################################################################### # MASK SEQUENCE WITH RM+TRF (DONE - 2009-07-24 - Hiram) # Since both doRepeatMasker.pl and doSimpleRepeats.pl have completed, # now it's time to combine the masking into the final ce8.2bit, # following the instructions at the end of doSimpleRepeat's output. cd /hive/data/genomes/ce8 twoBitMask ce8.rmsk.2bit -add bed/simpleRepeat/trfMask.bed ce8.2bit # You can safely ignore the warning about extra BED columns twoBitToFa ce8.2bit stdout | faSize stdin > faSize.ce8.2bit.txt cat faSize.ce8.2bit.txt # 100286004 bases (0 N's 100286004 real 86863769 upper 13422235 lower) # in 7 sequences in 1 files # %13.38 masked total, %13.38 masked real # set the symlink on hgwdev to /gbdb/ce8 rm -f /gbdb/ce8/ce8.2bit ln -s `pwd`/ce8.2bit /gbdb/ce8/ce8.2bit ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2009-07-22 - Hiram) # Use -repMatch=100 (based on size -- for human we use 1024, and # worm size is ~3.4% of human judging by gapless ce4 vs. hg18 genome # size from featureBits. So we would use 34, but that yields a very # high number of tiles to ignore, especially for a small more compact # genome. Bump that up a bit to be more conservative. cd /hive/data/genomes/ce8 blat ce8.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/ce8.11.ooc -repMatch=100 # Wrote 8514 overused 11-mers to jkStuff/ce8.11.ooc # copy all of this stuff to the klusters: mkdir /hive/data/staging/data/ce8 cp -p jkStuff/ce8.11.ooc chrom.sizes ce8.2bit /hive/data/staging/data/ce8 # request push of that data to kluster nodes /scratch/data/ce8/ ######################################################################### ## LASTZ caePb2 (DONE - 2009-07-28 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/ce8/bed/lastzCaePb2.2009-07-28 cd /hive/data/genomes/ce8/bed/lastzCaePb2.2009-07-28 cat << '_EOF_' > DEF # ce8 vs caePb2 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce8 SEQ1_DIR=/scratch/data/ce8/ce8.2bit SEQ1_LEN=/scratch/data/ce8/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. PB2801 caePb2 SEQ2_DIR=/scratch/data/caePb2/caePb2.2bit SEQ2_LEN=/scratch/data/caePb2/chrom.sizes SEQ2_CTGDIR=/scratch/data/caePb2/caePb2.supercontigs.2bit SEQ2_CTGLEN=/scratch/data/caePb2/caePb2.supercontigs.sizes SEQ2_LIFT=/scratch/data/caePb2/caePb2.supercontigs.lift SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/ce8/bed/lastzCaePb2.2009-07-28 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF -verbose=2 -bigClusterHub=swarm -workhorse=hgwdev \ -qRepeats=windowmaskerSdust -noLoadChainSplit -smallClusterHub=memk \ > do.log 2>&1 & # about 50 minutes cat fb.ce8.chainCaePb2Link.txt # 40793017 bases of 100286004 (40.677%) in intersection # swap, this is also in caePb2.txt mkdir /hive/data/genomes/caePb2/bed/blastz.ce8.swap cd /hive/data/genomes/caePb2/bed/blastz.ce8.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -workhorse=hgwdev -qRepeats=windowmaskerSdust \ /hive/data/genomes/ce8/bed/lastzCaePb2.2009-07-28/DEF \ -bigClusterHub=swarm -smallClusterHub=encodek -swap > swap.log 2>&1 & # real 3m16.709s cat fb.caePb2.chainCe8Link.txt # 55084580 bases of 170473138 (32.313%) in intersection ######################################################################### ## BLASTZ caeJap2 (DONE - 2009-07-28 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/ce8/bed/lastzCaeJap2.2009-07-28 cd /hive/data/genomes/ce8/bed/lastzCaeJap2.2009-07-28 cat << '_EOF_' > DEF # ce8 vs caeJap2 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce8 SEQ1_DIR=/scratch/data/ce8/ce8.2bit SEQ1_LEN=/scratch/data/ce8/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. japonica caeJap2 SEQ2_DIR=/scratch/data/caeJap2/caeJap2.2bit SEQ2_LEN=/scratch/data/caeJap2/chrom.sizes SEQ2_CTGDIR=/scratch/data/caeJap2/caeJap2.supers.2bit SEQ2_CTGLEN=/scratch/data/caeJap2/caeJap2.supers.sizes SEQ2_LIFT=/scratch/data/caeJap2/caeJap2.chrUn.lift SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/ce8/bed/lastzCaeJap2.2009-07-28 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -bigClusterHub=swarm -noLoadChainSplit -qRepeats=windowmaskerSdust \ -workhorse=hgwdev -smallClusterHub=memk > do.log 2>&1 & # about 42 minutes cat fb.ce8.chainCaeJap2Link.txt # 27270052 bases of 100286004 (27.192%) in intersection # swap, this is also in caeJap2.txt mkdir /hive/data/genomes/caeJap2/bed/blastz.ce8.swap cd /hive/data/genomes/caeJap2/bed/blastz.ce8.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -qRepeats=windowmaskerSdust -bigClusterHub=swarm -noLoadChainSplit \ /hive/data/genomes/ce8/bed/lastzCaeJap2.2009-07-28/DEF \ -smallClusterHub=encodek -swap > swap.log 2>&1 & # real 3m44.671s cat fb.caeJap2.chainCe8Link.txt # 26440993 bases of 129295754 (20.450%) in intersection ############################################################################ ## BLASTZ cb3 (DONE - 2009-07-28 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/ce8/bed/lastzCb3.2009-07-28 cd /hive/data/genomes/ce8/bed/lastzCb3.2009-07-28 cat << '_EOF_' > DEF # ce8 vs cb3 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce8 SEQ1_DIR=/scratch/data/ce8/ce8.2bit SEQ1_LEN=/scratch/data/ce8/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. briggsae cb3 SEQ2_DIR=/hive/data/genomes/cb3/cb3.rmskTrf.2bit SEQ2_LEN=/hive/data/genomes/cb3/chrom.sizes SEQ2_CHUNK=1000000 SEQ2_LAP=0 BASE=/hive/data/genomes/ce8/bed/lastzCb3.2009-07-28 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -bigClusterHub=swarm -noLoadChainSplit \ -smallClusterHub=memk > do.log 2>&1 & # about 40 minutes cat fb.ce8.chainCb3Link.txt # 42421395 bases of 100286004 (42.300%) in intersection # swap, this is also in cb3.txt mkdir /hive/data/genomes/cb3/bed/blastz.ce8.swap cd /hive/data/genomes/cb3/bed/blastz.ce8.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce8/bed/lastzCb3.2009-07-28/DEF \ -workhorse=hgwdev -bigClusterHub=swarm -noLoadChainSplit \ -smallClusterHub=encodek -swap > swap.log 2>&1 & # real 3m46.700s cat fb.cb3.chainCe8Link.txt # 43115929 bases of 108433446 (39.763%) in intersection ############################################################################ ## BLASTZ caeRem3 (DONE - 2009-07-28,09 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/ce8/bed/lastzCaeRem3.2009-07-28 cd /hive/data/genomes/ce8/bed/lastzCaeRem3.2009-07-28 cat << '_EOF_' > DEF # ce8 vs caeRem3 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce8 SEQ1_DIR=/scratch/data/ce8/ce8.2bit SEQ1_LEN=/scratch/data/ce8/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. remanei caeRem3 SEQ2_DIR=/scratch/data/caeRem3/caeRem3.2bit SEQ2_LEN=/scratch/data/caeRem3/chrom.sizes SEQ2_CTGDIR=/scratch/data/caeRem3/caeRem3.supercontigs.2bit SEQ2_CTGLEN=/scratch/data/caeRem3/caeRem3.supercontigs.sizes SEQ2_LIFT=/scratch/data/caeRem3/caeRem3.chrUn.lift SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/ce8/bed/lastzCaeRem3.2009-07-28 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -bigClusterHub=swarm -noLoadChainSplit \ -qRepeats=windowmaskerSdust -smallClusterHub=memk > do.log 2>&1 & XXX - running Tue Jul 28 11:14:53 PDT 2009 # real 28m14.168s cat fb.ce8.chainCaeRem3Link.txt # 41841184 bases of 100286004 (41.722%) in intersection # swap, this is also in caeRem3.txt mkdir /hive/data/genomes/caeRem3/bed/blastz.ce8.swap cd /hive/data/genomes/caeRem3/bed/blastz.ce8.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -qRepeats=windowmaskerSdust \ -workhorse=hgwdev -noLoadChainSplit \ /hive/data/genomes/ce8/bed/lastzCaeRem3.2009-07-28/DEF \ -bigClusterHub=swarm -smallClusterHub=encodek -swap > swap.log 2>&1 & # real 3m10.033s cat fb.caeRem3.chainCe8Link.txt # 46320775 bases of 138406388 (33.467%) in intersection ############################################################################ ## 5-Way multiple alignment (DONE - 2009-07-28 - Hiram) mkdir /hive/data/genomes/ce8/bed/multiz5way cd /hive/data/genomes/ce8/bed/multiz5way # See notes in ce6.txt for 6-way alignment. This is the tree from # there. cat << '_EOF_' > 5way.nh ((C._elegans_ce8:0.003000, (C._brenneri_caePb2:0.013000, (C._remanei_caeRem3:0.003000,C._briggsae_cb3:0.005000):0.004000) :0.002000):0.001000, C._japonica_caeJap2:0.023000); '_EOF_' # << happy emacs /cluster/bin/phast/x86_64/all_dists 5way.nh > 5way.distances.txt grep -i ce8 5way.distances.txt | sort -k3,3n # Use this output for reference, and use the calculated # distances in the table below to order the organisms and check # the button order on the browser. # And if you can fill in the table below entirely, you have # succeeded in finishing all the alignments required. # # featureBits chainLink measures # chaince8Link chain linearGap # distance on Ce8 on other minScore # 1 0.0120 - remanei_caeRem3 (% 41.722) (% 33.467) 1000 loose # 2 0.0140 - briggsae_cb3 (% 42.300) (% 39.763) 1000 loose # 3 0.0180 - brenneri_caePb2 (% 40.677) (% 32.313) 1000 loose # 3 0.0270 - japonica_caeJap2 (% 27.192) (% 20.450) 1000 loose cd /hive/data/genomes/ce8/bed/multiz5way # bash shell syntax here ... export H=/hive/data/genomes/ce8/bed mkdir mafLinks for G in caeRem3 cb3 caePb2 caeJap2 do mkdir mafLinks/$G if [ ! -d ${H}/lastz.${G}/mafNet ]; then echo "missing directory lastz.${G}/mafNet" fi ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G done # these are x86_64 binaries mkdir penn cp -p /cluster/bin/penn/multiz.2008-11-25/multiz penn cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn cp -p /cluster/bin/penn/multiz.2008-11-25/autoMZ penn # the autoMultiz cluster run ssh memk cd /hive/data/genomes/ce8/bed/multiz5way/ # create species list and stripped down tree for autoMZ sed -e \ 's/[a-z][a-z0-9]*_//ig; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d; s/C._//g' \ 5way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list mkdir maf run cd run # NOTE: set the db and pairs directories in this script cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = ce8 set c = $1 set result = $2 set run = `pwd` set tmp = $run/tmp/$db/multiz.$c set nway = /hive/data/genomes/ce8/bed/multiz5way set pairs = $nway/mafLinks /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p $nway/tree.nh $nway/species.list $tmp pushd $tmp foreach s (`sed -e "s/ $db//" species.list`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($nway/penn $path); rehash $nway/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd /bin/rm -f $result /bin/cp -p $tmp/$c.maf $result /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty $run/tmp/$db /bin/rmdir --ignore-fail-on-non-empty $run/tmp '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/ce8/bed/multiz5way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs awk '{print $1}' /hive/data/genomes/ce8/chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList para -maxNode=1 push para check ... push ... etc ... # Completed: 7 of 7 jobs # CPU time in finished jobs: 1517s 25.28m 0.42h 0.02d 0.000 y # IO & Wait Time: 124s 2.07m 0.03h 0.00d 0.000 y # Average job time: 234s 3.91m 0.07h 0.00d # Longest finished job: 334s 5.57m 0.09h 0.00d # Submission to last job: 348s 5.80m 0.10h 0.00d cd /hive/data/genomes/ce8/bed/multiz5way time nice -n +19 catDir maf > multiz5way.maf # a few seconds to produce: # -rw-rw-r-- 1 321149265 Jul 28 14:53 multiz5way.maf # before getting to the annotation, load this up so we can get # a first view of the track. This will be replaced with the annotated # mafs ssh hgwdev cd /hive/data/genomes/ce8/bed/multiz5way mkdir /gbdb/ce8/multiz5way ln -s /hive/data/genomes/ce8/bed/multiz5way/multiz5way.maf \ /gbdb/ce8/multiz5way # this load creates a large file, do that on local disk: cd /scratch/tmp time nice -n +19 hgLoadMaf ce8 multiz5way # a few seconds to load: # Loaded 327164 mafs in 1 files from /gbdb/ce8/multiz5way time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \ -maxSize=50000 ce8 multiz5waySummary /gbdb/ce8/multiz5way/multiz5way.maf # a few seconds to load: # Created 111894 summary blocks from 850001 components and 327164 mafs # from /gbdb/ce8/multiz5way/multiz5way.maf # remove the temporary .tab files: rm multiz5*.tab ############################################################################ # reset default position, same as ce4 on the ZC101 / unc-52 locus ssh hgwdev hgsql -e 'update dbDb set defaultPos="chrII:14646344-14667746" where name="ce8";' hgcentraltest ############################################################################ ## SANGER GENE TRACK (WORKING - 2009-07-29 - Hiram) ## There is a tremendous amount of extraneous notations in the gff ## files. Filter them down to a manageable set, change the chrom ## names, eliminate duplicates, select only what will work in ## ldHgGene mkdir /hive/data/genomes/ce8/bed/sangerGene cd /hive/data/genomes/ce8/bed/sangerGene for C in I II III IV V X do echo -n "${C} " cat ../../ws204/genome_feature_tables/GFF2/CHROMOSOME_${C}.gff | \ sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \ s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \ s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \ s/CHROMOSOME_M/chrM/g;" \ -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \ -e 's/CDS "//' -e 's/"//' \ > chr${C}.gff done C=M echo -n "${C} " cat ../../ws204/genome_feature_tables/GFF2/CHROMOSOME_MtDNA.gff | \ sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \ s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \ s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \ s/CHROMOSOME_M/chrM/g; s/chrMtDNA/chrM/g;" \ -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \ -e 's/CDS "//' -e 's/"//' \ > chr${C}.gff for C in I II III IV V X M do echo "chr${C}.gff -> filtered.chr${C}.gff" grep -v "^#" chr${C}.gff | awk -F'\t' ' BEGIN { IGNORECASE=1 } { if (match($2,"curated|Coding_transcript")) { if (match($3,"intron|coding_exon|exon|cds|three_prime_UTR|five_prime_UTR")) { gsub("coding_exon","CDS",$3) gsub("Transcript ","",$9) gsub(" .*","",$9) gsub("three_prime_UTR","3utr",$3) gsub("five_prime_UTR","5utr",$3) for (i = 1; i < 9; ++i) { printf "%s\t", $i } printf "%s\n", $9 } } } ' | sort -u | sort -k4n > filtered.chr${C}.gff done ssh hgwdev cd /hive/data/genomes/ce8/bed/sangerGene nice -n +19 ldHgGene ce8 sangerGene filtered.*.gff nice -n +19 ldHgGene -out=filteredGenePred.tab ce8 sangerGene filtered.*.gff # Read 55287 transcripts in 1027094 lines in 7 files # 55287 groups 7 seqs 3 sources 5 feature types # 31064 gene predictions ###############################################################################