#!/bin/bash usage='gencodeStats db ver Genrate statstics on a gencode for a release to include in ENCODE reports. ver is in the form "V7" ' set -beEu -o pipefail if [ $# != 2 ] ; then echo "wrong number of args: ${usage}" >&2 exit 1 fi db="$1" ver="$2" # generate transcript counts for a given class reportTransCounts() { local transcriptClass="$1" local baseSel="select count(distinct attrs.transcriptId) from wgEncodeGencodeAttrs${ver} attrs, wgEncodeGencodeTranscriptSource${ver} src where attrs.transcriptClass = \"${transcriptClass}\" and src.transcriptId = attrs.transcriptId" local baseSelPar="select count(distinct attrs.transcriptId) from wgEncodeGencodeAttrs${ver} attrs, wgEncodeGencodeTag${ver} tag, wgEncodeGencodeTranscriptSource${ver} src where attrs.transcriptClass = \"${transcriptClass}\" and tag.transcriptId = attrs.transcriptId and src.transcriptId = attrs.transcriptId and tag.tag = \"PAR\"" local cnt local par cnt=$(hgsql -Ne "${baseSel} and src.source like \"ensembl_havana%\"" ${db}) par=$(hgsql -Ne "${baseSelPar} and src.source like \"ensembl_havana%\"" ${db}) cnt=$(( $cnt+$par )) echo "${transcriptClass} both manual and automatic $cnt" cnt=$(hgsql -Ne "${baseSel} and src.source like \"havana%\"" ${db}) par=$(hgsql -Ne "${baseSelPar} and src.source like \"havana%\"" ${db}) cnt=$(( $cnt+$par )) echo "${transcriptClass} manual only $cnt" cnt=$(hgsql -Ne "${baseSel} and src.source not like \"%havana%\"" ${db}) par=$(hgsql -Ne "${baseSelPar} and src.source not like \"%havana%\"" ${db}) cnt=$(( $cnt+$par )) echo "${transcriptClass} automatic only $cnt" } echo "Transcript report" echo "transcriptClass source count" for transcriptClass in $(hgsql -Ne "select distinct(transcriptClass) from wgEncodeGencodeAttrs${ver}" ${db}) ; do reportTransCounts ${transcriptClass} done cnt=$(hgsql -Ne "select count(distinct attrs.transcriptId) from wgEncodeGencodeAttrs${ver} attrs" ${db}) par=$(hgsql -Ne "select count(distinct transcriptId) from wgEncodeGencodeTag${ver} where tag = \"PAR\"" ${db}) cnt=$(( $cnt+$par )) echo "All All $cnt" # generate gene stats reportGeneCounts() { local baseSel="select count(distinct attrs.geneId) from wgEncodeGencodeAttrs${ver} attrs, wgEncodeGencodeGeneSource${ver} src where src.geneId = attrs.geneId" local baseSelPar="select count(distinct attrs.geneId) from wgEncodeGencodeAttrs${ver} attrs, wgEncodeGencodeTag${ver} tag, wgEncodeGencodeGeneSource${ver} src where tag.transcriptId = attrs.transcriptId and src.geneId = attrs.geneId and tag.tag = \"PAR\"" local cnt local par cnt=$(hgsql -Ne "${baseSel} and src.source like \"ensembl_havana%\"" ${db}) par=$(hgsql -Ne "${baseSelPar} and src.source like \"ensembl_havana%\"" ${db}) cnt=$(( $cnt+$par )) echo "both manual and automatic $cnt" cnt=$(hgsql -Ne "${baseSel} and src.source like \"havana%\"" ${db}) par=$(hgsql -Ne "${baseSelPar} and src.source like \"havana%\"" ${db}) cnt=$(( $cnt+$par )) echo "manual only $cnt" cnt=$(hgsql -Ne "${baseSel} and src.source not like \"%havana%\"" ${db}) par=$(hgsql -Ne "${baseSelPar} and src.source not like \"%havana%\"" ${db}) cnt=$(( $cnt+$par )) echo "automatic only $cnt" type="PseudoGene" cnt=$(hgsql -Ne "${baseSel} and attrs.geneType like \"%pseudogene\" and src.source like \"ensembl_havana%\"" ${db}) par=$(hgsql -Ne "${baseSelPar} and attrs.geneType like \"%pseudogene\"and src.source like \"ensembl_havana%\"" ${db}) cnt=$(( $cnt+$par )) echo "pseudo both manual and automatic $cnt" cnt=$(hgsql -Ne "${baseSel} and geneType like \"%pseudogene\" and src.source like \"havana%\"" ${db}) par=$(hgsql -Ne "${baseSelPar} and attrs.geneType like \"%pseudogene\" and src.source like \"havana%\"" ${db}) cnt=$(( $cnt+$par )) echo "pseudo manual only $cnt" cnt=$(hgsql -Ne "${baseSel} and geneType like \"%pseudogene\" and src.source not like \"%havana%\"" ${db}) par=$(hgsql -Ne "${baseSelPar} and attrs.geneType like \"%pseudogene\" and src.source not like \"%havana%\"" ${db}) cnt=$(( $cnt+$par )) echo "pseudo automatic only $cnt" } echo "" echo "Gene report" echo "source count" reportGeneCounts cnt=$(hgsql -Ne "select count(distinct attrs.geneId) from wgEncodeGencodeAttrs${ver} attrs" ${db}) par=$(hgsql -Ne "select count(distinct attrs.geneId) from wgEncodeGencodeAttrs${ver} attrs, wgEncodeGencodeTag${ver} tag where attrs.transcriptId = tag.transcriptId and tag.tag = \"PAR\"" ${db}) cnt=$(( $cnt+$par )) echo "All $cnt" cnt=$(hgsql -Ne "select count(distinct name) from wgEncodeGencode2wayConsPseudo${ver};" ${db}) echo "2-way pseudogenes $cnt"