#!/bin/sh -e usage="txGenbankData db Get GenBankand RefSeq alignments and metadata, and ReqSeq peptide sequences for input to the UCSC Genes pipeline. Outputs the following files in the current directory: mrna.psl mrna.ra mrna.fa refSeq.psl refSeq.ra refSeq.fa refPep.fa mgcStatus.tab " if [ $# != 1 ] ; then echo "Wrong # args: $usage" >&2 exit 1 fi db="$1" gbRoot=/cluster/data/genbank gbGetSeqs=${gbRoot}/bin/$(uname -m)/gbGetSeqs # function to get alignments and metadata. doGet() { local srcDb="$1" outPre="$2" (set -x ; $gbGetSeqs -inclVersion -gbRoot=$gbRoot -get=psl -db=$db -native $srcDb mrna $outPre.psl) (set -x ; $gbGetSeqs -inclVersion -gbRoot=$gbRoot -get=ra -db=$db -native $srcDb mrna $outPre.ra) (set -x ; $gbGetSeqs -inclVersion -gbRoot=$gbRoot -get=seq -db=$db -native $srcDb mrna $outPre.fa) } doGet refseq refSeq (set -x ; $gbGetSeqs -inclVersion -gbRoot=$gbRoot -get=seq -db=$db -native refseq pep refPep.fa) doGet genbank mrna # create mgcStatus.tab, in the format of the retired mgcFullStatus table, from # the mrna.ra file. This has two columns, the acc and a status of either # fullLength or fullLengthSynthetic. awk ' BEGIN {OFS="\t"} $1=="acc" { acc = $2; isMgc = isSyn = 0; } /^key .*mgc(\.$|;.*)/ { isMgc = 1; } /^org synthetic/ { isSyn = 1; } /^$/ && isMgc { if (isSyn) { print acc,"fullLengthSynthetic"; } else { print acc,"fullLength"; } } ' mrna.ra >mgcStatus.tab.tmp mv -f mgcStatus.tab.tmp mgcStatus.tab