#!/usr/bin/awk -f # ensGeneToGenePred [options] [exons] > genePred # # Converts a mysql dump of ensembl exons to an extended genePred file. if # the gene is single exon, set to frame to 0 to override bad data from # ENSEMBL. # # options: # -genePredExt - create extended genePred columns, including frames. # -ignore filterFile - drop ENSEMBL transcript ids that are in this file. # Empty lines and whose starting with # are ignored. NOTE; -ignore was used # rather than -f, because awk wanted to interpret this and an option to awk # instead of putting it in ARGV # BEGIN { FS = OFS="\t" filterFile = "" genePredExt = 0; iArg = 1; while ((iArg < ARGC) && (ARGV[iArg] ~ "^-.*")) { if (ARGV[iArg] == "-ignore") { if (iArg+1 >= ARGC) { print "Error: -ignore requires an argument" >"/dev/stderr" exit(1) } filterFile = ARGV[iArg+1] ARGV[iArg++] = "" } else if (ARGV[iArg] == "-genePredExt") { genePredExt = 1 } else { print "Error: invalid option:",ARGV[iArg] >"/dev/stderr" exit(1) } ARGV[iArg++] = "" } if (filterFile) { while ((getline < filterFile) > 0) { $0 = gensub("[[:space:]]+", "", "g", $0); if (!($0 ~ "^(#|$)")) { ignore[$0] = 1 } } close(filterFile) } exonCnt = 0; } # prints current gene function prGene() { if ((exonCnt == 1) && (cdsStart < cdsEnd)) { frames = "0,"; # fix bogus fram on single-exon genes } if (genePredExt) { print name, chr, strand, txStart, txEnd, cdsStart, cdsEnd, exonCnt, eStarts, eEnds, 0, name2, "cmpl", "cmpl", frames } else { print name, chr, strand, txStart, txEnd, cdsStart, cdsEnd, exonCnt, eStarts, eEnds } } ignore[$11] { next; # skip ignored gene } # output gene being assemble if a hit a new gene ($11 != name) && (name != "") { prGene(); } # new gene ($11 != name) { name = $11; chr = $1; strand = $8 txStart = $5; cdsStart = $3; cdsEnd = $4; name2 = $10; eStarts = eEnds = frames = ""; exonCnt = 0; } # accumulate gene { txEnd = $6; eStarts = eStarts $5 ","; eEnds = eEnds $6 ","; if ($9 != ".") { frames = frames $9 ","; } else { frames = frames "-1,"; } exonCnt++; } END { if (name != "") { prGene(); } }