#!/usr/bin/awk -f # aarToAxt [aar] # # Convert Aligned Ancient Repeat (AAR) format file (which is AXT-like), to an # AXT file. # # Format: # - aar file: a series of multi-line records, each record seperated by # a blank line # - record: an AXT alignment, followed by one or more AAR alignments. # - AXT alignment: 3 lines: # - header # - two alignment lines # - AAR alignment: 4 lines: # - repeat info # - target and query coordinates # - two alignment lines # # BEGIN { inAarAlign = 0; aarCount = 0; } # Record seperator /^$/ { inAarAlign = 0; next; # don't check any more states } # AXT alignment part of record !inAarAlign { # First AXT line, save sequence names tName = $2; qName = $5; qStrand = $8; # skip two alignment lines getline; getline; # Move on two aar getline; inAarAlign = 1; } # AAR part of record inAarAlign { aarCount++; # Don't care about 1st line # 2nd line has coords getline; tStart = $1; tEnd = $2; qStart = $3; qEnd = $4; # 3rd and 4th are alignment getline; tAlign = $0; getline; qAlign = $0; # Validate that the alignment sequences are the same length if (length(tAlign) != length(qAlign)) { printf "Error: alignment sequences are different lengths for %s:%d-%d\n", tName, tStart, tEnd > "/dev/stderr"; exit(1); } # Check if target sequence length matches range tLen = (tEnd - tStart) + 1; tSeq = tAlign; gsub("-+", "", tSeq); if (length(tSeq) != tLen) { printf "Error: target sequence length does not match target range length %s:%d-%d\n", tName, tStart, tEnd > "/dev/stderr"; exit(1); } # Check if query sequence length matches range # There is a bug in the AAR program that causes qStart to be # incorrect. This can be detected and corrected by checking # the length of the sequence. Once this is fixed, change this # to an error check. qLen = (qEnd - qStart) + 1; qSeq = qAlign; gsub("-+", "", qSeq); if (length(qSeq) != qLen) { printf "Warning: query sequence length does not match query range length for\n" > "/dev/stderr"; printf " %s:%d-%d, adjusting size to compensate for known bug\n", qName, qStart, qEnd > "/dev/stderr"; qStart = qStart+(qLen-length(qSeq)); } # output the AXT record print aarCount,tName,tStart,tEnd,qName,qStart,qEnd,qStrand,0; print tAlign; print qAlign; print ""; # end-of-record }