#!/usr/bin/env python #Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com) # #Released under the MIT license, see LICENSE.txt #!/usr/bin/env python #nknguyen@soe.ucsc.edu #09/24/2010 # #Get the reference sequence from maf output of cactus_MAFGenerator (with #options --orderByReference and --includeReferenceSequence specified) #Input: cactus.maf Output refseq.fa #Given that blocks are in order of the 'reference' in cactus.maf, and 'reference' #is always positive. import os import sys import re from optparse import OptionParser def getRefSeq(): seq = '' #count = 0 #totalbases = 0 #go through input file (stdin) and get the sequences for line in sys.stdin.readlines(): p = re.compile('^s\treference') if p.match(line): #0 1 2 3 4 5 6 #s, name, start, length, strand, totalLength, sequence list = line.rstrip().split('\t') if len(list) != 7: sys.stderr.write("Line %s has wrong format\n" %line) continue else: seq += list[6] #count += 1 #totalbases += int(list[3]) sys.stdout.write(">reference\n") #sys.stderr.write("Count: %d; totalBase: %d; Total bases in the ref: %d\n" %(count, totalbases, len(seq))) for i in range(0, len(seq), 50): sys.stdout.write(seq[i:i+50] + '\n') if __name__ == "__main__" : usage = "usage: %prog [options] < input maf file > output fasta file" parser = OptionParser(usage=usage) (options, args) = parser.parse_args() getRefSeq()