#!/usr/bin/env python import sys, getopt, re, os, traceback, math, random if not sys.version.startswith("2.") or int(sys.version[2]) not in [6,7]: print >> sys.stderr,"Stampy requires Python version 2.6 or 2.7" sys.exit(1) if sys.maxint == 2147483647: print >> sys.stderr,"Stampy requires a 64-bit Python install to run; 32-bit installations are not supported" sys.exit(1) try: import maptools except ImportError: print >> sys.stderr,"Could not import the maptools module -- does the maptools.so library exist?" raise try: from Stampy import hashtable, genome, reader, formatter, mapper, singleendedmapper, recalibrate, mapstats, fastqformatter, utils, multithread, filez except ImportError: print >> sys.stderr,"Could not import main python modules -- please check that the Python interpreter matches the installed Stampy version" import plugins from ext import makeconstants short_opts = "?!g:G:d:v:H:h:f:i:s:R:M:T:S:P:A:o:f:t:" string_opts = ["genome=","logfile=","hash=","outputformat=","output=","inputformat=","stats=","keep-recalibrated=", "map=","comment=","recalibrate=","qualitybase=","recaldatasuffix=","testmap=","recaldatadir=","remote=", "readgroup=","processpart=","parse=","species=","assembly=","referenceuri=","pg=","labelfilter=","separator1=", "separator2=","bamsortprefix=","adapter-strip="] multi_opts = [] float_opts = ["recalfraction=","substitutionrate="] int_opts = ["verbosity=","bits=","maxcount=","maxscore=","minposterior=","numrecords=","lowqthreshold=","seed=", "insertsize=","insertsd=","maxfingerprintvariants=","linearalignmentband=","simulate-minindellen=", "simulate-maxindellen=","tryvariants=","fastaqual=","simulate-numsubstitutions=","gapopen=","gapextend=", "recalscoreprefix=","svprior=","longindelprior=","baseentropy=","banding=","xa-max=","xa-max-discordant=", "insertsize2=","insertsd2=","padding=","maxpairseeds=","paircandlikethres=","maxbasequal=","threads=", "bamsortbuflen=","bamsortmemory=","unclipq="] bool_opts = ["logit","sensitive","fast","simulate-duplications","solexa","solexaold","sanger","noautosense","norefoutput", "stripns","keeplabel","rightmost","alignquals","keepreforder","alphabetical","noparseNCBI","mdtag","strip-adapter", "ignore-improper-pairs","baq","overwrite","casava8","gatkcigarworkaround","bamkeepgoodreads","xcigar","cigarx"] long_opts = ["help","build-genome=","build-hash=","recalibrate=","map=","simulate="] + string_opts + int_opts + float_opts + bool_opts oldhelp = """ -R FILE, --recalibrate=FILE[,FILE] Compute recalibration data from fastq file(s) (not usually recommended) --recalfraction=F Fraction of reads to use for recalibration [0.01] --recaldatadir=DIR Directory for recalibration data [same as fastq file] --recalscoreprefix=N Read prefix length over which likelihood is calculated for recalibration [20] """ help = """%s\n\nUsage: %s [options] [files] Command options (one required) -G PREFIX, --build-genome=PREFIX Build genome index PREFIX.stidx from fasta file(s) on command line -H PREFIX, --build-hash=PREFIX Build hash PREFIX.sthash -M FILE, --map=FILE[,FILE] Map fastq/fasta/BAM file(s) -S FILE, --simulate=FILE Simulate reads following empirical qualities from read FILE -P FILE, --parse=FILE Parse simulated SAM file, and report statistics -A FILE Convert qualities; strip adapters Mapping options: -g PREFIX, --genome=PREFIX Use genome index file PREFIX.stidx -h PREFIX, --hash=PREFIX Use hash file PREFIX.sthash -t N, --threads=N Number of threads to use [1] --numrecords=N Number of records to process [all] --processpart=A/B Process the Ath read (pair) in every block of B read (pairs) [1/1] --minposterior=N Minimum read mapping phred posterior [no filtering] --maxscore=N Maximum read likelihood phred score [no filtering] --qualitybase=C Character or ASCII for quality score 0 in input [33 or !] --logit Qualities are logit (log p/1-p) rather than phred (log p) scores [phred] --solexa, --solexaold, --sanger Short for: --qualitybase=@; --qualitybase=@ --logit; default --substitutionrate=F Set substitution rate for mapping and simulation [0.001] --gapopen=N Gap open penalty (phred score) [40] --gapextend=N Gap extension penalty (phred score) [3] --insertsize=N (Initial) mean insert size for paired-end reads [250] --insertsd=N (Initial) standard deviation for insert size [60] --insertsize2=N (Initial) mean insert size for mate pairs [-2000] --insertsd2=N (Initial) standard deviation for mate pairs [-1 = deactivated] --maxpairseeds=N Number of ambiguous single mappings to take forward for paired realignment [25] --noautosense Do not auto-sense insert size distribution(s) [False] --sensitive More sensitive; about 25-50%%%% slower --fast Less sensitive; about 50%%%% (paired-end) to 100%%%% (single-ended) faster %s Output options: -o FILE, --output=FILE Write mapping output to FILE [stdout] -f FMT, --outputformat=FMT Select mapping output format (sam,maqtxt,maqmap,maqmapN) [sam] --rightmost Flush gaps to rightmost position in reference coordinates [leftmost] --xa-max Number of alternative hits to output for single/concordant paired reads [0, max 20] --xa-max-discordant Number of alternative hits to output for discordant paired reads [0] --overwrite Allow overwriting of output files --baq (SAM format) Compute base-alignment quality (BAQ; BQ tag) --alignquals (SAM format) Compute posterior alignment probabilities (YQ tag) --xcigar (SAM format) Compute the extended CIGAR string (XC tag) --readgroup=ID:id,tag:value,... (SAM format) Set readgroup tags (ID,SM,LB,DS,PU,PI,CN,DT,PL); ,, quotes , --norefoutput (SAM format) Use = signs at read positions that match the reference --comment=comment1%%%%comment2%%%%... (SAM format) Add @COmments. %%%% separate lines; commas are converted into tabs --comment=@F (SAM format) Add @COmments from file F Simulation options (use with -S): --seed=N Set random number seed [1] --simulate-minindellen=N Minimum indel length to simulate; negative is deletion [0] --simulate-maxindellen=N Maximum indel length to simulate; negative is deletion [0] --simulate-duplications For insertions, simulate duplications rather than (default) random sequence insertion --simulate-numsubstitutions=N Insert N substitutions in read (pairs) (default = Poisson distribution) %s Advanced options: --maxcount=N (-H) Maximum multiplicity of repeat words [200] --assembly=S (-G) Set assembly identifier (appears in @SQ field (AS tag) in SAM output) --species=S (-G) Set species identifier (appears in @SQ field (SP tag) in SAM output) --referenceuri=S (-G) Set URI for the reference (appears in @SQ field (UR tag) in SAM output) --noparseNCBI (-G) Don't parse but just copy gi|nnn|ref|xxx| labels in NCBI fasta files --strip-adapter (-A) Strip adapters from fastq files --inputformat=FMT Read input format (fasta, fastq, sam, bam) [fastq] --banding=N Band size for banded alignment; -1=none [60] --fastaqual=N Base quality for fasta input [30] --maxbasequal=N Maximum accepted base quality [50] --svprior=N Prior probability of read pair bridging a SV (phred score) [55] --longindelprior=N Prior probability of read pair bridging a long indel (phred score) [40] --baseentropy=N Entropy of inserted nucleotides (phred score) [5] --remote=U Command to get input reads from URL %%%%s; shortcuts are 'wget' and 'scp' --keepreforder (-M) (default) Use reference order in .fa/.stidx file, rather than alphabetical --alphabetical (-M) Order reference identifiers alphabetically --labelfilter=R (-M) Only map reads / read pairs whose label match the regular expression R --casava8 (-M) Parse Casava v1.8 FastQ headers; include but do not map filtered reads --separator1=c (-M) Use 'c' as separator for Casava data columns [:] --separator2=c (-M) use 'c' as separator between first 7 columns, and the auxiliary columns [ ] --gatkcigarworkaround (-M) Remove adjacent I/D CIGAR operators (valid in SAM spec, but trips up GATK) --unclipq (-M X.bam / -M --bamoptions) Undo soft clipping over low Q bases (-1=disable) [10] --bamsortprefix=S (-M X.bam) Temporary storage for sorting [--output argument, or /tmp/bamsort] --bamsortbuflen=N (-M X.bam) Entries in memory sort buffer [10000] --bamsortmemory=N (-M X.bam) Value for option -m to samtools sort [200000000] --bamkeepgoodreads (-M X.bam) If set, do not re-map already well-mapped reads from BAM [False] %s General options: -?, --help Display this help page -v N, --verbosity=N Set verbosity level to N [2] --logfile=FILE Send logging information to FILE [stderr] %s """ shorthelp = """%s\n\nUsage: %s [options] [.fa files] Option summary (--help for all): Command options -G PREFIX file1.fa [...] Build genome index PREFIX.stidx from fasta file(s) on command line -H PREFIX Build hash PREFIX.sthash -M FILE[,FILE] Map fastq/fasta/BAM file(s) -A FILE Convert qualities; strip adapters Mapping/output options -g PREFIX Use genome index file PREFIX.stidx -h PREFIX Use hash file PREFIX.sthash -o FILE Write mapping output to FILE [stdout] --readgroup=ID:id,tag:value,... Set read-group tags (ID,SM,LB,DS,PU,PI,CN,DT,PL) (SAM format) --solexa, --solexaold, --sanger Solexa read qualities (@-based); pre-v1.3 Solexa; and Sanger (!-based, default) --substitutionrate=F Set substitution rate for mapping and simulation [0.001] --gapopen=N Gap open penalty (phred score) [40] --gapextend=N Gap extension penalty (phred score) [3] %s General options --help Full help -v N Set verbosity level (0-3) [2] %s """ default_options = """ --svprior=55 --longindelprior=40 --baseentropy=5 --logfile=stderr --verbosity=2 --bits=-1 --maxcount=200 --outputformat=sam --inputformat=none --output=- --stats=mapstats.cache --qualitybase=! --numrecords=-1 --recalscoreprefix=20 --recalfraction=0.01 --recaldatasuffix=.recaldata --maxscore=99999 --minposterior=-99999 --lowqthreshold=10 --seed=1 --insertsize=250 --insertsd=80 --insertsize2=-2000 --insertsd2=-1 --maxfingerprintvariants=3 --linearalignmentband=3 --simulate-minindellen=0 --simulate-maxindellen=0 --simulate-numsubstitutions=0 --substitutionrate=0.001 --tryvariants=-1 --gapopen=40 --gapextend=3 --fastaqual=30 --maxbasequal=50 --banding=60 --xa-max=0 --xa-max-discordant=0 --padding=160 --maxpairseeds=25 --paircandlikethres=100 --separator1=: --threads=1 --bamsortbuflen=10000 --bamsortmemory=200000000 --unclipq=10 %s """ outputformats = ["sam","maqtxt","maqmap","maqmapShort","maqmapShortN"] inputformats = ["fastq","fasta","sam","bam","none"] def install_plugins(): global help, shorthelp global default_options global short_opts, string_opts, float_opts, int_opts, bool_opts, long_opts, multi_opts mapping_opts=plugins.hook_add("get_mapping_opts","") simulation_opts = plugins.hook_add("get_simulation_opts","") advanced_opts = plugins.hook_add("get_advanced_opts","") general_opts = plugins.hook_add("get_general_opts","") default_opts = plugins.hook_add("get_default_opts","") new_short_opts = plugins.hook_add("get_new_short_opts","") new_string_opts = plugins.hook_add("get_new_string_opts",[]) new_multi_opts = plugins.hook_add("get_new_multi_opts",[]) new_float_opts = plugins.hook_add("get_new_float_opts",[]) new_int_opts = plugins.hook_add("get_new_int_opts",[]) new_bool_opts = plugins.hook_add("get_new_bool_opts",[]) new_long_opts = plugins.hook_add("get_newlong_opts",[]) help = help % ("%s", "%s", mapping_opts, simulation_opts, advanced_opts, general_opts) shorthelp = shorthelp % ("%s", "%s", mapping_opts, general_opts) default_options += default_opts short_opts += new_short_opts string_opts += new_string_opts multi_opts += new_multi_opts float_opts += new_float_opts int_opts += new_int_opts bool_opts += new_bool_opts long_opts += new_long_opts + new_string_opts + new_int_opts + new_float_opts + new_bool_opts def validate( settings ): if settings.verbosity < 0 or settings.verbosity > 3: raise ValueError("--verbosity out of range (must be between 0 and 3)") if settings.bits not in range(16,30) + [-1]: raise ValueError("--bits out of range (must be between 16 and 29)") if settings.maxcount < 1: raise ValueError("--maxcount must be >= 1") if settings.outputformat not in outputformats: raise ValueError("--outputformat not one of " + str(outputformats)) if settings.inputformat not in inputformats: raise ValueError("--inputformat not one of " + str(inputformats)) if len(settings.qualitybase)>1: raise ValueError("--qualitybase must be a single character or integer") if settings.insertsize < 1: raise ValueError("--insertsize must be a positive integer") if settings.insertsd < 1: raise ValueError("--insertsd must be a positive integer") if not -1 <= settings.maxfingerprintvariants <= 4: raise ValueError("--maxfingerprintvariants must be an integer between 1 and 4, or -1 for no filtering") if not 0 <= settings.substitutionrate <= 0.5: raise ValueError("--substitutionrate must be between 0 and 0.5") if not -1 <= settings.tryvariants <= 3: raise ValueError("--tryvariants must be -1 (autoselect), 0 (none), 1 (all) or 2, 3 (half/third of locations)") if settings.keepreforder and settings.alphabetical: raise ValueError("Must give at most one of (--keepreforder, --alphabetical)") if len(settings.separator1) != 1: raise ValueError("Separator1 must be a single character") if len(settings.separator2) != 1: raise ValueError("Separator2 must be a single character") if settings.threads < 1: raise ValueError("--threads must be a positive integer") if not settings.bamsortprefix: settings.bamsortprefix = [settings.output, "/tmp/bamsort"][settings.output == "-"] settings.keepreforder = not settings.alphabetical if settings.readgroup: tags = ("ID","SM","LB","DS","PU","PI","CN","DT","PL") tagvalues = [ v.replace('#$&*',',').split(':') for v in settings.readgroup.replace(',,','#$&*').split(',') ] for tv in tagvalues: if len(tv) < 2 or len(tv[0]) != 2: raise ValueError("--readgroup: I don't understand '%s' (expect two-letter-TAG:VALUE pair)" % ':'.join(tv)) tv = [tv[0], ":".join(tv[1:])] if tv[0] not in tags: raise ValueError("--readgroup: tag '%s' unknown, must be one of %s" % (tv[0],','.join(tags))) try: tv[0] == "PI" and int(tv[1]) except ValueError: raise ValueError("--readgroup PI tag must be numeric (found: '%s')" % tv[1]) if len( set(["ID"]).intersection( set( tv[0] for tv in tagvalues ) ) ) != 1: # removed the check for SM tag, since --readgroup can be used to select a readgroup from a BAM file raise ValueError("--readgroup must set at least ID tag") settings.readgroupid = [ tv[1] for tv in tagvalues if tv[0] == "ID" ][0] try: settings.readgroupsm = [ tv[1] for tv in tagvalues if tv[0] == "SM" ][0] except: settings.readgroupsm = "(none)" if settings.comment: if settings.comment[0] == "@": try: f = open(settings.comment[1:],'r') lines = [ line.strip() for line in f if len(line)>1 ] except: raise ValueError("--comment: Could not open file '%s' for reading" % settings.comment[1:]) settings.comment = lines else: settings.comment = [ c.replace(",","\t") for c in settings.comment.split('%') ] if settings.processpart: try: pq = map(int, settings.processpart.split('/')) if len(pq) != 2 or not (1 <= pq[0] <= pq[1]): raise ValueError("") except: raise ValueError("--processpart: argument should be of the form 'p/q' with 1 <= p <= q, q > 0") settings.processpartpq = pq def parse_options( options, settings, fail_duplicates = False ): actions = [] for option, value in options: if option == "-v": option = "--verbosity" if option == "-d": option = "--defaults" if option == "-g": option = "--genome" if option == "-h": option = "--hash" if option == "-f": option = "--outputformat" if option == "-i": option = "--inputformat" if option == "-s": option = "--stats" if option == "-o": option = "--output" if option == "-t": option = "--threads" if (len(value)>0 and value[0] == "-" and option[min(2,len(option)):]+"=" not in int_opts and not option.startswith('--')): sys.stderr.write("Warning -- argument to option '%s' starts with '-'\n" % option) optkey = option[2:].replace('-','_') if option == "-?" or option == "-!" or option == "--help": if option == "-!": helpstr = shorthelp else: helpstr = help print helpstr % (settings.program + " v" + settings.version + ", <%s>" % settings.author, sys.argv[0]) sys.exit(0) elif option == "-G" or option == "--build-genome": actions.append( ("-G", value) ) elif option == "-H" or option == "--build-hash": actions.append( ("-H", value) ) elif option == "-R" or option == "--recalibrate": actions.append( ("-R", value) ) elif option == "-M" or option == "--map": actions.append( ("-M", value) ) elif option == "-T" or option == "--testmap": actions.append( ("-T", value) ) elif option == "-S" or option == "--simulate": actions.append( ("-S", value) ) elif option == "-P" or option == "--parse": actions.append( ("-P", value) ) elif option == "-A": actions.append( ("-A", value) ) elif option in ["--solexa","--solexaold","--sanger"]: if option[:8] == "--solexa": settings.qualitybase = "@" if option == "--solexaold": settings.logit = True elif option[2:] in bool_opts: settings.__dict__[optkey] = True elif option[2:]+"=" in string_opts: if settings.__dict__[optkey] == None: settings.__dict__[optkey] = value elif value and option[2:]+"=" in multi_opts: settings.__dict__[optkey] = settings.__dict__[optkey] + " " + value elif option[2:]+"=" in float_opts: if settings.__dict__[optkey] == None: try: settings.__dict__[optkey] = float(value) except: raise ValueError("Option %s requires a float argment, got '%s'" % (option, value)) elif option[2:]+"=" in int_opts: if settings.__dict__[optkey] == None: try: settings.__dict__[optkey] = int(value) except: raise ValueError("Option %s requires an integer argument, got '%s'" % (option, value)) else: raise ValueError("Internal error: unrecognized option " + option) return actions def build_genome( settings, logger, genomefile, arguments ): if len(arguments) == 0: raise ValueError("Please provide .fa[.gz] files to process") genomefile += ".stidx" if not settings.overwrite: utils.test_write( genomefile ) logger.log("Building genome...",1) logger.log("Input files: " + str( [a.split(':')[-1] for a in arguments] ), 2) # chop off .gz/.dz extension, if it exists for i in range(len(arguments)): if arguments[i].lower().endswith(".gz"): arguments[i] = arguments[i][:-3] g = genome.genome( genomefile, parseNCBI = not settings.noparseNCBI, write=True, padding = settings.padding ) g.openFromFa( arguments, assembly=settings.assembly, species=settings.species, uri=settings.referenceuri ) def build_hash( settings, logger, hashfile, arguments ): if len(arguments)>0: raise ValueError("Unexpected arguments on command line") hashfile += ".sthash" utils.open_genome( settings, logger ) if not settings.overwrite: utils.test_write( hashfile ) hashtable.buildhash3( hashfile, settings, logger ) def stripadapter( settings, logger, readfiles, arguments ): input_filenames = readfiles.split(',') + arguments if len(input_filenames) != 1: raise ValueError("Please provide one .fa[.gz] file to process") input_format = reader.sniff( input_filenames[0], logger ) if input_format != "fastq": raise ValueError("Input file %s does not appear to be a fastq file" % input_filenames[0] ) if settings.output == "-": output = sys.stdout else: if not settings.overwrite: utils.test_write(settings.output) output = open(settings.output,'w') logger.log("Converting fastq file...",1) maptools.stripadapter( input_filenames[0], output, settings, logger, filez ) def recalibratereads( settings, logger, readfiles, arguments ): input_filenames = readfiles.split(',') + arguments if len(input_filenames) == 1: input_format = reader.sniff( input_filenames[0], logger ) else: input_format = "fastq" recaldata_filenames = [ utils.make_recaldatafilename(settings,name) for name in input_filenames ] if not settings.overwrite: for f in recaldata_filenames: if not settings.overwrite: utils.test_write( f, filetype="Recalibration data file" ) utils.open_genome( settings, logger ) if input_format == "fastq": utils.open_hash( settings, logger ) datareaders = [ reader.make( f, logger, settings, process_fraction=settings.recalfraction ) for f in input_filenames ] mapinstance = mapper.mapper( settings.hashmap, settings.genomemap, settings, lowqthreshold=settings.lowqthreshold, maxfingerprintvariants = settings.maxfingerprintvariants, phred_gapopen = settings.gapopen, phred_gapextend = settings.gapextend, tryvariants = settings.tryvariants, baseentropy = settings.baseentropy) if len(input_filenames) == 1: semapper = singleendedmapper.SingleEndMapper( settings ) outputgenerator = semapper.generator( datareaders[0], mapinstance, genome ) else: pairedreader = reader.PairedReader( datareaders[0], datareaders[1], logger ) pemapper = singleendedmapper.PairedEndMapper( settings ) outputgenerator = pemapper.generator( pairedreader, mapinstance, genome ) else: outputgenerator = reader.SamReader( input_filenames[0], settings, process_fraction=settings.recalfraction, logger=logger ).generator() logger.log("Recalibrating...",1) recalibrate.recalibrator( recaldata_filenames, outputgenerator, settings, maxreadlength=1000, maxq=50, pseudocount = 1 ) def mapreads_background( settings, logger, queue, pipe, process_idx ): """ maps reads from the queue and puts results in the pipe """ logger.setprocess( process_idx ) mapinstance = mapper.mapper( settings.hashmap, settings.genomemap, settings, lowqthreshold=settings.lowqthreshold, maxfingerprintvariants = settings.maxfingerprintvariants, phred_gapopen = settings.gapopen, phred_gapextend = settings.gapextend, tryvariants = settings.tryvariants, baseentropy = settings.baseentropy) readmapper = singleendedmapper.PairedEndMapper( settings ) datareader = reader.QueueReader( queue, readmapper, settings ) outputgenerator = readmapper.generator( datareader, mapinstance, settings.genomemap ) for data in outputgenerator: update = readmapper.get_pair_stats_updates() pipe.send( (data, update) ) pipe.send( None ) logger.log("Child exiting",3) def mapreads( settings, logger, readfiles, arguments ): input_filenames = readfiles.split(',') + arguments bams = sum( ("1234"+name.upper())[-4:] in [".BAM",".SAM"] for name in input_filenames ) == len(input_filenames) if not bams and len(input_filenames) not in [1,2]: raise ValueError("Map: please provide one or two input files") utils.open_genome( settings, logger ) utils.open_hash( settings, logger ) recaldata_filenames = [ utils.make_recaldatafilename(settings,name) for name in input_filenames ] mapinstance = mapper.mapper( settings.hashmap, settings.genomemap, settings, lowqthreshold=settings.lowqthreshold, maxfingerprintvariants = settings.maxfingerprintvariants, phred_gapopen = settings.gapopen, phred_gapextend = settings.gapextend, tryvariants = settings.tryvariants, baseentropy = settings.baseentropy) formatinstance = formatter.make( settings, logger ) if len(input_filenames) == 1 or bams: # single-ended mapping if os.path.exists(recaldata_filenames[0]): recalhandle = open(recaldata_filenames[0],'r') recaldata = recalibrate.get_single_read_calibration( recalibrate.read_calibration_file( recalhandle ), 0 ) recalhandle.close() else: recaldata = [None] if bams: datareader = reader.make( input_filenames, logger, settings, recal_data = None, dep_data = [None] ) else: datareader = reader.make( input_filenames[0], logger, settings, recal_data = recaldata[0], dep_data = recaldata ) else: # paired-end mapping recaldata = [[None], [None]] for idx in [0,1]: if os.path.exists(recaldata_filenames[idx]): recalhandle = open(recaldata_filenames[idx],'r') recaldata[idx] = recalibrate.get_single_read_calibration( recalibrate.read_calibration_file( recalhandle ), idx ) recalhandle.close() datareaders = ( reader.make( input_filenames[0], logger, settings, recal_data = recaldata[0][0], dep_data=recaldata[0] ), reader.make( input_filenames[1], logger, settings, recal_data = recaldata[1][0], dep_data=recaldata[1] ) ) datareader = reader.PairedReader( datareaders[0], datareaders[1], logger ) readmapper = singleendedmapper.PairedEndMapper( settings ) readmapper = multithread.MultiMapper( settings, readmapper, logger ) outputgenerator = plugins.hook( 'bwa', settings, logger, readmapper, datareader, mapinstance, singleendedmapper ) if outputgenerator == None: outputgenerator = readmapper.generator( datareader, mapinstance, settings.genomemap ) logger.log("Mapping...",1) formatgenerator = formatinstance.formatter( outputgenerator ) for output in formatgenerator: pass formatinstance.dumpbuffer() if len(input_filenames) == 2: readmapper.dump_pair_stats( logger ) def testmap( settings, logger, readfiles, arguments ): input_filenames = readfiles.split(',') + arguments if len(input_filenames) not in [1,2]: raise ValueError("TestMap: please provide one or two input files") utils.open_genome( settings, logger ) utils.open_hash( settings, logger ) recaldata_filenames = [ utils.make_recaldatafilename(settings,name) for name in input_filenames ] mapinstance = mapper.mapper( settings.hashmap, settings.genomemap, settings, lowqthreshold=settings.lowqthreshold, maxfingerprintvariants = settings.maxfingerprintvariants, phred_gapopen = settings.gapopen, phred_gapextend = settings.gapextend, tryvariants = settings.tryvariants, baseentropy =settings.baseentropy) mapinstance.make_scoretable() formatinstance_output = formatter.make( settings, logger ) formatinstance = formatter.TestDataCollector( formatinstance_output ) recaldata = [[None] for f in recaldata_filenames] for idx in range(len(recaldata)): if os.path.exists(recaldata_filenames[idx]): recalhandle = open(recaldata_filenames[idx],'r') recaldata[idx] = recalibrate.get_single_read_calibration( recalibrate.read_calibration_file( recalhandle ), idx ) recalhandle.close() logger.log("Testing...",1) if len(input_filenames) == 1: # single-ended mapping datareader = reader.make( input_filenames[0], logger, settings, recal_data = recaldata[0][0], dep_data = recaldata[0] ) randomreader = reader.RandomReader( datareader, settings ) themapper = singleendedmapper.SingleEndMapper( settings ) else: # paired-end mapping datareaders = ( reader.make( input_filenames[0], logger, settings, recal_data = recaldata[0][0], dep_data = recaldata[0] ), reader.make( input_filenames[1], logger, settings, recal_data = recaldata[1][0], dep_data = recaldata[1] ) ) pairedreader = reader.PairedReader( datareaders[0], datareaders[1], logger ) randomreader = reader.RandomReader( pairedreader, settings ) themapper = singleendedmapper.PairedEndMapper( settings ) outputgenerator = themapper.generator( randomreader, mapinstance, settings.genomemap ) formatgenerator = formatinstance.formatter( outputgenerator ) for output in formatgenerator: pass formatinstance.dumpbuffer() formatinstance.report( sys.stderr ) def simulatedata( settings, logger, readfiles, arguments ): input_filenames = readfiles.split(',') + arguments if len(input_filenames) not in [1,2]: raise ValueError("SimulateData: please provide one or two input files") recaldata_filenames = [ utils.make_recaldatafilename(settings,name) for name in input_filenames ] recaldata = [[None] for f in recaldata_filenames] for idx in range(len(recaldata)): if os.path.exists(recaldata_filenames[idx]): recalhandle = open(recaldata_filenames[idx],'r') logger.log("Using recalibration data from %s" % recaldata_filenames[idx], 3) else: #This is a hack. A better solution is to read all files in the recaldatadir, read the readgroup #from the SAM file, select the right one (allow multiple, or enforce a single readgroup per sam?) if not settings.recaldatadir: settings.recaldatadir = '' recalfile = os.path.join( settings.recaldatadir, os.path.basename( recaldata_filenames[idx] ) ) if os.path.exists( recalfile ): recalhandle = open(recalfile,'r') logger.log("Using recalibration data from %s" % recalfile, 3) else: recalhandle = None logger.log("Not using recalibration data", 3) if recalhandle != None: recaldata[idx] = recalibrate.get_single_read_calibration( recalibrate.read_calibration_file( recalhandle ), idx ) recalhandle.close() utils.open_genome( settings, logger ) if len(input_filenames) == 1: # single-ended mapping datareader = reader.make( input_filenames[0], logger, settings, recal_data = None, dep_data = recaldata[0] ) randomreader = reader.RandomReader( datareader, settings ) else: # paired-end mapping datareaders = ( reader.make( input_filenames[0], logger, settings, recal_data = None, dep_data = recaldata[0] ), reader.make( input_filenames[1], logger, settings, recal_data = None, dep_data = recaldata[1] ) ) pairedreader = reader.PairedReader( datareaders[0], datareaders[1], logger ) randomreader = reader.RandomReader( pairedreader, settings ) theformatter = fastqformatter.FastqFormatter( input_filenames, settings ) logger.log("Simulating...",1) theformatter.format( randomreader ) def parsetest( settings, logger, readfiles, arguments ): input_filenames = readfiles.split(',') + arguments if len(input_filenames) != 1: raise ValueError("Parse: please provide one input file") utils.open_genome( settings, logger ) formatinstance_output = formatter.make( settings, logger ) formatinstance = formatter.TestDataCollector( formatinstance_output ) logger.log("Parsing...",1) samreader = reader.SamReader( input_filenames[0], settings, logger=logger ) formatgenerator = formatinstance.formatter( samreader.generator() ) for output in formatgenerator: pass formatinstance.dumpbuffer() formatinstance.report( sys.stderr ) ################################################################################## # # top-level code # ################################################################################## def main(): global logger, settings options, arguments = getopt.getopt( sys.argv[1:], short_opts, long_opts ) actions = parse_options( options, settings, fail_duplicates = True ) defaultopts, defaultargs = [], [] for line in default_options.split("\n"): o, a = getopt.gnu_getopt( re.split("\s*",line), short_opts, long_opts ) defaultopts += o defaultactions = parse_options( defaultopts, settings ) if not settings.separator2: settings.separator2 = " " if len(settings.qualitybase)>1: try: settings.qualitybase = chr( int( settings.qualitybase ) ) except: pass # caught by validate() if settings.sensitive and settings.fast: raise ValueError("Can only set one of --sensitive and --fast") if settings.sensitive: settings.linearalignmentband = 4 # no effect when SSE available settings.maxfingerprintvariants = -1 settings.tryvariants = 1 settings.maxpairseeds = max(settings.maxpairseeds, 45) if settings.fast: settings.paircandlikethres = 50 settings.maxfingerprintvariants = 2 settings.linearalignmentband = 2 # no effect when SSE available settings.tryvariants = 3 if len(actions) == 0: if len(options) == 0: parse_options([("-!","")], settings) else: if len(arguments)>0: print "Nothing to do, with unused arguments on command line" print "First unused argument: '%s'" % arguments[0] else: print "Nothing to do...\n" elif len(actions) > 1: raise ValueError("Please provide exactly one of -G, -H, -M, ...") else: settings.cmdline = ' '.join( sys.argv[1:] ) p1, p2 = settings.stats, os.path.join( os.path.dirname(os.path.abspath(sys.argv[0])), settings.stats ) if os.path.exists(p1): statspath = p1 elif os.path.exists(p2): statspath = p2 else: statspath = p1 settings.mapstats = mapstats.MapStats( statspath ) logger.setfilename( settings.logfile ) logger.log( "Starting Stampy with the following options:\n" + str( settings ), 3 ) validate( settings ) random.seed( settings.seed ) actiondict = dict( actions ) while len( actiondict ) > 0: if "-G" in actiondict: build_genome( settings, logger, actiondict['-G'], arguments ) del actiondict['-G'] elif "-H" in actiondict: logger.log("Building hash table...",1) build_hash( settings, logger, actiondict['-H'], arguments ) del actiondict['-H'] elif "-A" in actiondict: stripadapter( settings, logger, actiondict['-A'], arguments ) del actiondict['-A'] elif "-R" in actiondict: recalibratereads( settings, logger, actiondict['-R'], arguments ) del actiondict['-R'] elif "-M" in actiondict: mapreads( settings, logger, actiondict['-M'], arguments ) del actiondict['-M'] elif "-T" in actiondict: testmap( settings, logger, actiondict['-T'], arguments ) del actiondict['-T'] elif "-S" in actiondict: simulatedata( settings, logger, actiondict['-S'], arguments ) del actiondict['-S'] elif "-P" in actiondict: parsetest( settings, logger, actiondict['-P'], arguments ) del actiondict['-P'] else: raise ValueError("Internal error: unrecognized actions: " + str(actiondict)) if len(actions)>0: logger.log("Done",1) if __name__ == '__main__': try: install_plugins() settings = utils.Settings( string_opts, int_opts, float_opts, bool_opts ) logger = utils.Logger( settings ) makeconstants.bind_all( sys.modules['Stampy.formatter'], verbose=False ) makeconstants.bind_all( sys.modules['Stampy.singleendedmapper'], verbose=False ) makeconstants.bind_all( sys.modules['Stampy.genome'], verbose=False ) makeconstants.bind_all( sys.modules['Stampy.mapstats'], verbose=False ) main() except: errtype, value, tb = sys.exc_info() if errtype == SystemExit: raise if (errtype != KeyboardInterrupt) or True: tbstr = "Traceback:\n" + ''.join( traceback.format_tb( tb )) logger.log( tbstr[:-1], 3 ) if settings.verbosity < 3: tbstr = "Internal problem: " + (["(no info)"]+traceback.format_tb( tb ))[-1][:-1].replace('\n',' : ') else: tbstr = None if errtype in (ValueError,getopt.GetoptError,IOError): logger.log("\nError: "+str(value),0) if errtype == getopt.GetoptError: logger.log("Use -? for help\n",0) elif errtype == KeyboardInterrupt: logger.log("\nInterrupted by user",0) else: if tbstr: logger.log(tbstr,0) logger.log("Internal problem: " + repr(value),0) logger.log("Please report this problem to <%s> mentioning the version number (%s) and command line - thanks!\n" % (settings.author,settings.version),0) sys.exit(1)