#! /usr/bin/env python import os import IO #functions that check if the area near a breakpoint is genotyped and returns the number of validating SNPs on each side, as well as the total number of SNPs def bp_genotypes(bp, vcf, chr_snps, contig_max_size = 1000): print(bp) founders = ['bur-0', 'can-0', 'col-0','ct-1','edi-0','hi-0','kn-0','ler-0','mt-0','no-0','oy-0','po-0','rsch-4','sf-2','tsu-0','wil-2','ws-0','wu-0','zu-0'] chrom, start, end, left_founder, right_founder = 'Chr' + str(bp[1]), int(bp[2]), int(bp[3]), bp[4], bp[5] left_genotypes = [[row[0], int(row[1]), row[3], row[4]] for row in vcf if row[0] == chrom and int(row[1]) >= start - contig_max_size and int(row[1]) <= start and int(row[1]) in chr_snps.keys() and chr_snps[int(row[1])][0][founders.index(left_founder)] != chr_snps[int(row[1])][0][founders.index(right_founder)] and int(row[1]) >= int(bp[2]) - int(bp[6])] right_genotypes = [[row[0], int(row[1]), row[3], row[4]] for row in vcf if row[0] == chrom and int(row[1]) >= end and int(row[1]) <= end + contig_max_size and int(row[1]) in chr_snps.keys() and chr_snps[int(row[1])][0][founders.index(left_founder)] != chr_snps[int(row[1])][0][founders.index(right_founder)] and int(row[1]) <= int(bp[3]) + int(bp[7])] print(left_genotypes) print(right_genotypes) if left_genotypes and right_genotypes: print('yes') for gen in left_genotypes: print(chr_snps[gen[1]]) print(gen) conf_left = [gen for gen in left_genotypes if (gen[2] == chr_snps[gen[1]][0][founders.index(left_founder)] and gen[3] == '.') or gen[3] == chr_snps[gen[1]][0][founders.index(left_founder)]] conf_right = [gen for gen in right_genotypes if (gen[2] == chr_snps[gen[1]][0][founders.index(right_founder)] and gen[3] == '.') or gen[3] == chr_snps[gen[1]][0][founders.index(right_founder)]] return([len(conf_left), len(conf_right), len(left_genotypes) + len(right_genotypes)]) return [] def vcf_genotypes(breaks, vcf, chr_snps): bps_c_g = [] for bp in breaks: bp_gen = bp_genotypes(bp, vcf, chr_snps['Chr' + str(bp[1])]) if bp_gen: bp_copy = bp[:] bp_copy.extend(bp_gen) print(bp_copy) bps_c_g.append(bp_copy) return bps_c_g if __name__ == '__main__': breakpoints_file = '/home/martha/DPhil/Recombination/Results2/Mosaics/MaskedTransposons/breaks10.highcov.infcoords.txt' breakpoints = IO.readtable(breakpoints_file, sep = '\t', ignoreheader = True) breaks_dict = {bp[0]:[] for bp in breakpoints} for bp in breakpoints: breaks_dict[bp[0]].append(bp) vcf_dir = '/data/martha/Recombination/DeNovoContigs/' write_file = vcf_dir + 'breaks_contig_genotypes.txt' vcf = {'.'.join(f.split('.')[0:2]):IO.readvcf(vcf_dir + f, onlySNPs = True, onlypos = False, pos_and_af = False) for f in os.listdir(vcf_dir) if f.startswith('MAGIC') and f.endswith('vcf')} variants_dir = '/data/www/19genomes/variants.tables/' transposons_file = '/data/martha/Recombination/data/transposons.gff' chr_snps = {} for chrom in ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5']: varfile = ''.join([variants_dir, chrom.lower(), '.alleles.txt']) chr_snps[chrom] = {int(row[2]) : [row[5:]] for row in IO.read_variants(varfile, only_int_pos = True, onlySNPs = True, return_sites_only = False, only_biallelic = True, maskfile = transposons_file)} breaks_contig_genotypes = [] for line in breaks_dict.keys(): print(line) breaks_contig_genotypes.extend(vcf_genotypes(breaks_dict[line], vcf[line], chr_snps)) IO.writetable(write_file, breaks_contig_genotypes)