#include #include // for testing #include // for testing static const int MAX_UNIT_LENGTH = 12; static const int MIN_PARTIAL_MATCH = 5; //================================================================================================= unsigned char* twobit(char* sequence, int length, int offset) { // Helper -- converts ACGT sequence into two-bit-per-nucleotide version, // with an offset, and pad to 64-nucleotide (128 bit) words // size in bytes, rounded up to whole number of 128-bit words, and make // sure that the result ends with an empty 128-bit word int buflen = (129 + (((length + MAX_UNIT_LENGTH) * 2 ) | 127)) / 8; unsigned char* buffer = malloc( (size_t)buflen ); int current_nuc_index = offset; int i,j; // Loop over bytes for (i=0; i length) { size = length - displacement - pos; } // convert size to length of repetitive region size += displacement; // only accept true tandems and sufficiently long partial tandems if (size < displacement + min(MIN_PARTIAL_MATCH, displacement)) { return; } if (approximate_indel_rate_inline(sizes[pos], displacements[pos]) < approximate_indel_rate_inline(size, displacement)) { sizes[pos] = size; displacements[pos] = displacement; if (markfull) { int i = pos+1; for (; i= length) { break; } // get target unsigned long long target0 = *((long long*) & seqs[displacement % 4][(pos+displacement)/4]); // calculate match lengths target0 ^= original0; int size0 = 64; int size1 = 64; int size2 = 64; int size3 = 64; int nucscanleft = (__builtin_ffsll( target0 ) + 1)/2; // 0 for no mismatches; 1 for nuc 0 mismatch, etc. if (nucscanleft == 1) { size0 = 0; // found result for position 0 target0 &= -4LL; // remove blocking mismatch nucscanleft = (__builtin_ffsll( target0 ) + 1)/2; // recompute } if (nucscanleft == 2) { size0 = min(size0,1); size1 = 0; target0 &= -16LL; nucscanleft = (__builtin_ffsll( target0 ) + 1)/2; } if (nucscanleft == 3) { size0 = min(size0, 2); size1 = min(size1, 1); size2 = 0; target0 &= -64LL; nucscanleft = (__builtin_ffsll( target0 ) + 1)/2; } if (nucscanleft == 0) { if (pos + displacement + 32 < length) { unsigned long long target1 = *((long long*) & seqs[displacement % 4][(pos+displacement+32)/4]); target1 ^= original1; nucscanleft = (__builtin_ffsll( target1 ) +1)/2; if (nucscanleft == 0) { nucscanleft = 32; } else { nucscanleft--; } } else { nucscanleft = 0; } size0 = min(size0, nucscanleft+32); size1 = min(size1, nucscanleft+31); size2 = min(size2, nucscanleft+30); size3 = nucscanleft + 29; } else { size0 = min(size0, nucscanleft-1); size1 = min(size1, nucscanleft-2); size2 = min(size2, nucscanleft-3); size3 = nucscanleft - 4; } foundmatch(sizes, displacements, pos, size0, displacement, original_length); foundmatch(sizes, displacements, pos+1, size1, displacement, original_length); foundmatch(sizes, displacements, pos+2, size2, displacement, original_length); foundmatch(sizes, displacements, pos+3, size3, displacement, original_length); } } free(seqs[0]); free(seqs[1]); free(seqs[2]); free(seqs[3]); } //================================================================================================= int main() { char* seq1 = "TATTTGCATGCGCTTTCGAGCTGTTGAAGAGACGTGTATTGGAATAAGTAATCACATAAGTGTTAGTAACTTATTTAAATACGTATAGAGTCGCCTATTTGCCTAGCCTTTTGGTTCTCAGATTTTTTAATTATTACATTGCTATAAGGGTGTAACTGTGTGATAGCCAAAATTTTAAGCTGCAAATGGTTTGTAAATATGATATATTACAAGCTTCATGAAAATCGGTTTATGACTGATCCGCGATTACGTTGAAAGGCGACTGGCAGAGATACTTTTGTTCAGATGTTTTTTCAGGTAGCGATTCCAATGAATAGGTAAAATACCTTGCAAGTTTTGTTGTTGTCGTTGGAGGAAATGTGGATGTGGTTGTTATTGTTGA"; char* sizes = (char*)malloc( strlen(seq1)+1 ); char* displacements = (char*)malloc( strlen(seq1)+1 ); annotate( seq1, sizes, displacements, -strlen(seq1) ); int i; for (i=0; i