#!/usr/bin/perl # This script was originally written by Yontao Lu and was copied from # /cluster/store4/ratMarker/code/ into the kent/src tree in order # to get it under CVS control; added warnings & strict and incorporated a # fix for getN from /cluster/store5/mouseMarker/code/luConvertPrimerToFa. # Also, I parse size ranges and compute the average, instead of feeding # the ranges directly to the calculation for nSize which seems to result # in the lower number being used. # $Id: luConvertPrimerToFa,v 1.2 2007/08/03 16:45:40 hiram Exp $ use warnings; use strict; ################################################## ##Yontao Lu ##date: 12/23/02 ##Des: Take a bed file and output a fasta file. ##1. Just know Forwards and reverse put N in it according to the size. ##If size don't know put default size which in average expected-size ##2. If know clone sequence, put clone sequence as another sequence entry ##with "|clone" ########################################################################### my $n25s = "nnnnnnnnnnnnnnnnnnnnnnnnn"; sub getN { my ($nSize, @rest) = @_; my $lines = int($nSize / 25); my $remainder = $nSize % 25; my $ns = ""; for(my $j = 0; $j < $lines - 1; $j++) { $ns .= "$n25s\n"; } $ns .="$n25s"; my $leftSeq = substr($n25s, 0, $remainder); $ns .= "\n$leftSeq" if ($leftSeq ne ""); return $ns; } sub reverse { my (@primer) = @_; my @newS = (); #my $errors = 0; ## For testing purpose. for(my $k = 0; $k < @primer; $k++) { $primer[$k] =~ s/\s+//g; my @chars = split(//, $primer[$k]); my @newChar = (); for(my $j = $#chars; $j >= 0; $j--) { if($chars[$j] eq 'a' || $chars[$j] eq 'A') { push(@newChar, "T"); } elsif($chars[$j] eq 't'|| $chars[$j] eq 'T') { push(@newChar, "A"); } elsif($chars[$j] eq 'c' || $chars[$j] eq 'C') { push(@newChar, "G"); } elsif($chars[$j] eq 'g' || $chars[$j] eq 'G') { push(@newChar, "C"); } else { #$errors = 1; push(@newChar, "N"); } } my $newStr = join ("",@newChar); push(@newS, $newStr); } #return ($errors, @newS); ##just for testing purpose return (@newS); } #set default parameter my $header = 0; my $aveSize = 150; my $ucscId = 1; my @file = (); while(my $arg = shift) { if($arg eq "-h") { $header = 1; } elsif($arg eq "-a") { $aveSize = shift; } else { push(@file, $arg); } } if(@file < 4) { print STDERR "Usage: luConvertPrimerToFa [-a avgSize] [-h] stsInfo primer.fa clone.fa primer.info -a: average size. -h: has header. default no header.\n"; exit(1); } #open file open(INFO, "<$file[0]") || die "Can't open marker file. $file[0] $!"; open(FA, ">$file[1]") || die "Can't open primer file. $file[1] $!"; open(CLON, ">$file[2]") || die "Can't open clone file. $file[2] $!"; open(PRI, ">$file[3]") || die "Can't open primer file. $file[3] $!"; while(my $line = ) { chomp($line); my (@eles ) = split(/\t/, $line); # size field may specify KB, BP, ~, or a comma separated list my $kbMultiplier = 1; if($eles[9] =~ m/KB/i) { $kbMultiplier = 1000; } $eles[9] =~ s/\~|\s*KB\s*|BP//gi; # remove known garbage my @sizeList = split(',\s*|-',$eles[9]); my $sizeSum = 0; my $val = $aveSize; if (scalar(@sizeList) > 0) { for (my $j = 0; $j < scalar(@sizeList); ++$j) { if($sizeList[$j] !~ /\d/o || ($sizeList[$j] =~ /^\d+$/o && $sizeList[$j] == 0)) { $sizeSum += $aveSize; } else { $sizeSum += $sizeList[$j] * $kbMultiplier; } } $val = int($sizeSum / scalar(@sizeList)); } my $priInfo = $val; if($eles[7] ne "" && $eles[7] ne "N/A") { my ($ends, @rest) = &reverse($eles[8]); my @forwards = split(//,$eles[7]); my @rewards = split(//,$ends); my $ns = getN(($val-$#forwards-$#rewards-2)); print FA ">$eles[0]_$eles[1] $eles[7] ${ns} $ends\n"; print PRI "$eles[0]\t$eles[7]\t$eles[8]\t$priInfo\t$eles[1]\n"; } if($eles[24] && $eles[24] ne "N/A" && $eles[24] =~ /\w/o) { #my (@seqs) = split("", $eles[24]); print CLON ">$eles[0]_$eles[1]_clone\n"; my $len = length($eles[24]); for(my $i=0; $i < $len; $i=$i+50) { print CLON substr($eles[24], $i, 50),"\n"; #$eles[24]\n"; } } } close(INFO); close(FA); close(CLON); close(PRI);