#!/usr/bin/perl -W # takes dbSnp XML files as stdin and prints condensed information # to stdout in one line per SNP in tab delimited format $chrom=$strand=$alleles=$class=$func=$valid=""; $chromEnd=$rsId=$score=$het=$hetSE=0; $source="dbSnp"; die "\nFix the chromosome issue - like the pseudoautosomal errors. \nExtend this to the whole genome by getting the chromosome \nfrom within NSE-rs_contig-mapset rather than from the header\n\n"; do { $line = <>; if ($line =~ /<(NSE-ExchangeSet_query)>select distinct snp_id from vwSNP_hs_chr where chr = '(.*)'<\/\1>/) { $chrom = "chr$2"; } } until ( $line =~ //); while (<>) { if (/<(NSE-rsMaploc_physmap-int)>(\S+)<\/\1>/) { $chromEnd = $2;} elsif (/<(NSE-rsContigHit_contig-id)>(\S+)<\/\1>/) { $contig = $2;} elsif (/<(NSE-rs_refsnp-id)>(\S+)<\/\1>/) { $rsId = $2;} elsif (/<(NSE-rsMaploc_orient value=\")(\S+)\"\/>/) { $strand = $2;} elsif (/<(NSE-rs_observed)>(\S+)<\/\1>/) { $alleles = $2;} elsif (/<(NSE-rs_snp-class value=\")(.*)\"\/>/) { $class = $2;} elsif (/<(NSE-rs_het)>(\S+)<\/\1>/) { $het = $2;} elsif (/<(NSE-rs_het-SE)>(\S+)<\/\1>/ && ($2 ne "NaN")) { $hetSE = $2;} elsif (/<(NSE-FxnSet_fxn-class-contig value=\")(\S+)\"\/>/) { if (index($func, $2) == -1) { $func .= "$2,";} } elsif (/<(NSE-rs_validated-)(\S+) value=\"true\"\/>/) { if (index($valid,$2) == -1) { $valid .= "$2,";} } elsif (/<\/NSE-rsMaploc>/) # end of current location { # store current pos/strand/fxn and reinit if ( $contig =~ /b/ || $chromEnd == 0 ) { $chromEnd = 1; } if ( $func eq "" ) { $func = "unknown,"; } if ( $strand eq "forward" ) { $strand = "+"; } elsif ( $strand eq "reverse" ) { $strand = "-"; } $func =~ tr/ //; substr($func, -1) = ""; # remove last character $loc{$chromEnd}="$strand $func"; $strand=$func=""; $chromEnd=0; } elsif (/<\/NSE-rs>/) # end of current SNP { if (index($valid,"other-pop") != -1) { $score += 1; } elsif (index($valid,"by-frequency") != -1) { $score += 2; } elsif (index($valid,"by-cluster") != -1) { $score += 4; } elsif (index($valid,"by-2hit-2allele") != -1) { $score += 8; } if ($valid eq "") {$valid = "no-information,"; } substr($valid, -1) = ""; # remove last character $alleles =~ tr/\//,/; if ($alleles eq "") { $alleles = "?"; } for $chromEnd (sort keys %loc ) { ($strand, $func) = split / /, $loc{$chromEnd}; $chromStart = $chromEnd - 1; print "$chrom\t$chromStart\t$chromEnd\trs$rsId\t$score\t$strand\t"; print "$alleles\t$source\t$class\t$valid\t"; printf "%.4f\t%.4f\t%s\n",$het, $hetSE, $func; } %loc=(); $strand=$alleles=$class=$func=$valid=""; $rsId=$score=$het=$hetSE=0; } }