#!/usr/bin/perl -W # takes dbSnp XML files as stdin and prints condensed information # to stdout in one line per SNP in tab delimited format as the # first step of parsing the dbSnp data for loading into the fixed # browser table hgFixed.dbSnpRS (supplemental details) $rs=$a1=$a2=$seq5=$seq3=$valid=$func=$seq5rc=$seq3rc=""; $het=$hetSE=0; while (<>) { if (//) # start of new SNP { while (<>) { if (/<\/NSE-rs>/) # end of current SNP { if ( $a1 =~ /\//) # reset alleles if not biallelic { # alleles should be read into hash for compatibility # with triallelic polymorphisms; reset variables if null $a2 = "$a1/$a2"; $a1 = "(not_biallelic)"; } if ($seq5 eq "") {$seq5 = "n"; } if ($seq3 eq "") {$seq3 = "n"; } if ($valid eq "") {$valid = "no-information,"; } if ($a1 eq "") {$a1 = "?"; } if ($a2 eq "") {$a2 = "?"; } if ($func eq "") {$func = "X,"; } substr($valid, -1) = ""; # remove last character substr($func, -1) = ""; # remove last character print "rs$rs\t$het\t$hetSE\t$valid\t$a1\t$a2\t"; print substr($seq5,-20)."\t".substr($seq3,0,20)."\t"; print "$func\n"; # reinitialize $rs=$a1=$a2=$seq5=$seq3=$valid=$func=$seq5rc=$seq3rc=""; $het=$hetSE=0.0; } elsif (/<(NSE-rs_refsnp-id)>(\S+)<\/\1>/) {$rs = $2;} elsif (/<(NSE-rs_observed)>(\S+)\/(\S+)<\/\1>/) {$a1=$2; $a2=$3;} elsif (/<(NSE-rs_seq-5_E)>(\S+)<\/\1>/) {$seq5 .= $2;} elsif (/<(NSE-rs_seq-3_E)>(\S+)<\/\1>/) {$seq3 .= $2;} elsif (/<(NSE-rs_het)>(\S+)<\/\1>/) {$het += $2;} elsif (/<(NSE-rs_het-SE)>(\S+)<\/\1>/ && ($2 ne "NaN")) {$hetSE += $2;} elsif (/<(NSE-rs_validated-)(\S+) value=\"true\"\/>/) {if (index($valid,$2)== -1) {$valid .= "$2,";}} elsif (/<(NSE-FxnSet_fxn-class-contig value=\")(\S+)\"\/>/) {if (index($func,$2)== -1) {$func .= "$2,";}} } } }