#!/usr/bin/perl -W $usage = " parseDbSnpXML takes dbSnp XML files and writes condensed a file of information that can be lifted and loaded directly into snp table. It also creates files of flanking sequences, structural information, and locus info.\n usage: parseDbSnpXML parseDbSnpXML ds_chY.xml.gz ds_chY where is a gzipped XML file, is the directory for output files, and specifies the root for the output files: .det: details for snp table .seq: flanking sequences .str: structural info .loc: locus link info\n\n"; if ($#ARGV != 2) {die $usage;} $inputFile = $ARGV[0]; $outDir = $ARGV[1]; $outRoot = $ARGV[2]; $tmpDetail = "/tmp/$outRoot.det"; $tmpSeq = "/tmp/$outRoot.seq"; $tmpStruct = "/tmp/$outRoot.str"; $tmpLocus = "/tmp/$outRoot.loc"; unlink( " /tmp/$outRoot.* $outDir/$outRoot.* " ); open( IN, " zcat $inputFile | " ) or die "can't open INPUT file: $inputFile; $!"; open( DET, " > $tmpDetail " ) or die "can't open OUTPUT file: $tmpDetail; $!"; open( SEQ, " > $tmpSeq " ) or die "can't open OUTPUT file: $tmpSeq; $!"; open( STR, " > $tmpStruct " ) or die "can't open OUTPUT file: $tmpStruct; $!"; open( LOC, " > $tmpLocus " ) or die "can't open OUTPUT file: $tmpLocus; $!"; $accession=$strand=$observed=$molType=$class=$status=$locType=$valid="."; $protAcc=$frame=$allele=$residue=$aaPos=$func="."; $version=$contigStart=$contigEnd=$rsId=$avHet=$avHetSE=0; %mapLoc=(); %funcHash=(); %validHash=(); $printHeaders="FALSE"; if ($printHeaders eq "TRUE") { print DET "chr/acc\tctgStrt\tctgEnd\trsId\tscore\tstrand\tmolType\tclass\tvalid\t"; print DET "avHet\tavHetSE\tfunc\tlocType\thitQual\tmapWt\tchrHits\tctgHits\tseqHits\tsource\n"; print STR "rsId\tprotAcc\tprotGi\tstructGi\tprotLoc\trsRes\tprotRes\tstructRes\tstructLoc\n"; print LOC "rsId\tlocusId\tsymbol\tmrnaAcc\tprotAcc\tfunc\tframe\tallele\tresidue\taaPos\n"; print SEQ "rsId\tflank5\tflank3\tobserved\n"; } while () { chomp; if (/NSE-rsStruct/) { if (/<(NSE-rsStruct_prot-acc)>(\S+)\s?<\/\1>/) { $protAcc = $2; } elsif (/<(NSE-rsStruct_prot-gi)>(\S+)<\/\1>/) { $protGi = $2; } elsif (/<(NSE-rsStruct_prot-loc)>(\S+)<\/\1>/) { $protLoc = $2; } elsif (/<(NSE-rsStruct_prot-res)>(\S+)<\/\1>/) { $protRes = $2; } elsif (/<(NSE-rsStruct_rs-res)>(\S+)<\/\1>/) { $rsRes = $2; } elsif (/<(NSE-rsStruct_struct-gi)>(\S+)<\/\1>/) { $structGi = $2; } elsif (/<(NSE-rsStruct_struct-loc)>(\S+)<\/\1>/) { $structLoc = $2; } elsif (/<(NSE-rsStruct_struct-res)>(\S+)<\/\1>/) { $structRes = $2; } elsif (/<\/NSE-rsStruct>/) { print STR "$rsId\t$protAcc\t$protGi\t$structGi\t$protLoc\t$rsRes\t$protRes\t$structRes\t$structLoc\n"; $protAcc=$protGi=$protLoc=$protRes=$rsRes=$structGi=$structLoc=$structRes="."; } } elsif (/NSE-FxnSet/) { if (/<(NSE-FxnSet_fxn-class-contig value=\")(\S+)\"\/>/) { $func = $2; } elsif (/<(NSE-FxnSet_locusid)>(\S+)<\/\1>/) { $locusId = $2; } elsif (/<(NSE-FxnSet_symbol)>(\S+)<\/\1>/) { $symbol = $2; } elsif (/<(NSE-FxnSet_mrna-acc)>(\S+)<\/\1>/) { $mrnaAcc = $2; } elsif (/<(NSE-FxnSet_prot-acc)>(\S+)<\/\1>/) { $protAcc = $2; } elsif (/<(NSE-FxnSet_reading-frame)>(\S+)<\/\1>/) { $frame = $2; } elsif (/<(NSE-FxnSet_allele)>(\S+)<\/\1>/) { $allele = $2; } elsif (/<(NSE-FxnSet_residue)>(\S+)<\/\1>/) { $residue = $2; } elsif (/<(NSE-FxnSet_aa-position)>(\S+)<\/\1>/) { $aaPos = $2; } elsif (/<\/NSE-FxnSet>/) { print LOC "rs$rsId\t$locusId\t$symbol\t$mrnaAcc\t$protAcc\t$func\t$frame\t$allele\t$residue\t$aaPos\n"; if ($func ne "reference") { $funcHash{$func}= 1; } $locusId=$symbol=$mrnaAcc=$protAcc=$frame=$allele=$residue=$aaPos=$func="."; } } elsif (/NSE-rs_seq/) { if (/<(NSE-rs_seq-5_E)>(\S+)<\/\1>/) { $flank5 .= $2; } if (/<(NSE-rs_seq-3_E)>(\S+)<\/\1>/) { $flank3 .= $2; } } elsif (/NSE-rsMaploc/) { if (/<(NSE-rsMaploc_asn-from)>(\S+)<\/\1>/) { $contigStart = $2; } elsif (/<(NSE-rsMaploc_asn-to)>(\S+)<\/\1>/) { $contigEnd = $2; } elsif (/<(NSE-rsMaploc_loc-type value=\")(\S+)\"\/>/) { $locType = $2; } elsif (/<(NSE-rsMaploc_orient value=\")(\S+)\"\/>/) { $strand = $2; } elsif (/<\/NSE-rsMaploc>/) { $func=""; foreach $key (sort keys %funcHash) { $func .= "$key,"; } if ($func eq "") { $func = "unknown,"; } substr($func, -1) = ""; if ($locType eq "between") # half open { $contigEnd -= 1; } else { $contigStart -= 1; } if ($strand eq "forward") { $strand = "+"; } elsif ($strand eq "reverse") { $strand = "-"; } $key = "$contigStart:$contigEnd:$locType:$strand:$func"; $mapLoc{$key}=1; $contigStart=$contigEnd=$locType=$strand=$func="."; %funcHash=(); } } elsif (/<(NSE-rs_refsnp-id)>(\S+)<\/\1>/) { $rsId = $2; } elsif (/<(NSE-rs_snp-class value=\")(.*)\"\/>/) { $class = $2; } elsif (/<(NSE-rs_moltype value=\")(.*)\"\/>/) { $molType = $2; } elsif (/<(NSE-rs_observed)>(.+)<\/\1>/) { $observed = $2; } elsif (/<(NSE-rs_het)>(\S+)<\/\1>/) { $avHet = $2; } elsif (/<(NSE-rs_het-SE)>(\S+)<\/\1>/ && ($2 ne "NaN")) { $avHetSE = $2; } elsif (/<(NSE-rs_validated-)(\S+) value=\"true\"\/>/) { $validHash{$2} = 1; } elsif (/<(NSE-rs_)(genotype) value=\"true\"\/>/) { $validHash{$2} = 1; } elsif (/<(NSE-rsContigHit_accession)>(\S+)<\/\1>/) { $accession = $2; } elsif (/<(NSE-rsContigHit_chromosome)>(.*)<\/\1>/) { $chrom = $2; } elsif (/<\/NSE-rsContigHit>/) # end of current contig { if ($observed eq ".") { die "Observed: rs$rsId >[$_]<; $!"; } $observed =~ s/ /_/g; if ($valid eq ".") { $valid=""; foreach $key (sort keys %validHash) { $valid .= "$key,"; } if ($valid eq "") { $valid = "unknown,"; } substr($valid, -1) = ""; %validHash=(); } foreach $key (sort keys %mapLoc) { ( $contigStart, $contigEnd, $locType, $strand, $func ) = split(":", $key); print DET "$chrom/$accession\t$contigStart\t$contigEnd\trs$rsId\t0\t$strand\t$observed\t$molType\t$class\t$valid\t"; if ($avHet>0) { printf DET ("%.3f\t%.3f\t$func\t$locType\t2\t0\n",$avHet,$avHetSE); } else { print DET "0\t0\t$func\t$locType\t2\t0\n"; } } %mapLoc=(); } elsif (/<\/NSE-rs>/) # end of current SNP; reinitialize { print SEQ "rs$rsId\t$flank5\t$flank3\t$observed\n"; $accession=$strand=$observed=$molType=$class=$valid=$status=$locType="."; $protAcc=$frame=$allele=$residue=$aaPos=$func="."; $flank3=$flank5=""; $version=$contigStart=$contigEnd=$rsId=$avHet=$avHetSE=0; %mapLoc=(); %funcHash=(); %validHash=(); } } close(DET); close(SEQ); close(STR); close(LOC); system("/bin/gzip $tmpDetail"); system("/bin/gzip $tmpSeq"); system("/bin/gzip $tmpStruct"); system("/bin/gzip $tmpLocus"); system("/bin/mv -f $tmpDetail.gz ./det/"); system("/bin/mv -f $tmpSeq.gz ./seq/"); system("/bin/mv -f $tmpStruct.gz ./str/"); system("/bin/mv -f $tmpLocus.gz ./loc/"); system("/bin/mv -f /tmp/$outRoot.* .");