#!/usr/bin/env perl # File: compileAccInfo # Author: Terry Furey # Date: 4/13/01 # Project: Human # Description: Create a file listing all of the # contig and accession information # $Id: compileAccInfo,v 1.3 2007/08/06 23:26:04 hiram Exp $ use strict; use warnings; use Carp (); local $SIG{__WARN__} = \&Carp::cluck; # Usage message if ($#ARGV < 1) { print STDERR "USAGE: compileAccInfo [-pre -nb -ncbi -mouse -rat] \n"; exit(1); } # global variables my %ntacc; my $randomStart; # Read parameters my $pre = 0; my $nb = 0; my $done = 0; my $ncbi = 0; my $mouse = 0; my $rat = 0; while ($#ARGV > 1) { my $arg = shift(@ARGV); if ($arg eq "-pre") { $pre = 1; my $gp = shift(@ARGV); open(GP, "<$gp") || die("Could not open $gp\n"); } elsif ($arg eq "-ncbi") { $ncbi = 1; } elsif ($arg eq "-mouse") { $mouse = 1; } elsif ($arg eq "-rat") { $rat = 1; } elsif ($arg eq "-nb") { $nb = 1; } } my $dir = shift(@ARGV); my $seqinfo = shift(@ARGV); if (!-e $dir) { die("$dir does not exist\n"); } my @chroms; # Set the chromosomes being processed if ($ncbi) { @chroms = (1..22,qw(X),qw(Y),qw(M),qw(Un)); } elsif ($mouse) { @chroms = (1..19,qw(X),qw(Y),qw(M),qw(Un)); } elsif ($rat) { @chroms = (1..20,qw(X),qw(M),qw(Un)); } else { @chroms = (1..22,qw(X),qw(Y),qw(NA),qw(UL)); } # Read in the phases of the accessions (human) my %phase; if ((!$mouse) && (!$rat)) { open(SEQ, "<$seqinfo") || die("Could not open $seqinfo\n"); while (my $line = ) { chomp($line); my ($thisacc, $gi, $bases, $phase, $draft, $chr, $FISH, $lab, $clone) = split(' ',$line); my ($acc, $vers) = split(/\./, $thisacc); $phase{$acc} = $phase; } close(SEQ); } # Read in golden_pos if creating from WashU map my %accchr; my %allordered; my %allstart; my %allend; if ($pre) { while (my $line = ) { chomp($line); my ($chr, $ord, $acc, $start, $end, $mid, $size) = split("\t", $line); $accchr{$acc} = $chr; $allordered{$acc} = $ord; $allstart{$acc} = $start; $allend{$acc} = $end; } } # Process all of the chromosomes including NA, UL, and random regions open(OUT, ">accession_info.rdb"); print OUT "Chr\tOrd\tStart\tEnd\tMiddle\tContig\tCtgIdx\tBargeIdx\tClone\tAcc\tBridged\tNT\tPhase\n"; print OUT "5S\t2N\t10N\t10N\t10N\t10S\t5N\t5N\t10S\t10S\t5S\t20S\t5N\n"; my $count = 0; my @contig; my @ctgstart; my @ctgend; foreach my $chr (@chroms) { my $border = 0; # for barge reading my @btype; # barge types my @ctgidx; # for barge reading, apparently unused my @bctg; # for barge reading, apparently unused my @bargeidx; # for barge reading my @bacc; # for barge reading, apparently unused my %accbarge; # accessions in barge my %clone; # accession barge info my %ntname; # accession barge info my %accExists; # accession barge info, apparently unused my %accctg; my %accctgidx; my $start; my $thisctg; my $ctglength; my $thischr; my $chrlength; my $ctg; print STDERR "Processing chrom $chr\n"; # First, read in ordering information for the contigs my $numctg = 0; if (!$pre) { if (open(LIFT, "$dir/$chr/lift/ordered.lft")) { print STDERR "reading contig information from $chr/lift/ordered.lft\n"; while (my $line = ) { chomp($line); ($start, $thisctg, $ctglength, $thischr, $chrlength) = split(' ',$line); $ctg = $thisctg; if ($thisctg =~ m#/#) { ($thischr, $ctg) = split(/\//,$thisctg); } if ($ctglength != 0) { $contig[$numctg] = $ctg; $ctgstart[$numctg] = $start; $ctgend[$numctg] = $start + $ctglength; $numctg++; } } } close(LIFT); $randomStart = $numctg; if (open(LIFT, "$dir/$chr/lift/random.lft")) { print STDERR "reading contig information from $chr/lift/random.lft\n"; while (my $line = ) { chomp($line); my ($start, $thisctg, $ctglength, $thischr, $chrlength) = split(' ',$line); $ctg = $thisctg; if ($thisctg =~ m#/#) { ($thischr, $ctg) = split(/\//,$thisctg); } if ($ctglength != 0) { $contig[$numctg] = $ctg; $ctgstart[$numctg] = $start; $ctgend[$numctg] = $start + $ctglength; $numctg++; } } } close(LIFT); } else { @contig = split(' ',`ls -d $dir/$chr/ctg*`); my $numctg = $#contig + 1; for (my $i = 0; $i < $numctg; $i++) { my @subdir = split(/\//, $contig[$i]); $contig[$i] = $subdir[$#subdir]; $contig[$i] =~ s/\///; } } # Read in NT contig information, if necessary if ((!$mouse) && (!$rat)) { for (my $i = 0; $i < $numctg; $i++) { if (open(NT, "<$dir/$chr/$contig[$i]/nt.noNt")) { my $currnt = ""; while (my $line = ) { my ($nt, $ntacc, $start, $direction, $length) = split(' ',$line); if ($nt eq $currnt) { $ntacc{$nt} .= ";$ntacc"; } else { $ntacc{$nt} = "$ntacc"; } $currnt = $nt; } } } } # Next, check if the barge information is available for (my $i = 0; $i <= $border; $i++) { $btype[$border] = ""; } $border = -1; for (my $i = 0; $i < $numctg; $i++) { next if (!defined $contig[$i]); if (open(BARGE, "<$dir/$chr/$contig[$i]/mmBarge")) { print STDERR "Processing $contig[$i]\n"; my $line = ; $line = ; my $firstacc = 1; $border++; my $bidx = 0; while ($line = ) { chomp($line); # Record if barge is bridged or not if ($line =~ /gap/) { if (!($line =~ /weak/)) { if ($line =~ /bridged/) { $btype[$border] = "true"; #print STDERR "Barge $border bridged\n"; } else { $btype[$border] = "false"; #print STDERR "Barge $border open\n"; } $border++; $firstacc = 1; } } else { my ($start, $name, $phase, $clone, $direction, $size, $before, $after, $center, $plc, $mappos) = split(' ',$line); # Record contig info if ($firstacc) { $ctgidx[$border] = $i; $bctg[$border] = $contig[$i]; $bargeidx[$border] = $bidx; $bidx++; $bacc[$border] = ""; $firstacc = 0; } # Create a list of accessions for the barge if ($name =~ /NT/) { $bacc[$border] .= "$ntacc{$name};"; my @accs = split(";",$ntacc{$name}); my $ntcount = 0; for (my $j = 0; $j <= $#accs; $j++) { $accbarge{$accs[$j]} = $border; $clone{$accs[$j]} = $clone; $ntcount++; $ntname{$accs[$j]} = "${name}_$ntcount"; $accExists{$accs[$j]} = 1; # print STDERR "ntname for $accs[$j] is $ntname{$accs[$j]}\n"; } } else { $bacc[$border] .= "$name;"; $accbarge{$name} = $border; $clone{$name} = $clone; $ntname{$name} = ""; $accExists{$name} = 1; } } } close(BARGE); $nb = 0; } else { $nb = 1; } } # Lastly, read in AGP position information my %ordered; my %start; my %end; if (!$pre) { my $agp = "$dir/$chr/chr$chr.agp"; my $ranAgp = "$dir/$chr/chr" . $chr . "_random.agp"; my $tmpAgp = "$$.agp"; if ( -f $ranAgp && -f $agp ) { `cat $agp $ranAgp > $tmpAgp`; } elsif ( -f $ranAgp ) { `cat $ranAgp > $tmpAgp`; } else { `cat $agp > $tmpAgp`; } open(AGP, "$tmpAgp") or die "Can not open created file $tmpAgp:$!"; while (my $line = ) { chomp($line); my ($thischr, $start, $end, $index, $type, $accession) = split(' ',$line); if ($type eq "N") { next; } my ($acc, $version) = split(/\./,$accession); if ($thischr =~ /random/) { $ordered{$acc} = 0; } else { $ordered{$acc} = 1; } if(!exists $start{$acc}) { $start{$acc} = $start; } $end{$acc} = $end; if (!$ordered{$acc}) { $index = $randomStart; } else { $index = 0; } while ($start{$acc} > $ctgend[$index]) { $index++; } $accctg{$acc} = $contig[$index]; $accctgidx{$acc} = $index; $count++; } close(AGP); `rm $tmpAgp`; } else { foreach my $acc (keys %accchr) { if ($accchr{$acc} eq $chr) { $count++; $ordered{$acc} = $allordered{$acc}; $start{$acc} = $allstart{$acc}; $end{$acc} = $allend{$acc}; $accExists{$acc} = 1; } } } # Print it all out for each accession foreach my $acc (keys %start) { my $middle = int(($start{$acc} + $end{$acc})/2); my $bargeidx = 0; my $btype = "false"; if (exists($accbarge{$acc})) { $border = $accbarge{$acc}; if (defined($btype[$border]) && (!$btype[$border])) { $btype[$border] = "false"; } $btype = $btype[$border]; $bargeidx = $bargeidx[$border]; } my $phase = -1; if (exists($phase{$acc}) && ($phase{$acc} =~ /0|1|2|3/)) { $phase = $phase{$acc}; } my $clone = ""; if (exists($clone{$acc})){$clone=$clone{$acc};} my $ntname = ""; if (exists($ntname{$acc})){$ntname=$ntname{$acc};} if (!exists($ordered{$acc})){print STDERR "ordered{$acc} does not exist\n";} if (!exists($accctgidx{$acc})){print STDERR "accctgidx{$acc} does not exist\n";} die "chr not defined" if (!defined($chr)); die "acc not defined" if (!defined($acc)); die "ordered{$acc} not exist" if (!exists($ordered{$acc})); die "start{$acc} not exist" if (!exists($start{$acc})); die "end{$acc} not exist" if (!exists($end{$acc})); die "middle not defined" if (!defined($middle)); die "accctg{$acc} not exist" if (!exists($accctg{$acc})); die "accctgidx{$acc} not exist" if (!exists($accctgidx{$acc})); die "bargeidx not defined" if (!defined($bargeidx)); die "clone not defined" if (!defined($clone)); die "btype not defined" if (!defined($btype)); die "ntname not defined" if (!defined($ntname)); die "phase not defined" if (!defined($phase)); if (!$nb) { print OUT "chr0XX${chr}YY\t1XX$ordered{$acc}YY\t2XX$start{$acc}YY\t3XX$end{$acc}YY\t4XX${middle}YY\t5XX$accctg{$acc}YY\t6XX$accctgidx{$acc}YY\t7XX${bargeidx}YY\t8XX${clone}YY\t9XX${acc}YY\t10XX${btype}YY\t11XX${ntname}YY\t12XX${phase}YY\n"; } else { print OUT "chr$chr\t$ordered{$acc}\t$start{$acc}\t$end{$acc}\t$middle\t$accctg{$acc}\t$accctgidx{$acc}\t0\t$clone\t$acc\tfalse\t$ntname\t$phase\n"; } } } print STDERR "$count processed\n"; close(OUT);