#!/usr/bin/env perl use warnings; use strict; use Getopt::Long; use Cwd; use lib "/cluster/bin/scripts"; use Encode; use RAFile; use HgAutomate; use HgDb; use vars qw/ $opt_configDir $opt_verbose $opt_instance /; sub usage { print STDERR < "name", "Lots" => "lots" # not sure why the term is spelled different need to investigate ); sub mapTermToColumn { my $term = shift; if (exists $termToColumnMapper{$term}) { return $termToColumnMapper{$term}; } else { return $term; } } sub mapColumnToTerm { my $column = shift; for my $k (keys %termToColumnMapper) { if ($column eq $termToColumnMapper{$k}) { return $k; } } return $column; } # return a list of warning sub getDbWarnings { my $dbHandle = shift; my @warnings = (); my $results = $dbHandle->execute("SHOW WARNINGS") or die $dbHandle->errstr; while (my @row = $results->fetchrow_array()) { push @warnings, $row[2]; } return @warnings; } # return a list of the fields in a database table sub getDbFields { my $dbHandle = shift; my $tableName = shift; my @fieldsList = (); my $cmd = "SHOW COLUMNS FROM $tableName"; my $results = $dbHandle->execute($cmd) or die $dbHandle->errstr; if ($results) { while (my @row = $results->fetchrow_array()) { push @fieldsList, $row[0]; } } else { die $dbHandle->errstr; } return sort @fieldsList; } # returns all the possible fields for the given CV terms sub getCvFields { my %records = @_; my %fieldsHash = (); for my $k (keys %records) { my @fields = keys(%{$records{$k}}); for my $f (@fields) { $fieldsHash{$f} = 0; } } return sort keys %fieldsHash; } sub checkUpdateFields { my $dbHandle = shift; my $tableName = shift; my %termInfo = @_; my $name = $termInfo{"term"}; my @columnNames = (); my @cvValues = (); my @dbValues = (); # get all columns that are in the CV for my $k (keys %termInfo) { next if ($k eq "type"); push @columnNames, mapTermToColumn($k); if (defined $termInfo{$k}) { push @cvValues, $termInfo{$k}; } else { push @cvValues, ""; } } # get all columns fron the database my $cmd = "SELECT " . join(",", @columnNames) . " FROM $tableName WHERE name = " . $dbHandle->quote($name); my $results = $dbHandle->execute($cmd) or die $dbHandle->errstr; if ($results) { my @row = $results->fetchrow_array(); for (my $i = 0; $i < scalar(@row); ++$i) { # check if database has empty column if (not defined $row[$i] or $row[$i] eq "") { # check if there is anything in the CV to back fill if ($cvValues[$i] ne "") { HgAutomate::verbose(2, "Backfilling field " . $columnNames[$i] . " with " . $cvValues[$i] . " in $name\n"); my $updateCmd = "UPDATE $tableName" . " SET " . $columnNames[$i] . " = " . $dbHandle->quote($cvValues[$i]) . " WHERE name = " . $dbHandle->quote($name); $dbHandle->execute($updateCmd) or die $dbHandle->errstr; my @warnings = getDbWarnings($dbHandle); if (scalar(@warnings) > 0) { print STDERR "Got a warning when executing $cmd.\n"; for my $w (@warnings) { print STDERR "\t$w\n"; } } } } elsif ($cvValues[$i] ne $row[$i]) { # check to make sure the CV and database tables are equal print STDERR "WARNING: term $name in $tableName contrains different $columnNames[$i], $cvValues[$i] != $row[$i]\n"; } } } else { die $dbHandle->errstr; } } # add a new term to the database table sub addNewTerm { my $dbHandle = shift; my $tableName = shift; my %termInfo = @_; my @columnNames = (); my @columnValues = (); # get the defined column and it's value for my $k (keys %termInfo) { next if ($k eq "type"); push @columnNames, mapTermToColumn($k); if (defined $termInfo{$k}) { push @columnValues, $dbHandle->quote($termInfo{$k}); } else { push @columnValues, $dbHandle->quote(""); } } my $cmd = "INSERT INTO $tableName (" . join(",", @columnNames) . ") VALUES " . "(" . join(",", @columnValues) . ")"; my $results = $dbHandle->execute($cmd) or die $dbHandle->errstr; my $warnCount = $dbHandle->quickQuery("SHOW COUNT(*) WARNINGS"); my @warnings = getDbWarnings($dbHandle); if (scalar(@warnings) > 0) { print STDERR "Got a warning when executing $cmd.\n"; for my $w (@warnings) { print STDERR "\t$w\n"; } } } ############################################################################ # Main ############################################################################ my $now = time(); my $wd = cwd(); my $ok = GetOptions("instance=s", "configDir=s", "verbose=i" ); # parse options usage() if (!$ok); usage() if (scalar(@ARGV) != 2 and scalar(@ARGV) != 3); # get options or set defaults if (not defined $opt_instance) { $opt_instance = "prod"; } my $configPath; if (defined $opt_configDir) { if ($opt_configDir =~ /^\//) { $configPath = $opt_configDir; } else { $configPath = "$wd/$opt_configDir"; } } else { $configPath = "/hive/groups/encode/dcc/pipeline/encpipeline_$opt_instance/config/"; } if(!(-d $configPath)) { die "configPath '$configPath' is invalid; Can't find the config directory\n"; } if (not defined $opt_verbose) { $opt_verbose = 1; } HgAutomate::verbose(4, "Config directory path: \'$configPath\'\n"); my $database = $ARGV[0]; my $tableName = $ARGV[1]; my $termName; if (scalar(@ARGV) == 3) { $termName = $ARGV[2]; } else { $termName = $tableName; } # read the cv.ra file my %cvTerms = Encode::getControlledVocab($configPath); # connect to the database and read the metadata table for the obj my $dbHandle = HgDb->new(DB => $database); # get hashes contraining the fields and term names my %dbFields = map { $_ => 0 } getDbFields($dbHandle, $tableName); my %cvFields = map { $_ => 0 } getCvFields(%{$cvTerms{$termName}}); delete $cvFields{"type"}; # check if there are any fields in the CV that aren't in the table my $fatalError = 0; for my $f (keys %cvFields) { if (defined $dbFields{mapTermToColumn($f)}) { } else { print STDERR "ERROR: Found field $f in CV that is missing from database table.\n"; $fatalError = 1; } } # check if there are any fields in the table that aren't in the CV for my $f (keys %dbFields) { if (defined $cvFields{mapColumnToTerm($f)}) { } else { print STDERR "WARNING: Found field $f in database table that is missing from CV.\n"; } } if ($fatalError) { exit 10; } # read the names present in the database my %dbNames = (); my $results = $dbHandle->execute("SELECT name FROM $tableName") or die $dbHandle->errstr; if ($results) { while(my @row = $results->fetchrow_array()) { my $name = $row[0]; $dbNames{$name} = 0; } } # read the names from the CV my %cvNames = (); my %terms = %{$cvTerms{$termName}}; for my $t (keys %terms) { $cvNames{$t} = 0; # store the name for later diff'ing my %termInfo = %{$terms{$t}}; if (defined $dbNames{$t}) { # if it's already in the table, HgAutomate::verbose(3, "Term $t already in table $tableName.\n"); checkUpdateFields($dbHandle, $tableName, %termInfo); } else { # add it HgAutomate::verbose(2, "Term $t not in table $tableName, adding it.\n"); addNewTerm($dbHandle, $tableName, %termInfo); } } # check for terms in the database that are not in the cv for my $n (sort keys %dbNames) { if (defined $cvNames{$n}) { # if in cv # do nothing } else { # otherwise give a warning print STDERR "Term $n found in table $tableName but not in cv.\n"; } }