#!/bin/env python import sys import optparse import os import re import subprocess import md5 #### Classes ################################################################### #### Functions ################################################################# def get_file_set(path, regexp): """return a set of the files in the path matching a regexp""" expression = re.compile(regexp) file_set = set() for file in os.listdir(path): if expression.match(file): file_set.add(file) return file_set def table_exists(database, table): """check if a table exists in a database""" if "\"" in table: raise ValueError, "table name contains a \"" command = "hgsql %s -e \"DESC %s;\"" % (database, table) proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE,stderr=subprocess.PIPE) proc.communicate() status_code = proc.returncode; return status_code == 0 def parse_version(name): """parse out a the version in name like tableNameV4""" expression = re.compile("(.*)V([0-9]+)$") match = expression.match(name) if match == None: return name, 1 else: return match.group(1), int(match.group(2)) def prev_version(name, version): """return the prev version of a name/version pair""" if version == 1: raise ValueError, "there is no previous version of table %s" % name else: return "%sV%d" % (name, version - 1) def next_version(name, version): """return the next version of a name/version pair""" return "%sV%d" % (name, version + 1) def get_gbdb_pathname(database, table_name): """extract the gbdb pathaname of a gbdb table""" query = "SELECT file FROM %s LIMIT 1;" % table_name command = "hgsql %s --skip-column-names -e \"%s;\"" % (database, query) proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE) name = proc.communicate()[0].rstrip() status_code = proc.returncode; if status_code == 0: return name else: query = "SELECT fileName FROM %s LIMIT 1;" % table_name command = "hgsql %s --skip-column-names -e \"%s;\"" % (database, query) proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE) name = proc.communicate()[0].rstrip() status_code = proc.returncode; if status_code == 0: return name else: return None def same_file(x, y, do_md5=True): """checks if two files are the same with various definitions of same""" if os.path.samefile(x, y): return True # same inode else: if os.path.getsize(x) != os.path.getsize(y): return False # different sizes else: # now check md5s of the two files if do_md5: md5_x = md5.new() file_x = open(x) block = file_x.read(2**24) while block != "": md5_x.update(block) block = file_x.read(2**24) md5_y = md5.new() file_y = open(y) block = file_y.read(2**24) while block != "": md5_y.update(block) block = file_y.read(2**24) return md5_x.digest() == md5_y.digest() else: return True def remove_duplicates(list): seen = {} for i in list: if i in seen: pass else: seen[i] = 1 returnlist = [] for i in seen: returnlist.append(i) return(returnlist) #### Main ###################################################################### def main(argv=None): """ Generate a human readable file describing the changes between two releases of an ENCODE track. """ if argv is None: argv = sys.argv # parse the args parser = optparse.OptionParser(usage="%prog [options] database current_release (prev_release|-)", version="%prog 0.9") parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False) parser.add_option("-t", "--composite-name", dest="composite_name", help="the name of the composite track by default this is the name of the current directory", metavar="N", default=None) parser.add_option("-n", "--track-name", dest="track_name", help="the English name of track, by default this is \"ENCODE [composite-name]\"", metavar="N", default=None) parser.add_option("--files", dest="files_path", help="dump list of new files to F", metavar="F") parser.add_option("--tables", dest="tables_path", help="dump list of new tablesto F", metavar="F") parser.add_option("--disable-md5-checks", dest="do_md5_checks", action="store_false", help="disable MD5 checks on files", default=True) global options (options, args) = parser.parse_args() # output a usage message if len(args) != 3: parser.print_help() sys.exit(10) # default track name is the current direcotry name if options.composite_name == None: options.composite_name = os.path.basename(os.getcwd()) # default composite name is "ENCODE composite_name" if options.track_name == None: options.track_name = "ENCODE %s" % options.composite_name # get the positional args database = args[0] current_release_dir = args[1] prev_release_dir = args[2] # list of addition files possible_addition_files = ["README.txt", "files.txt", "md5sum.txt"] addition_files = [] # some re we will be using table_and_file = re.compile("^(narrowPeaks|narrowPeak|broadPeak|gtf|bedRnaElements|bedRrbs|bedGraph\d+|bed\d*|bedLogR|pairedTagAlign|shortFrags|peptideMapping)$") gbdb = re.compile("^(wig|tagAlign|bigWig|bam)$") file_only = re.compile("^(matrix|pdf|fastq|fasta|rpkm|bowtie|psl|csqual|csfasta|junction|fpkm|fpkm1|fpkm2|insDistrib|insDist|junction|txt|tar)$") # if new relase, add the full path to all files path_prefix = "" #if prev_release_dir == "-": path_prefix = "/usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/%s/" \ % (database, options.composite_name, current_release_dir) # generate the list of files file_pattern = ".*\.(gz|bigWig|bam|tgz)$" # the pattern that describes what files we even consider current_files = get_file_set(current_release_dir, file_pattern) if prev_release_dir == "-": prev_files = set() prev_release_dir = "" else: prev_files = get_file_set(prev_release_dir, file_pattern) # form the three derived sets removed_files = prev_files - current_files unchanged_files = current_files & prev_files new_files = current_files - prev_files # warnings warnings = [] # the list of files that we'll be printing unchanged_tables_list = [] unchanged_files_list = [] unchanged_gbdbs_list = [] removed_tables_list = [] removed_files_list = [] removed_gbdbs_list = [] new_tables_list = [] new_files_list = [] new_gbdbs_list = [] if not options.do_md5_checks: warnings.append("Use of MD5 checksums to verify unchanged files has been disabled.") # process the list of unchanged files for f in unchanged_files: if f.count(".") == 1: name, type = f.split(".") compression = None elif f.count(".") == 2: name, type, compression = f.split(".") assert compression == None or compression == "gz" or compression == "tgz" # we don't deal with revisions yet stem, version = parse_version(name) if next_version(stem, version) in new_files: raise ValueError, "Unimplemented: newer version of %s found, can't deal with this yet" % name # check to make sure the files are really the same if not same_file(os.path.join(current_release_dir, f), os.path.join(prev_release_dir, f), options.do_md5_checks): warn = "file %s in %s and %s don't appear to be the same (type=%s)" % \ (name, current_release_dir, prev_release_dir, type) warnings.append(warn) print >>sys.stderr, warn if gbdb.match(type): unchanged_files_list.append(path_prefix + f) if type == "bam": if os.path.exists(path_prefix + f + ".bai"): unchanged_files_list.append(path_prefix + f + ".bai") else: warn = "could not find .bai file for bam file %s" % f warnings.append(warn) print >>sys.stderr, warn gbdb_path = get_gbdb_pathname(database, name) if gbdb_path != None and os.path.exists(gbdb_path + ".bai"): unchanged_gbdbs_list.append(gbdb_path + ".bai") else: warn = "could not find %s file for %s" % (str(gbdb_path) + ".bai", f) warnings.append(warn) print >>sys.stderr, warn if table_exists(database, name): unchanged_tables_list.append(name) gbdb_path = get_gbdb_pathname(database, name) if gbdb_path != None and os.path.exists(gbdb_path): unchanged_gbdbs_list.append(gbdb_path) else: warn = "could not find %s file for %s" % (gbdb_path, f) warnings.append(warn) print >>sys.stderr, warn else: warn = "table %s does not exist, from filetype %s" % (name, type) warnings.append(warn) print >>sys.stderr, warn elif table_and_file.match(type): unchanged_files_list.append(path_prefix + f) if table_exists(database, name): unchanged_tables_list.append(name) else: warn = "table %s does not exist, from filetype %s" % (name, type) warnings.append(warn) print >>sys.stderr, warn elif file_only.match(type): unchanged_files_list.append(path_prefix + f) else: raise ValueError, "unknown type %s of file %s" % (type, f) # process the list of removed files for f in removed_files: if f.count(".") == 1: name, type = f.split(".") compression = None elif f.count(".") == 2: name, type, compression = f.split(".") assert compression == None or compression == "gz" if gbdb.match(type): removed_files_list.append(path_prefix + f) if type == "bam": if os.path.exists(path_prefix + f + ".bai"): removed_files_list.append(path_prefix + f + ".bai") else: warn = "could not find .bai file for bam file %s" % f warnings.append(warn) print >>sys.stderr, warn gbdb_path = get_gbdb_pathname(database, name) if gbdb_path != None and os.path.exists(gbdb_path + ".bai"): removed_gbdbs_list.append(gbdb_path + ".bai") else: warn = "could not find %s file for %s" % (str(gbdb_path) + ".bai", f) warnings.append(warn) print >>sys.stderr, warn if table_exists(database, name): removed_tables_list.append(name) gbdb_path = get_gbdb_pathname(database, name) if gbdb_path != None and os.path.exists(gbdb_path): removed_gbdbs_list.append(gbdb_path) else: warn = "could not find %s file for %s" % (gbdb_path, f) warnings.append(warn) print >>sys.stderr, warn else: warn = "table %s does not exist, from filetype %s" % (name, type) warnings.append(warn) print >>sys.stderr, warn elif table_and_file.match(type): removed_files_list.append(path_prefix + f) if table_exists(database, name): removed_tables_list.append(name) else: warn = "table %s does not exist, from filetype %s" % (name, type) warnings.append(warn) print >>sys.stderr, warn elif file_only.match(type): removed_files_list.append(path_prefix + f) else: raise ValueError, "unknown type %s of file %s" % (type, f) # process the list of new files for f in new_files: if f.count(".") == 1: name, type = f.split(".") compression = None elif f.count(".") == 2: name, type, compression = f.split(".") assert compression == None or compression == "gz" or compression == "tgz" if gbdb.match(type): new_files_list.append(path_prefix + f) if type == "bam": if os.path.exists(path_prefix + f + ".bai"): new_files_list.append(path_prefix + f + ".bai") else: warn = "could not find .bai file for bam file %s" % f warnings.append(warn) print >>sys.stderr, warn gbdb_path = get_gbdb_pathname(database, name) if gbdb_path == None: warn = "could not find gbdb path in table %s.%s" % (database, name) warnings.append(warn) print >>sys.stderr, warn elif gbdb_path != None and os.path.exists(gbdb_path + ".bai"): new_gbdbs_list.append(gbdb_path + ".bai") else: warn = "could not find %s file for %s" % (gbdb_path + ".bai", f) warnings.append(warn) print >>sys.stderr, warn if table_exists(database, name): new_tables_list.append(name) gbdb_path = get_gbdb_pathname(database, name) if gbdb_path == None: warn = "could not find gbdb path in table %s.%s" % (database, name) warnings.append(warn) print >>sys.stderr, warn elif gbdb_path != None and os.path.exists(gbdb_path): new_gbdbs_list.append(gbdb_path) else: warn = "could not find %s file for %s" % (gbdb_path, f) warnings.append(warn) print >>sys.stderr, warn else: warn = "table %s does not exist, from filetype %s" % (name, type) warnings.append(warn) print >>sys.stderr, warn elif table_and_file.match(type): new_files_list.append(path_prefix + f) if table_exists(database, name): new_tables_list.append(name) else: warn = "table %s does not exist, from filetype %s" % (name, type) warnings.append(warn) print >>sys.stderr, warn elif file_only.match(type): new_files_list.append(path_prefix + f) else: raise ValueError, "unknown type %s of file %s" % (type, f) # check the list of addition files for f in possible_addition_files: if os.path.exists(f): addition_files.append(path_prefix + f) else: warn = "addition file %s not found" % f warnings.append(warn) print >>sys.stderr, warn # output some basic stats if options.verbose: print >>sys.stderr, "Counts:" print >>sys.stderr, " unchanged tables: %d" % len(unchanged_tables_list) print >>sys.stderr, " unchanged files: %d" % len(unchanged_files_list) print >>sys.stderr, " unchanged gbdbs: %d" % len(unchanged_gbdbs_list) print >>sys.stderr, " removed tables: %d" % len(removed_tables_list) print >>sys.stderr, " removed files: %d" % len(removed_files_list) print >>sys.stderr, " removed gbdbs: %d" % len(removed_gbdbs_list) print >>sys.stderr, " new tables: %d" % len(new_tables_list) print >>sys.stderr, " new files: %d" % len(new_files_list) print >>sys.stderr, " new gbdbs: %d" % len(new_gbdbs_list) print >>sys.stderr, " additional files: %d" % len(addition_files) unchanged_tables_list = remove_duplicates(unchanged_tables_list) unchanged_files_list = remove_duplicates(unchanged_files_list) unchanged_gbdbs_list = remove_duplicates(unchanged_gbdbs_list) removed_tables_list = remove_duplicates(removed_tables_list) removed_files_list = remove_duplicates(removed_files_list) removed_gbdbs_list = remove_duplicates(removed_gbdbs_list) new_tables_list = remove_duplicates(new_tables_list) new_files_list = remove_duplicates(new_files_list) new_gbdbs_list = remove_duplicates(new_gbdbs_list) addition_files = remove_duplicates(addition_files) # if asked, save the list of new files if options.files_path: new_files_file = open(options.files_path, "w") new_files_list.sort() for i in new_files_list: print >>new_files_file, i print >>new_files_file new_gbdbs_list.sort() for i in new_gbdbs_list: print >>new_files_file, i new_files_file.close() # if asked, generate list of new tables if options.tables_path: new_tables_file = open(options.tables_path, "w") new_tables_list.sort() for i in new_tables_list: print >>new_tables_file, i print >>new_tables_file new_tables_file.close() # generate the header print "# generated with %s" % parser.get_version() print "This is a %s of the \"%s\"" % (current_release_dir, options.track_name) print "The composite track is %s" % options.composite_name if len(warnings) > 0: warn_header = "# WARNINGS " print warn_header + "#" * (60 - len(warn_header)) c = 1 for w in warnings: print "%0d - %s" % (c, w) c += 1 print "#" * 60 print """ Categories of tables and files('): A) Untouched - are on public browser and should remain B) Deprecated - are currently on RR but will no longer be needed and should not be referenced by the public site. NOTE: NO FILES SHOULD BE REMOVED from the downloads directory on hgdownloads (RR). This list is provided for completeness. Any files marked here as in gbdb may be eliminated. C) New - are only currently on test but will need to be pushed to the RR. D) Additional items of note """ # some summary counts of current files, i.e. new + untouched print "Summary total counts for %s (new+untouched):" % current_release_dir print " Tables: %d" % (len(unchanged_tables_list) + len(new_tables_list)) print " Files: %d" % (len(unchanged_files_list) + len(new_files_list) + len(addition_files)) print " Gbdbs: %d" % (len(unchanged_gbdbs_list) + len(new_gbdbs_list)) print # untouched list print "A) Untouched Tables (%d):" % len(unchanged_tables_list) unchanged_tables_list.sort() for i in unchanged_tables_list: print i print print "A') Untouched Files (%d downloadables, %d gbdbs):" % (len(unchanged_files_list), len(unchanged_gbdbs_list)) if prev_release_dir == "": assert len(unchanged_files_list) == 0 assert len(unchanged_gbdbs_list) == 0 else: print " current location on alpha:" print " /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, current_release_dir) print " on RR:" print " /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name) print unchanged_files_list.sort() for i in unchanged_files_list: print i print unchanged_gbdbs_list.sort() for i in unchanged_gbdbs_list: print i print # eliminated list print "B) Deprecated tables (%d):" % len(removed_tables_list) removed_tables_list.sort() for i in removed_tables_list: print i print print "B') Deprecated files (%d downloadables, %d gbdbs):" %(len(removed_files_list), len(removed_gbdbs_list)) if prev_release_dir == "": assert len(removed_files_list) == 0 assert len(removed_gbdbs_list) == 0 else: print " NOTE: NO FILES SHOULD BE REMOVED from the downloads directory on hgdownloads (RR)." print " This list is provided for completeness. Any files marked here as in gbdb may be eliminated." print " current location on alpha:" print " /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, prev_release_dir) print " on RR:" print " /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name) print removed_files_list.sort() for i in removed_files_list: print i print removed_gbdbs_list.sort() for i in removed_gbdbs_list: print i print # new list print "C) New tables (%d):" % len(new_tables_list) new_tables_list.sort() for i in new_tables_list: print i print print "C') New files (%d downloadables, %d gbdbs):" % (len(new_files_list), len(new_gbdbs_list)) print " current location on alpha:" print " /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, current_release_dir) print " NOT on RR but must be placed in:" print " /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name) print new_files_list.sort() for i in new_files_list: print i print new_gbdbs_list.sort() for i in new_gbdbs_list: print i print print "D) Additional items:" print " current location on alpha:" print " /usr/local/apache/htdocs-hgdownload/goldenPath/%s/encodeDCC/%s/%s/" % (database, options.composite_name, current_release_dir) print " should be placed on the RR in (overwritting any existing copy):" print " /usr/local/apache/htdocs/goldenPath/%s/encodeDCC/%s/" % (database, options.composite_name) print for f in addition_files: print f #### Module #################################################################### if __name__ == "__main__": sys.exit(main())