#!/usr/bin/env python2.7 # # cellType.py: parse any to-be-registered cell type from the wiki, # and download any newly-approved cell type protocol documents. # import argparse import base64 from BeautifulSoup import BeautifulSoup import HTMLParser import re import string import sys import urllib2 from ucscGb.gbData.ra.raStanza import RaStanza def stripLeadingTrailingWhitespace(text): """Given a string, remove any leading or trailing whitespace""" text = re.sub("^([" + string.whitespace + "])+", "", text) text = re.sub("([" + string.whitespace + "])+$", "", text) return(text) def getContents(field): """Given an HTML field, return the contents""" contents = stripLeadingTrailingWhitespace(field.contents[0]) if len(contents) == 0: contents = "missing" return(contents) def processOrderUrl(orderInfo): """Parse the orderInfo column. Return the vendorName, vendorId, and orderUrl""" if not orderInfo.has_key("href"): vendorName = getContents(orderInfo) vendorId = "missing" orderUrl = "missing" else: orderUrl = orderInfo["href"] vendorData = orderInfo.text.split() vendorId = vendorData.pop() vendorName = ' '.join(vendorData) return((vendorName, vendorId, orderUrl)) def processTermId(termInfo): """Parse the term ID column. Return the termId and termUrl. """ if not termInfo.has_key("href"): termId = termInfo.text termUrl = "missing" else: termId = termInfo.text termUrl = termInfo["href"] return((termId, termUrl)) # # Process a cell type table entry, in which the order of the columns is # (Cell Type, Description, Lineage, Karyotype, Sex, Tissue, Order URL, Term ID, # Submitting Lab) # def processCellTypeEntry(row, species, downloadsDirectory, noDownload, username, password, wikiBaseUrl): cellData = row.findAll("td") term = getContents(cellData[0]) if re.search("(Example)", term) or term == "missing": return((None, False)) else: # # Scrape the cells of the wiki row into a new RaStanza object stanza = RaStanza() stanza["term"] = term stanza["tag"] = re.sub("[-_\(\)]", "", term).upper() stanza["type"] = "Cell Line" stanza["tier"] = "3" stanza["organism"] = species stanza["description"] = getContents(cellData[1]) stanza["lineage"] = getContents(cellData[2]) stanza["karyotype"] = getContents(cellData[3]) stanza["sex"] = getContents(cellData[4]) stanza["tissue"] = getContents(cellData[5]) if len(cellData[6]) > 1: (stanza["vendorName"], stanza["vendorId"], stanza["orderUrl"]) = processOrderUrl(cellData[6].contents[1]) if len(cellData[7]) > 1: (stanza["termId"], stanza["termUrl"]) = processTermId(cellData[7].contents[1]) stanza["lab"] = getContents(cellData[8]) # # Assemble the target name of the cell protocol document. The naming # convention is __protocol.pdf, with any special characters # stripped from the term. protocolDocument = "%s_%s_protocol.pdf" \ % (re.sub("[-_\(\)]", "", term), stanza["lab"]) stanza["tag"] = stanza["tag"].upper() protocolDocument = re.sub("(\s)+", "", protocolDocument) stanza["protocol"] = "%s:%s" % (stanza["lab"], protocolDocument) # # Indicate whether or not the document (if any) is approved by the NHGRI. # If it's approved, and if the noDownload flag is false, # then download it into the target filename. approved = False if re.search("^[Y|y]", getContents(cellData[10])): approved = True if noDownload == False: documentContents = getContents(cellData[9]) if len(cellData[9].findAll("a")) != 0: urlClauses = cellData[9].findAll("a") if len(urlClauses) > 0: if urlClauses[0].has_key("href"): url = urlClauses[0]["href"] doc = accessWiki(wikiBaseUrl + url, username, password) if len(doc) > 0: outputFilename = "%s/%s" % (downloadsDirectory, protocolDocument) newDocFile = open(outputFilename, "wb") newDocFile.write(doc) newDocFile.close() return((stanza, approved)) def accessWiki(url, username, password): """Read the indicated URL from the wiki page""" passmgr = urllib2.HTTPPasswordMgrWithDefaultRealm() base64string = base64.encodestring('%s:%s' % (username, password))[:-1] authheader = "Basic %s" % base64string req = urllib2.Request(url) req.add_header("Authorization", authheader) try: handle=urllib2.urlopen(req) return(handle.read()) except IOError, e: print "Fail! Bad username or password?" return(None) # # Main code # defaultUsername = "encode" defaultPassword = "human" parser = argparse.ArgumentParser(description="Parse new cell type registrations from the ENCODE wiki and download protocol documents") parser.add_argument("-s", dest="species", default="human", action="store", help="Species (default: human)") parser.add_argument("-d", dest="downloadDirectory", default=".", action="store", help="Directory to download any documents into (default: '.'") parser.add_argument("-f", dest="forcePrinting", default=False, action="store_true", help="Force printing of all stanzas, whether or not there's NHGRI approval (default: false)") parser.add_argument("-n", dest="noDownload", default=False, action="store_true", help="Download no files (default: false)") parser.add_argument("-u", dest="username", default=defaultUsername, action="store", help="Username to access the wiki page (default: encode)") parser.add_argument("-p", dest="password", default=defaultPassword, action="store", help="Password to access the wiki page (default: human)") args = parser.parse_args() # # Set up access to the wiki page # wikiBaseUrl = "http://encodewiki.ucsc.edu/" cellTypePage = wikiBaseUrl + "EncodeDCC/index.php/Cell_lines" passmgr = urllib2.HTTPPasswordMgrWithDefaultRealm() base64string = base64.encodestring('%s:%s' % (args.username, args.password))[:-1] authheader = "Basic %s" % base64string req = urllib2.Request(cellTypePage) req.add_header("Authorization", authheader) try: handle=urllib2.urlopen(req) except IOError, e: print "Fail! Bad username or password?" thepage = handle.read() soup = BeautifulSoup(thepage) # # Look for the Tier 3 cell types table, the third table on the page. # Once you reach the table, process each line until an end of # table tag is reached. # cellTypeTable = soup.findAll("table")[1] skippedHeaderRow = False for entry in cellTypeTable.findAll("tr"): if not skippedHeaderRow: skippedHeaderRow = True else: (stanza, approved) = processCellTypeEntry(entry, args.species, args.downloadDirectory, args.noDownload, args.username, args.password, wikiBaseUrl) if approved or args.forcePrinting and stanza != None: print stanza