Source code for pubchem

"""Queries the PubChem database using a compound name (i.e. 1,3,5-hexatriene)
   to obtain a molecule string that can be passed to Molecule. ::

      results = getPubChemObj("1,3,5-hexatriene")

      Results is an array of results from PubChem matches to your query.
        for entry in results:
           entry["CID"]         => PubChem compound identifer
           entry["IUPAC"]       => IUPAC name for the resulting compound
           entry["PubChemObj"]  => instance of PubChemObj for this compound

           entry["PubChemObj"].getMoleculeString()   => returns a string compatible
                                                        with PSI4's Molecule creation

"""
import urllib2
import re


[docs]class PubChemObj(object): def __init__(self, cid, mf, iupac): self.url = 'http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi' self.cid = cid self.mf = mf self.iupac = iupac self.natom = 0 self.dataSDF = '' def __str__(self): return "%17d %s\n" % (self.cid, self.iupac)
[docs] def getSDF(self): """Function to return the SDF (structure-data file) of the PubChem object.""" if (len(self.dataSDF) == 0): # When completed uncomment the following: url = self.url + '?cid=' + urllib2.quote(str(self.cid)) + '&disopt=3DDisplaySDF' try: location = urllib2.urlopen(url) except urllib2.URLError, e: msg = "\tPubchemError\n%s\n\treceived when trying to open\n\t%s\n" % (str(e), url) msg += "\tCheck your internet connection, and the above URL, and try again.\n" raise Exception(msg) print "\tRetrieved entry for chemical ID %d\n" % self.cid self.dataSDF = location.read() #f = open("TEST", "w") #f.write(self.dataSDF) return self.dataSDF
[docs] def name(self): """Function to return the IUPAC name of the PubChem object.""" return self.iupac
[docs] def getCartesian(self): """Function to return a string of the atom symbol and XYZ coordinates of the PubChem object. """ try: sdfText = self.getSDF() except Exception as e: raise e # Find # NA NB CONSTANT # 14 13 0 0 0 0 0 0 0999 V2000 m = re.search(r'^\s*(\d+)\s+(?:\d+\s+){8}V2000$', sdfText, re.MULTILINE) self.natom = 0 if (m): self.natom = int(m.group(1)) if (self.natom == 0): raise Exception("PubchemError\n Cannot find the number of atoms. 3D data doesn't appear\n" + "to be available for %s.\n" % self.iupac) lines = re.split('\n', sdfText) # 3.7320 -0.2500 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 NUMBER = "((?:[-+]?\\d*\\.\\d+(?:[DdEe][-+]?\\d+)?)|(?:[-+]?\\d+\\.\\d*(?:[DdEe][-+]?\\d+)?))" atom_re = re.compile(r'^\s*' + NUMBER + r'\s+' + NUMBER + r'\s+' + NUMBER + r'\s*(\w+)(?:\s+\d+){12}') molecule_string = "PubchemInput\n" atom_count = 0 for line in lines: if (not line or line.isspace()): continue atom_match = atom_re.match(line) if atom_match: x = float(atom_match.group(1)) y = float(atom_match.group(2)) z = float(atom_match.group(3)) sym = atom_match.group(4) atom_count = atom_count + 1 molecule_string += "%s %10.6f %10.6f %10.6f\n" % (sym, x, y, z) if (atom_count == self.natom): break return molecule_string
[docs] def getXYZFile(self): """Function to obtain preferentially a molecule string through getCartesian() or a query string otherwise. """ try: temp = self.getCartesian() except Exception as e: raise molstr = "%d\n%s\n%s" % (self.natom, self.iupac, temp) return molstr
[docs] def getMoleculeString(self): """Function to obtain a molecule string through getCartesian() or fail. """ try: return self.getCartesian() except Exception as e: return e.message
[docs]def getPubChemResults(name): """Function to query the PubChem database for molecules matching the input string. Builds a PubChem object if found. """ url = 'http://www.ncbi.nlm.nih.gov/sites/entrez?db=pccompound&term=%s&format=text' % (urllib2.quote(name)) print "\tSearching PubChem database for %s" % (name) try: loc = urllib2.urlopen(url) except urllib2.URLError as e: msg = "\tPubchemError\n%s\n\treceived when trying to open\n\t%s\n" % (str(e), url) msg += "\tCheck your internet connection, and the above URL, and try again.\n" raise Exception(msg) data = loc.read() ans = [] l = data.find("<pre>") l = data.find("\n", l) for i in range(1, 21): l = data.find("%s. " % (i), l) if l == -1: break l = data.find("MF: ", l) + 4 mf = data[l:data.find("\n", l)] l = data.find("IUPAC: ", l) + 7 iupac = data[l:data.find("\n", l)] l = data.find("CID: ", l) + 5 #if l == 4: # break cid = int(data[l:data.find("\n", l)]) l = data.find("\t", l) + 1 pubobj = PubChemObj(cid, mf, iupac) ans.append(pubobj) print "\tFound %d results" % (len(ans)) return ans
if __name__ == "__main__": try: obj = getPubChemResults("1-methoxy-4-[(E)-prop-1-enyl]benzene") #obj = getPubChemResults("sodium benzenesulfonate") except Exception as e: print e.message for r in obj: print r print r.getMoleculeString()