Source code for pubchem
"""Queries the PubChem database using a compound name (i.e. 1,3,5-hexatriene)
to obtain a molecule string that can be passed to Molecule. ::
results = getPubChemObj("1,3,5-hexatriene")
Results is an array of results from PubChem matches to your query.
for entry in results:
entry["CID"] => PubChem compound identifer
entry["IUPAC"] => IUPAC name for the resulting compound
entry["PubChemObj"] => instance of PubChemObj for this compound
entry["PubChemObj"].getMoleculeString() => returns a string compatible
with PSI4's Molecule creation
"""
import urllib2
import re
[docs]class PubChemObj(object):
def __init__(self, cid, mf, iupac):
self.url = 'http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi'
self.cid = cid
self.mf = mf
self.iupac = iupac
self.natom = 0
self.dataSDF = ''
def __str__(self):
return "%17d %s\n" % (self.cid, self.iupac)
[docs] def getSDF(self):
"""Function to return the SDF (structure-data file) of the PubChem object."""
if (len(self.dataSDF) == 0):
# When completed uncomment the following:
url = self.url + '?cid=' + urllib2.quote(str(self.cid)) + '&disopt=3DDisplaySDF'
try:
location = urllib2.urlopen(url)
except urllib2.URLError, e:
msg = "\tPubchemError\n%s\n\treceived when trying to open\n\t%s\n" % (str(e), url)
msg += "\tCheck your internet connection, and the above URL, and try again.\n"
raise Exception(msg)
print "\tRetrieved entry for chemical ID %d\n" % self.cid
self.dataSDF = location.read()
#f = open("TEST", "w")
#f.write(self.dataSDF)
return self.dataSDF
[docs] def name(self):
"""Function to return the IUPAC name of the PubChem object."""
return self.iupac
[docs] def getCartesian(self):
"""Function to return a string of the atom symbol and XYZ
coordinates of the PubChem object.
"""
try:
sdfText = self.getSDF()
except Exception as e:
raise e
# Find
# NA NB CONSTANT
# 14 13 0 0 0 0 0 0 0999 V2000
m = re.search(r'^\s*(\d+)\s+(?:\d+\s+){8}V2000$', sdfText, re.MULTILINE)
self.natom = 0
if (m):
self.natom = int(m.group(1))
if (self.natom == 0):
raise Exception("PubchemError\n Cannot find the number of atoms. 3D data doesn't appear\n" +
"to be available for %s.\n" % self.iupac)
lines = re.split('\n', sdfText)
# 3.7320 -0.2500 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
NUMBER = "((?:[-+]?\\d*\\.\\d+(?:[DdEe][-+]?\\d+)?)|(?:[-+]?\\d+\\.\\d*(?:[DdEe][-+]?\\d+)?))"
atom_re = re.compile(r'^\s*' + NUMBER + r'\s+' + NUMBER + r'\s+' + NUMBER + r'\s*(\w+)(?:\s+\d+){12}')
molecule_string = "PubchemInput\n"
atom_count = 0
for line in lines:
if (not line or line.isspace()):
continue
atom_match = atom_re.match(line)
if atom_match:
x = float(atom_match.group(1))
y = float(atom_match.group(2))
z = float(atom_match.group(3))
sym = atom_match.group(4)
atom_count = atom_count + 1
molecule_string += "%s %10.6f %10.6f %10.6f\n" % (sym, x, y, z)
if (atom_count == self.natom):
break
return molecule_string
[docs] def getXYZFile(self):
"""Function to obtain preferentially a molecule string
through getCartesian() or a query string otherwise.
"""
try:
temp = self.getCartesian()
except Exception as e:
raise
molstr = "%d\n%s\n%s" % (self.natom, self.iupac, temp)
return molstr
[docs] def getMoleculeString(self):
"""Function to obtain a molecule string through
getCartesian() or fail.
"""
try:
return self.getCartesian()
except Exception as e:
return e.message
[docs]def getPubChemResults(name):
"""Function to query the PubChem database for molecules matching the
input string. Builds a PubChem object if found.
"""
url = 'http://www.ncbi.nlm.nih.gov/sites/entrez?db=pccompound&term=%s&format=text' % (urllib2.quote(name))
print "\tSearching PubChem database for %s" % (name)
try:
loc = urllib2.urlopen(url)
except urllib2.URLError as e:
msg = "\tPubchemError\n%s\n\treceived when trying to open\n\t%s\n" % (str(e), url)
msg += "\tCheck your internet connection, and the above URL, and try again.\n"
raise Exception(msg)
data = loc.read()
ans = []
l = data.find("<pre>")
l = data.find("\n", l)
for i in range(1, 21):
l = data.find("%s. " % (i), l)
if l == -1:
break
l = data.find("MF: ", l) + 4
mf = data[l:data.find("\n", l)]
l = data.find("IUPAC: ", l) + 7
iupac = data[l:data.find("\n", l)]
l = data.find("CID: ", l) + 5
#if l == 4:
# break
cid = int(data[l:data.find("\n", l)])
l = data.find("\t", l) + 1
pubobj = PubChemObj(cid, mf, iupac)
ans.append(pubobj)
print "\tFound %d results" % (len(ans))
return ans
if __name__ == "__main__":
try:
obj = getPubChemResults("1-methoxy-4-[(E)-prop-1-enyl]benzene")
#obj = getPubChemResults("sodium benzenesulfonate")
except Exception as e:
print e.message
for r in obj:
print r
print r.getMoleculeString()