Source code for pubchem

#
# @BEGIN LICENSE
#
# Psi4: an open-source quantum chemistry software package
#
# Copyright (c) 2007-2016 The Psi4 Developers.
#
# The copyrights for code used from other parties are included in
# the corresponding files.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
# @END LICENSE
#

from __future__ import print_function
"""Queries the PubChem database using a compound name (i.e. 1,3,5-hexatriene)
   to obtain a molecule string that can be passed to Molecule. ::

      results = getPubChemObj("1,3,5-hexatriene")

      Results is an array of results from PubChem matches to your query.
        for entry in results:
           entry["CID"]         => PubChem compound identifer
           entry["IUPAC"]       => IUPAC name for the resulting compound
           entry["PubChemObj"]  => instance of PubChemObj for this compound

           entry["PubChemObj"].getMoleculeString()   => returns a string compatible
                                                        with Psi4's Molecule creation

"""
from __future__ import absolute_import
try:
    # Python 2 syntax
    from urllib2 import urlopen
    from urllib2 import quote
    from urllib2 import URLError
except ImportError:
    # Python 3 syntax
    from urllib.request import urlopen
    from urllib.parse import quote
    from urllib.error import URLError
import xml.etree.ElementTree as ET
import time
import gzip
import re
import sys
import os
from p4util.exceptions import *


[docs]class PubChemObj(object): def __init__(self, cid, mf, iupac): self.url = 'http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi' self.cid = cid self.mf = mf self.iupac = iupac self.natom = 0 self.dataSDF = '' def __str__(self): return "%17d %s\n" % (self.cid, self.iupac)
[docs] def getSDF(self): """Function to return the SDF (structure-data file) of the PubChem object.""" if (len(self.dataSDF) == 0): def extract_xml_keyval(xml, key): """ A useful helper function for parsing a single key from XML. """ try: # Python 2.6 (ElementTree 1.2 API) matches = xml.getiterator(key) except: # Python 2.7 (ElementTree 1.3 API) matches = list(xml.iter(key)) if len(matches) == 0: return None elif len(matches) == 1: return matches[0].text else: print(matches) raise ValidationError("""PubChem: too many matches found %d""" % (len(matches))) url = "http://pubchem.ncbi.nlm.nih.gov/pug/pug.cgi" initial_request = """ <PCT-Data> <PCT-Data_input> <PCT-InputData> <PCT-InputData_download> <PCT-Download> <PCT-Download_uids> <PCT-QueryUids> <PCT-QueryUids_ids> <PCT-ID-List> <PCT-ID-List_db>pccompound</PCT-ID-List_db> <PCT-ID-List_uids> <PCT-ID-List_uids_E>%d</PCT-ID-List_uids_E> </PCT-ID-List_uids> </PCT-ID-List> </PCT-QueryUids_ids> </PCT-QueryUids> </PCT-Download_uids> <PCT-Download_format value="sdf"/> <PCT-Download_use-3d value="true"/> <PCT-Download_n-3d-conformers>%d</PCT-Download_n-3d-conformers> </PCT-Download> </PCT-InputData_download> </PCT-InputData> </PCT-Data_input> </PCT-Data> """ % (self.cid, 1) print("\tPosting PubChem request for CID %d" % self.cid) server_response = urlopen(url, initial_request).read() xml = ET.fromstring(server_response) for attempt in range(4): if attempt == 3: raise ValidationError("""PubChem: timed out""") # Look for a download location in the XML response download_url = extract_xml_keyval(xml, 'PCT-Download-URL_url') if download_url: print("\tDownloading from PubChem...") # We have a download location; gogetit file_name = '_psi4_pubchem_temp_download_.tgz' response = urlopen(download_url) data = response.read() fh = open(file_name, 'wb') fh.write(data) fh.close() unzipped = gzip.open(file_name) self.dataSDF = unzipped.read() unzipped.close() try: os.remove(file_name) except: pass print("\tDone!") break # We didn't find a download location yet. if attempt == 0: # If this is the first try, take a ticket ticket = extract_xml_keyval(xml, 'PCT-Waiting_reqid') #print("ticket = " + ticket) statusrequest = """<PCT-Data> <PCT-Data_input> <PCT-InputData> <PCT-InputData_request> <PCT-Request> <PCT-Request_reqid>%d</PCT-Request_reqid> <PCT-Request_type value="status"/> </PCT-Request> </PCT-InputData_request> </PCT-InputData> </PCT-Data_input> </PCT-Data> """ % int(ticket) if ticket: # Wait 10 seconds... print("\tPubChem result not available yet, will try again in 10 seconds...") time.sleep(10) # ...and ask for an update on the progress server_response = urlopen(url, statusrequest).read() xml = ET.fromstring(server_response) #print(server_response) else: # We can't find a ticket number, or a download location. Bail. raise ValidationError("""PubChem: download error""") return self.dataSDF
[docs] def name(self): """Function to return the IUPAC name of the PubChem object.""" return self.iupac
[docs] def getCartesian(self): """Function to return a string of the atom symbol and XYZ coordinates of the PubChem object. """ try: sdfText = self.getSDF() except Exception as e: raise e # Find # NA NB CONSTANT # 14 13 0 0 0 0 0 0 0999 V2000 m = re.search(r'^\s*(\d+)\s+(?:\d+\s+){8}V2000$', sdfText, re.MULTILINE) self.natom = 0 if (m): self.natom = int(m.group(1)) if self.natom == 0: raise ValidationError("PubChem: Cannot find the number of atoms. 3D data doesn't appear\n" + "to be available for %s.\n" % self.iupac) lines = re.split('\n', sdfText) # 3.7320 -0.2500 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 NUMBER = "((?:[-+]?\\d*\\.\\d+(?:[DdEe][-+]?\\d+)?)|(?:[-+]?\\d+\\.\\d*(?:[DdEe][-+]?\\d+)?))" atom_re = re.compile(r'^\s*' + NUMBER + r'\s+' + NUMBER + r'\s+' + NUMBER + r'\s*(\w+)(?:\s+\d+){12}') molecule_string = "PubchemInput\n" atom_count = 0 for line in lines: if (not line or line.isspace()): continue atom_match = atom_re.match(line) if atom_match: x = float(atom_match.group(1)) y = float(atom_match.group(2)) z = float(atom_match.group(3)) sym = atom_match.group(4) atom_count = atom_count + 1 molecule_string += "%s %10.6f %10.6f %10.6f\n" % (sym, x, y, z) if (atom_count == self.natom): break return molecule_string
[docs] def getXYZFile(self): """Function to obtain preferentially a molecule string through getCartesian() or a query string otherwise. """ try: temp = self.getCartesian() except Exception as e: raise molstr = "%d\n%s\n%s" % (self.natom, self.iupac, temp) return molstr
[docs] def getMoleculeString(self): """Function to obtain a molecule string through getCartesian() or fail. """ try: return self.getCartesian() except Exception as e: return e.message
[docs]def getPubChemResults(name): """Function to query the PubChem database for molecules matching the input string. Builds a PubChem object if found. """ url = 'http://www.ncbi.nlm.nih.gov/sites/entrez?db=pccompound&term=%s&format=text' % quote(name) print("\tSearching PubChem database for %s" % (name)) try: loc = urlopen(url) except URLError as e: msg = "\tPubchemError\n%s\n\treceived when trying to open\n\t%s\n" % (str(e), url) msg += "\tCheck your internet connection, and the above URL, and try again.\n" raise ValidationError(msg) data = loc.read() ans = [] l = data.find(b"<pre>") l = data.find(b"\n", l) i = 1 while(True): l = data.find(str("%d. " % i).encode(sys.getdefaultencoding()), l) if l == -1: break tag = b"MF: " l = data.find(tag, l) + len(tag) mf = data[l:data.find(b'\n', l)].decode(sys.getdefaultencoding()) tag = b"IUPAC name: " l = data.find(tag, l) + len(tag) iupac = data[l:data.find(b'\n', l)].decode(sys.getdefaultencoding()) tag = b"CID:" l = data.find(tag, l) + len(tag) #if l == 4: # break cid = int(data[l:data.find(b"\n", l)]) l = data.find(b'\t', l) + 1 pubobj = PubChemObj(cid, mf, iupac) ans.append(pubobj) i += 1 print("\tFound %d results" % (len(ans))) return ans
if __name__ == "__main__": try: #obj = getPubChemResults("1-methoxy-4-[(E)-prop-1-enyl]benzene") #obj = getPubChemResults("sodium benzenesulfonate") obj = getPubChemResults("4-[bis(4-hydroxyphenyl)methyl]phenol") except Exception as e: print(e.message) for r in obj: print(r) print(r.getMoleculeString())