Source code for pubchem

#
# @BEGIN LICENSE
#
# Psi4: an open-source quantum chemistry software package
#
# Copyright (c) 2007-2016 The Psi4 Developers.
#
# The copyrights for code used from other parties are included in
# the corresponding files.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
# @END LICENSE
#

from __future__ import print_function
"""Queries the PubChem database using a compound name (i.e. 1,3,5-hexatriene)
   to obtain a molecule string that can be passed to Molecule. ::

      results = getPubChemObj("1,3,5-hexatriene")

      Results is an array of results from PubChem matches to your query.
        for entry in results:
           entry["CID"]         => PubChem compound identifer
           entry["IUPAC"]       => IUPAC name for the resulting compound
           entry["PubChemObj"]  => instance of PubChemObj for this compound

           entry["PubChemObj"].getMoleculeString()   => returns a string compatible
                                                        with Psi4's Molecule creation

"""
from __future__ import absolute_import
try:
    # Python 2 syntax
    from urllib2 import urlopen
    from urllib2 import quote
    from urllib2 import URLError
except ImportError:
    # Python 3 syntax
    from urllib.request import urlopen
    from urllib.parse import quote
    from urllib.error import URLError
import xml.etree.ElementTree as ET
import time
import gzip
import re
import sys
import os
from p4util.exceptions import *


[docs]class PubChemObj(object):

    def __init__(self, cid, mf, iupac):
        self.url = 'http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi'
        self.cid = cid
        self.mf = mf
        self.iupac = iupac
        self.natom = 0
        self.dataSDF = ''

    def __str__(self):
        return "%17d   %s\n" % (self.cid, self.iupac)

[docs]    def getSDF(self):
        """Function to return the SDF (structure-data file) of the PubChem object."""
        if (len(self.dataSDF) == 0):
            def extract_xml_keyval(xml, key):
                """ A useful helper function for parsing a single key from XML. """
                try:
                    # Python 2.6 (ElementTree 1.2 API)
                    matches = xml.getiterator(key)
                except:
                    # Python 2.7 (ElementTree 1.3 API)
                    matches = list(xml.iter(key))
                if len(matches) == 0:
                    return None
                elif len(matches) == 1:
                    return matches[0].text
                else:
                    print(matches)
                    raise ValidationError("""PubChem: too many matches found %d""" % (len(matches)))

            url = "http://pubchem.ncbi.nlm.nih.gov/pug/pug.cgi"
            initial_request = """
            <PCT-Data>
              <PCT-Data_input>
                <PCT-InputData>
                  <PCT-InputData_download>
                    <PCT-Download>
                      <PCT-Download_uids>
                        <PCT-QueryUids>
                          <PCT-QueryUids_ids>
                            <PCT-ID-List>
                              <PCT-ID-List_db>pccompound</PCT-ID-List_db>
                              <PCT-ID-List_uids>
                                <PCT-ID-List_uids_E>%d</PCT-ID-List_uids_E>
                              </PCT-ID-List_uids>
                            </PCT-ID-List>
                          </PCT-QueryUids_ids>
                        </PCT-QueryUids>
                      </PCT-Download_uids>
                      <PCT-Download_format value="sdf"/>
                      <PCT-Download_use-3d value="true"/>
                      <PCT-Download_n-3d-conformers>%d</PCT-Download_n-3d-conformers>
                    </PCT-Download>
                  </PCT-InputData_download>
                </PCT-InputData>
              </PCT-Data_input>
            </PCT-Data>
            """ % (self.cid, 1)

            print("\tPosting PubChem request for CID %d" % self.cid)
            server_response = urlopen(url, initial_request).read()
            xml = ET.fromstring(server_response)
            for attempt in range(4):
                if attempt == 3:
                    raise ValidationError("""PubChem: timed out""")
                # Look for a download location in the XML response
                download_url = extract_xml_keyval(xml, 'PCT-Download-URL_url')
                if download_url:
                    print("\tDownloading from PubChem...")
                    # We have a download location; gogetit
                    file_name = '_psi4_pubchem_temp_download_.tgz'
                    response = urlopen(download_url)
                    data = response.read()
                    fh = open(file_name, 'wb')
                    fh.write(data)
                    fh.close()
                    unzipped = gzip.open(file_name)
                    self.dataSDF = unzipped.read()
                    unzipped.close()
                    try:
                        os.remove(file_name)
                    except:
                        pass
                    print("\tDone!")
                    break
                # We didn't find a download location yet.
                if attempt == 0:
                    # If this is the first try, take a ticket
                    ticket = extract_xml_keyval(xml, 'PCT-Waiting_reqid')
                    #print("ticket = " + ticket)
                    statusrequest = """<PCT-Data>
                      <PCT-Data_input>
                        <PCT-InputData>
                          <PCT-InputData_request>
                            <PCT-Request>
                              <PCT-Request_reqid>%d</PCT-Request_reqid>
                              <PCT-Request_type value="status"/>
                            </PCT-Request>
                          </PCT-InputData_request>
                        </PCT-InputData>
                      </PCT-Data_input>
                    </PCT-Data>
                    """ % int(ticket)
                if ticket:
                    # Wait 10 seconds...
                    print("\tPubChem result not available yet, will try again in 10 seconds...")
                    time.sleep(10)
                    # ...and ask for an update on the progress
                    server_response = urlopen(url, statusrequest).read()
                    xml = ET.fromstring(server_response)
                    #print(server_response)
                else:
                    # We can't find a ticket number, or a download location. Bail.
                    raise ValidationError("""PubChem: download error""")
        return self.dataSDF

[docs]    def name(self):
        """Function to return the IUPAC name of the PubChem object."""
        return self.iupac

[docs]    def getCartesian(self):
        """Function to return a string of the atom symbol and XYZ
        coordinates of the PubChem object.

        """
        try:
            sdfText = self.getSDF()
        except Exception as e:
            raise e

        # Find
        # NA NB                        CONSTANT
        # 14 13  0     0  0  0  0  0  0999 V2000
        m = re.search(r'^\s*(\d+)\s+(?:\d+\s+){8}V2000$', sdfText, re.MULTILINE)
        self.natom = 0
        if (m):
            self.natom = int(m.group(1))

        if self.natom == 0:
            raise ValidationError("PubChem: Cannot find the number of atoms.  3D data doesn't appear\n" +
                            "to be available for %s.\n" % self.iupac)

        lines = re.split('\n', sdfText)

        #  3.7320   -0.2500    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
        NUMBER = "((?:[-+]?\\d*\\.\\d+(?:[DdEe][-+]?\\d+)?)|(?:[-+]?\\d+\\.\\d*(?:[DdEe][-+]?\\d+)?))"
        atom_re = re.compile(r'^\s*' + NUMBER + r'\s+' + NUMBER + r'\s+' + NUMBER + r'\s*(\w+)(?:\s+\d+){12}')

        molecule_string = "PubchemInput\n"

        atom_count = 0
        for line in lines:
            if (not line or line.isspace()):
                continue

            atom_match = atom_re.match(line)
            if atom_match:
                x = float(atom_match.group(1))
                y = float(atom_match.group(2))
                z = float(atom_match.group(3))
                sym = atom_match.group(4)

                atom_count = atom_count + 1

                molecule_string += "%s %10.6f %10.6f %10.6f\n" % (sym, x, y, z)

                if (atom_count == self.natom):
                    break

        return molecule_string

[docs]    def getXYZFile(self):
        """Function to obtain preferentially a molecule string
        through getCartesian() or a query string otherwise.

        """
        try:
            temp = self.getCartesian()
        except Exception as e:
            raise
        molstr = "%d\n%s\n%s" % (self.natom, self.iupac, temp)
        return molstr

[docs]    def getMoleculeString(self):
        """Function to obtain a molecule string through
        getCartesian() or fail.
        """
        try:
            return self.getCartesian()
        except Exception as e:
            return e.message


[docs]def getPubChemResults(name):
    """Function to query the PubChem database for molecules matching the
    input string. Builds a PubChem object if found.

    """
    url = 'http://www.ncbi.nlm.nih.gov/sites/entrez?db=pccompound&term=%s&format=text' % quote(name)
    print("\tSearching PubChem database for %s" % (name))
    try:
        loc = urlopen(url)
    except URLError as e:
        msg = "\tPubchemError\n%s\n\treceived when trying to open\n\t%s\n" % (str(e), url)
        msg += "\tCheck your internet connection, and the above URL, and try again.\n"
        raise ValidationError(msg)
    data = loc.read()

    ans = []
    l = data.find(b"<pre>")
    l = data.find(b"\n", l)
    i = 1
    while(True):
        l = data.find(str("%d. " % i).encode(sys.getdefaultencoding()), l)
        if l == -1:
            break
        tag = b"MF: "
        l = data.find(tag, l) + len(tag)
        mf = data[l:data.find(b'\n', l)].decode(sys.getdefaultencoding())
        tag = b"IUPAC name: "
        l = data.find(tag, l) + len(tag)
        iupac = data[l:data.find(b'\n', l)].decode(sys.getdefaultencoding())
        tag = b"CID:"
        l = data.find(tag, l) + len(tag)
        #if l == 4:
        #    break
        cid = int(data[l:data.find(b"\n", l)])
        l = data.find(b'\t', l) + 1

        pubobj = PubChemObj(cid, mf, iupac)
        ans.append(pubobj)
        i += 1

    print("\tFound %d results" % (len(ans)))
    return ans

if __name__ == "__main__":
    try:
        #obj = getPubChemResults("1-methoxy-4-[(E)-prop-1-enyl]benzene")
        #obj = getPubChemResults("sodium benzenesulfonate")
        obj = getPubChemResults("4-[bis(4-hydroxyphenyl)methyl]phenol")
    except Exception as e:
        print(e.message)

    for r in obj:
        print(r)
        print(r.getMoleculeString())