Source code for shepherd_score.protonation.chemaxon_utils

"""
Script to tautomerize and protonate molecules using ChemAxon's package.
REQUIRES ChemAxon license.

`tautomerize_chemaxon` functions are adapted from https://github.com/jenna-fromer/build_3d_py/
"""

import subprocess
import pandas as pd
import io
import os
from typing import List

from rdkit import Chem
from shepherd_score.protonation.protonate import remove_bad_protomers, neutralize_atoms


[docs] def tautomerize_chemaxon( smiles: str, cxcalc_exe: str, molconvert_exe: str, pH: float = 7.4, cutoff: float = 10, tautomer_limit: float = 20, protomer_limit: float = 20, neutralize: bool = True, verbose: bool = False, chemaxon_license_path: str = None, ) -> List[str]: """ Tautomerize/protonate a molecule using ChemAxon's package. Defaults are from the ZINC22 build pipeline. Arguments --------- smiles : str SMILES string of the molecule to tautomerize/protonate. cxcalc_exe : str Path to the cxcalc executable. molconvert_exe : str Path to the molconvert executable. pH : float (default: 7.4) pH value to use for the protonation. cutoff : float (default: 10) Cutoff value to use for the tautomerization/protonation. tautomer_limit : float (default: 20) Limit for the tautomerization/protonation. protomer_limit : float (default: 20) Limit for the protomerization. neutralize : bool (default: True) Whether to neutralize the molecule before tautomerization/protonation. verbose : bool (default: False) Whether to print verbose output. chemaxon_license_path : str | None (default: None) Path to the chemaxon license file. If ``None``, the ``CHEMAXON_LICENSE_URL`` environment variable is used. Returns ------- list[str] List of SMILES strings of the tautomers/protomers with bad charges removed. """ if chemaxon_license_path is not None: os.environ['CHEMAXON_LICENSE_URL'] = chemaxon_license_path if os.environ.get('CHEMAXON_LICENSE_URL') is None: raise ValueError('CHEMAXON_LICENSE_URL is not set') # Suppress noisy ChemAxon Java logging unless in verbose mode _stderr = None if verbose else subprocess.DEVNULL if neutralize: smiles = Chem.MolToSmiles(neutralize_atoms(Chem.MolFromSmiles(smiles))) cmd1 = f'{cxcalc_exe} -g dominanttautomerdistribution -H {pH} -C false -t tautomer-dist' output1 = subprocess.check_output(cmd1, shell=True, input=smiles.encode(), stderr=_stderr) cmd2 = f'{molconvert_exe} sdf -g -c "tautomer-dist>={tautomer_limit}" ' output2 = subprocess.check_output(cmd2, shell=True, input=output1, stderr=_stderr) cmd3 = f'{cxcalc_exe} -g microspeciesdistribution -H {pH} -t protomer-dist' output3 = subprocess.check_output(cmd3, shell=True, input=output2, stderr=_stderr) cmd4 = f'{molconvert_exe} smiles -g -c "protomer-dist>={protomer_limit}" -T name:tautomer-dist:protomer-dist' output4 = subprocess.check_output(cmd4, shell=True, input=output3, stderr=_stderr) table = pd.read_csv(io.BytesIO(output4), sep='\t') if len(table) == 1: protomers = list(table['#SMILES']) else: # remove redundant SMILES, sort by score (highest first) table['score'] = table['tautomer-dist']*table['protomer-dist']/100 prots = {} for smi, score in zip(table['#SMILES'], table['score']): if smi in prots: prots[smi] = max(score, prots[smi]) elif score > cutoff: prots[smi] = score protomers = sorted(prots, key = lambda x: prots[x], reverse=True) return remove_bad_protomers(protomers)