"""
Generate pharmacophores from a RDKit conformer.
Parts of code adapted from Francois Berenger / Tsuda Lab and RDKit.
References:
- Tsuda Lab: https://github.com/tsudalab/ACP4/blob/master/bin/acp4_ph4.py
(From https://doi.org/10.1021/acs.jcim.2c01623)
- RDKit: https://github.com/rdkit/rdkit/blob/master/rdkit/Chem/Features/FeatDirUtilsRD.py
- RDKit: https://github.com/rdkit/rdkit/blob/master/rdkit/Chem/Features/ShowFeats.py
"""
from __future__ import annotations
import os
from copy import deepcopy
import math
from typing import List, Tuple, Dict, Union
import numpy as np
from scipy.spatial import distance, Delaunay
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
# pharmacophores
from shepherd_score.pharm_utils.pharmvec import GetDonorFeatVects, GetAcceptorFeatVects, GetAromaticFeatVects, GetHalogenFeatVects
from shepherd_score.score.constants import P_TYPES
PT = Chem.GetPeriodicTable()
feature_colors = {
'Donor': (0, 1, 1),
'Acceptor': (1, 0, 1),
'NegIonizable': (1, 0, 0),
'Anion': (1,0,0),
'PosIonizable': (0, 0, 1),
'Cation': (0,0,1),
'ZnBinder': (1, .5, .5),
'Zn': (1, .5, .5),
'Aromatic': (1, .8, .2),
'LumpedHydrophobe': (.5, .25, 0),
'Hydrophobe': (.5, .25, 0),
'Halogen': (.13, .55, .13),
'Dummy': (0., .4, .55)
}
# Below is used to get hydrophobic groups
#### From https://github.com/tsudalab/ACP4/blob/master/bin/acp4_ph4.py ####
#### Credit to Francois Berenger and Tsuda Lab ####
#### https://doi.org/10.1021/acs.jcim.2c01623 ####
# These are the same as Pharmer / Pmapper
__hydrophobic_smarts = [
"a1aaaaa1",
"a1aaaa1",
# branched terminals as one point
"[$([CH3X4,CH2X3,CH1X2,F,Cl,Br,I])&!$(**[CH3X4,CH2X3,CH1X2,F,Cl,Br,I])]",
"[$(*([CH3X4,CH2X3,CH1X2,F,Cl,Br,I])[CH3X4,CH2X3,CH1X2,F,Cl,Br,I])&!$(*([CH3X4,CH2X3,CH1X2,F,Cl,Br,I])([CH3X4,CH2X3,CH1X2,F,Cl,Br,I])[CH3X4,CH2X3,CH1X2,F,Cl,Br,I])]([CH3X4,CH2X3,CH1X2,F,Cl,Br,I])[CH3X4,CH2X3,CH1X2,F,Cl,Br,I]",
"*([CH3X4,CH2X3,CH1X2,F,Cl,Br,I])([CH3X4,CH2X3,CH1X2,F,Cl,Br,I])[CH3X4,CH2X3,CH1X2,F,Cl,Br,I]",
# simple rings only; need to combine points to get good results for 3d structures
"[C&r3]1~[C&r3]~[C&r3]1",
"[C&r4]1~[C&r4]~[C&r4]~[C&r4]1",
"[C&r5]1~[C&r5]~[C&r5]~[C&r5]~[C&r5]1",
"[C&r6]1~[C&r6]~[C&r6]~[C&r6]~[C&r6]~[C&r6]1",
"[C&r7]1~[C&r7]~[C&r7]~[C&r7]~[C&r7]~[C&r7]~[C&r7]1",
"[C&r8]1~[C&r8]~[C&r8]~[C&r8]~[C&r8]~[C&r8]~[C&r8]~[C&r8]1",
# aliphatic chains
"[CH2X4,CH1X3,CH0X2]~[CH3X4,CH2X3,CH1X2,F,Cl,Br,I]",
"[$([CH2X4,CH1X3,CH0X2]~[$([!#1]);!$([CH2X4,CH1X3,CH0X2])])]~[CH2X4,CH1X3,CH0X2]~[CH2X4,CH1X3,CH0X2]",
"[$([CH2X4,CH1X3,CH0X2]~[CH2X4,CH1X3,CH0X2]~[$([CH2X4,CH1X3,CH0X2]~[$([!#1]);!$([CH2X4,CH1X3,CH0X2])])])]~[CH2X4,CH1X3,CH0X2]~[CH2X4,CH1X3,CH0X2]~[CH2X4,CH1X3,CH0X2]",
# sulfur (apparently)
"[$([S]~[#6])&!$(S~[!#6])]"
]
[docs]
def pattern_of_smarts(s):
return Chem.MolFromSmarts(s)
__hydrophobic_patterns = list(map(pattern_of_smarts, __hydrophobic_smarts))
# geometric center of a matched pattern
def __average_match(mol, matched_pattern):
avg_x = 0.0
avg_y = 0.0
avg_z = 0.0
count = float(len(matched_pattern))
conf0 = mol.GetConformer()
for i in matched_pattern:
xyz = conf0.GetAtomPosition(i)
avg_x += xyz.x
avg_y += xyz.y
avg_z += xyz.z
center = (avg_x / count,
avg_y / count,
avg_z / count)
return center
def __find_matches(mol, patterns):
res = []
for pat in patterns:
# get all matches for that pattern
matched = mol.GetSubstructMatches(pat)
for m in matched:
# get the center of each matched group
avg = __average_match(mol, m)
res.append(avg)
return res
def __euclid(xyz0, xyz1):
x0, y0, z0 = xyz0
x1, y1, z1 = xyz1
dx = x0 - x1
dy = y0 - y1
dz = z0 - z1
return math.sqrt(dx*dx + dy*dy + dz*dz)
def __average(vecs):
sum_x = 0.0
sum_y = 0.0
sum_z = 0.0
n = float(len(vecs))
for (x, y, z) in vecs:
sum_x += x
sum_y += y
sum_z += z
return (sum_x / n,
sum_y / n,
sum_z / n)
def _rdkit_point3d_to_tuple(point: Chem.Geometry.Point3D):
"""
Convert an rdkit Point3D to a tuple.
For reasons I can not explain, it's 1000x faster to convert this way instead of
calling tuple(point)
def pt_to_tuple(pt):
return (pt.x, pt.y, pt.z)
%timeit tuple(pt)
%timeit pt_to_tuple(pt)
Gives:
527 μs ± 16.1 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
252 ns ± 1.77 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
"""
return (point.x, point.y, point.z)
[docs]
def find_hydrophobes(mol: rdkit.Chem.rdchem.Mol,
cluster_hydrophobic: bool = True):
"""
Find hydrophobes and cluster them.
Arguments
---------
mol : rdkit Mol object with a conformer.
cluster_hydrophobic : bool (default=True) to cluster hydrophobic atoms if they fall within 2A.
Returns
-------
list of tuples containing coordinates for the locations for each hydrophobe.
"""
all_hydrophobes = __find_matches(mol, __hydrophobic_patterns)
if not cluster_hydrophobic:
return all_hydrophobes
else:
# regroup all hydrophobic features within 2.0A
grouped_hydrophobes = []
n = len(all_hydrophobes)
idx2cluster = list(range(n))
for i in range(n):
h_i = all_hydrophobes[i]
cluster_id = idx2cluster[i]
for j in range(i+1, n):
h_j = all_hydrophobes[j]
if __euclid(h_i, h_j) <= 2.0: # Angstrom
# same cluster
idx2cluster[j] = cluster_id
cluster_ids = set(idx2cluster)
for cid in cluster_ids:
group = []
for i, h in enumerate(all_hydrophobes):
if idx2cluster[i] == cid:
group.append(h)
grouped_hydrophobes.append(__average(group))
return grouped_hydrophobes
### End Tsuda Lab code
def _get_points_fibonacci(num_samples):
"""
Generate points on unit sphere using fibonacci approach.
Adapted from Morfeus:
https://github.com/digital-chemistry-laboratory/morfeus/blob/main/morfeus/geometry.py
Parameters
----------
num_samples : int
Number of points to sample from the surface of a sphere
Returns
-------
np.ndarray (num_samples,3)
Coordinates of the sampled points.
"""
offset = 2.0 / num_samples
increment = np.pi * (3.0 - np.sqrt(5.0))
i = np.arange(num_samples)
y = ((i * offset) - 1) + (offset / 2)
r = np.sqrt(1 - np.square(y))
phi = np.mod((i + 1), num_samples) * increment
x = np.cos(phi) * r
z = np.sin(phi) * r
points = np.column_stack((x, y, z))
return points
def __outside_hull(sample_points: np.ndarray,
hull: Delaunay
) -> np.ndarray:
"""
Test if points in `sample_points` are outside of the convex hull formed by the atoms.
Arguments
---------
sample_points : (N,3) np.ndarray of the points to check if outside the "interior" of the molecule.
hull : scipy.spatial.Delaunay object initialized by the positions of the atoms of the molecule.
Returns
-------
(N,) np.ndarray of booleans describing if sample_points are outside of the convex hull
"""
return hull.find_simplex(sample_points) < 0
def __is_accessible(interaction_sphere, atom_pos, radii, mask_atom_idx):
"""
Check if at least 2% of sampled points fall within a surface-accessible volume of the molecule.
This is 2% of the original 200 points (4 points).
Currently using SAS with a probe radius of 0.8A rather than vdW volume. vdW volume will fail to
exclude buried pharmacophores. Also experimented with checking if the interaction points fell
within a convex hull and buried volume with Morpheus which both had limited improvements.
Arguments
---------
interaction_sphere : np.ndarray (M, 3) of points to check accessibility of a potentially
interacting atom. M <= 200
atom_pos : np.ndarray (N, 3) Positions of atoms in molecule.
radii : np.ndarray (N,) vdW radii for each corresponding atom.
mask_atom_idx : np.ndarray of bool (N,) contains atom indices to ignore if the interaction
points are within their SA volumes. For example, the acceptor atom or the
donating hydrogens.
Returns
-------
bool
"""
# compute distances from each sampled point to all atoms (except excluded)
dist_matrix = distance.cdist(interaction_sphere, atom_pos[mask_atom_idx])
mask = np.all(dist_matrix >= radii + 0.8, axis=1) # mask for points within vdW + probe radius
interaction_sphere = interaction_sphere[mask]
# if hull is not None:
# # If you actually want to include this, then only compute Delaunay ONCE per molecule (outside this func).
# hull = Delaunay(mol.GetConformer().GetPositions())
# sas_mask = np.all(dist_matrix[mask] >= radii + 0.8, axis=1) # points within SAS defined volume
# hull_mask = __outside_hull(interaction_sphere, hull).astype(bool) # points within hull
# interaction_sphere = interaction_sphere[hull_mask | sas_mask]
num_accessible = len(interaction_sphere) # number of non-colliding points
if num_accessible > 4: # at least 2% accessible from initial total 200 points
return True
else:
return False
def _is_donator_accessible(mol: rdkit.Chem.rdchem.Mol,
hydrogens: Union[List[rdkit.Chem.rdchem.Atom], None],
pharm_pos: Tuple,
unit_vec: Tuple,
) -> bool:
"""
Check accessbility of donator atoms inspired by protocol of Pharao.
DOI: 10.1016/j.jmgm.2008.04.003
Check whether at least 2% of the points sampled on a sphere of 1.8A radius is accessible.
i.e., beyond the SAS
Arguments
---------
mol : rdkit Mol with conformer
pharm_pos : tuple holding coords of anchor point
unit_vec : tuple holding coords of releative unit vector
num_nbrs : int of the number of neighbors to the acceptor (heavy + hydr)
Returns
-------
bool
"""
if hydrogens is None:
hyd_atom_ids = []
else:
hyd_atom_ids = [h.GetIdx() for h in hydrogens]
radii = np.array([PT.GetRvdw(atom.GetAtomicNum()) for i, atom in enumerate(mol.GetAtoms()) if i not in hyd_atom_ids])
# Pharmacophore position is about 1.2A in direction of vector
pharm_pos = np.array(pharm_pos) + 1.2*np.array(unit_vec)
# unit sphere
interaction_sphere = _get_points_fibonacci(200)
interaction_radius = 1.8 # angstroms
interaction_sphere *= interaction_radius
interaction_sphere += pharm_pos # move to position of pharmacophore
atom_pos = mol.GetConformer().GetPositions()
# don't include the hydrogens themselves
mask_atom_idx = np.isin(np.arange(len(atom_pos)), hyd_atom_ids, invert=True)
return __is_accessible(interaction_sphere, atom_pos, radii, mask_atom_idx)
def _is_acceptor_accessible(mol: rdkit.Chem.rdchem.Mol,
acceptor_atom: rdkit.Chem.rdchem.Atom,
pharm_pos: Tuple,
unit_vec: Tuple,
num_nbrs: int,
) -> bool:
"""
Check accessbility of acceptor atoms inspired by protocol of Pharao.
DOI: 10.1016/j.jmgm.2008.04.003
Check whether at least 2% of the points sampled on a sphere of 1.8A radius is accessible.
i.e., beyond the SAS
Arguments
---------
mol : rdkit Mol with conformer
acceptor_atom : rdkit Atom from mol that is the acceptor
pharm_pos : tuple holding coords of anchor point
unit_vec : tuple holding coords of releative unit vector
num_nbrs : int of the number of neighbors to the acceptor (heavy + hydr)
Returns
-------
bool
"""
acceptor_atom_id = acceptor_atom.GetIdx()
radii = np.array([PT.GetRvdw(atom.GetAtomicNum()) for i, atom in enumerate(mol.GetAtoms()) if i != acceptor_atom_id])
pharm_pos = np.array(pharm_pos)
# unit sphere
interaction_sphere = _get_points_fibonacci(200)
# mask out irrelevant parts of the sphere
if num_nbrs >= 3:
# hemisphere
vec = np.array(unit_vec)
inds = np.where(np.dot(vec, interaction_sphere.T) > 0)[0]
interaction_sphere = interaction_sphere[inds]
elif num_nbrs == 2:
# Little more than a hemisphere, sqrt(2)/2 = -0.7071 -> 180+45 deg
vec = np.array(unit_vec)
inds = np.where(np.dot(vec, interaction_sphere.T) > -0.7071)[0]
interaction_sphere = interaction_sphere[inds]
# otherwise full sphere
interaction_radius = 1.8 # angstroms
interaction_sphere *= interaction_radius
interaction_sphere += pharm_pos # move to position of pharmacophore
atom_pos = mol.GetConformer().GetPositions()
# don't include the atom itself
mask_atom_idx = np.where(np.arange(len(atom_pos)) != acceptor_atom_id)[0]
return __is_accessible(interaction_sphere, atom_pos, radii, mask_atom_idx)
### From rdkit:
# https://github.com/rdkit/rdkit/blob/master/rdkit/Chem/Features/FeatDirUtilsRD.py
# https://github.com/rdkit/rdkit/blob/master/rdkit/Chem/Features/ShowFeats.py
def _average_vectors(vectors: List):
"""
Arguments
---------
vectors : List of rdkit geometry point3d objects. These should be unit vectors.
Returns
-------
rdkit.Geometry.rdGeometry.Point3D object that is an average of the provided vectors.
"""
avg_vec = 0
for v in vectors:
if avg_vec == 0:
avg_vec = deepcopy(v)
else:
avg_vec += v
avg_vec.Normalize()
return avg_vec
# Lazily create and cache the feature factory
_cached_factory: rdkit.Chem.rdMolChemicalFeatures.MolChemicalFeatureFactory | None = (
None
)
[docs]
def get_pharmacophores_dict(mol: rdkit.Chem.rdchem.Mol,
multi_vector: bool = True,
exclude: List[int] = [],
check_access: bool = False,
scale: float = 1.0
) -> Dict:
"""
Get the positions of pharmacophore anchors and their associated unit vectors.
Returns a dictionary. Adapted from rdkit.Chem.Features.ShowFeats.ShowMolFeats.
Parameters
----------
mol : rdkit.Chem.Mol
RDKit Mol object with a conformer.
multi_vector : bool, optional
Whether to represent pharmacophores with multiple vectors. Default is ``True``.
exclude : list, optional
List of atom indices to not include as a HBD. Default is [].
check_access : bool, optional
Check if HBD/HBA are accessible to the molecular surface. Default is ``False``.
scale : float, optional
Length of the vector in Angstroms. Default is 1.0.
Returns
-------
dict
Dictionary with format ``{'FeatureName': {'P': [(anchor coord), ...],
'V': [(rel. vec), ...]}}``.
"""
global _cached_factory
pharmacophores = {}
if _cached_factory is None:
dirname = os.path.dirname(__file__)
fdef_file = os.path.join(dirname, "smarts_features.fdef")
_cached_factory = AllChem.BuildFeatureFactory(fdef_file)
mol_feats = _cached_factory.GetFeaturesForMol(mol)
# Filter only these for rdkit processing, we will compute hydrophobes later
keep = ('Aromatic', 'ZnBinder', 'Donor', 'Acceptor', 'Cation', 'Anion', 'Halogen')
# Non-hydrophobe pharmacophore processing
for feat in mol_feats:
family = feat.GetFamily() # type of pharmacophore
if family not in keep:
continue
if family not in pharmacophores:
pharmacophores[family] = {}
pharmacophores[family]['P'] = []
pharmacophores[family]['V'] = []
pos = feat.GetPos() # positions of pharmacophore anchor
if family.lower() == 'aromatic':
anchor, vec = GetAromaticFeatVects(conf = mol.GetConformer(),
featAtoms = feat.GetAtomIds(),
featLoc = pos,
return_both = multi_vector,
scale = scale)
if not multi_vector:
anchor = anchor[0]
vec = vec[0]
elif family.lower() == 'donor':
aids = feat.GetAtomIds()
if len(aids) == 1:
featAtom = mol.GetAtomWithIdx(aids[0])
# Multivector by default
anchor, vec, hydrogen_list = GetDonorFeatVects(conf = mol.GetConformer(),
featAtoms = aids,
scale = scale,
exclude = exclude)
if vec is not None and len(vec) > 1:
avg_vec = _average_vectors(vec)
else:
if vec is None:
avg_vec = None
else:
avg_vec = deepcopy(vec[0])
if check_access:
if anchor is None or avg_vec is None:
continue
elif not _is_donator_accessible(mol = mol,
hydrogens = hydrogen_list,
pharm_pos = anchor if not isinstance(anchor, list) else anchor[0],
unit_vec = avg_vec
):
continue # don't keep this pharmacophore
# If only one vector per pharmacophore
if not multi_vector and anchor is not None:
anchor = anchor[0]
vec = deepcopy(avg_vec)
elif family.lower() == 'acceptor':
aids = feat.GetAtomIds()
if len(aids) == 1:
featAtom = mol.GetAtomWithIdx(aids[0])
# Multivector by default
anchor, vec = GetAcceptorFeatVects(conf = mol.GetConformer(),
featAtoms = aids,
scale = scale)
if vec is not None and len(vec) > 1:
avg_vec = _average_vectors(vec)
else:
if vec is None:
avg_vec = None
else:
avg_vec = deepcopy(vec[0])
if check_access:
if anchor is None or avg_vec is None:
continue
numNbrs = len(featAtom.GetNeighbors())
if not _is_acceptor_accessible(mol = mol,
acceptor_atom = featAtom,
pharm_pos = anchor if not isinstance(anchor, list) else anchor[0],
unit_vec = avg_vec,
num_nbrs = numNbrs):
continue # don't keep this pharmacophore
# If only one vector per pharmacophore
if not multi_vector and anchor is not None:
anchor = anchor[0]
vec = deepcopy(avg_vec)
elif family.lower() == 'halogen':
aids = feat.GetAtomIds()
if len(aids) == 1:
featAtom = mol.GetAtomWithIdx(aids[0])
anchor, vec = GetHalogenFeatVects(conf = mol.GetConformer(),
featAtoms = aids,
scale = scale)
anchor = anchor[0]
vec = vec[0]
else:
anchor = pos
vec = Chem.rdGeometry.Point3D(0,0,0)
if anchor is not None and vec is not None:
pass
if isinstance(anchor, list):
pharmacophores[family]['P'].extend(_rdkit_point3d_to_tuple(x) for x in anchor)
pharmacophores[family]['V'].extend(_rdkit_point3d_to_tuple(x) for x in vec)
else:
pharmacophores[family]['P'].append(_rdkit_point3d_to_tuple(anchor))
pharmacophores[family]['V'].append(_rdkit_point3d_to_tuple(vec))
# Hydrophobe processing
hydrophobes = find_hydrophobes(mol=mol, cluster_hydrophobic=True)
pharmacophores['Hydrophobe'] = {}
pharmacophores['Hydrophobe']['P'] = hydrophobes
pharmacophores['Hydrophobe']['V'] = [(0,0,0) for _ in range(len(hydrophobes))]
return pharmacophores
[docs]
def get_pharmacophores(mol: rdkit.Chem.rdchem.Mol,
multi_vector: bool = True,
exclude: List[int] = [],
check_access: bool = False,
scale: float = 1.0
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Get the identity, anchor positions, and relative unit vectors for each pharmacophore.
Pharmacophore ordering for indexing:
('Acceptor', 'Donor', 'Aromatic', 'Hydrophobe', 'Cation', 'Anion', 'ZnBinder')
Notes
-----
The ``check_access`` parameter is currently based on whether interaction points sampled
from a sphere's surface with a radius of 1.8A from the acceptor/donor atom falls outside
the solvent accessible surface defined by the vdW radius + 0.8A of the neighboring atoms.
This works for buried acceptors/donors, but may be prone to false positives. For example,
CN(C)C would have its sole HBA rejected. Other approaches such as buried volume should
be considered in the future.
Parameters
----------
mol : rdkit.Chem.Mol
RDKit Mol object with conformer.
multi_vector : bool, optional
Whether to represent pharmacophores with multiple vectors. Default is ``True``.
exclude : list, optional
List of hydrogen indices to not include as a HBD. Default is [].
check_access : bool, optional
Check if HBD/HBA are accessible to the molecular surface. Default is ``False``.
scale : float, optional
Length of a pharmacophore vector in Angstroms. Default is 1.0.
Returns
-------
X : np.ndarray
Identity of pharmacophore corresponding to the indexing order, shape (N,).
P : np.ndarray
Anchor positions of each pharmacophore, shape (N, 3).
V : np.ndarray
Unit vectors in a relative position to the anchor positions, shape (N, 3).
Adding P and V results in the position of the vector's extended point.
"""
pharmacophores_dict = get_pharmacophores_dict(mol=mol,
multi_vector=multi_vector,
check_access=check_access,
scale=scale,
exclude=exclude)
N = sum(len(pharmacophores_dict[family]['P']) for family in pharmacophores_dict)
X = np.empty((N,), dtype=np.int64)
P = np.empty((N, 3), dtype=np.float64)
V = np.empty((N, 3), dtype=np.float64)
start_idx = 0
for family in pharmacophores_dict:
this_len = len(pharmacophores_dict[family]['P'])
if this_len == 0:
continue
end_idx = start_idx + this_len
X[start_idx:end_idx] = P_TYPES.index(family)
P[start_idx:end_idx, :] = pharmacophores_dict[family]['P']
V[start_idx:end_idx, :] = pharmacophores_dict[family]['V']
start_idx = end_idx
return X, P, V