Source code for shepherd_score.conformer_generation

"""
Handles anything related to generating conformers with xTB or MMFF94.

Requires xtb installation with command-line access.
See https://xtb-docs.readthedocs.io/en/latest/setup.html for installation instructions.
"""
import os
from copy import deepcopy
import re
import shutil
import subprocess
import time
from pathlib import Path
from tqdm import tqdm
import uuid
from typing import Optional, List, Union
import contextlib
import multiprocessing

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Geometry import Point3D
from rdkit.Chem import rdMolAlign
from rdkit.ML.Cluster import Butina # type: ignore

TMPDIR = os.environ.get('TMPDIR', '')
if TMPDIR == '' and Path('/tmp').is_dir():
    TMPDIR = '/tmp'
TMPDIR = Path(TMPDIR)



[docs]
@contextlib.contextmanager
def set_thread_limits(num_threads: int):
    """Temporarily set threading environment variables."""
    env_vars = [
        'OMP_NUM_THREADS',
        'OPENBLAS_NUM_THREADS',
        'MKL_NUM_THREADS',
        'NUMEXPR_NUM_THREADS',
        'VECLIB_MAXIMUM_THREADS',
    ]
    old_env = {var: os.environ.get(var) for var in env_vars}
    try:
        for var in env_vars:
            os.environ[var] = str(num_threads)
        yield
    finally:
        for var, val in old_env.items():
            if val is None:
                os.environ.pop(var, None)
            else:
                os.environ[var] = val




[docs]
def update_mol_coordinates(mol: Chem.Mol, coordinates) -> Chem.Mol:
    """
    Update the coordinates of a 3D RDKit mol object with a new set of coordinates.

    Parameters
    ----------
    mol : Chem.Mol
        RDKit mol object with 3D coordinates to be replaced.
    coordinates : list or array-like
        List/array of new [x, y, z] coordinates.

    Returns
    -------
    Chem.Mol
        RDKit mol object with updated 3D coordinates.
    """
    mol_new = deepcopy(mol)
    conf = mol_new.GetConformer()
    for i in range(mol_new.GetNumAtoms()):
        x,y,z = coordinates[i]
        conf.SetAtomPosition(i, Point3D(x,y,z))
    return mol_new



[docs]
def read_multi_xyz_file(file_dir: str):
    """
    Read an xyz file that potentially contains multiple structures.

    Parameters
    ----------
    file_dir : str
        Path to .xyz file.

    Returns
    -------
    all_coordinates : list
        List of lists containing the coordinates of each structure in the xyz file.
    all_elements : list
        List of lists containing the element types of each atom in each structure.
    """
    atom_types = [Chem.GetPeriodicTable().GetElementSymbol(i) for i in range(1, 119)]
    with open(file_dir, 'r') as file:
        lines = file.readlines()
        all_coordinates = []
        all_elements = []

        begin_coord = False
        coordinates = []
        elements = []
        for line in lines:
            stripped_line = (' '.join(line.split()))
            stripped_line_split = stripped_line.split(' ')
            numbers = re.findall(r"[-+]?(?:\d*\.\d+|\d+)", stripped_line)
            if len(stripped_line) == 0:
                continue

            if (len(numbers) == 3) & (stripped_line_split[0] in atom_types):
                begin_coord = True
                x,y,z = numbers
                coordinates.append([float(x),float(y),float(z)])
                elements.append(stripped_line_split[0])
            else:
                if begin_coord:
                    all_coordinates.append(coordinates)
                    all_elements.append(elements)
                    elements = []
                    coordinates = []
                begin_coord = False

    if begin_coord:
        all_coordinates.append(coordinates)
        all_elements.append(elements)

    return all_coordinates, all_elements




[docs]
def embed_conformer(mol: Chem.Mol, attempts: int=50, MMFF_optimize: bool=False, random_seed=-1):
    """
    Embed a mol object into a 3D RDKit mol object with ETKDG (and optional MMFF94).

    Parameters
    ----------
    mol : Chem.Mol
        RDKit Mol object.
    attempts : int, optional
        Number of embedding attempts. Default is 50.
    MMFF_optimize : bool, optional
        Whether to optimize embedded conformer with MMFF94. Default is ``False``.
    random_seed: int, optional
        Seed for RDKit's EmbedMolecule. -1 means no seed, otherwise must be positive.

    Returns
    -------
    Chem.Mol or None
        RDKit mol object with 3D coordinates, or ``None`` if embedding fails.
    """
    try:
        mol = Chem.AddHs(mol)
        AllChem.EmbedMolecule(mol, maxAttempts = attempts, randomSeed = random_seed)
        if MMFF_optimize:
            AllChem.MMFFOptimizeMolecule(mol)

        mol.GetConformer() # test whether conformer generation succeeded
    except Exception:
        return None

    return mol




[docs]
def embed_conformer_from_smiles(smiles: str, attempts: int = 50, MMFF_optimize: bool = False, random_seed: int = -1):
    """
    Embed a SMILES into a 3D RDKit mol object with ETKDG (and optionally MMFF94).

    Parameters
    ----------
    smiles : str
        SMILES string of molecule.
    attempts : int, optional
        Number of embedding attempts. Default is 50.
    MMFF_optimize : bool, optional
        Whether to optimize embedded conformer with MMFF94. Default is ``False``.
    random_seed: int, optional
        Seed for RDKit's EmbedMolecule. -1 means no seed, otherwise must be positive.

    Returns
    -------
    Chem.Mol or None
        RDKit mol object with 3D coordinates, or ``None`` if embedding fails.
    """
    try:
        mol = Chem.MolFromSmiles(smiles)
    except Exception as e:
        print('Error in SMILES string when embedding molecule:', e)
        return None
    mol = embed_conformer(mol, attempts, MMFF_optimize, random_seed)
    return mol




[docs]
def conf_to_mol(mol, conf_id):
    """
    Convert a conformer of a RDKit mol object into its own RDKit mol object.

    Parameters
    ----------
    mol : Chem.Mol
        RDKit mol object with multiple conformers.
    conf_id : int
        ID of conformer to be converted into its own mol object.

    Returns
    -------
    Chem.Mol
        Mol object with only 1 conformer (the selected conformer).
    """
    conf = mol.GetConformer(conf_id)
    new_mol = Chem.Mol(mol)
    new_mol.RemoveAllConformers()
    new_mol.AddConformer(Chem.Conformer(conf))
    return new_mol




[docs]
def generate_conformer_ensemble(mol_3d: Chem.Mol,
                                num_confs: int=100,
                                num_threads: int = 4,
                                threshold: float = 0.25,
                                num_opt_steps: int = 200):
    """
    Use ETKDG algorithm to embed multiple conformers from a given 3D conformer template.

    Optionally optimizes each embedded conformer with MMFF94.

    Parameters
    ----------
    mol_3d : Chem.Mol
        RDKit mol object with 3D coordinates.
    num_confs : int, optional
        Maximum number of conformers to be embedded with ETKDG. Default is 100.
    num_threads : int, optional
        Number of processors to be used in parallel when embedding conformers. Default is 4.
    threshold : float, optional
        RMSD threshold used to eliminate redundant conformers after ETKDG embedding.
        Default is 0.25.
    num_opt_steps : int, optional
        Number of MMFF94 optimization steps. Default is 200.

    Returns
    -------
    list
        List of mol objects, each containing 1 (unique) conformer.
    """
    mol_3d = deepcopy(mol_3d)

    cids = AllChem.EmbedMultipleConfs(
        mol_3d,
        clearConfs=True,
        numConfs=num_confs,
        pruneRmsThresh = threshold,
        maxAttempts = 50,
        numThreads = num_threads,
    )

    if num_opt_steps > 0:
        for cid in cids:
            AllChem.MMFFOptimizeMolecule(
                mol_3d,
                confId=cid,
                mmffVariant='MMFF94',
                maxIters = num_opt_steps,

            )
    mols = [conf_to_mol(mol_3d, c) for c in cids]

    return mols




[docs]
def cluster_conformers_butina(conformers, threshold: float = 0.2, num_max_conformers: int = 100):
    """
    Cluster a list of conformers by their pairwise RMSD with Butina Clustering algorithm.

    Parameters
    ----------
    conformers : list
        List of rdkit mol objects containing conformers of a common molecule to be clustered.
    threshold : float, optional
        Initial RMSD threshold for clustering. Default is 0.2.
    num_max_conformers : int, optional
        Maximum number of conformers in the final clustered ensemble. Default is 100.

    Returns
    -------
    list
        List of int indices of the centroids of each cluster, to be indexed into conformers.
    """
    dists = []
    for i, conformer_i in enumerate(conformers):
        for j in range(i):
            mol_i = Chem.RemoveHs(deepcopy(conformer_i))
            mol_j = Chem.RemoveHs(deepcopy(conformers[j]))
            dists.append(rdMolAlign.GetBestRMS(mol_i, mol_j))

    clusts = Butina.ClusterData(dists, len(conformers), threshold, isDistData=True, reordering=True)

    if isinstance(num_max_conformers, int):
        while len(clusts) > num_max_conformers:
            threshold += 0.1
            clusts = Butina.ClusterData(dists, len(conformers),
                                        threshold, isDistData=True, reordering=True)

    select_confs = [c[0] for c in clusts] # picking centroids of each cluster

    return select_confs




[docs]
def optimize_conformer_with_xtb(conformer: Chem.Mol,
                                solvent: Optional[str] = None,
                                num_cores: int = 1,
                                charge: int = 0,
                                temp_dir: Union[str, Path] = TMPDIR,
                                timeout: Optional[float] = None):
    """
    Use external calls to GFN2-XTB (command line) to optimize a conformer geometry.

    Parameters
    ----------
    conformer : Chem.Mol
        RDKit mol object containing 3D coordinates.
    solvent : str, optional
        Implicit solvent to be used during optimization. Must be a solvent supported
        by XTB (https://xtb-docs.readthedocs.io/en/latest/gbsa.html). Default is ``None``.
    num_cores : int, optional
        Number of CPU cores to be used in the xTB geometry optimization. Default is 1.
    charge : int, optional
        Molecular charge. Default is 0.
    temp_dir : str or Path, optional
        Temporary directory for I/O. Default is the system temporary directory.
    timeout : float, optional
        Maximum number of seconds to allow the xtb subprocess to run before it is
        killed and ``subprocess.TimeoutExpired`` is raised. Default is ``None`` (no timeout).

    Returns
    -------
    tuple
        (xtb_mol, energy, charges) - tuple of optimized RDKit mol object,
        xTB energy (in Hartrees), and partial charges (in e-).
    """
    mol = deepcopy(conformer)

    with set_thread_limits(num_cores):
        # rand = str((os.getpid() * int(time.time())) % 123456789)
        rand = str(uuid.uuid4()) + ''.join(str(time.time()).split('.')[1])
        out_dir = Path(f'temp_xtb_opt_{rand}/')
        try:
            if temp_dir != '':
                if isinstance(temp_dir, str):
                    temp_dir = Path(temp_dir)
                out_dir = temp_dir / out_dir
            out_dir.mkdir(exist_ok=True)

            input_file = 'input_mol.xyz'
            Chem.rdmolfiles.MolToXYZFile(mol, str(out_dir/input_file))

            if solvent is not None:
                subprocess.check_call(
                    ['xtb', input_file, '--opt', '--alpb', solvent, '--parallel', str(num_cores), '--chrg', str(charge)],
                    cwd = out_dir,
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.STDOUT,
                    timeout=timeout,
                )
            else:
                subprocess.check_call(
                    ['xtb', input_file, '--opt', '--parallel', str(num_cores), '--chrg', str(charge)],
                    cwd = out_dir,
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.STDOUT,
                    timeout=timeout,
                )

            xtb_coords_list, xtb_elements_list = read_multi_xyz_file(out_dir/'xtbopt.xyz')
            xtb_coords = xtb_coords_list[0]
            xtb_mol = update_mol_coordinates(mol, xtb_coords)

            with open(out_dir/'xtbopt.xyz', 'r') as file:
                lines = file.readlines()
                for line in lines:
                    if 'energy' in line:
                        numbers = re.findall(r"[-+]?(?:\d*\.\d+|\d+)", line)
                        energy = float(numbers[0])
                        break

            with open(out_dir/'charges', 'r') as file:
                lines = file.readlines()
                charges = [0]*len(lines)
                for i, line in enumerate(lines):
                    charges[i] = float(line.split()[0])
                    xtb_mol.GetAtomWithIdx(i).SetProp('charge', str(charges[i]))

            xtb_mol.SetProp("energy", str(energy))

        finally:
            if out_dir.exists():
                shutil.rmtree(out_dir)

        return (xtb_mol, energy, charges)




[docs]
def optimize_conformer_with_xtb_from_xyz_block(xyz_block: str,
                                               solvent: Optional[str] = None,
                                               num_cores: int = 1,
                                               charge: int = 0,
                                               temp_dir: Union[str, Path] = TMPDIR,
                                               timeout: Optional[float] = None):
    """
    Use external calls to GFN2-XTB (command line) to optimize coordinates from an xyz block.

    Parameters
    ----------
    xyz_block : str
        String of an xyz block.
    solvent : str, optional
        Implicit solvent to be used during optimization. Must be a solvent supported
        by XTB (https://xtb-docs.readthedocs.io/en/latest/gbsa.html). Default is ``None``.
    num_cores : int, optional
        Number of CPU cores to be used in the xTB geometry optimization. Default is 1.
    charge : int, optional
        Molecular charge. Default is 0.
    temp_dir : str or Path, optional
        Temporary directory for I/O. Default is the system temporary directory.
    timeout : float, optional
        Maximum number of seconds to allow the xtb subprocess to run before it is
        killed and ``subprocess.TimeoutExpired`` is raised. Default is ``None`` (no timeout).

    Returns
    -------
    tuple
        (xtb_xyz_block, energy, charges) - tuple of optimized xyz block string,
        xTB energy (in Hartrees), and partial charges (in e-).
    """
    with set_thread_limits(num_cores):
        # rand = str((os.getpid() * int(time.time())) % 123456789)
        rand = str(uuid.uuid4()) + ''.join(str(time.time()).split('.')[1])
        out_dir = Path(f'temp_xtb_opt_{rand}/')
        try:
            if temp_dir != '':
                if isinstance(temp_dir, str):
                    temp_dir = Path(temp_dir)
                out_dir = temp_dir / out_dir
            out_dir.mkdir(exist_ok=True)

            input_file = 'input_mol.xyz'
            with open(out_dir/input_file, 'w') as f:
                f.write(xyz_block)

            if solvent is not None:
                subprocess.check_call(
                    ['xtb', input_file, '--opt', '--alpb', solvent, '--parallel', str(num_cores), '--chrg', str(charge)],
                    cwd = out_dir,
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.STDOUT,
                    timeout=timeout,
                )
            else:
                subprocess.check_call(
                    ['xtb', input_file, '--opt', '--parallel', str(num_cores), '--chrg', str(charge)],
                    cwd = out_dir,
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.STDOUT,
                    timeout=timeout,
                )

            opt_xyz_path = out_dir/'xtbopt.xyz'
            if opt_xyz_path.is_file():
                with open(out_dir/'xtbopt.xyz', 'r') as file:
                    xtb_xyz_block = file.readlines()
                    for line in xtb_xyz_block:
                        if 'energy' in line:
                            numbers = re.findall(r"[-+]?(?:\d*\.\d+|\d+)", line)
                            energy = float(numbers[0])
                            break

                if 'energy' in xtb_xyz_block[1]:
                    xtb_xyz_block[1] = '\n' # Replace energy line with blank
                xtb_xyz_block = ''.join(xtb_xyz_block)
            else:
                xtb_xyz_block = None
                energy = None

            with open(out_dir/'charges', 'r') as file:
                lines = file.readlines()
                charges = [0]*len(lines)
                for i, line in enumerate(lines):
                    charges[i] = float(line.split()[0])

        finally:
            if out_dir.exists():
                shutil.rmtree(out_dir)

        return (xtb_xyz_block, energy, charges)




[docs]
def charges_from_single_point_conformer_with_xtb(conformer: Chem.Mol,
                                                 solvent: Optional[str] = None,
                                                 num_cores: int = 1,
                                                 charge: int = 0,
                                                 temp_dir: Union[str, Path] = TMPDIR,
                                                 timeout: Optional[float] = None
                                                 ):
    """
    Compute atomic partial charges from a single point xTB calculation of a provided conformer.

    Uses external calls to GFN2-XTB (command line).

    Parameters
    ----------
    conformer : Chem.Mol
        RDKit mol object containing 3D coordinates.
    solvent : str, optional
        Implicit solvent to be used during calculation. Must be a solvent supported
        by XTB (https://xtb-docs.readthedocs.io/en/latest/gbsa.html). Default is ``None``.
    num_cores : int, optional
        Number of CPU cores to be used in the xTB calculation. Default is 1.
    charge : int, optional
        Molecular charge. Default is 0.
    temp_dir : str or Path, optional
        Temporary directory for I/O. Default is the system temporary directory.
    timeout : float, optional
        Maximum number of seconds to allow the xtb subprocess to run before it is
        killed and ``subprocess.TimeoutExpired`` is raised. Default is ``None`` (no timeout).

    Returns
    -------
    list
        List of partial charges for each atom (in e-).
    """

    mol = deepcopy(conformer)

    with set_thread_limits(num_cores):
        # rand = str((os.getpid() * int(time.time())) % 123456789)
        rand = str(uuid.uuid4()) + ''.join(str(time.time()).split('.')[1])
        out_dir = Path(f'temp_xtb_opt_{rand}/')
        try:
            if temp_dir != '':
                if isinstance(temp_dir, str):
                    temp_dir = Path(temp_dir)
                out_dir = temp_dir / out_dir
            out_dir.mkdir(exist_ok=True)

            input_file = 'input_mol.xyz'
            Chem.rdmolfiles.MolToXYZFile(mol, str(out_dir/input_file))

            if solvent is not None:
                subprocess.check_call(
                    ['xtb', input_file, '--scc', '--alpb', solvent, '--parallel', str(num_cores), '--chrg', str(charge)],
                    cwd = out_dir,
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.STDOUT,
                    timeout=timeout,
                )
            else:
                subprocess.check_call(
                    ['xtb', input_file, '--scc', '--parallel', str(num_cores), '--chrg', str(charge)],
                    cwd = out_dir,
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.STDOUT,
                    timeout=timeout,
                )

            with open(out_dir/'charges', 'r') as file:
                lines = file.readlines()
                charges = [0]*len(lines)
                for i, line in enumerate(lines):
                    charges[i] = float(line.split()[0])

        finally:
            if out_dir.exists():
                shutil.rmtree(out_dir)

        return charges




[docs]
def single_point_xtb_from_xyz(xyz_block: str,
                              solvent: Optional[str] = None,
                              num_cores: int = 1,
                              charge: int = 0,
                              temp_dir: Union[str, Path] = TMPDIR,
                              timeout: Optional[float] = None):
    """
    Compute energy and atomic partial charges from a single point xTB calculation.

    Uses external calls to GFN2-XTB (command line).

    Parameters
    ----------
    xyz_block : str
        String of xyz block representing a molecule.
    solvent : str, optional
        Implicit solvent to be used during calculation. Must be a solvent supported
        by XTB (https://xtb-docs.readthedocs.io/en/latest/gbsa.html). Default is ``None``.
    num_cores : int, optional
        Number of CPU cores to be used in the xTB calculation. Default is 1.
    charge : int, optional
        Molecular charge. Default is 0.
    temp_dir : str or Path, optional
        Temporary directory for I/O. Default is the system temporary directory.
    timeout : float, optional
        Maximum number of seconds to allow the xtb subprocess to run before it is
        killed and ``subprocess.TimeoutExpired`` is raised. Default is ``None`` (no timeout).

    Returns
    -------
    energy : float
        xTB energy in Hartrees.
    charges : list
        List of partial charges for each atom (in e-).
    """

    with set_thread_limits(num_cores):
        # rand = str((os.getpid() * int(time.time())) % 123456789)
        rand = str(uuid.uuid4()) + ''.join(str(time.time()).split('.')[1])
        out_dir = Path(f'temp_xtb_opt_{rand}/')
        try:
            if temp_dir != '':
                if isinstance(temp_dir, str):
                    temp_dir = Path(temp_dir)
                out_dir = temp_dir / out_dir
            out_dir.mkdir(exist_ok=True)

            input_file = 'input_mol.xyz'
            with open(out_dir/input_file, 'w') as f:
                f.write(xyz_block)

            if solvent is not None:
                output = subprocess.check_output(
                    ['xtb', input_file, '--scc', '--alpb', solvent, '--parallel', str(num_cores), '--chrg', str(charge)],
                    cwd = out_dir,
                    text=True,
                    encoding="utf-8",
                    errors="replace",
                    stderr=subprocess.STDOUT,
                    timeout=timeout,
                )
            else:
                output = subprocess.check_output(
                    ['xtb', input_file, '--scc', '--parallel', str(num_cores), '--chrg', str(charge)],
                    cwd = out_dir,
                    text=True,
                    encoding="utf-8",
                    errors="replace",
                    stderr=subprocess.STDOUT,
                    timeout=timeout,
                )

            output = output.split('\n')
            for line in output[::-1]:
                if 'TOTAL ENERGY' in line:
                    numbers = re.findall(r"[-+]?(?:\d*\.\d+|\d+)", line)
                    energy = float(numbers[0])
                    break

            with open(out_dir/'charges', 'r') as file:
                lines = file.readlines()
                charges = [0]*len(lines)
                for i, line in enumerate(lines):
                    charges[i] = float(line.split()[0])
        finally:
            if out_dir.exists():
                shutil.rmtree(out_dir)

        return energy, charges



def _optimize_conformer_worker(params):
    """Worker function for multiprocessing - must be at module level for pickling"""
    return optimize_conformer_with_xtb(*params)


[docs]
def optimize_conformer_ensemble_with_xtb(conformers: List[Chem.Mol],
                                       solvent: Optional[str] = None,
                                       num_processes: int = 1,
                                       num_workers: int = 1,
                                       charge: int = 0,
                                       temp_dir: Union[str, Path] = TMPDIR,
                                       verbose: bool = False):
    """
    GFN2-XTB geometry optimization for a list of conformers.

    Parameters
    ----------
    conformers : list
        List of RDKit Mol objects (with 3D coordinates) to be optimized.
    solvent : str, optional
        Implicit solvent supported by XTB
        (https://xtb-docs.readthedocs.io/en/latest/gbsa.html). Default is ``None``.
    num_processes : int, optional
        Number of CPU cores used per xTB optimization. Default is 1.
    num_workers : int, optional
        Number of parallel workers (processes) to distribute conformers across.
        Ensure num_workers * num_processes <= available CPUs to avoid oversubscription.
        Default is 1.
    charge : int, optional
        Molecular charge. RDKit will be used to compute the formal charge if
        len(conformers) > 1. Default is 0.
    temp_dir : str or Path, optional
        Temporary directory for I/O. Default is the system temporary directory.
    verbose : bool, optional
        Show a simple progress bar in single-process mode. Default is ``False``.

    Returns
    -------
    tuple
        (conformers_opt, energies_opt, charges_opt) - tuple of lists containing
        optimized conformers, their energies, and partial charges.
    """

    if not conformers:
        return [], [], []

    available_cpus = os.cpu_count() or 1
    total_requested_cpus = max(1, num_workers) * max(1, num_processes)
    if total_requested_cpus > available_cpus:
        raise ValueError(
            f"Requested num_workers * num_processes = {total_requested_cpus} exceeds available CPUs ({available_cpus})."
        )

    # Serial path (single conformer or single worker)
    if len(conformers) == 1 or num_workers <= 1:
        conformers_opt = []
        energies_opt = []
        charges_opt = []

        iterator = tqdm(conformers, total=len(conformers), desc='XTB opt') if verbose else conformers
        for conf in iterator:
            opt_conf, opt_energy, opt_charges = optimize_conformer_with_xtb(
                conformer=conf,
                solvent=solvent,
                num_cores=num_processes,
                charge=charge,
                temp_dir=temp_dir,
            )
            conformers_opt.append(opt_conf)
            energies_opt.append(opt_energy)
            charges_opt.append(opt_charges)

        return conformers_opt, energies_opt, charges_opt

    # Parallel
    ctx = multiprocessing.get_context('spawn')

    args = [
        (conf, solvent, num_processes, Chem.GetFormalCharge(conf), temp_dir) for conf in conformers
    ]

    with ctx.Pool(processes=num_workers) as pool:
        iterator = pool.imap(_optimize_conformer_worker, args)
        if verbose:
            iterator = tqdm(iterator, total=len(args), desc='XTB opt')
        results = list(iterator)

    conformers_opt, energies_opt, charges_opt = zip(*results)
    return list(conformers_opt), list(energies_opt), list(charges_opt)




[docs]
def generate_opt_conformers_xtb(smiles: str,
                                charge: int = 0,
                                solvent: Optional[str]=None,
                                MMFF_optimize: bool = True,
                                num_processes: int = 1,
                                num_workers: int = 1,
                                temp_dir: Union[str, Path] = TMPDIR,
                                verbose: bool = False,
                                num_confs: int = 1000):
    """
    Generate conformer ensemble with RDKit then relax with xTB.

    Parameters
    ----------
    smiles : str
        SMILES string of the molecule.
    charge : int, optional
        Molecular charge. Default is 0.
    solvent : str, optional
        Implicit solvent to be used during optimization. Must be a solvent supported
        by XTB (https://xtb-docs.readthedocs.io/en/latest/gbsa.html). Default is ``None``.
    MMFF_optimize : bool, optional
        Optimize RDKit embedded molecules with MMFF94. Default is ``True``.
    num_processes : int, optional
        Number of CPU cores to be used in the xTB geometry optimization. Default is 1.
    num_workers : int, optional
        Number of parallel workers (processes) to distribute conformers across.
        Ensure num_workers * num_processes <= available CPUs to avoid oversubscription.
        Default is 1.
    temp_dir : str or Path, optional
        Temporary directory for I/O. Default is the system temporary directory.
    verbose : bool, optional
        Toggle tqdm progress bar. Default is ``False``.
    num_confs : int, optional
        Number of conformers to initially generate. Default is 1000.

    Returns
    -------
    clustered_conformers_xtb : list
        List of rdkit conformers after xTB relaxation and clustering.
    clustered_energies_xtb : list
        List of energies for associated conformers.
    clustered_charges_xtb : list
        List of partial charges for associated conformers.
    """
    available_cpus = os.cpu_count() or 1
    total_requested_cpus = max(1, num_workers) * max(1, num_processes)
    if total_requested_cpus > available_cpus:
        raise ValueError(
            f"Requested num_workers * num_processes = {total_requested_cpus} exceeds available CPUs ({available_cpus})."
        )
    mol_3d = embed_conformer_from_smiles(smiles, attempts = 50, MMFF_optimize = MMFF_optimize)

    if mol_3d is None:
        return None, None, None

    conformer_ensemble = generate_conformer_ensemble(
        mol_3d,
        num_confs=num_confs,
        num_threads = 4,
        threshold = 0.05,
        num_opt_steps = 50,
    )
    if verbose:
        print(f'Num conformers generated: \t{len(conformer_ensemble)}')

    clustered_conformers_index = cluster_conformers_butina(
        conformer_ensemble,
        threshold = 0.1,
        num_max_conformers = None,
    )
    clustered_conformers = [conformer_ensemble[c] for c in clustered_conformers_index]
    if verbose:
        print(f'Num conformers remaining: \t{len(clustered_conformers)}')

    # further optimizing each conformer with GFN2-XTB
    optimized_conformers_xtb, optimized_energies_xtb, optimized_charges_xtb = optimize_conformer_ensemble_with_xtb(
        clustered_conformers,
        solvent = solvent,
        num_processes = num_processes,
        num_workers = num_workers,
        charge=charge,
        temp_dir = temp_dir,
        verbose=verbose
    )
    if verbose:
        print(f'Num optimized confomers with xtb: \t{len(optimized_conformers_xtb)}')

    clustered_conformers_xtb_index = cluster_conformers_butina(
        optimized_conformers_xtb,
        threshold = 0.1,
        num_max_conformers = None,
    )
    clustered_conformers_xtb = [optimized_conformers_xtb[c] for c in clustered_conformers_xtb_index]
    clustered_charges_xtb = [optimized_charges_xtb[c] for c in clustered_conformers_xtb_index]
    clustered_energies_xtb = [optimized_energies_xtb[c] for c in clustered_conformers_xtb_index]

    if verbose:
        print(f'Num conformers remaining after xtb opt and cluster: \t{len(clustered_conformers_xtb)}')

    return clustered_conformers_xtb, clustered_energies_xtb, clustered_charges_xtb




[docs]
def generate_opt_conformers(smiles: str,
                            MMFF_optimize: bool = True,
                            verbose: bool = False,
                            num_confs: int = 1000):
    """
    Generate optimal conformers with RDKit (MMFF94).

    Parameters
    ----------
    smiles : str
        SMILES string of the molecule.
    MMFF_optimize : bool, optional
        Optimize RDKit embedded molecules with MMFF94. Default is ``True``.
    verbose : bool, optional
        Toggle tqdm progress bar. Default is ``False``.
    num_confs : int, optional
        Number of conformers to initially generate. Default is 1000.

    Returns
    -------
    list
        List of clustered rdkit conformers after RDKit embedding and optional MMFF relaxation.
    """
    mol_3d = embed_conformer_from_smiles(smiles, attempts = 50, MMFF_optimize = MMFF_optimize)

    if mol_3d is None:
        return None, None, None

    conformer_ensemble = generate_conformer_ensemble(
        mol_3d,
        num_confs=num_confs,
        num_threads = 4,
        threshold = 0.05,
        num_opt_steps = 50,
    )
    if verbose:
        print(f'Num conformers generated: \t{len(conformer_ensemble)}')

    clustered_conformers_index = cluster_conformers_butina(
        conformer_ensemble,
        threshold = 0.1,
        num_max_conformers = None,
    )
    clustered_conformers = [conformer_ensemble[c] for c in clustered_conformers_index]
    if verbose:
        print(f'Num conformers remaining: \t{len(clustered_conformers)}')

    return clustered_conformers