Source code for chembee.preparation.processing

import pandas as pd
from mordred import Calculator, descriptors
import numpy as np

from rdkit import Chem
from rdkit.Chem import (
    Descriptors,
    Descriptors3D,
    rdMolDescriptors,
    PandasTools,
    RDKFingerprint,
)
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect

# Utilities
import os
from tqdm import tqdm


[docs]def calculate_mordred_descriptors(mols: list) -> pd.DataFrame: """ The calculate_mordred_descriptors function calculates Mordred descriptors for a set of molecules. The function takes as input a list of RDKit molecule objects and returns the calculated descriptors in a pandas dataframe. :param mols: Used to specify the molecules for which descriptors are to be calculated. :return: A pandas dataframe containing the calculated descriptors. :doc-author: Julian M. Kleber """ calc = Calculator(descriptors, ignore_3D=False) data = calc.pandas(mols) return data
[docs]def calculate_lipinski_desc(data_set: pd.DataFrame, mols: pd.Series) -> pd.DataFrame: """ The calculate_lipinski_desc function calculates the molecular descriptors for each molecule in a data set. The function takes two arguments: 1) data_set - A pandas DataFrame containing the SMILES strings of each molecule in a data set. 2) mols - A pandas Series containing the RDKit Mol object of each molecule in a data set. The function returns one value: 1) The original DataFrame with additional columns for each descriptor, which contain floats. :param data_set:pd.DataFrame: Used to Store the calculated descriptors. :param mols:pd.Series: Used to Pass a series of molecules to the function. :return: A dataframe with the descriptors for each molecule in the mols series. :doc-author: Trelent """ for i, mol in tqdm(enumerate(mols)): Chem.SanitizeMol(mol) data_set.loc[i, "Molecule"] = mol data_set.loc[i, "MolWt"] = Descriptors.MolWt(mol) data_set.loc[i, "LogP"] = Descriptors.MolLogP(mol) data_set.loc[i, "NumHAcceptors"] = Descriptors.NumHAcceptors(mol) data_set.loc[i, "NumHDonors"] = Descriptors.NumHDonors(mol) data_set.loc[i, "NumHeteroatoms"] = Descriptors.NumHeteroatoms(mol) data_set.loc[i, "NumRotatableBonds"] = Descriptors.NumRotatableBonds( mol) data_set.loc[i, "NumHeavyAtoms"] = Descriptors.HeavyAtomCount(mol) data_set.loc[ i, "NumAliphaticCarbocycles" ] = Descriptors.NumAliphaticCarbocycles(mol) data_set.loc[ i, "NumAliphaticHeterocycles" ] = Descriptors.NumAliphaticHeterocycles(mol) data_set.loc[i, "NumAliphaticRings"] = Descriptors.NumAliphaticRings( mol) data_set.loc[i, "NumAromaticCarbocycles"] = Descriptors.NumAromaticCarbocycles( mol ) data_set.loc[ i, "NumAromaticHeterocycles" ] = Descriptors.NumAromaticHeterocycles(mol) data_set.loc[i, "NumAromaticRings"] = Descriptors.NumAromaticRings(mol) data_set.loc[i, "RingCount"] = Descriptors.RingCount(mol) data_set.loc[i, "FractionCSP3"] = Descriptors.FractionCSP3(mol) data_set.loc[i, "TPSA"] = Descriptors.TPSA(mol) data_set.loc[i, "NPR1"] = rdMolDescriptors.CalcNPR1(mol) data_set.loc[i, "NPR2"] = rdMolDescriptors.CalcNPR2(mol) data_set.loc[i, "InertialShapeFactor"] = Descriptors3D.InertialShapeFactor( mol) data_set.loc[i, "RadiusOfGyration"] = Descriptors3D.RadiusOfGyration( mol) return data_set
[docs]def convert_mols_to_rdk_fp(mols, return_bit=False): """ The convert_mols_to_rdkfingerprint function converts a list of RDKit molecules into a list of RDKit fingerprints. :param mols: Used to pass the molecules that are to be converted into fingerprints. :return: A list of rdkit fingerprints. :doc-author: Julian M. Kleber """ if return_bit: fps = [np.array(RDKFingerprint(x)).tolist() for x in mols] else: fps = [RDKFingerprint(x) for x in mols] return fps
[docs]def convert_mols_to_morgan_fp(mols, radius=3, n_bits=2048, return_bit=False): """ The convert_mols_to_morgan_fp function converts a list of molecules to their Morgan fingerprints. :param mols: Used to specify the molecules that are to be converted into Morgan fingerprints. :param radius=3: Used to determine the number of bits that will be used to represent each molecule. :param n_bits=2048: Used to determine the number of bits in the Morgan fingerprint. :return: A list of fingerprints. :doc-author: Julian M. Kleber """ import numpy as np if return_bit: fps = [ GetMorganFingerprintAsBitVect( np.array(x).tolist(), useChirality=True, useFeatures=True, radius=radius, nBits=n_bits, ) for x in mols ] else: fps = [ GetMorganFingerprintAsBitVect( x, useChirality=True, useFeatures=True, radius=radius, nBits=n_bits ) for x in mols ] return fps
[docs]def convert_mol_to_inchi(mols): """ The convert_mol_to_inchi function converts a molecule into an InChI string. :param mol: Used to pass the molecule to the function. :return: A tuple of the inchi and inchikey. :doc-author: Julian M. Kleber """ from rdkit import Chem inchis = [] inchikeys = [] for mol in mols: inchis.append(str(Chem.MolToInchi(mol))) inchikeys.append(str(Chem.MolToInchiKey(mol))) return inchis, inchikeys
[docs]def get_mols_from_supplier(indices, supplier): """ The get_mols_from_supplier function takes a list of indices and a supplier object, and returns the molecules corresponding to those indices. :param indices: Used to specify which molecules to retrieve from the supplier. :param supplier: Used to specify the supplier of the molecules. :return: A list of molecules. :doc-author: Julian M. Kleber """ mols = [] for indx in indices: mol = supplier[indx] mols.append(mol) return mols
[docs]def load_data(file_path: str): """ The load_data function loads the data from a file and returns it as a Pandas DataFrame. :param file_path:str: Used to specify the location of the. :return: A dataframe with the columns specified by the sdf data entries :doc-author: Julian M. Kleber """ sdfFile = os.path.join(file_path) mols = Chem.SDMolSupplier(sdfFile) frame = PandasTools.LoadSDF( sdfFile, smilesName="SMILES", molColName="Molecule", includeFingerprints=True, removeHs=False, strictParsing=True, ) frame = calculate_lipinski_desc(data_set=frame, mols=mols) return frame
if __name__ == "__main__": from utils import save_csv data = load_data("tests/data/Biodeg.sdf") save_csv(data, "converted_data.csv")