Source code for chembee.actions.benchmark_algorithms

from chembee.config.benchmark.algorithms import algorithms
import logging
from chembee.utils.file_utils import save_json_to_file, make_full_filename
from chembee.plotting.graphics import plotting_map_comparison
import sys
import os
import numpy as np

sys.path.insert(0, os.path.abspath(
    os.path.join(os.path.dirname(__file__), "..")))


logging.basicConfig(
    format="%(levelname)s:%(asctime)s %(message)s",
    datefmt="%m/%d/%Y %I:%M:%S %p",
    level=logging.INFO,
    filename="chembee.log",
)


[docs]def benchmark_standard(
    X,
    y,
    feature_names=["Feature 1", "Feature 2"],
    file_name="benchmark",
    prefix="plots/benchmarks",
    algorithms=algorithms,
    to_fit=True,
):

    metrics = {}
    for alg in algorithms:
        if alg is not None:
            response_method = alg._response_method
        else:
            response_method = None
        try:
            metric = benchmark_algorithm_standard(
                alg,
                X,
                y,
                plot_function=plotting_map_comparison[len(alg.algorithms)],
                file_name="benchmark_" + alg.name,
                prefix=prefix,
                feature_names=feature_names,
                response_method=response_method,
                to_fit=to_fit,
            )
        except Exception as e:
            assert 1 == 2, str(e) + str(alg.name) + str(response_method)
            logging.info("Could not fit classifier " + str(alg.name))
            logging.info(str(e))
            metric = None
            assert False, "Could not plot"
            continue
    file_name = make_full_filename(file_name=file_name, prefix=prefix)
    save_json_to_file(metrics, file_name=file_name)
    return metrics


[docs]def benchmark_cv_algorithms(
    algorithms: list,
    names: list,
    X: np.ndarray,
    y: np.ndarray,
    plot_function=plotting_map_comparison[1],
    file_name: str = "after_cv_benchmark",
    prefix: str = "plots/benchmarks/",
    feature_names: list = ["Feature 1", "Feature 2"],
    response_method: str = "predict",
    to_fit=True,
):
    """
    The benchmark_cv_algorithms function takes a list of algorithms and fits them to the data.
    It then plots the results using plot_cv_algorithms function.

    :param algorithms:list: Used to specify which algorithms to use in the benchmark.
    :param names:list: Used to specify the names of the algorithms.
    :param X:np.ndarray: Used to pass the data to be used for training and testing.
    :param y:np.ndarray: Used to pass the target variable.
    :param file_name:str="benchmark": Used to name the output file.
    :param prefix:str="benchmarks/": Used to specify the path to the directory where all benchmark plots are saved.
    :param feature_names:list=["Feature1": Used to label the x-axis of the plots.
    :param "Feature2"]: Used to Specify the name of the feature that is used for plotting.
    :param response_method:str="predict": Used to determine which method is used to generate the response.
    :param to_fit=True: Used to determine whether the model should be fitted before predicting or transforming.
    :param : Used to determine the method used to obtain the response from the model.
    :return: A list of metrics for each model.

    :doc-author: Julian M. Kleber
    """

    metrics_class = []
    models = []

    for i in range(len(algorithms)):

        clf = algorithms[i]
        name = names[i]
        file_name = name + file_name
        if to_fit:
            try:
                clf.fit(X, y)
            except:
                logging.info("Could not fit clf " + str(name))
        models.append(clf)
        if response_method == "predict":
            try:
                y_pred = clf.predict(X)
            except:
                logging.info("Could not predict results for" + str(name))
        elif response_method == "fit_predict":
            try:
                y_pred = clf.fit_predict(X, y)
            except:
                logging.info("Could not fit_predict results for" + str(name))
        elif response_method == "transform":
            try:
                y_pred = clf.transform(X)
            except:
                logging.info("Could not transform results for" + str(name))
        logging.info("Fitted model: " + str(name))

        try:
            plot_function(
                clf=clf,
                X=X,
                y=y,
                file_name=file_name,
                prefix=prefix,
                feature_names=feature_names,
                response_method=response_method,
            )
        except:
            logging.info("Could not plot name " + str(name))

    metrics = {k: v for k, v in zip(names, metrics_class)}
    return metrics


[docs]def benchmark_algorithm_standard(
    algorithm: list,
    X: np.ndarray,
    y: np.ndarray,
    plot_function: object(),
    file_name: str = "benchmark",
    prefix: str = "benchmarks/",
    feature_names: list = ["Feature 1", "Feature 2"],
    response_method: str = "predict",
    to_fit=True,
) -> dict:
    """
    The benchmark_algorithm_standard function takes a list of algorithms,
    a list of names for the algorithms, and a dataset (X and y),
    and returns an array of metrics. The function is meant to be used for standard algorithms and not cross validation
    objects. For cross_validation objects, choose the benchmark_cv object. The function also plots the results.

    :param algorithm:list: Used to store the algorithms that will be used in the benchmark.
    :param names:list: Used to pass the names of the algorithms that will be used in the plot.
    :param X:np.ndarray: Used to pass the data to the benchmark_algorithm_standard function.
    :param y:np.ndarray: Used to pass the target variable to the plot_roc_curve function.
    :param plot_function:object(): Used to pass a function to the benchmark_algorithm_standard function.
    :param file_name:str="benchmark": Used to define the name of the file that will be saved.
    :param prefix:str="benchmarks/": Used to specify the path where the plots will be saved.
    :param feature_names:list=["Feature1": Used to label the x-axis of the plot.
    :param "Feature2"]: Used to Specify the name of the feature that is used in the plot.
    :param response_method:str="predict": Used to determine the method used to generate a response from the model.
    :param to_fit=True: Used to tell the algorithm to fit the model before predicting.
    :param : Used to define the type of plot to be created.
    :return: A dictionary of metrics.

    :doc-author: Julian M. Kleber"""

    name = algorithm.name
    algorithms = algorithm.algorithms
    file_name = "benchmark_" + name
    metrics_class = []
    models = []
    for i in range(len(algorithm.titles)):

        clf = algorithms[i]
        if to_fit:
            clf = clf.fit(X, y)
        models.append(clf)
        if response_method == "predict":
            y_pred = clf.predict(X)

        elif response_method == "fit_predict":
            y_pred = clf.fit_predict(X, y)
        if response_method == "transform":
            y_pred = clf.transform(X)
        logging.info("Fitted model: " + str(algorithm.titles[i]))

    plot_function(
        models=models,
        titles=algorithm.titles,
        X=X,
        y=y,
        file_name=file_name,
        prefix=prefix,
        feature_names=feature_names,
        response_method=response_method,
    )

    metrics = {k: v for k, v in zip(algorithm.titles, metrics_class)}
    return metrics


# should refactor the above functions
[docs]def benchmark_algorithm(
    algorithms: list,
    X: np.ndarray,
    y: np.ndarray,
    plot_function: object(),
    file_name: str = "benchmark",
    prefix: str = "benchmarks/",
    feature_names: list = ["Feature 1", "Feature 2"],
    response_method: str = "predict",
    to_fit=True,
) -> dict:
    """
    This function is not used in production at the moment and serves as a draft on how to abstract the functions above,
    to refactor the code and make it more maintainable.
    The benchmark_algorithm function is used to benchmark the performance of a given algorithm.
    It takes as input an algorithm, and returns a dictionary containing the metrics for that algorithm.
    The function is designed to be called within a loop, so that it can be used to compare multiple algorithms at once.

    :param algorithm: Used to specify the algorithm to use.
    :param X:np.ndarray: Used to pass the data to be used for training and testing.
    :param y:np.ndarray: Used to pass the target variable to the plot_roc_curve function.
    :param plot_function:object(): Used to pass a function that plots the results.
    :param file_name:str="benchmark": Used to name the plot file.
    :param prefix:str="benchmarks/": Used to specify the path to store the generated plots.
    :param feature_names:list=["Feature1": Used to name the columns of the dataframe.
    :param "Feature2"]: Used to Specify the names of the features in your dataset.
    :param response_method:str="predict": Used to determine whether the models should be used to predict the response or transform it.
    :param to_fit=True: Used to fit the model to the data.
    :param : Used to specify the name of the file to which we want to save our plots.
    :return: A dictionary with the following keys:.

    :doc-author: Julian M. Kleber
    """

    clf = algorithms[i]
    if to_fit:
        clf = clf.fit(X, y)
    if response_method == "predict":
        y_pred = clf.predict(X)
    elif response_method == "fit_predict":
        y_pred = clf.fit_predict(X, y)
    elif response_method == "transform":
        y_pred = clf.transform(X)
    else:
        logging.info("No valid response method")
    logging.info("Fitted model: " + str(algorithm.titles[i]))

    metrics = {k: v for k, v in zip(algorithm.titles, metrics_class)}
    return metrics