Source code for chembee.actions.benchmark_algorithms

from chembee.config.benchmark.algorithms import algorithms
import logging
from chembee.utils.file_utils import save_json_to_file, make_full_filename
from chembee.plotting.graphics import plotting_map_comparison
import sys
import os
import numpy as np

sys.path.insert(0, os.path.abspath(
    os.path.join(os.path.dirname(__file__), "..")))


logging.basicConfig(
    format="%(levelname)s:%(asctime)s %(message)s",
    datefmt="%m/%d/%Y %I:%M:%S %p",
    level=logging.INFO,
    filename="chembee.log",
)


[docs]def benchmark_standard( X, y, feature_names=["Feature 1", "Feature 2"], file_name="benchmark", prefix="plots/benchmarks", algorithms=algorithms, to_fit=True, ): metrics = {} for alg in algorithms: if alg is not None: response_method = alg._response_method else: response_method = None try: metric = benchmark_algorithm_standard( alg, X, y, plot_function=plotting_map_comparison[len(alg.algorithms)], file_name="benchmark_" + alg.name, prefix=prefix, feature_names=feature_names, response_method=response_method, to_fit=to_fit, ) except Exception as e: assert 1 == 2, str(e) + str(alg.name) + str(response_method) logging.info("Could not fit classifier " + str(alg.name)) logging.info(str(e)) metric = None assert False, "Could not plot" continue file_name = make_full_filename(file_name=file_name, prefix=prefix) save_json_to_file(metrics, file_name=file_name) return metrics
[docs]def benchmark_cv_algorithms( algorithms: list, names: list, X: np.ndarray, y: np.ndarray, plot_function=plotting_map_comparison[1], file_name: str = "after_cv_benchmark", prefix: str = "plots/benchmarks/", feature_names: list = ["Feature 1", "Feature 2"], response_method: str = "predict", to_fit=True, ): """ The benchmark_cv_algorithms function takes a list of algorithms and fits them to the data. It then plots the results using plot_cv_algorithms function. :param algorithms:list: Used to specify which algorithms to use in the benchmark. :param names:list: Used to specify the names of the algorithms. :param X:np.ndarray: Used to pass the data to be used for training and testing. :param y:np.ndarray: Used to pass the target variable. :param file_name:str="benchmark": Used to name the output file. :param prefix:str="benchmarks/": Used to specify the path to the directory where all benchmark plots are saved. :param feature_names:list=["Feature1": Used to label the x-axis of the plots. :param "Feature2"]: Used to Specify the name of the feature that is used for plotting. :param response_method:str="predict": Used to determine which method is used to generate the response. :param to_fit=True: Used to determine whether the model should be fitted before predicting or transforming. :param : Used to determine the method used to obtain the response from the model. :return: A list of metrics for each model. :doc-author: Julian M. Kleber """ metrics_class = [] models = [] for i in range(len(algorithms)): clf = algorithms[i] name = names[i] file_name = name + file_name if to_fit: try: clf.fit(X, y) except: logging.info("Could not fit clf " + str(name)) models.append(clf) if response_method == "predict": try: y_pred = clf.predict(X) except: logging.info("Could not predict results for" + str(name)) elif response_method == "fit_predict": try: y_pred = clf.fit_predict(X, y) except: logging.info("Could not fit_predict results for" + str(name)) elif response_method == "transform": try: y_pred = clf.transform(X) except: logging.info("Could not transform results for" + str(name)) logging.info("Fitted model: " + str(name)) try: plot_function( clf=clf, X=X, y=y, file_name=file_name, prefix=prefix, feature_names=feature_names, response_method=response_method, ) except: logging.info("Could not plot name " + str(name)) metrics = {k: v for k, v in zip(names, metrics_class)} return metrics
[docs]def benchmark_algorithm_standard( algorithm: list, X: np.ndarray, y: np.ndarray, plot_function: object(), file_name: str = "benchmark", prefix: str = "benchmarks/", feature_names: list = ["Feature 1", "Feature 2"], response_method: str = "predict", to_fit=True, ) -> dict: """ The benchmark_algorithm_standard function takes a list of algorithms, a list of names for the algorithms, and a dataset (X and y), and returns an array of metrics. The function is meant to be used for standard algorithms and not cross validation objects. For cross_validation objects, choose the benchmark_cv object. The function also plots the results. :param algorithm:list: Used to store the algorithms that will be used in the benchmark. :param names:list: Used to pass the names of the algorithms that will be used in the plot. :param X:np.ndarray: Used to pass the data to the benchmark_algorithm_standard function. :param y:np.ndarray: Used to pass the target variable to the plot_roc_curve function. :param plot_function:object(): Used to pass a function to the benchmark_algorithm_standard function. :param file_name:str="benchmark": Used to define the name of the file that will be saved. :param prefix:str="benchmarks/": Used to specify the path where the plots will be saved. :param feature_names:list=["Feature1": Used to label the x-axis of the plot. :param "Feature2"]: Used to Specify the name of the feature that is used in the plot. :param response_method:str="predict": Used to determine the method used to generate a response from the model. :param to_fit=True: Used to tell the algorithm to fit the model before predicting. :param : Used to define the type of plot to be created. :return: A dictionary of metrics. :doc-author: Julian M. Kleber""" name = algorithm.name algorithms = algorithm.algorithms file_name = "benchmark_" + name metrics_class = [] models = [] for i in range(len(algorithm.titles)): clf = algorithms[i] if to_fit: clf = clf.fit(X, y) models.append(clf) if response_method == "predict": y_pred = clf.predict(X) elif response_method == "fit_predict": y_pred = clf.fit_predict(X, y) if response_method == "transform": y_pred = clf.transform(X) logging.info("Fitted model: " + str(algorithm.titles[i])) plot_function( models=models, titles=algorithm.titles, X=X, y=y, file_name=file_name, prefix=prefix, feature_names=feature_names, response_method=response_method, ) metrics = {k: v for k, v in zip(algorithm.titles, metrics_class)} return metrics
# should refactor the above functions
[docs]def benchmark_algorithm( algorithms: list, X: np.ndarray, y: np.ndarray, plot_function: object(), file_name: str = "benchmark", prefix: str = "benchmarks/", feature_names: list = ["Feature 1", "Feature 2"], response_method: str = "predict", to_fit=True, ) -> dict: """ This function is not used in production at the moment and serves as a draft on how to abstract the functions above, to refactor the code and make it more maintainable. The benchmark_algorithm function is used to benchmark the performance of a given algorithm. It takes as input an algorithm, and returns a dictionary containing the metrics for that algorithm. The function is designed to be called within a loop, so that it can be used to compare multiple algorithms at once. :param algorithm: Used to specify the algorithm to use. :param X:np.ndarray: Used to pass the data to be used for training and testing. :param y:np.ndarray: Used to pass the target variable to the plot_roc_curve function. :param plot_function:object(): Used to pass a function that plots the results. :param file_name:str="benchmark": Used to name the plot file. :param prefix:str="benchmarks/": Used to specify the path to store the generated plots. :param feature_names:list=["Feature1": Used to name the columns of the dataframe. :param "Feature2"]: Used to Specify the names of the features in your dataset. :param response_method:str="predict": Used to determine whether the models should be used to predict the response or transform it. :param to_fit=True: Used to fit the model to the data. :param : Used to specify the name of the file to which we want to save our plots. :return: A dictionary with the following keys:. :doc-author: Julian M. Kleber """ clf = algorithms[i] if to_fit: clf = clf.fit(X, y) if response_method == "predict": y_pred = clf.predict(X) elif response_method == "fit_predict": y_pred = clf.fit_predict(X, y) elif response_method == "transform": y_pred = clf.transform(X) else: logging.info("No valid response method") logging.info("Fitted model: " + str(algorithm.titles[i])) metrics = {k: v for k, v in zip(algorithm.titles, metrics_class)} return metrics