Source code for chembee.actions.cross_validation

# want to do with ROC check: https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html#sphx-glr-auto-examples-model-selection-plot-roc-crossval-py
# making own scoring: https://scikit-learn.org/stable/modules/cross_validation.html
# Want to do Stratified k-fold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import accuracy_score

# algorithm

# cross validation parameters
import logging


logging.basicConfig(
    format="%(levelname)s:%(asctime)s %(message)s",
    datefmt="%m/%d/%Y %I:%M:%S %p",
    level=logging.INFO,
    filename="actions.log",
)








[docs]def stratified_n_fold(clf, X_data, y_data, n=5, cut_off_filter=None) -> dict: """ The stratified_n_fold function takes a classifier, training data and labels, and returns the average accuracy of the classifier on each fold. The function also takes an optional cut_off_filter parameter that allows you to filter out features with low variance. :param clf: Used to pass in the classifier that will be used. :param X_data: Used to pass the data to be used for training and testing. :param y_data: Used to specify the labels of the data. :param n=5: Used to specify the number of folds. :param cut_off_filter=None: Used to filter the data. :return: A dictionary with two lists: accuracies_train and accuracies_test. :doc-author: Julian M. Kleber """ # Run classifier with cross-validation and plot ROC curves cv = StratifiedKFold(n_splits=n) accuracies_test = [] accuracies_train = [] for i, (train, test) in enumerate(cv.split(X_data, y_data.astype(np.int32))): clf.fit(X_data[train], y_data.astype(np.int32)[train]) y_pred = clf.predict(X_data[test]) score = accuracy_score(y_pred, y_data.astype(np.int32)[test]) accuracies_test.append(float(score)) y_pred = clf.predict(X_data[train]) score = accuracy_score(y_pred, y_data.astype(np.int32)[train]) accuracies_train.append(float(score)) result = {"accuracies_train": accuracies_train, "accuracies_test": accuracies_test} return result
[docs]def stratified_n_fold_filter(clf, X_data, y_data, n=5, cut_off_filter=None) -> dict: """ The stratified_n_fold_filter function takes in a classifier, training data and labels, and returns the average accuracy of the classifier on each fold. Additionally it also returns a list of indices that were filtered out due to having an accuracy below some cut_off_filter value. Note: To be SOLID and avoid weird code as well as performance issues the function, stratified_n_fold is basically implemented twice. One with and one time without a filter. :param clf: Used to pass the classifier that should be used. :param X_data: Used to pass the data to be used for training and testing. :param y_data: Used to determine the number of classes. :param n=5: Used to specify the number of folds in the stratifiedkfold function. :param cut_off_filter=None: Used to filter out the indices of the test set that have a lower accuracy than cut_off_filter. :return: A dictionary with the following keys:. :doc-author: Julian M. Kleber """ cv = StratifiedKFold(n_splits=n) accuracies_test = [] accuracies_train = [] filtered_indices = [] for i, (train, test) in enumerate(cv.split(X_data, y_data.astype(np.int32))): clf.fit(X_data[train], y_data.astype(np.int32)[train]) y_pred = clf.predict(X_data[test]) score = accuracy_score(y_pred, y_data.astype(np.int32)[test]) accuracies_test.append(float(score)) y_pred = clf.predict(X_data[train]) score = accuracy_score(y_pred, y_data.astype(np.int32)[train]) accuracies_train.append(float(score)) if cut_off_filter != None: if score < cut_off_filter: filtered_indices.append(test) if len(filtered_indices) < 1: filtered_indices = None result = { "accuracies_train": accuracies_train, "accuracies_test": accuracies_test, "filtered_indices": filtered_indices, } return result