Source code for chembee.datasets.BreastCancer

from chembee.datasets.DataSet import DataSet
from sklearn import datasets
from sklearn.model_selection import train_test_split

from chembee.utils.file_utils import prepare_file_name_saving


[docs]class BreastCancerDataset(DataSet): name = "breast-cancer" def __init__(self, split_ratio): self.data = self.load_data_set() self.split_ratio = split_ratio ( self.X_train, self.X_test, self.y_train, self.y_test, ) = self.make_train_test_split(self.data, self.split_ratio)
[docs] def load_data_set(self): """ The load_data_set function loads the data from the csv file and creates a list of lists. The function also removes any rows with missing values, as well as any columns that have all zeros. The function returns a tuple containing two elements: (data_set, target) :param self: Used to Reference the class object. :return: The cancer_data dataframe. :doc-author: Trelent """ cancer_data = datasets.load_breast_cancer() return cancer_data
[docs] def make_train_test_split(self, data, split_ratio, shuffle=True): """ The make_train_test_split function splits the data into training and testing sets. Parameters: data (object): The dataset to be split. split_ratio (float): The ratio of the number of training samples to total number of samples in the dataset. shuffle (bool, optional): Whether or not to shuffle the input before splitting it into train and test sets. Defaults to True if not specified otherwise. :param self: Used to Reference the class instance. :param data: Used to Pass the data set to be split. :param split_ratio: Used to determine the ratio of samples used for training. :param shuffle=True: Used to shuffle the data before splitting it into train and test sets. :return: The following:. :doc-author: Trelent """ X = data.data[:, :2] y = data.target train_samples = int(split_ratio * len(X)) X_train, X_test, y_train, y_test = train_test_split( X, y, shuffle=shuffle, test_size=len(X) - train_samples, ) return X_train, X_test, y_train, y_test
[docs] def save_data_npy(self, data, file_name, prefix=None): import numpy as np file_name = prepare_file_name_saving( prefix=prefix, file_name=file_name, ending=".npy" ) np.save(file_name, data)