Source code for chembee.datasets.BreastCancer

from chembee.datasets.DataSet import DataSet
from sklearn import datasets
from sklearn.model_selection import train_test_split

from chembee.utils.file_utils import prepare_file_name_saving


[docs]class BreastCancerDataset(DataSet):

    name = "breast-cancer"

    def __init__(self, split_ratio):

        self.data = self.load_data_set()
        self.split_ratio = split_ratio
        (
            self.X_train,
            self.X_test,
            self.y_train,
            self.y_test,
        ) = self.make_train_test_split(self.data, self.split_ratio)

[docs]    def load_data_set(self):
        """
        The load_data_set function loads the data from the csv file and creates a list of lists.
        The function also removes any rows with missing values, as well as any columns that have all zeros.
        The function returns a tuple containing two elements: (data_set, target)

        :param self: Used to Reference the class object.
        :return: The cancer_data dataframe.

        :doc-author: Trelent
        """

        cancer_data = datasets.load_breast_cancer()
        return cancer_data

[docs]    def make_train_test_split(self, data, split_ratio, shuffle=True):
        """
        The make_train_test_split function splits the data into training and testing sets.

        Parameters:
            data (object): The dataset to be split.

            split_ratio (float): The ratio of the number of training samples to total number of samples in the dataset.

            shuffle (bool, optional): Whether or not to shuffle the input before splitting it into train and test sets. Defaults to True if not specified otherwise.

        :param self: Used to Reference the class instance.
        :param data: Used to Pass the data set to be split.
        :param split_ratio: Used to determine the ratio of samples used for training.
        :param shuffle=True: Used to shuffle the data before splitting it into train and test sets.
        :return: The following:.

        :doc-author: Trelent
        """

        X = data.data[:, :2]
        y = data.target
        train_samples = int(split_ratio * len(X))
        X_train, X_test, y_train, y_test = train_test_split(
            X,
            y,
            shuffle=shuffle,
            test_size=len(X) - train_samples,
        )
        return X_train, X_test, y_train, y_test

[docs]    def save_data_npy(self, data, file_name, prefix=None):
        import numpy as np

        file_name = prepare_file_name_saving(
            prefix=prefix, file_name=file_name, ending=".npy"
        )
        np.save(file_name, data)