Source code for chembee.actions.feature_extraction

# want to do the random forest extraction and give out the features as JSON
from sklearn.ensemble import RandomForestClassifier
import numpy as np


[docs]def filter_importance_by_std(result_json, cut_off=0.01):
    """
    The filter_importance_by_std function takes a JSON object containing feature names, indices, importances and standard deviations
    and filters out features with low importance. The cut_off parameter is used to determine the minimum importance level for a feature
    to be included in the output. For example, if cut_off = 0.01 then only features with an importance greater than 1% will be included
    in the output.

    :param result_json: Used to pass the result of a sklearn.
    :param cut_off=0.01: Used to filter out the features with low importance.
    :return: A dictionary with the following keys:.

    :doc-author: Julian M. Kleber
    """
    feature_names = np.array(result_json["feature_names"])
    feature_indices = np.array(result_json["feature_indices"])
    importances = np.array(result_json["importances"])
    std = np.array(result_json["std"])
    indices = std > cut_off
    if len(feature_names) == 0:
        feature_names = 0
    else:
        feature_names = feature_names[indices].tolist()
    result_json = {
        "feature_names": feature_names,
        "feature_indices": feature_indices[indices].tolist(),
        "importances": importances[indices].tolist(),
        "std": std[indices].tolist(),
    }
    return result_json


[docs]def get_feature_importances(
    X_data: np.ndarray, y_data: np.ndarray, feature_names: list
) -> dict:
    """
    The get_feature_importances function accepts three arguments:
        1. X_data - A numpy array of the features in the dataset
        2. y_data - A numpy array of the labels in the dataset
        3. feature_names - An optional list containing names for each feature

    The function returns a dictionary with four keys:

        1) "feature_names" which contains all of the feature names passed to this function, and
        2) "importances", which is another dictionary where each key is a column name from X data,and its value is that column's importance as determined by sklearn's RandomForestClassifier algorithm.
        3) "std", which is the standard deviation of the feature importance for each feature,
        4) "feature_indices", which is a list of the indices of the respective feature names

    :param X_data:np.ndarray: Used to pass the data.
    :param y_data:np.ndarray: Used to pass the target variable.
    :param feature_names:list: Used to get the names of the features.
    :return: A dictionary with the following keys:.

    :doc-author: Julian M. Kleber
    """
    assert type(X_data) == type(np.zeros((1, 1))) and type(y_data) == type(
        np.zeros((1, 1))
    )

    # Split the dataset in two equal parts

    feature_indices = np.linspace(
        1, X_data.shape[1], X_data.shape[1]).astype(np.int32)
    forest = RandomForestClassifier(random_state=0)  # algorithm
    forest.fit(X_data, y_data)
    importances = forest.feature_importances_
    std = np.std(
        [tree.feature_importances_ for tree in forest.estimators_], axis=0)
    result_json = {
        "feature_names": feature_names,
        "feature_indices": feature_indices.tolist(),
        "importances": importances.tolist(),
        "std": std.tolist(),
    }
    return result_json