Source code for chembee.actions.feature_extraction

# want to do the random forest extraction and give out the features as JSON
from sklearn.ensemble import RandomForestClassifier
import numpy as np


[docs]def filter_importance_by_std(result_json, cut_off=0.01): """ The filter_importance_by_std function takes a JSON object containing feature names, indices, importances and standard deviations and filters out features with low importance. The cut_off parameter is used to determine the minimum importance level for a feature to be included in the output. For example, if cut_off = 0.01 then only features with an importance greater than 1% will be included in the output. :param result_json: Used to pass the result of a sklearn. :param cut_off=0.01: Used to filter out the features with low importance. :return: A dictionary with the following keys:. :doc-author: Julian M. Kleber """ feature_names = np.array(result_json["feature_names"]) feature_indices = np.array(result_json["feature_indices"]) importances = np.array(result_json["importances"]) std = np.array(result_json["std"]) indices = std > cut_off if len(feature_names) == 0: feature_names = 0 else: feature_names = feature_names[indices].tolist() result_json = { "feature_names": feature_names, "feature_indices": feature_indices[indices].tolist(), "importances": importances[indices].tolist(), "std": std[indices].tolist(), } return result_json
[docs]def get_feature_importances( X_data: np.ndarray, y_data: np.ndarray, feature_names: list ) -> dict: """ The get_feature_importances function accepts three arguments: 1. X_data - A numpy array of the features in the dataset 2. y_data - A numpy array of the labels in the dataset 3. feature_names - An optional list containing names for each feature The function returns a dictionary with four keys: 1) "feature_names" which contains all of the feature names passed to this function, and 2) "importances", which is another dictionary where each key is a column name from X data,and its value is that column's importance as determined by sklearn's RandomForestClassifier algorithm. 3) "std", which is the standard deviation of the feature importance for each feature, 4) "feature_indices", which is a list of the indices of the respective feature names :param X_data:np.ndarray: Used to pass the data. :param y_data:np.ndarray: Used to pass the target variable. :param feature_names:list: Used to get the names of the features. :return: A dictionary with the following keys:. :doc-author: Julian M. Kleber """ assert type(X_data) == type(np.zeros((1, 1))) and type(y_data) == type( np.zeros((1, 1)) ) # Split the dataset in two equal parts feature_indices = np.linspace( 1, X_data.shape[1], X_data.shape[1]).astype(np.int32) forest = RandomForestClassifier(random_state=0) # algorithm forest.fit(X_data, y_data) importances = forest.feature_importances_ std = np.std( [tree.feature_importances_ for tree in forest.estimators_], axis=0) result_json = { "feature_names": feature_names, "feature_indices": feature_indices.tolist(), "importances": importances.tolist(), "std": std.tolist(), } return result_json