-
Notifications
You must be signed in to change notification settings - Fork 887
Open
Labels
Description
Hi,
I got a strange behavior using feature_groups;:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from mlxtend.evaluate import feature_importance_permutation
# Generate random data
np.random.seed(42)
n_samples = 100
feature_1 = np.random.normal(loc=10, scale=2, size=n_samples)
feature_2 = np.random.uniform(low=0, high=20, size=n_samples)
feature_3 = np.random.normal(loc=5, scale=1, size=n_samples)
# Create target (strongly correlated with feature_1)
target = 2 * feature_1 + np.random.normal(loc=0, scale=1, size=n_samples)
# Create DataFrame
df = pd.DataFrame({
'feature_1': feature_1,
'feature_2': feature_2,
'feature_3': feature_3,
'target': target
})
# Check correlation between feature_1 and target
correlation = df['feature_1'].corr(df['target'])
print(f"Correlation between feature_1 and target: {correlation:.2f}")
# Train-test split
X = df.drop(columns=['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# Fit Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train.values, y_train.values)
feature_groups_idx = [0,1,2]
feat_names = [str(idx) for idx,_ in enumerate(feature_groups_idx)]
# Perform feature importance permutation
mean_importance_vals, all_importance_vals = feature_importance_permutation(
predict_method=model.predict,
X=X_train.values,
y=y_train,
metric='r2',
num_rounds=30,
feature_groups=feature_groups_idx,
seed=42,
)
importance_std = np.std(all_importance_vals, axis=1)
# Create a DataFrame with the features and their importance scores
pfi_df = pd.DataFrame({
'Feature': feat_names,
'Importance': mean_importance_vals,
'Importance_Std': importance_std
}).sort_values('Importance', ascending=False)
pfi_df['Feature'] = pfi_df['Feature'].astype(str)
# Plot the top features
top_pfi_df = pfi_df.head(3)
fig, ax = plt.subplots(figsize=(10, 5))
sns.barplot(x='Importance', y='Feature', data=top_pfi_df, xerr=top_pfi_df['Importance_Std'])
plt.title('Top Feature Importances')
plt.xlabel('Importance Value')
plt.ylabel('Features')
plt.tight_layout()
plt.show()
that works well, however feature_1 is no more important when I use feature_groups_idx = [0, range(1,3)] in the MWE.
Do I miss something ?