This tutorial shows 4 different methods to select features:
import numpy as np
import pandas as pd
import shap
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from sklearn.datasets import load_boston
# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# Model Building
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
# Evaluation
from sklearn.metrics import mean_squared_error
boston_data = load_boston()
X = boston_data.data
y = boston_data.target
# Subsume into a dataframe
boston = pd.DataFrame(boston_data.data, columns=boston_data.feature_names)
# Add target to the dataframe
boston['MEDV'] = boston_data.target
boston.head()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
pipe = Pipeline([
('scaler', StandardScaler()),
('regressor', LinearRegression())
])
# fit/train model
pipe.fit(X_train, y_train)
# predict labels
y_pred = pipe.predict(X_test)
# metrics
rmse = mean_squared_error(y_test, y_pred, squared = False)
print(f'RMSE: {rmse:0.4}')
print(f'R^2 Train: {pipe.score(X_train, y_train):0.4}')
print(f'R^2 Test: {pipe.score(X_test, y_test):0.4}')
num = 6
# Import VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor
scaler = StandardScaler()
# For each feature X, calculate VIF and save it in a dataframe
def vif(X):
# perform feature scaling
xs = scaler.fit_transform(X)
# subsume into a dataframe
vif = pd.DataFrame()
vif["Features"] = [boston_data.feature_names[i] for i in range(xs.shape[1])]
vif["VIF Factor"] = [variance_inflation_factor(xs, i) for i in range(xs.shape[1])]
return vif
# List scores in descending order
result = vif(X_train).round(2).sort_values(by=['VIF Factor'])
result
# Select first num features and retrain model
selected1 = result.iloc[:num, 0]
X1 = boston[selected1]
X1
X1_train, X1_test = train_test_split(X1, test_size=0.2, random_state=0)
# fit/train model
pipe.fit(X1_train, y_train)
# predict labels
y_pred = pipe.predict(X1_test)
# evaluate
print(f'R^2 Train: {pipe.score(X1_train, y_train):0.4}')
print(f'R^2 Test: {pipe.score(X1_test, y_test):0.4}')
The result above shows serious overfitting. Let's try another feature selection method.
# Import f_regression and SelectKBest
from sklearn.feature_selection import f_regression, SelectKBest
# SelectKBest
kbest = SelectKBest(f_regression, k=6)
# Fit the model
kbest.fit(X_train, y_train)
# Show selected features
kbest.get_support(indices=True)
# Show scores
kbest.scores_
# Display the sorted scores
s, f = zip(*sorted(zip(kbest.scores_.round(2), boston_data.feature_names), reverse = True))
scores = pd.DataFrame()
scores['Feature'] = f
scores['Score'] = s
scores
# Select first 6 features and retrain model
selected2 = scores.iloc[:num, 0]
X2 = boston[selected2]
X2
X2_train, X2_test = train_test_split(X2, test_size=0.2, random_state=0)
# fit/train model
pipe.fit(X2_train, y_train)
# predict labels
y_pred = pipe.predict(X2_test)
# evaluate
print(f'R^2 Train: {pipe.score(X2_train, y_train):0.4}')
print(f'R^2 Test: {pipe.score(X2_test, y_test):0.4}')
These results are not great and suggest overfitting as well. Let's try RFE and RFECV.
# Feature Selection using RFE
from sklearn.feature_selection import RFE
rfe = RFE(LinearRegression(), n_features_to_select=num, step=1)
rfe.fit(X_train,y_train)
# Check the selected position
rfe.support_
# Get the feature ranking
rfe.ranking_
# Display the sorted ranks
r, f = zip(*sorted(zip(rfe.ranking_, boston_data.feature_names)))
ranks = pd.DataFrame()
ranks['Feature'] = f
ranks['Score'] = r
ranks
# Select first num features and retrain model
selected3 = ranks.iloc[:num, 0]
X3 = boston[selected3]
X3
X3_train, X3_test = train_test_split(X3, test_size=0.2, random_state=0)
# fit/train model
pipe.fit(X3_train, y_train)
# predict labels
y_pred = pipe.predict(X3_test)
# evaluate
print(f'R^2 Train: {pipe.score(X3_train, y_train):0.4}')
print(f'R^2 Test: {pipe.score(X3_test, y_test):0.4}')
Results are better but there's still a lot of overfitting. Let's try the final method - rfecv.
# Feature Selection using RFECV
from sklearn.feature_selection import RFECV
rfecv = RFECV(LinearRegression(), cv=10)
rfecv.fit(X_train, y_train)
# Get the selected features with CV
rfecv.n_features_
# Check the selected position
rfecv.support_
# Get the feature ranking
rfecv.ranking_
Since this selection technique only removed one feature, we will not proceed to test it.
There seems to be no improvement in the score of our linear regression model after feature selection. We can try playing around with the desired number of features num
or using a different model.