This tutorial shows 4 different methods to select features:

Variable Inflation Factors (VIF)
SelectKBest
Recursive Feature Elimination (RFE)
Recursive Feature Elimination CV (RFECV)

Import Libraries

import numpy as np
import pandas as pd
import shap
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

from sklearn.datasets import load_boston

# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Model Building
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

# Evaluation
from sklearn.metrics import mean_squared_error

Load Data

boston_data = load_boston()

X = boston_data.data
y = boston_data.target

# Subsume into a dataframe
boston = pd.DataFrame(boston_data.data, columns=boston_data.feature_names)

# Add target to the dataframe
boston['MEDV'] = boston_data.target

boston.head()

Train Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Train a Linear Regression Model

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

# fit/train model
pipe.fit(X_train, y_train)

# predict labels
y_pred = pipe.predict(X_test)

# metrics
rmse = mean_squared_error(y_test, y_pred, squared = False)

print(f'RMSE: {rmse:0.4}')
print(f'R^2 Train: {pipe.score(X_train, y_train):0.4}')
print(f'R^2 Test: {pipe.score(X_test, y_test):0.4}')

RMSE: 5.784
R^2 Train: 0.773
R^2 Test: 0.5892

Feature Selection

num = 6

VIF¶

# Import VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

scaler = StandardScaler()

# For each feature X, calculate VIF and save it in a dataframe
def vif(X):
    
    # perform feature scaling
    xs = scaler.fit_transform(X)
    
    # subsume into a dataframe
    vif = pd.DataFrame()
    vif["Features"] = [boston_data.feature_names[i] for i in range(xs.shape[1])]
    vif["VIF Factor"] = [variance_inflation_factor(xs, i) for i in range(xs.shape[1])]
    
    return vif

# List scores in descending order
result = vif(X_train).round(2).sort_values(by=['VIF Factor'])
result

# Select first num features and retrain model

selected1 = result.iloc[:num, 0]
X1 = boston[selected1]

X1

X1_train, X1_test = train_test_split(X1, test_size=0.2, random_state=0)

# fit/train model
pipe.fit(X1_train, y_train)

# predict labels
y_pred = pipe.predict(X1_test)

# evaluate
print(f'R^2 Train: {pipe.score(X1_train, y_train):0.4}')
print(f'R^2 Test: {pipe.score(X1_test, y_test):0.4}')

R^2 Train: 0.6789
R^2 Test: 0.3846

The result above shows serious overfitting. Let's try another feature selection method.

SelectKBest¶

# Import f_regression and SelectKBest
from sklearn.feature_selection import f_regression, SelectKBest

# SelectKBest
kbest =  SelectKBest(f_regression, k=6) 

# Fit the model
kbest.fit(X_train, y_train)

SelectKBest(k=6, score_func=<function f_regression at 0x000002313FA65430>)

# Show selected features
kbest.get_support(indices=True)

array([ 2,  4,  5,  9, 10, 12], dtype=int64)

# Show scores
kbest.scores_

array([ 81.77212064,  81.1475855 , 153.00493431,  13.09583263,
       107.63575126, 397.33191725,  82.83387891,  34.46704936,
        88.68548609, 142.08609712, 187.10308463,  57.67850087,
       535.13194094])

# Display the sorted scores
s, f = zip(*sorted(zip(kbest.scores_.round(2), boston_data.feature_names), reverse = True))

scores = pd.DataFrame()
scores['Feature'] = f
scores['Score'] = s
scores

# Select first 6 features and retrain model

selected2 = scores.iloc[:num, 0]
X2 = boston[selected2]

X2

X2_train, X2_test = train_test_split(X2, test_size=0.2, random_state=0)

# fit/train model
pipe.fit(X2_train, y_train)

# predict labels
y_pred = pipe.predict(X2_test)

# evaluate
print(f'R^2 Train: {pipe.score(X2_train, y_train):0.4}')
print(f'R^2 Test: {pipe.score(X2_test, y_test):0.4}')

R^2 Train: 0.7267
R^2 Test: 0.4762

These results are not great and suggest overfitting as well. Let's try RFE and RFECV.

RFE¶

# Feature Selection using RFE
from sklearn.feature_selection import RFE

rfe = RFE(LinearRegression(), n_features_to_select=num, step=1)
rfe.fit(X_train,y_train)

RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                               normalize=False),
    n_features_to_select=6, step=1, verbose=0)

# Check the selected position
rfe.support_

array([False, False, False,  True,  True,  True, False,  True, False,
       False,  True, False,  True])

# Get the feature ranking
rfe.ranking_

array([2, 4, 7, 1, 1, 1, 8, 1, 3, 5, 1, 6, 1])

# Display the sorted ranks
r, f = zip(*sorted(zip(rfe.ranking_, boston_data.feature_names)))

ranks = pd.DataFrame()
ranks['Feature'] = f
ranks['Score'] = r
ranks

# Select first num features and retrain model

selected3 = ranks.iloc[:num, 0]
X3 = boston[selected3]

X3

X3_train, X3_test = train_test_split(X3, test_size=0.2, random_state=0)

# fit/train model
pipe.fit(X3_train, y_train)

# predict labels
y_pred = pipe.predict(X3_test)

# evaluate
print(f'R^2 Train: {pipe.score(X3_train, y_train):0.4}')
print(f'R^2 Test: {pipe.score(X3_test, y_test):0.4}')

R^2 Train: 0.7534
R^2 Test: 0.5486

Results are better but there's still a lot of overfitting. Let's try the final method - rfecv.

RFECV¶

# Feature Selection using RFECV
from sklearn.feature_selection import RFECV

rfecv = RFECV(LinearRegression(), cv=10)
rfecv.fit(X_train, y_train)

# Get the selected features with CV
rfecv.n_features_

12

# Check the selected position
rfecv.support_

array([ True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True])

# Get the feature ranking
rfecv.ranking_

array([1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1])

Since this selection technique only removed one feature, we will not proceed to test it.

Conclusion¶

There seems to be no improvement in the score of our linear regression model after feature selection. We can try playing around with the desired number of features num or using a different model.

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98	24.0
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14	21.6
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03	34.7
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94	33.4
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33	36.2

	Features	VIF Factor
3	CHAS	1.06
11	B	1.38
0	CRIM	1.79
10	PTRATIO	1.82
5	RM	2.03
1	ZN	2.34
12	LSTAT	3.15
6	AGE	3.27
2	INDUS	3.96
7	DIS	3.99
4	NOX	4.74
8	RAD	7.43
9	TAX	8.93

	CHAS	B	CRIM	PTRATIO	RM	ZN
0	0.0	396.90	0.00632	15.3	6.575	18.0
1	0.0	396.90	0.02731	17.8	6.421	0.0
2	0.0	392.83	0.02729	17.8	7.185	0.0
3	0.0	394.63	0.03237	18.7	6.998	0.0
4	0.0	396.90	0.06905	18.7	7.147	0.0
...	...	...	...	...	...	...
501	0.0	391.99	0.06263	21.0	6.593	0.0
502	0.0	396.90	0.04527	21.0	6.120	0.0
503	0.0	396.90	0.06076	21.0	6.976	0.0
504	0.0	393.45	0.10959	21.0	6.794	0.0
505	0.0	396.90	0.04741	21.0	6.030	0.0

	Feature	Score
0	LSTAT	535.13
1	RM	397.33
2	PTRATIO	187.10
3	INDUS	153.00
4	TAX	142.09
5	NOX	107.64
6	RAD	88.69
7	AGE	82.83
8	CRIM	81.77
9	ZN	81.15
10	B	57.68
11	DIS	34.47
12	CHAS	13.10

	LSTAT	RM	PTRATIO	INDUS	TAX	NOX
0	4.98	6.575	15.3	2.31	296.0	0.538
1	9.14	6.421	17.8	7.07	242.0	0.469
2	4.03	7.185	17.8	7.07	242.0	0.469
3	2.94	6.998	18.7	2.18	222.0	0.458
4	5.33	7.147	18.7	2.18	222.0	0.458
...	...	...	...	...	...	...
501	9.67	6.593	21.0	11.93	273.0	0.573
502	9.08	6.120	21.0	11.93	273.0	0.573
503	5.64	6.976	21.0	11.93	273.0	0.573
504	6.48	6.794	21.0	11.93	273.0	0.573
505	7.88	6.030	21.0	11.93	273.0	0.573