Import Libraries

# Data Manipulation
import numpy as np
import pandas as pd
import shap

# Visualizaiton
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

# Get Dataset from Scikit-learn
from sklearn.datasets import load_boston

# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Regressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

# Metrics
from sklearn.metrics import mean_squared_error, r2_score

Load Data into DataFrame

# Load data
boston_data = load_boston()
# Subsume into a dataframe
boston = pd.DataFrame(boston_data['data'], columns=boston_data['feature_names'])
# Add target to the dataframe
boston['MEDV'] = boston_data.target

boston

Split Data into Training and Test Subsets

# Splitting the datasets into training and testing data.
X_train, X_test, y_train, y_test = train_test_split(boston.iloc[:, :-1], boston.iloc[:, -1], test_size=0.2, random_state=0)

# Output the train and test data size
print(f"Train and Test Size {len(X_train)}, {len(X_test)}")

Train and Test Size 404, 102

Linear Regression

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

# fit/train model
pipe.fit(X_train, y_train)

# predict labels
y_pred = pipe.predict(X_test)

# metrics
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f'RMSE: {rmse:0.4}')
print(f'R^2 Train: {pipe.score(X_train, y_train):0.4}')
print(f'R^2 Test: {pipe.score(X_test, y_test):0.4}')

RMSE: 5.784
R^2 Train: 0.773
R^2 Test: 0.5892

pipe['regressor'].coef_

array([-0.97082019,  1.05714873,  0.03831099,  0.59450642, -1.8551476 ,
        2.57321942, -0.08761547, -2.88094259,  2.11224542, -1.87533131,
       -2.29276735,  0.71817947, -3.59245482])

pipe['regressor'].intercept_

22.611881188118836

LASSO Regression

The Least Absolute Shrinkage and Selection Operator (LASSO) is a variation of linear regression where a L1 penalty term is added to the Mean Square Error (MSE).

This penalty term is calculated by summing the absolute values of the regression weights (i.e. the coefficients), multiplied by a constant known as the regularization penalty, denoted as lamda in the formula below (but as alpha in sklearn).

The L1 penalty term not only shrinks the regression weights, but also shrinks some of them to zero, making it very useful for feature selection.

The loss function for LASSO regression is given by:

lasso = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', Lasso(alpha=0.1))
])

# fit lasso model

lasso.fit(X_train, y_train)
print(f'R^2 Train: {lasso.score(X_train, y_train):0.4}')
print(f'R^2 Test: {lasso.score(X_test, y_test):0.4}')

R^2 Train: 0.7677
R^2 Test: 0.5664

lasso['regressor'].coef_

array([-0.66346811,  0.70152397, -0.13072368,  0.5889339 , -1.35874893,
        2.72275426, -0.        , -2.14093195,  0.64085327, -0.65877899,
       -2.17221721,  0.60266575, -3.61579998])

Effects of Lambda on Model¶

alpha_range = np.linspace(0.01,10,100)

la_coef = []
la_score = []
la_rmse = []

for i in alpha_range:
    
    lasso = Pipeline([('scaler', StandardScaler()), ('regressor', Lasso(alpha=i))])
    lasso.fit(X_train, y_train)
    y_pred = lasso.predict(X_test)
    
    la_coef.append(lasso['regressor'].coef_)
    la_rmse.append(mean_squared_error(y_test, y_pred, squared = False))
    la_score.append(r2_score(y_test, y_pred))

Coefficients as a function of Lambda¶

# Plot Coefficients
fig = plt.figure(figsize=(20,8))

ax = plt.axes()
ax.plot(alpha_range, la_coef)

ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis

ax.set_title('Lasso coefficients as a function of lambda')
ax.set_xlabel('$\lambda$ in descending order')
ax.set_ylabel('$\mathbf{w}$');

Each line above represents a coefficient of the LASSO regression model.

We see that when lambda is huge (10), all coefficients become zero. When lambda is 1, most coefficients (all except 3) are close to zero.

RMSE and R2 as a function of lambda¶

# Plot RMSE
fig, ax = plt.subplots(1,2, figsize=(20,8))
ax[0].plot(alpha_range, la_rmse, 'orange')
ax[0].set_title('Root Mean Square Error')
ax[0].set_xlabel('Alpha')
ax[0].set_ylabel('RMSE')

# Plot R^2
ax[1].plot(alpha_range, la_score, 'r-')
ax[1].set_title('Coefficient of Determination')
ax[1].set_xlabel('Alpha')
ax[1].set_ylabel('R-Squared');

Ridge Regression

In ridge regression, the cost function is altered by adding a L2 penality equivalent to square of the magnitude of the coefficients. The Ridge regression shrinks the coefficients and helps to reduce the multi-collinearity.

Its cost function is given by:

rid = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', Ridge(alpha=1))
])

# fit ridge model
rid.fit(X_train, y_train)
print(f'R^2 Train: {rid.score(X_train, y_train):0.4}')
print(f'R^2 Test: {rid.score(X_test, y_test):0.4}')

R^2 Train: 0.773
R^2 Test: 0.5881

rid['regressor'].coef_

array([-0.96225708,  1.04087205,  0.01168009,  0.59871898, -1.82013351,
        2.58378594, -0.09518768, -2.8482632 ,  2.03623116, -1.80609168,
       -2.28319087,  0.71831036, -3.5760731 ])

Effects of Lambda on Model¶

alpha_range = 10**np.linspace(6,-2,100)*0.5
rid_score = []
rid_rmse = []
rid_coef = []

for i in alpha_range:
    rid = Pipeline([('scaler', StandardScaler()), ('regressor', Ridge(alpha=i))])
    rid.fit(X_train, y_train)    
    y_pred = rid.predict(X_test)  
          
    rid_coef.append(rid['regressor'].coef_)
    rid_rmse.append(mean_squared_error(y_test, y_pred, squared = False))
    rid_score.append(r2_score(y_test, y_pred))

Coefficients as a function of Lambda¶

# Plot Coefficients
fig = plt.figure(figsize=(20,8))
ax = plt.axes()
ax.plot(alpha_range, rid_coef)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
ax.set_title('Ridge coefficients as a function of lambda')
ax.set_xlabel('$\lambda$ in descending order')
ax.set_ylabel('$\mathbf{w}$');

RMSE and R2 as a function of lambda¶

# Plot RMSE
fig, ax = plt.subplots(1,2, figsize=(20,8))
ax[0].plot(alpha_range, rid_rmse, 'orange')
ax[0].set_title('Root Mean Square Error')
ax[0].set_xlabel('$\lambda$')
ax[0].set_ylabel('RMSE')
ax[0].set_xscale('log')

# Plot R^2
ax[1].plot(alpha_range, rid_score, 'r-')
ax[1].set_title('Coefficient of Determination')
ax[1].set_xlabel('$\lambda$')
ax[1].set_ylabel('$R^2$')
ax[1].set_xscale('log');

ElasticNet Regression

ElasticNet combines the properties of both Lasso and Ridge regression. It penalizes the model using both the L1 and L2 norm.

In the formula below, Elastic Net is the same as LASSO when alpha = 1 and ridge when alpha = 0. For other values of alpha, the penalty term interpolates between the L1 and L2 penalty terms.

Note: lambda = alpha and alpha = l1_ratio in sklearn.

elastic = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])

# fit elasticnet model
elastic.fit(X_train, y_train)
print(f'R^2 Train: {elastic.score(X_train, y_train):0.4}')
print(f'R^2 Test: {elastic.score(X_test, y_test):0.4}')

R^2 Train: 0.7671
R^2 Test: 0.5617

elastic['regressor'].coef_

array([-0.74093903,  0.72243247, -0.26704266,  0.62522263, -1.18854988,
        2.73484113, -0.09746715, -2.04841124,  0.7449307 , -0.78039169,
       -2.10099018,  0.66923402, -3.34109508])

Effects of Lambda on Model¶

alpha_range = 10**np.linspace(2,-2,100)*0.5
elastic_score = []
elastic_rmse = []
elastic_coef = []

for i in alpha_range:
    elastic = Pipeline([('scaler', StandardScaler()), ('regressor', ElasticNet(alpha=i))])
    elastic.fit(X_train, y_train)
    y_pred = elastic.predict(X_test)
    
    elastic_rmse.append(mean_squared_error(y_test, y_pred, squared = False))
    elastic_coef.append(elastic['regressor'].coef_)
    elastic_score.append(r2_score(y_test, y_pred))

Coefficients as a function of Lambda¶

# Plot Coefficients
fig = plt.figure(figsize=(20,8))
ax = plt.axes()
ax.plot(alpha_range, elastic_coef)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
ax.set_title('ElasticNet coefficients as a function of the regularization')
ax.set_xlabel('$\lambda$ in descending order')
ax.set_ylabel('$\mathbf{w}$');

RMSE and R2 as a function of lambda¶

# Plot RMSE
fig, ax = plt.subplots(1,2, figsize=(20,8))
ax[0].plot(alpha_range, elastic_rmse, 'orange')
ax[0].set_title('Root Mean Square Error')
ax[0].set_xlabel('Alpha')
ax[0].set_ylabel('RMSE')

# Plot R^2
ax[1].plot(alpha_range, elastic_score, 'r-')
ax[1].set_title('Coefficient of Determination')
ax[1].set_xlabel('Alpha')
ax[1].set_ylabel('R-Squared');

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
0	0.00632	18.0	2.31	0.0	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98	24.0
1	0.02731	0.0	7.07	0.0	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14	21.6
2	0.02729	0.0	7.07	0.0	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03	34.7
3	0.03237	0.0	2.18	0.0	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94	33.4
4	0.06905	0.0	2.18	0.0	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33	36.2
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
501	0.06263	0.0	11.93	0.0	0.573	6.593	69.1	2.4786	1.0	273.0	21.0	391.99	9.67	22.4
502	0.04527	0.0	11.93	0.0	0.573	6.120	76.7	2.2875	1.0	273.0	21.0	396.90	9.08	20.6
503	0.06076	0.0	11.93	0.0	0.573	6.976	91.0	2.1675	1.0	273.0	21.0	396.90	5.64	23.9
504	0.10959	0.0	11.93	0.0	0.573	6.794	89.3	2.3889	1.0	273.0	21.0	393.45	6.48	22.0
505	0.04741	0.0	11.93	0.0	0.573	6.030	80.8	2.5050	1.0	273.0	21.0	396.90	7.88	11.9