Import Libraries

In [1]:
# Data Manipulation
import numpy as np
import pandas as pd
import shap

# Visualizaiton
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

# Get Dataset from Scikit-learn
from sklearn.datasets import load_boston

# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Regressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

# Metrics
from sklearn.metrics import mean_squared_error, r2_score

Load Data into DataFrame

In [2]:
# Load data
boston_data = load_boston()
# Subsume into a dataframe
boston = pd.DataFrame(boston_data['data'], columns=boston_data['feature_names'])
# Add target to the dataframe
boston['MEDV'] = boston_data.target

boston
Out[2]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33 36.2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1.0 273.0 21.0 391.99 9.67 22.4
502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 273.0 21.0 396.90 9.08 20.6
503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 273.0 21.0 396.90 5.64 23.9
504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1.0 273.0 21.0 393.45 6.48 22.0
505 0.04741 0.0 11.93 0.0 0.573 6.030 80.8 2.5050 1.0 273.0 21.0 396.90 7.88 11.9

506 rows × 14 columns

Split Data into Training and Test Subsets

In [3]:
# Splitting the datasets into training and testing data.
X_train, X_test, y_train, y_test = train_test_split(boston.iloc[:, :-1], boston.iloc[:, -1], test_size=0.2, random_state=0)

# Output the train and test data size
print(f"Train and Test Size {len(X_train)}, {len(X_test)}")
Train and Test Size 404, 102

Linear Regression

In [4]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])
In [5]:
# fit/train model
pipe.fit(X_train, y_train)

# predict labels
y_pred = pipe.predict(X_test)

# metrics
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f'RMSE: {rmse:0.4}')
print(f'R^2 Train: {pipe.score(X_train, y_train):0.4}')
print(f'R^2 Test: {pipe.score(X_test, y_test):0.4}')
RMSE: 5.784
R^2 Train: 0.773
R^2 Test: 0.5892
In [6]:
pipe['regressor'].coef_
Out[6]:
array([-0.97082019,  1.05714873,  0.03831099,  0.59450642, -1.8551476 ,
        2.57321942, -0.08761547, -2.88094259,  2.11224542, -1.87533131,
       -2.29276735,  0.71817947, -3.59245482])
In [7]:
pipe['regressor'].intercept_
Out[7]:
22.611881188118836

LASSO Regression

The Least Absolute Shrinkage and Selection Operator (LASSO) is a variation of linear regression where a L1 penalty term is added to the Mean Square Error (MSE).

This penalty term is calculated by summing the absolute values of the regression weights (i.e. the coefficients), multiplied by a constant known as the regularization penalty, denoted as lamda in the formula below (but as alpha in sklearn).

The L1 penalty term not only shrinks the regression weights, but also shrinks some of them to zero, making it very useful for feature selection.

The loss function for LASSO regression is given by:

In [8]:
lasso = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', Lasso(alpha=0.1))
])
In [9]:
# fit lasso model

lasso.fit(X_train, y_train)
print(f'R^2 Train: {lasso.score(X_train, y_train):0.4}')
print(f'R^2 Test: {lasso.score(X_test, y_test):0.4}')
R^2 Train: 0.7677
R^2 Test: 0.5664
In [10]:
lasso['regressor'].coef_
Out[10]:
array([-0.66346811,  0.70152397, -0.13072368,  0.5889339 , -1.35874893,
        2.72275426, -0.        , -2.14093195,  0.64085327, -0.65877899,
       -2.17221721,  0.60266575, -3.61579998])

Effects of Lambda on Model

In [11]:
alpha_range = np.linspace(0.01,10,100)

la_coef = []
la_score = []
la_rmse = []

for i in alpha_range:
    
    lasso = Pipeline([('scaler', StandardScaler()), ('regressor', Lasso(alpha=i))])
    lasso.fit(X_train, y_train)
    y_pred = lasso.predict(X_test)
    
    la_coef.append(lasso['regressor'].coef_)
    la_rmse.append(mean_squared_error(y_test, y_pred, squared = False))
    la_score.append(r2_score(y_test, y_pred))

Coefficients as a function of Lambda

In [12]:
# Plot Coefficients
fig = plt.figure(figsize=(20,8))

ax = plt.axes()
ax.plot(alpha_range, la_coef)

ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis

ax.set_title('Lasso coefficients as a function of lambda')
ax.set_xlabel('$\lambda$ in descending order')
ax.set_ylabel('$\mathbf{w}$');

Each line above represents a coefficient of the LASSO regression model.

We see that when lambda is huge (10), all coefficients become zero. When lambda is 1, most coefficients (all except 3) are close to zero.

RMSE and R2 as a function of lambda

In [13]:
# Plot RMSE
fig, ax = plt.subplots(1,2, figsize=(20,8))
ax[0].plot(alpha_range, la_rmse, 'orange')
ax[0].set_title('Root Mean Square Error')
ax[0].set_xlabel('Alpha')
ax[0].set_ylabel('RMSE')

# Plot R^2
ax[1].plot(alpha_range, la_score, 'r-')
ax[1].set_title('Coefficient of Determination')
ax[1].set_xlabel('Alpha')
ax[1].set_ylabel('R-Squared');

Ridge Regression

In ridge regression, the cost function is altered by adding a L2 penality equivalent to square of the magnitude of the coefficients. The Ridge regression shrinks the coefficients and helps to reduce the multi-collinearity.

Its cost function is given by:

In [14]:
rid = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', Ridge(alpha=1))
])
In [15]:
# fit ridge model
rid.fit(X_train, y_train)
print(f'R^2 Train: {rid.score(X_train, y_train):0.4}')
print(f'R^2 Test: {rid.score(X_test, y_test):0.4}')
R^2 Train: 0.773
R^2 Test: 0.5881
In [16]:
rid['regressor'].coef_
Out[16]:
array([-0.96225708,  1.04087205,  0.01168009,  0.59871898, -1.82013351,
        2.58378594, -0.09518768, -2.8482632 ,  2.03623116, -1.80609168,
       -2.28319087,  0.71831036, -3.5760731 ])

Effects of Lambda on Model

In [17]:
alpha_range = 10**np.linspace(6,-2,100)*0.5
rid_score = []
rid_rmse = []
rid_coef = []

for i in alpha_range:
    rid = Pipeline([('scaler', StandardScaler()), ('regressor', Ridge(alpha=i))])
    rid.fit(X_train, y_train)    
    y_pred = rid.predict(X_test)  
          
    rid_coef.append(rid['regressor'].coef_)
    rid_rmse.append(mean_squared_error(y_test, y_pred, squared = False))
    rid_score.append(r2_score(y_test, y_pred))

Coefficients as a function of Lambda

In [18]:
# Plot Coefficients
fig = plt.figure(figsize=(20,8))
ax = plt.axes()
ax.plot(alpha_range, rid_coef)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
ax.set_title('Ridge coefficients as a function of lambda')
ax.set_xlabel('$\lambda$ in descending order')
ax.set_ylabel('$\mathbf{w}$');

RMSE and R2 as a function of lambda

In [19]:
# Plot RMSE
fig, ax = plt.subplots(1,2, figsize=(20,8))
ax[0].plot(alpha_range, rid_rmse, 'orange')
ax[0].set_title('Root Mean Square Error')
ax[0].set_xlabel('$\lambda$')
ax[0].set_ylabel('RMSE')
ax[0].set_xscale('log')

# Plot R^2
ax[1].plot(alpha_range, rid_score, 'r-')
ax[1].set_title('Coefficient of Determination')
ax[1].set_xlabel('$\lambda$')
ax[1].set_ylabel('$R^2$')
ax[1].set_xscale('log');

ElasticNet Regression

ElasticNet combines the properties of both Lasso and Ridge regression. It penalizes the model using both the L1 and L2 norm.

In the formula below, Elastic Net is the same as LASSO when alpha = 1 and ridge when alpha = 0. For other values of alpha, the penalty term interpolates between the L1 and L2 penalty terms.

Note: lambda = alpha and alpha = l1_ratio in sklearn.

In [20]:
elastic = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5))
])
In [21]:
# fit elasticnet model
elastic.fit(X_train, y_train)
print(f'R^2 Train: {elastic.score(X_train, y_train):0.4}')
print(f'R^2 Test: {elastic.score(X_test, y_test):0.4}')
R^2 Train: 0.7671
R^2 Test: 0.5617
In [22]:
elastic['regressor'].coef_
Out[22]:
array([-0.74093903,  0.72243247, -0.26704266,  0.62522263, -1.18854988,
        2.73484113, -0.09746715, -2.04841124,  0.7449307 , -0.78039169,
       -2.10099018,  0.66923402, -3.34109508])

Effects of Lambda on Model

In [23]:
alpha_range = 10**np.linspace(2,-2,100)*0.5
elastic_score = []
elastic_rmse = []
elastic_coef = []

for i in alpha_range:
    elastic = Pipeline([('scaler', StandardScaler()), ('regressor', ElasticNet(alpha=i))])
    elastic.fit(X_train, y_train)
    y_pred = elastic.predict(X_test)
    
    elastic_rmse.append(mean_squared_error(y_test, y_pred, squared = False))
    elastic_coef.append(elastic['regressor'].coef_)
    elastic_score.append(r2_score(y_test, y_pred))

Coefficients as a function of Lambda

In [24]:
# Plot Coefficients
fig = plt.figure(figsize=(20,8))
ax = plt.axes()
ax.plot(alpha_range, elastic_coef)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
ax.set_title('ElasticNet coefficients as a function of the regularization')
ax.set_xlabel('$\lambda$ in descending order')
ax.set_ylabel('$\mathbf{w}$');

RMSE and R2 as a function of lambda

In [25]:
# Plot RMSE
fig, ax = plt.subplots(1,2, figsize=(20,8))
ax[0].plot(alpha_range, elastic_rmse, 'orange')
ax[0].set_title('Root Mean Square Error')
ax[0].set_xlabel('Alpha')
ax[0].set_ylabel('RMSE')

# Plot R^2
ax[1].plot(alpha_range, elastic_score, 'r-')
ax[1].set_title('Coefficient of Determination')
ax[1].set_xlabel('Alpha')
ax[1].set_ylabel('R-Squared');