Import Libraries

In [1]:
# Base Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import yfinance as yf

# Preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit, cross_val_score

# Classifier
from sklearn.neighbors import KNeighborsClassifier

# Metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import plot_confusion_matrix, auc, roc_curve

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

Load Data and Create New Features

In [2]:
# df = yf.download('SPY', start='2000-01-01', end = '2021-12-31', progress=False)
df = pd.read_csv('spy.csv', index_col=0, parse_dates=True)
In [3]:
df.head()
Out[3]:
Open High Low Close Adj Close Volume
Date
1999-12-31 146.84375 147.50000 146.250000 146.8750 98.155975 3172700
2000-01-03 148.25000 148.25000 143.875000 145.4375 97.195312 8164300
2000-01-04 143.53125 144.06250 139.640625 139.7500 93.394363 8089800
2000-01-05 139.93750 141.53125 137.250000 140.0000 93.561455 12177900
2000-01-06 139.62500 141.50000 137.750000 137.7500 92.057747 6227200
In [4]:
df.describe()
Out[4]:
Open High Low Close Adj Close Volume
count 5499.000000 5499.000000 5499.000000 5499.000000 5499.000000 5.499000e+03
mean 175.315666 176.330719 174.201723 175.323582 150.385093 1.101146e+08
std 82.277623 82.557352 81.969557 82.306747 90.324600 9.597972e+07
min 67.949997 70.000000 67.099998 68.110001 52.985550 1.436600e+06
25% 117.459999 118.239998 116.720001 117.430000 86.700020 4.897305e+07
50% 140.687500 141.570007 139.940002 140.809998 107.377617 8.085580e+07
75% 211.235001 211.955002 210.474998 211.185005 188.797386 1.458263e+08
max 469.279999 470.649994 466.920013 468.529999 468.529999 8.710263e+08
In [5]:
# Predictors
df['O-C'] = df['Open'] - df['Close']
df['H-L'] = df['High'] - df['Low']
In [6]:
# Target

'''
y = 1 if tomorrow's close is greater than 99.5% of today's close
y = -1 otherwise

'''

df['y'] = np.where(df['Adj Close'].shift(-1)>0.995*df['Adj Close'],1,-1) 
In [7]:
df.head(10)
Out[7]:
Open High Low Close Adj Close Volume O-C H-L y
Date
1999-12-31 146.84375 147.50000 146.250000 146.8750 98.155975 3172700 -0.03125 1.250000 -1
2000-01-03 148.25000 148.25000 143.875000 145.4375 97.195312 8164300 2.81250 4.375000 -1
2000-01-04 143.53125 144.06250 139.640625 139.7500 93.394363 8089800 3.78125 4.421875 1
2000-01-05 139.93750 141.53125 137.250000 140.0000 93.561455 12177900 -0.06250 4.281250 -1
2000-01-06 139.62500 141.50000 137.750000 137.7500 92.057747 6227200 1.87500 3.750000 1
2000-01-07 140.31250 145.75000 140.062500 145.7500 97.404129 8066500 -5.43750 5.687500 1
2000-01-10 146.25000 146.90625 145.031250 146.2500 97.738312 5741700 0.00000 1.875000 -1
2000-01-11 145.81250 146.09375 143.500000 144.5000 96.568756 7503700 1.31250 2.593750 -1
2000-01-12 144.59375 144.59375 142.875000 143.0625 95.608109 6907700 1.53125 1.718750 1
2000-01-13 144.46875 145.75000 143.281250 145.0000 96.902931 5158300 -0.53125 2.468750 1
In [8]:
# Convert to numpy

data = df[['O-C', 'H-L', 'y']].to_numpy()
data[:5]
Out[8]:
array([[-0.03125 ,  1.25    , -1.      ],
       [ 2.8125  ,  4.375   , -1.      ],
       [ 3.78125 ,  4.421875,  1.      ],
       [-0.0625  ,  4.28125 , -1.      ],
       [ 1.875   ,  3.75    ,  1.      ]])

Train Test Split (with no shuffle)

In [9]:
# Splitting the datasets into training and testing data.
# Always keep shuffle = False for financial time series
X_train, X_test, y_train, y_test = train_test_split(data[:, :-1], data[:, -1], test_size=0.2, random_state=0, shuffle=False)

# Output the train and test data size
print(f"Train and Test Size {len(X_train)}, {len(X_test)}")
Train and Test Size 4399, 1100

Train KNN Model

In [10]:
# Scale and fit the model
pipe = Pipeline([
    ("scaler", MinMaxScaler()), 
    ("classifier", KNeighborsClassifier())
]) 
pipe.fit(X_train, y_train)
Out[10]:
Pipeline(memory=None,
         steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('classifier',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=5, p=2,
                                      weights='uniform'))],
         verbose=False)

Accuracy and Confusion Matrix

In [11]:
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
In [12]:
acc_train = accuracy_score(y_train, y_train_pred)
acc_test = accuracy_score(y_test, y_test_pred)
print(f'Train Accuracy: {acc_train:0.4}, Test Accuracy: {acc_test:0.4}')
Train Accuracy: 0.7854, Test Accuracy: 0.7018
In [13]:
# Demonstration of Series.ravel()

mySeries = pd.Series([11, 32, 53, 14])
a, b, c, d = mySeries.ravel()
print(a, b, c, d)
print()
print(mySeries)
11 32 53 14

0    11
1    32
2    53
3    14
dtype: int64
In [14]:
tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
print(tn, fp, fn, tp)

# Plot confusion matrix
plot_confusion_matrix(pipe, X_test, y_test, values_format = 'd', cmap='Blues')
plt.title('Confusion Matrix')
plt.grid(False)
51 165 163 721

Classification Report

In [15]:
# Classification Report
print(classification_report(y_test, y_test_pred))
              precision    recall  f1-score   support

        -1.0       0.24      0.24      0.24       216
         1.0       0.81      0.82      0.81       884

    accuracy                           0.70      1100
   macro avg       0.53      0.53      0.53      1100
weighted avg       0.70      0.70      0.70      1100

Calculations

ROC curve

Understanding the predict_proba() function (Default threshold = 0.5)

In [16]:
# Predict Probabilities
probs = pipe.predict_proba(X_test)
preds1 = probs[:, 0]
preds2 = probs[:, 1]
In [17]:
# display first 10 rows for probs
probs[:10]
Out[17]:
array([[0.2, 0.8],
       [0.2, 0.8],
       [0.2, 0.8],
       [0.4, 0.6],
       [0. , 1. ],
       [0.2, 0.8],
       [0.2, 0.8],
       [0.6, 0.4],
       [0.4, 0.6],
       [0.6, 0.4]])
In [18]:
y_pred = pipe.predict(X_test)
# display first 10 rows for y_pred
y_pred[:10]
Out[18]:
array([ 1.,  1.,  1.,  1.,  1.,  1.,  1., -1.,  1., -1.])

Note that the 1s correspond to instances where the second column in probs >= 0.5

Changing the threshold

In [19]:
p = 0.2
probs = pipe.predict_proba(X_test)

# y_pred_1 = True if second column in pipe.predict_proba(X_test) is greater than or equal to p
# y_pred_1 = False otherwise
y_pred_1 = (probs[:,1] >= p)
In [20]:
# Display first 10 rows in y_pred_1
y_pred_1[:10]
Out[20]:
array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])
In [21]:
# Replace True with 1 and False with -1

y_pred_1 = np.where(y_pred_1 == True, 1, y_pred_1) 
y_pred_1 = np.where(y_pred_1 == False, -1, y_pred_1) 
In [22]:
# Getting the TP, TN, FP, FN values when threshold = p

tn, fp, fn, tp = confusion_matrix(y_test, y_pred_1).ravel()
print(tn, fp, fn, tp)

print(f'FPR = {(fp/(tn+fp)).round(6)}, TPR = {(tp/(tp+fn)).round(6)}')
2 214 12 872
FPR = 0.990741, TPR = 0.986425
In [23]:
# probs[:, 1] gives the probability of y = 1

fpr, tpr, threshold = roc_curve(y_test, probs[:, 1], pos_label=1)
In [24]:
fpr, tpr, threshold
Out[24]:
(array([0.        , 0.14814815, 0.45833333, 0.76388889, 0.90740741,
        0.99074074, 1.        ]),
 array([0.        , 0.16742081, 0.50904977, 0.81561086, 0.95135747,
        0.98642534, 1.        ]),
 array([2. , 1. , 0.8, 0.6, 0.4, 0.2, 0. ]))

FPR (0.990741) and TPR (0.986425) calculated using threshold = 0.2 match the values given in the arrays above (2nd last value in each array).

Plotting the ROC curve

In [25]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20,10))

fpr1, tpr1, threshold1 = roc_curve(y_test, preds1, pos_label=-1)
roc_auc1 = auc(fpr1, tpr1)

fpr2, tpr2, threshold2 = roc_curve(y_test, preds2, pos_label=1)
roc_auc2 = auc(fpr2, tpr2)

ax[0].plot([0, 1], [0, 1], 'r--')
ax[0].plot(fpr1, tpr1, 'cornflowerblue', label=f'AUC = {roc_auc1:0.2}', marker='o', ms = 15)
ax[0].set_title("Receiver Operating Characteristic for Down Moves")
ax[0].set_xlabel('False Positive Rate')
ax[0].set_ylabel('True Positive Rate')

ax[1].plot([0, 1], [0, 1], 'r--')
ax[1].plot(fpr2, tpr2, 'cornflowerblue', label=f'AUC = {roc_auc2:0.2}', marker='o', ms = 15)
ax[1].set_title("Receiver Operating Characteristic for Up Moves")
ax[1].set_xlabel('False Positive Rate')
ax[1].set_ylabel('True Positive Rate')

# Define legend
ax[0].legend(), ax[1].legend();

TimeSeriesSplit

Demonstrating how TimeSeriesSplit works

In [26]:
tscv = TimeSeriesSplit(n_splits=3)

for train, test in tscv.split(data[:, :-1]):
    print(train, test)
[   0    1    2 ... 1374 1375 1376] [1377 1378 1379 ... 2748 2749 2750]
[   0    1    2 ... 2748 2749 2750] [2751 2752 2753 ... 4122 4123 4124]
[   0    1    2 ... 4122 4123 4124] [4125 4126 4127 ... 5496 5497 5498]

Performing a Grid Search using TimeSeriesSplit

In [27]:
# Perform Gridsearch and fit
param_grid = {"classifier__n_neighbors": np.arange(1,51,1)}

grid_search = GridSearchCV(pipe, param_grid, scoring='roc_auc', n_jobs=-1, cv=TimeSeriesSplit(n_splits=5), verbose=1)
grid_search.fit(X_train, y_train)
Fitting 5 folds for each of 50 candidates, totalling 250 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    1.8s finished
Out[27]:
GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=5),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        MinMaxScaler(copy=True,
                                                     feature_range=(0, 1))),
                                       ('classifier',
                                        KNeighborsClassifier(algorithm='auto',
                                                             leaf_size=30,
                                                             metric='minkowski',
                                                             metric_params=None,
                                                             n_jobs=None,
                                                             n_neighbors=5, p=2,
                                                             weights='uniform'))],
                                verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'classifier__n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=1)
In [28]:
# Getting the best parameters and best score

# Best Params
print(f'Best Parameters = {grid_search.best_params_}')

# Best Score
print(f'Best AUC Score = {grid_search.best_score_}')
Best Parameters = {'classifier__n_neighbors': 21}
Best AUC Score = 0.541663998793043

Retrain Model with Best Parameters

In [29]:
clf = KNeighborsClassifier(n_neighbors = grid_search.best_params_['classifier__n_neighbors'])

# Fit the model
clf.fit(X_train, y_train)

# Predicting the test dataset
y_pred = clf.predict(X_test)

# Measure Accuracy
acc_train = accuracy_score(y_train, clf.predict(X_train))
acc_test = accuracy_score(y_test, y_pred)

# Print Accuracy
print(f'\n Training Accuracy \t: {acc_train :0.4} \n Test Accuracy \t\t: {acc_test :0.4}')
 Training Accuracy 	: 0.7531 
 Test Accuracy 		: 0.7818

Backtest The Trading Strategy (on X_test, y_test)

In [30]:
df.head()
Out[30]:
Open High Low Close Adj Close Volume O-C H-L y
Date
1999-12-31 146.84375 147.50000 146.250000 146.8750 98.155975 3172700 -0.03125 1.250000 -1
2000-01-03 148.25000 148.25000 143.875000 145.4375 97.195312 8164300 2.81250 4.375000 -1
2000-01-04 143.53125 144.06250 139.640625 139.7500 93.394363 8089800 3.78125 4.421875 1
2000-01-05 139.93750 141.53125 137.250000 140.0000 93.561455 12177900 -0.06250 4.281250 -1
2000-01-06 139.62500 141.50000 137.750000 137.7500 92.057747 6227200 1.87500 3.750000 1
In [31]:
# Create a new Dataframe using the X_test instances

btdf = df[-len(X_test):]
btdf.head()
Out[31]:
Open High Low Close Adj Close Volume O-C H-L y
Date
2017-06-27 243.039993 243.380005 241.309998 241.330002 223.699539 82247700 1.709991 2.070007 1
2017-06-28 242.500000 243.720001 242.229996 243.490005 225.701736 70042600 -0.990005 1.490005 -1
2017-06-29 243.660004 243.720001 239.960007 241.350006 223.718109 106949700 2.309998 3.759995 1
2017-06-30 242.279999 242.710007 241.580002 241.800003 224.135193 86820700 0.479996 1.130005 1
2017-07-03 242.880005 243.380005 242.210007 242.210007 224.515259 39153800 0.669998 1.169998 1
In [32]:
# Add a Signal Column
# Signal = predicted y values

btdf['Signal'] = clf.predict(X_test)
In [33]:
# Compute SPY's log returns
# log returns today = log of adj close today - log of adj close yesterday

btdf['Returns'] = np.log(btdf['Adj Close']).diff().fillna(0)
btdf.head()
Out[33]:
Open High Low Close Adj Close Volume O-C H-L y Signal Returns
Date
2017-06-27 243.039993 243.380005 241.309998 241.330002 223.699539 82247700 1.709991 2.070007 1 1.0 0.000000
2017-06-28 242.500000 243.720001 242.229996 243.490005 225.701736 70042600 -0.990005 1.490005 -1 1.0 0.008911
2017-06-29 243.660004 243.720001 239.960007 241.350006 223.718109 106949700 2.309998 3.759995 1 1.0 -0.008828
2017-06-30 242.279999 242.710007 241.580002 241.800003 224.135193 86820700 0.479996 1.130005 1 1.0 0.001863
2017-07-03 242.880005 243.380005 242.210007 242.210007 224.515259 39153800 0.669998 1.169998 1 1.0 0.001694
In [34]:
# Compute log returns of KNN strategy

# today's knn returns = yesterday's signal * today's spy returns 
btdf['Strategy'] = btdf['Returns'] * (btdf['Signal'].shift(1)).fillna(0)
btdf.tail(20)
Out[34]:
Open High Low Close Adj Close Volume O-C H-L y Signal Returns Strategy
Date
2021-10-11 437.160004 440.260010 434.619995 434.690002 434.690002 65233300 2.470001 5.640015 1 -1.0 -0.007266 -0.007266
2021-10-12 435.670013 436.100006 432.779999 433.619995 433.619995 71181200 2.050018 3.320007 1 -1.0 -0.002465 0.002465
2021-10-13 434.709991 436.049988 431.540009 435.179993 435.179993 72974000 -0.470001 4.509979 1 1.0 0.003591 -0.003591
2021-10-14 439.079987 442.660004 438.579987 442.500000 442.500000 70236800 -3.420013 4.080017 1 1.0 0.016681 0.016681
2021-10-15 444.750000 446.260010 444.089996 445.869995 445.869995 66226800 -1.119995 2.170013 1 1.0 0.007587 0.007587
2021-10-18 443.970001 447.549988 443.269989 447.190002 447.190002 62213200 -3.220001 4.279999 1 1.0 0.002956 0.002956
2021-10-19 448.920013 450.709991 448.269989 450.640015 450.640015 46996800 -1.720001 2.440002 1 1.0 0.007685 0.007685
2021-10-20 451.130005 452.730011 451.010010 452.410004 452.410004 49571600 -1.279999 1.720001 1 1.0 0.003920 0.003920
2021-10-21 451.769989 453.829987 451.309998 453.589996 453.589996 41305400 -1.820007 2.519989 1 1.0 0.002605 0.002605
2021-10-22 453.130005 454.670013 451.049988 453.119995 453.119995 58845100 0.010010 3.620026 1 1.0 -0.001037 -0.001037
2021-10-25 454.279999 455.899994 452.390015 455.549988 455.549988 45214500 -1.269989 3.509979 1 1.0 0.005348 0.005348
2021-10-26 457.200012 458.489990 455.559998 455.959991 455.959991 56075100 1.240021 2.929993 1 1.0 0.000900 0.000900
2021-10-27 456.450012 457.160004 453.859985 453.940002 453.940002 72438000 2.510010 3.300018 1 1.0 -0.004440 -0.004440
2021-10-28 455.459991 458.399994 455.450012 458.320007 458.320007 51437900 -2.860016 2.949982 1 1.0 0.009603 0.009603
2021-10-29 455.869995 459.559998 455.559998 459.250000 459.250000 70108200 -3.380005 4.000000 1 1.0 0.002027 0.002027
2021-11-01 460.299988 460.700012 458.200012 460.040009 460.040009 48433600 0.259979 2.500000 1 1.0 0.001719 0.001719
2021-11-02 460.220001 462.230011 460.079987 461.899994 461.899994 48908400 -1.679993 2.150024 1 1.0 0.004035 0.004035
2021-11-03 461.299988 465.149994 460.829987 464.720001 464.720001 52509800 -3.420013 4.320007 1 1.0 0.006087 0.006087
2021-11-04 465.359985 467.000000 464.989990 466.910004 466.910004 52847100 -1.550018 2.010010 1 1.0 0.004701 0.004701
2021-11-05 469.279999 470.649994 466.920013 468.529999 468.529999 66332200 0.750000 3.729980 -1 -1.0 0.003464 0.003464
In [35]:
# Cumulative Returns 
# = exp(ln(today's close) - ln(yesterday's close) + ln(yesterday's close) - ..... - ln(Day 1's close))
# = exp(ln(today's close/close of Day 1)) 
# = today's close/close of Day 1

btdf['cumretspy'] = btdf['Returns'].cumsum().apply(np.exp)
In [36]:
btdf['cumretknn'] = btdf['Strategy'].cumsum().apply(np.exp)
In [37]:
btdf.head()
Out[37]:
Open High Low Close Adj Close Volume O-C H-L y Signal Returns Strategy cumretspy cumretknn
Date
2017-06-27 243.039993 243.380005 241.309998 241.330002 223.699539 82247700 1.709991 2.070007 1 1.0 0.000000 0.000000 1.000000 1.000000
2017-06-28 242.500000 243.720001 242.229996 243.490005 225.701736 70042600 -0.990005 1.490005 -1 1.0 0.008911 0.008911 1.008950 1.008950
2017-06-29 243.660004 243.720001 239.960007 241.350006 223.718109 106949700 2.309998 3.759995 1 1.0 -0.008828 -0.008828 1.000083 1.000083
2017-06-30 242.279999 242.710007 241.580002 241.800003 224.135193 86820700 0.479996 1.130005 1 1.0 0.001863 0.001863 1.001947 1.001947
2017-07-03 242.880005 243.380005 242.210007 242.210007 224.515259 39153800 0.669998 1.169998 1 1.0 0.001694 0.001694 1.003646 1.003646

Plot cumulative returns graphs comparing SPY and KNN strategy

In [38]:
# Plot graph iteratively
fig, ax = plt.subplots(3, 2, figsize=(20,20))
# fig, ax = plt.subplots(2, 2, figsize=(20,10))


# 2017
ax[0,0].plot(btdf['cumretspy']['2017'], label ='SPY', color ='cornflowerblue')
ax[0,0].plot(btdf['cumretknn']['2017'], label ='Strategy', color ='crimson')
# 2018
ax[0,1].plot(btdf['cumretspy']['2018'], label ='SPY', color ='cornflowerblue')
ax[0,1].plot(btdf['cumretknn']['2018'], label ='Strategy', color ='crimson')
# 2019
ax[1,0].plot(btdf['cumretspy']['2019'], label ='SPY', color ='cornflowerblue')
ax[1,0].plot(btdf['cumretknn']['2019'], label ='Strategy', color ='crimson')
# 2020
ax[1,1].plot(btdf['cumretspy']['2020'], label ='SPY', color ='cornflowerblue')
ax[1,1].plot(btdf['cumretknn']['2020'], label ='Strategy', color ='crimson')


# 2021
ax[2,0].plot(btdf['cumretspy']['2021'], label ='SPY', color ='cornflowerblue')
ax[2,0].plot(btdf['cumretknn']['2021'], label ='Strategy', color ='crimson')



# Set axis title
ax[0,0].set_title('2017'), ax[0,1].set_title('2018'), ax[1,0].set_title('2019'), ax[1,1].set_title('2020')
ax[2,0].set_title('2021')

# Define legend
ax[0,0].legend(), ax[0,1].legend(), ax[1,0].legend(), ax[1,1].legend()
ax[2,0].legend()

fig.suptitle('Trading Strategy Performance', fontsize=14);

Plot SPY vs KNN Portfolio returns

KNN

In [39]:
# Import pyfolio
import pyfolio as pf

pf.create_simple_tear_sheet(btdf['Strategy'])
Start date2017-06-27
End date2021-11-05
Total months52
Backtest
Annual return 23.133%
Cumulative returns 148.024%
Annual volatility 19.914%
Sharpe ratio 1.14
Calmar ratio 1.58
Stability 0.96
Max drawdown -14.658%
Omega ratio 1.27
Sortino ratio 1.66
Skew 0.11
Kurtosis 17.76
Tail ratio 0.89
Daily value at risk -2.418%

SPY

In [40]:
pf.create_simple_tear_sheet(btdf['Returns'])
Start date2017-06-27
End date2021-11-05
Total months52
Backtest
Annual return 16.097%
Cumulative returns 91.847%
Annual volatility 19.937%
Sharpe ratio 0.85
Calmar ratio 0.45
Stability 0.80
Max drawdown -35.746%
Omega ratio 1.20
Sortino ratio 1.15
Skew -1.12
Kurtosis 17.95
Tail ratio 0.81
Daily value at risk -2.445%