Import Libraries

# Base Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import yfinance as yf

# Preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit, cross_val_score

# Classifier
from sklearn.neighbors import KNeighborsClassifier

# Metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import plot_confusion_matrix, auc, roc_curve

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

Load Data and Create New Features

# df = yf.download('SPY', start='2000-01-01', end = '2021-12-31', progress=False)
df = pd.read_csv('spy.csv', index_col=0, parse_dates=True)

df.head()

df.describe()

# Predictors
df['O-C'] = df['Open'] - df['Close']
df['H-L'] = df['High'] - df['Low']

# Target

'''
y = 1 if tomorrow's close is greater than 99.5% of today's close
y = -1 otherwise

'''

df['y'] = np.where(df['Adj Close'].shift(-1)>0.995*df['Adj Close'],1,-1)

df.head(10)

# Convert to numpy

data = df[['O-C', 'H-L', 'y']].to_numpy()
data[:5]

array([[-0.03125 ,  1.25    , -1.      ],
       [ 2.8125  ,  4.375   , -1.      ],
       [ 3.78125 ,  4.421875,  1.      ],
       [-0.0625  ,  4.28125 , -1.      ],
       [ 1.875   ,  3.75    ,  1.      ]])

Train Test Split (with no shuffle)

# Splitting the datasets into training and testing data.
# Always keep shuffle = False for financial time series
X_train, X_test, y_train, y_test = train_test_split(data[:, :-1], data[:, -1], test_size=0.2, random_state=0, shuffle=False)

# Output the train and test data size
print(f"Train and Test Size {len(X_train)}, {len(X_test)}")

Train and Test Size 4399, 1100

Train KNN Model

# Scale and fit the model
pipe = Pipeline([
    ("scaler", MinMaxScaler()), 
    ("classifier", KNeighborsClassifier())
]) 
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('classifier',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=5, p=2,
                                      weights='uniform'))],
         verbose=False)

Accuracy and Confusion Matrix

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

acc_train = accuracy_score(y_train, y_train_pred)
acc_test = accuracy_score(y_test, y_test_pred)
print(f'Train Accuracy: {acc_train:0.4}, Test Accuracy: {acc_test:0.4}')

Train Accuracy: 0.7854, Test Accuracy: 0.7018

# Demonstration of Series.ravel()

mySeries = pd.Series([11, 32, 53, 14])
a, b, c, d = mySeries.ravel()
print(a, b, c, d)
print()
print(mySeries)

11 32 53 14

0    11
1    32
2    53
3    14
dtype: int64

tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
print(tn, fp, fn, tp)

# Plot confusion matrix
plot_confusion_matrix(pipe, X_test, y_test, values_format = 'd', cmap='Blues')
plt.title('Confusion Matrix')
plt.grid(False)

51 165 163 721

Classification Report

# Classification Report
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

        -1.0       0.24      0.24      0.24       216
         1.0       0.81      0.82      0.81       884

    accuracy                           0.70      1100
   macro avg       0.53      0.53      0.53      1100
weighted avg       0.70      0.70      0.70      1100

Calculations¶

ROC curve

Understanding the predict_proba() function (Default threshold = 0.5)¶

# Predict Probabilities
probs = pipe.predict_proba(X_test)
preds1 = probs[:, 0]
preds2 = probs[:, 1]

# display first 10 rows for probs
probs[:10]

array([[0.2, 0.8],
       [0.2, 0.8],
       [0.2, 0.8],
       [0.4, 0.6],
       [0. , 1. ],
       [0.2, 0.8],
       [0.2, 0.8],
       [0.6, 0.4],
       [0.4, 0.6],
       [0.6, 0.4]])

y_pred = pipe.predict(X_test)
# display first 10 rows for y_pred
y_pred[:10]

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1., -1.,  1., -1.])

Note that the 1s correspond to instances where the second column in probs >= 0.5

Changing the threshold¶

p = 0.2
probs = pipe.predict_proba(X_test)

# y_pred_1 = True if second column in pipe.predict_proba(X_test) is greater than or equal to p
# y_pred_1 = False otherwise
y_pred_1 = (probs[:,1] >= p)

# Display first 10 rows in y_pred_1
y_pred_1[:10]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

# Replace True with 1 and False with -1

y_pred_1 = np.where(y_pred_1 == True, 1, y_pred_1) 
y_pred_1 = np.where(y_pred_1 == False, -1, y_pred_1)

# Getting the TP, TN, FP, FN values when threshold = p

tn, fp, fn, tp = confusion_matrix(y_test, y_pred_1).ravel()
print(tn, fp, fn, tp)

print(f'FPR = {(fp/(tn+fp)).round(6)}, TPR = {(tp/(tp+fn)).round(6)}')

2 214 12 872
FPR = 0.990741, TPR = 0.986425

# probs[:, 1] gives the probability of y = 1

fpr, tpr, threshold = roc_curve(y_test, probs[:, 1], pos_label=1)

fpr, tpr, threshold

(array([0.        , 0.14814815, 0.45833333, 0.76388889, 0.90740741,
        0.99074074, 1.        ]),
 array([0.        , 0.16742081, 0.50904977, 0.81561086, 0.95135747,
        0.98642534, 1.        ]),
 array([2. , 1. , 0.8, 0.6, 0.4, 0.2, 0. ]))

FPR (0.990741) and TPR (0.986425) calculated using threshold = 0.2 match the values given in the arrays above (2nd last value in each array).

Plotting the ROC curve¶

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20,10))

fpr1, tpr1, threshold1 = roc_curve(y_test, preds1, pos_label=-1)
roc_auc1 = auc(fpr1, tpr1)

fpr2, tpr2, threshold2 = roc_curve(y_test, preds2, pos_label=1)
roc_auc2 = auc(fpr2, tpr2)

ax[0].plot([0, 1], [0, 1], 'r--')
ax[0].plot(fpr1, tpr1, 'cornflowerblue', label=f'AUC = {roc_auc1:0.2}', marker='o', ms = 15)
ax[0].set_title("Receiver Operating Characteristic for Down Moves")
ax[0].set_xlabel('False Positive Rate')
ax[0].set_ylabel('True Positive Rate')

ax[1].plot([0, 1], [0, 1], 'r--')
ax[1].plot(fpr2, tpr2, 'cornflowerblue', label=f'AUC = {roc_auc2:0.2}', marker='o', ms = 15)
ax[1].set_title("Receiver Operating Characteristic for Up Moves")
ax[1].set_xlabel('False Positive Rate')
ax[1].set_ylabel('True Positive Rate')

# Define legend
ax[0].legend(), ax[1].legend();

TimeSeriesSplit

Demonstrating how TimeSeriesSplit works¶

tscv = TimeSeriesSplit(n_splits=3)

for train, test in tscv.split(data[:, :-1]):
    print(train, test)

[   0    1    2 ... 1374 1375 1376] [1377 1378 1379 ... 2748 2749 2750]
[   0    1    2 ... 2748 2749 2750] [2751 2752 2753 ... 4122 4123 4124]
[   0    1    2 ... 4122 4123 4124] [4125 4126 4127 ... 5496 5497 5498]

Performing a Grid Search using TimeSeriesSplit¶

# Perform Gridsearch and fit
param_grid = {"classifier__n_neighbors": np.arange(1,51,1)}

grid_search = GridSearchCV(pipe, param_grid, scoring='roc_auc', n_jobs=-1, cv=TimeSeriesSplit(n_splits=5), verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    1.8s finished

GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=5),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        MinMaxScaler(copy=True,
                                                     feature_range=(0, 1))),
                                       ('classifier',
                                        KNeighborsClassifier(algorithm='auto',
                                                             leaf_size=30,
                                                             metric='minkowski',
                                                             metric_params=None,
                                                             n_jobs=None,
                                                             n_neighbors=5, p=2,
                                                             weights='uniform'))],
                                verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'classifier__n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=1)

# Getting the best parameters and best score

# Best Params
print(f'Best Parameters = {grid_search.best_params_}')

# Best Score
print(f'Best AUC Score = {grid_search.best_score_}')

Best Parameters = {'classifier__n_neighbors': 21}
Best AUC Score = 0.541663998793043

Retrain Model with Best Parameters¶

clf = KNeighborsClassifier(n_neighbors = grid_search.best_params_['classifier__n_neighbors'])

# Fit the model
clf.fit(X_train, y_train)

# Predicting the test dataset
y_pred = clf.predict(X_test)

# Measure Accuracy
acc_train = accuracy_score(y_train, clf.predict(X_train))
acc_test = accuracy_score(y_test, y_pred)

# Print Accuracy
print(f'\n Training Accuracy \t: {acc_train :0.4} \n Test Accuracy \t\t: {acc_test :0.4}')

 Training Accuracy 	: 0.7531 
 Test Accuracy 		: 0.7818

Backtest The Trading Strategy (on X_test, y_test)

df.head()

# Create a new Dataframe using the X_test instances

btdf = df[-len(X_test):]
btdf.head()

# Add a Signal Column
# Signal = predicted y values

btdf['Signal'] = clf.predict(X_test)

# Compute SPY's log returns
# log returns today = log of adj close today - log of adj close yesterday

btdf['Returns'] = np.log(btdf['Adj Close']).diff().fillna(0)
btdf.head()

# Compute log returns of KNN strategy

# today's knn returns = yesterday's signal * today's spy returns 
btdf['Strategy'] = btdf['Returns'] * (btdf['Signal'].shift(1)).fillna(0)
btdf.tail(20)

# Cumulative Returns 
# = exp(ln(today's close) - ln(yesterday's close) + ln(yesterday's close) - ..... - ln(Day 1's close))
# = exp(ln(today's close/close of Day 1)) 
# = today's close/close of Day 1

btdf['cumretspy'] = btdf['Returns'].cumsum().apply(np.exp)

btdf['cumretknn'] = btdf['Strategy'].cumsum().apply(np.exp)

btdf.head()

Plot cumulative returns graphs comparing SPY and KNN strategy¶

# Plot graph iteratively
fig, ax = plt.subplots(3, 2, figsize=(20,20))
# fig, ax = plt.subplots(2, 2, figsize=(20,10))


# 2017
ax[0,0].plot(btdf['cumretspy']['2017'], label ='SPY', color ='cornflowerblue')
ax[0,0].plot(btdf['cumretknn']['2017'], label ='Strategy', color ='crimson')
# 2018
ax[0,1].plot(btdf['cumretspy']['2018'], label ='SPY', color ='cornflowerblue')
ax[0,1].plot(btdf['cumretknn']['2018'], label ='Strategy', color ='crimson')
# 2019
ax[1,0].plot(btdf['cumretspy']['2019'], label ='SPY', color ='cornflowerblue')
ax[1,0].plot(btdf['cumretknn']['2019'], label ='Strategy', color ='crimson')
# 2020
ax[1,1].plot(btdf['cumretspy']['2020'], label ='SPY', color ='cornflowerblue')
ax[1,1].plot(btdf['cumretknn']['2020'], label ='Strategy', color ='crimson')


# 2021
ax[2,0].plot(btdf['cumretspy']['2021'], label ='SPY', color ='cornflowerblue')
ax[2,0].plot(btdf['cumretknn']['2021'], label ='Strategy', color ='crimson')



# Set axis title
ax[0,0].set_title('2017'), ax[0,1].set_title('2018'), ax[1,0].set_title('2019'), ax[1,1].set_title('2020')
ax[2,0].set_title('2021')

# Define legend
ax[0,0].legend(), ax[0,1].legend(), ax[1,0].legend(), ax[1,1].legend()
ax[2,0].legend()

fig.suptitle('Trading Strategy Performance', fontsize=14);

Plot SPY vs KNN Portfolio returns¶

KNN¶

# Import pyfolio
import pyfolio as pf

pf.create_simple_tear_sheet(btdf['Strategy'])

SPY¶

pf.create_simple_tear_sheet(btdf['Returns'])

	Open	High	Low	Close	Adj Close	Volume
Date
1999-12-31	146.84375	147.50000	146.250000	146.8750	98.155975	3172700
2000-01-03	148.25000	148.25000	143.875000	145.4375	97.195312	8164300
2000-01-04	143.53125	144.06250	139.640625	139.7500	93.394363	8089800
2000-01-05	139.93750	141.53125	137.250000	140.0000	93.561455	12177900
2000-01-06	139.62500	141.50000	137.750000	137.7500	92.057747	6227200

	Open	High	Low	Close	Adj Close	Volume
count	5499.000000	5499.000000	5499.000000	5499.000000	5499.000000	5.499000e+03
mean	175.315666	176.330719	174.201723	175.323582	150.385093	1.101146e+08
std	82.277623	82.557352	81.969557	82.306747	90.324600	9.597972e+07
min	67.949997	70.000000	67.099998	68.110001	52.985550	1.436600e+06
25%	117.459999	118.239998	116.720001	117.430000	86.700020	4.897305e+07
50%	140.687500	141.570007	139.940002	140.809998	107.377617	8.085580e+07
75%	211.235001	211.955002	210.474998	211.185005	188.797386	1.458263e+08
max	469.279999	470.649994	466.920013	468.529999	468.529999	8.710263e+08

	Open	High	Low	Close	Adj Close	Volume	O-C	H-L	y
Date
1999-12-31	146.84375	147.50000	146.250000	146.8750	98.155975	3172700	-0.03125	1.250000	-1
2000-01-03	148.25000	148.25000	143.875000	145.4375	97.195312	8164300	2.81250	4.375000	-1
2000-01-04	143.53125	144.06250	139.640625	139.7500	93.394363	8089800	3.78125	4.421875	1
2000-01-05	139.93750	141.53125	137.250000	140.0000	93.561455	12177900	-0.06250	4.281250	-1
2000-01-06	139.62500	141.50000	137.750000	137.7500	92.057747	6227200	1.87500	3.750000	1
2000-01-07	140.31250	145.75000	140.062500	145.7500	97.404129	8066500	-5.43750	5.687500	1
2000-01-10	146.25000	146.90625	145.031250	146.2500	97.738312	5741700	0.00000	1.875000	-1
2000-01-11	145.81250	146.09375	143.500000	144.5000	96.568756	7503700	1.31250	2.593750	-1
2000-01-12	144.59375	144.59375	142.875000	143.0625	95.608109	6907700	1.53125	1.718750	1
2000-01-13	144.46875	145.75000	143.281250	145.0000	96.902931	5158300	-0.53125	2.468750	1

	Open	High	Low	Close	Adj Close	Volume	O-C	H-L	y
Date
1999-12-31	146.84375	147.50000	146.250000	146.8750	98.155975	3172700	-0.03125	1.250000	-1
2000-01-03	148.25000	148.25000	143.875000	145.4375	97.195312	8164300	2.81250	4.375000	-1
2000-01-04	143.53125	144.06250	139.640625	139.7500	93.394363	8089800	3.78125	4.421875	1
2000-01-05	139.93750	141.53125	137.250000	140.0000	93.561455	12177900	-0.06250	4.281250	-1
2000-01-06	139.62500	141.50000	137.750000	137.7500	92.057747	6227200	1.87500	3.750000	1

	Open	High	Low	Close	Adj Close	Volume	O-C	H-L	y
Date
2017-06-27	243.039993	243.380005	241.309998	241.330002	223.699539	82247700	1.709991	2.070007	1
2017-06-28	242.500000	243.720001	242.229996	243.490005	225.701736	70042600	-0.990005	1.490005	-1
2017-06-29	243.660004	243.720001	239.960007	241.350006	223.718109	106949700	2.309998	3.759995	1
2017-06-30	242.279999	242.710007	241.580002	241.800003	224.135193	86820700	0.479996	1.130005	1
2017-07-03	242.880005	243.380005	242.210007	242.210007	224.515259	39153800	0.669998	1.169998	1

	Open	High	Low	Close	Adj Close	Volume	O-C	H-L	y	Signal	Returns	Strategy
Date
2021-10-11	437.160004	440.260010	434.619995	434.690002	434.690002	65233300	2.470001	5.640015	1	-1.0	-0.007266	-0.007266
2021-10-12	435.670013	436.100006	432.779999	433.619995	433.619995	71181200	2.050018	3.320007	1	-1.0	-0.002465	0.002465
2021-10-13	434.709991	436.049988	431.540009	435.179993	435.179993	72974000	-0.470001	4.509979	1	1.0	0.003591	-0.003591
2021-10-14	439.079987	442.660004	438.579987	442.500000	442.500000	70236800	-3.420013	4.080017	1	1.0	0.016681	0.016681
2021-10-15	444.750000	446.260010	444.089996	445.869995	445.869995	66226800	-1.119995	2.170013	1	1.0	0.007587	0.007587
2021-10-18	443.970001	447.549988	443.269989	447.190002	447.190002	62213200	-3.220001	4.279999	1	1.0	0.002956	0.002956
2021-10-19	448.920013	450.709991	448.269989	450.640015	450.640015	46996800	-1.720001	2.440002	1	1.0	0.007685	0.007685
2021-10-20	451.130005	452.730011	451.010010	452.410004	452.410004	49571600	-1.279999	1.720001	1	1.0	0.003920	0.003920
2021-10-21	451.769989	453.829987	451.309998	453.589996	453.589996	41305400	-1.820007	2.519989	1	1.0	0.002605	0.002605
2021-10-22	453.130005	454.670013	451.049988	453.119995	453.119995	58845100	0.010010	3.620026	1	1.0	-0.001037	-0.001037
2021-10-25	454.279999	455.899994	452.390015	455.549988	455.549988	45214500	-1.269989	3.509979	1	1.0	0.005348	0.005348
2021-10-26	457.200012	458.489990	455.559998	455.959991	455.959991	56075100	1.240021	2.929993	1	1.0	0.000900	0.000900
2021-10-27	456.450012	457.160004	453.859985	453.940002	453.940002	72438000	2.510010	3.300018	1	1.0	-0.004440	-0.004440
2021-10-28	455.459991	458.399994	455.450012	458.320007	458.320007	51437900	-2.860016	2.949982	1	1.0	0.009603	0.009603
2021-10-29	455.869995	459.559998	455.559998	459.250000	459.250000	70108200	-3.380005	4.000000	1	1.0	0.002027	0.002027
2021-11-01	460.299988	460.700012	458.200012	460.040009	460.040009	48433600	0.259979	2.500000	1	1.0	0.001719	0.001719
2021-11-02	460.220001	462.230011	460.079987	461.899994	461.899994	48908400	-1.679993	2.150024	1	1.0	0.004035	0.004035
2021-11-03	461.299988	465.149994	460.829987	464.720001	464.720001	52509800	-3.420013	4.320007	1	1.0	0.006087	0.006087
2021-11-04	465.359985	467.000000	464.989990	466.910004	466.910004	52847100	-1.550018	2.010010	1	1.0	0.004701	0.004701
2021-11-05	469.279999	470.649994	466.920013	468.529999	468.529999	66332200	0.750000	3.729980	-1	-1.0	0.003464	0.003464

	Backtest
Start date	2017-06-27
End date	2021-11-05
Total months	52
Annual return	23.133%
Cumulative returns	148.024%
Annual volatility	19.914%
Sharpe ratio	1.14
Calmar ratio	1.58
Stability	0.96
Max drawdown	-14.658%
Omega ratio	1.27
Sortino ratio	1.66
Skew	0.11
Kurtosis	17.76
Tail ratio	0.89
Daily value at risk	-2.418%