Import Libraries

In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Data manipulation 
import pandas as pd
import numpy as np

# Yahoo Finance
import yfinance as yf

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

# XGBoost Classifier
from xgboost import XGBClassifier, plot_importance, Booster, plot_tree, to_graphviz

# Preprocessing & Cross validation
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, TimeSeriesSplit, cross_val_score, GridSearchCV

# Metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, plot_confusion_matrix, plot_roc_curve

Load and Visualize Data

In [2]:
df = yf.download('SPY', start='2000-01-01', end = '2021-11-10', progress=False)[['Adj Close']]
# df = pd.read_csv('data/spy.csv', index_col=0, parse_dates=True)[['Adj Close']]

# Calculate returns
df['Returns'] = np.log(df).diff()

# Output first five values
df.head()
Out[2]:
Adj Close Returns
Date
1999-12-31 98.155983 NaN
2000-01-03 97.195335 -0.009835
2000-01-04 93.394363 -0.039892
2000-01-05 93.561440 0.001787
2000-01-06 92.057770 -0.016202
In [3]:
# Visualize raw price series
plt.figure(figsize=(20,10))
plt.title('SPY Price Performance')
plt.plot(df['Adj Close'], color='cornflowerblue');

Create the Features

We'll create a total of 22 features: 11 are based on log returns (e.g. 10 days log returns, 15 days log returns etc) and another 11 based on standard deviation (e.g. 10 days std, 15 days std etc)

In [4]:
features_list = []
for r in range(10, 65, 5):
    df['Ret_'+str(r)] = df.Returns.rolling(r).sum()
    df['Std_'+str(r)] = df.Returns.rolling(r).std()
    features_list.append('Ret_'+str(r))
    features_list.append('Std_'+str(r))

# Drop NaN values
df.dropna(inplace=True)

Define Label

The label is defined as 1 if today's close is greater than 99.5% of yesterday's close. Else, it is defined as -1.

In [5]:
df['Target'] = np.where(df['Adj Close'].shift(-1)>.995*df['Adj Close'],1,-1) 

# Check output
df
Out[5]:
Adj Close Returns Ret_10 Std_10 Ret_15 Std_15 Ret_20 Std_20 Ret_25 Std_25 ... Std_40 Ret_45 Std_45 Ret_50 Std_50 Ret_55 Std_55 Ret_60 Std_60 Target
Date
2000-03-28 101.211014 -0.005776 0.102991 0.016808 0.099908 0.017161 0.097062 0.016531 0.115188 0.016099 ... 0.015079 0.076137 0.015692 0.030012 0.015742 0.038339 0.015368 0.030650 0.017420 1
2000-03-29 101.315758 0.001034 0.080964 0.016388 0.102198 0.017096 0.090847 0.016542 0.104483 0.016044 ... 0.015027 0.065879 0.015623 0.038944 0.015694 0.035948 0.015363 0.041519 0.017367 -1
2000-03-30 99.619827 -0.016881 0.018427 0.011734 0.056512 0.016967 0.073290 0.017210 0.107945 0.015837 ... 0.015315 0.056956 0.015801 0.013952 0.015853 0.031106 0.015451 0.064530 0.016697 1
2000-03-31 100.750481 0.011286 0.023125 0.012035 0.073136 0.016873 0.066025 0.016954 0.122858 0.015806 ... 0.015243 0.072245 0.015850 0.040663 0.015763 0.052389 0.015448 0.074029 0.016749 1
2000-04-03 101.336700 0.005802 0.034044 0.011779 0.089926 0.016292 0.081618 0.016677 0.107899 0.015460 ... 0.015233 0.109737 0.015023 0.048626 0.015772 0.044739 0.015368 0.096032 0.016601 -1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2021-11-03 464.720001 0.006087 0.026846 0.003909 0.065675 0.004935 0.066319 0.005419 0.067354 0.007012 ... 0.008118 0.031925 0.007686 0.039907 0.007464 0.048719 0.007434 0.051786 0.007190 1
2021-11-04 466.910004 0.004701 0.028943 0.003960 0.053696 0.003588 0.062412 0.005287 0.084353 0.006284 ... 0.008096 0.036096 0.007709 0.042513 0.007483 0.064426 0.007269 0.054006 0.007204 1
2021-11-05 468.529999 0.003464 0.033443 0.003712 0.049573 0.003412 0.067701 0.005157 0.076003 0.006033 ... 0.007972 0.036487 0.007712 0.051897 0.007427 0.066342 0.007275 0.054477 0.007207 1
2021-11-08 468.929993 0.000853 0.028948 0.003714 0.047470 0.003471 0.075820 0.004559 0.089836 0.005059 ... 0.007970 0.037583 0.007710 0.043849 0.007340 0.059270 0.007216 0.053512 0.007206 1
2021-11-09 467.380005 -0.003311 0.024738 0.004175 0.036474 0.003606 0.074974 0.004624 0.076174 0.005036 ... 0.007931 0.037854 0.007706 0.036150 0.007345 0.047201 0.007162 0.047850 0.007223 -1

5441 rows × 25 columns

Check Correlation

In [6]:
import seaborn as sns
corrmat = df.drop(['Adj Close', 'Returns', 'Target'],axis=1).corr()

# Visualize feature correlation
fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(corrmat, annot=True, annot_kws={"size": 10}, fmt="0.2f", linewidths=0.5, square=False, cbar=True, cmap='RdYlBu')
ax.set_title('Feature Correlation', fontsize=14, color='black')
Out[6]:
Text(0.5, 1.0, 'Feature Correlation')

Split into Training and Test Subsets

In [7]:
# Splitting the datasets into training and testing data.
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 2:-1], df.iloc[:, -1], test_size=0.2, random_state=0, shuffle=False)

# Output the train and test data size
print(f"Train and Test Size {len(X_train)}, {len(X_test)}")
Train and Test Size 4352, 1089

Fit Model

In [8]:
# Fit the classifier model
# verbosity = 0 suppress the warning messages, which can clutter the output

xgb =  XGBClassifier(verbosity = 0, random_state = 0)
xgb.fit(X_train, y_train)
Out[8]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='binary:logistic',
              predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=True, validate_parameters=1, verbosity=0)
In [9]:
# Predicting the test dataset
y_pred = xgb.predict(X_test)

# Output prediction scoare
print(f'Train Accuracy: {xgb.score(X_train,y_train):0.4}')
print(f'Test Accuracy: {xgb.score(X_test,y_test):0.4}')
Train Accuracy: 0.9841
Test Accuracy: 0.7713

The results above show serious overfitting

In [10]:
# Cross-validation
tscv = TimeSeriesSplit(n_splits=5)
In [11]:
# Cross validation score
score=cross_val_score(xgb,X_train,y_train,cv=tscv)
print(score)
print(f'Mean CV Score : {score.mean():0.4}')
[0.41931034 0.61655172 0.63586207 0.80689655 0.76      ]
Mean CV Score : 0.6477

Hyper-Parameter Tuning

We'll tune the following hyperparameters using a randomized search:

  • learning rate: step size shrinkage used in update to prevents overfitting. Range is 0 to 1.
  • max_depth: maximum depth of a tree.
  • colsample_bytree: percentage of features used per tree. High value can lead to overfitting.
  • min_child_weight: minimum sum of instance weight (hessian) needed in a child.
  • gamma: minimum loss reduction required to make a further partition on a leaf node of the tree. Large gamma will lead to more conservative algorithm.

Explanation of the min_child_weight hyperparameter

Recall: The Hessian matrix is a matrix of second order partial derivatives.

Consider the case for a regression. The loss of each point in a node is

$\frac{1}{2}(y_i−\hat{y_i})^2$

The second derivative of this expression with respect to $\hat{y_i}$ is 1. So when you sum the second derivative over all points in the node, you get the number of points in the node. Here, min_child_weight means something like "stop trying to split once your sample size in a node goes below a given threshold".

Reference

In [12]:
# Cross-validation
tscv = TimeSeriesSplit(n_splits=5)
In [13]:
# Hyper parameter optimization
param_grid = {'learning_rate': [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
              'max_depth': [3, 4, 5, 6, 8, 10, 12, 15],
              'min_child_weight': [1, 3, 5, 7],
              'gamma': [0.0, 0.1, 0.2 , 0.3, 0.4],
              'colsample_bytree': [0.3, 0.4, 0.5 , 0.7]}
In [14]:
random_search = RandomizedSearchCV(xgb, param_grid, n_iter=100, scoring='f1', n_jobs=-1, cv=tscv, random_state=0, verbose=1)
random_search.fit(X_train, y_train)
Fitting 5 folds for each of 100 candidates, totalling 500 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   35.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.6min finished
Out[14]:
RandomizedSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=5),
                   error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1,
                                           enable_categorical=False, gamma=0,
                                           gpu_id=-1, importance_type=None,
                                           interaction_constraints='',
                                           learning_rate=0.300000012,
                                           max_delta_step=0, max_depth=6,
                                           min_...
                                           validate_parameters=1, verbosity=0),
                   iid='deprecated', n_iter=100, n_jobs=-1,
                   param_distributions={'colsample_bytree': [0.3, 0.4, 0.5,
                                                             0.7],
                                        'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
                                        'learning_rate': [0.05, 0.1, 0.15, 0.2,
                                                          0.25, 0.3],
                                        'max_depth': [3, 4, 5, 6, 8, 10, 12,
                                                      15],
                                        'min_child_weight': [1, 3, 5, 7]},
                   pre_dispatch='2*n_jobs', random_state=0, refit=True,
                   return_train_score=False, scoring='f1', verbose=1)
In [15]:
random_search.best_params_
Out[15]:
{'min_child_weight': 5,
 'max_depth': 3,
 'learning_rate': 0.05,
 'gamma': 0.3,
 'colsample_bytree': 0.7}
In [16]:
random_search.best_score_
Out[16]:
0.8288861038757614

Retrain Model with the best parameters

In [17]:
# Instantiate XGB model with best_param_
# ** unpacks the random_search.best_params_ into variable = value items

cls = XGBClassifier(verbosity = 0, **random_search.best_params_, random_state = 0) 
cls.fit(X_train, y_train)
Out[17]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7,
              enable_categorical=False, gamma=0.3, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=3,
              min_child_weight=5, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1,
              objective='binary:logistic', predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=0)
In [18]:
# Output prediction scoare
print(f'Train Accuracy: {cls.score(X_train,y_train):0.4}')
print(f'Test Accuracy: {cls.score(X_test,y_test):0.4}')
Train Accuracy: 0.7585
Test Accuracy: 0.8017
In [19]:
# Cross validation score
score=cross_val_score(cls,X_train,y_train,cv=tscv)
print(score)
print(f'Mean CV Score : {score.mean():0.4}')
[0.55586207 0.73793103 0.69931034 0.82206897 0.81517241]
Mean CV Score : 0.7261

Plot feature importances

Using the feature_importances_ attribute

In [20]:
fig, ax = plt.subplots(figsize=(10,8))
feature_imp = pd.DataFrame({'Importance Score': cls.feature_importances_,'Features': df.iloc[:, 2:-1].columns}).sort_values(by='Importance Score', ascending=False)

sns.barplot(x=feature_imp['Importance Score'], y=feature_imp['Features'])
ax.set_title('Features Importance');

Using the plot_importance() method

In [21]:
plot_importance(cls, importance_type='gain', show_values = False, height = 0.7)
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d5f375a5e0>

Importance type can be either of the following:

  • weight: the number of times a feature is used to split the data across all trees.
  • gain: the average gain across all splits the feature is used in.
  • cover: the average coverage across all splits the feature is used in.
  • total_gain: the total gain across all splits the feature is used in.
  • total_cover: the total coverage across all splits the feature is used in.

Tree Visualization

In [22]:
# You can add Graphiz to your PATH or use the code below (for Windows)
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin/'

# change tree number to see the corresponding plot
to_graphviz(cls, num_trees=1, rankdir='UT') 
Out[22]:
0 Std_10<0.00901329704 1 Std_40<0.0049152975 0->1 yes, missing 2 Ret_25<-0.0198173523 0->2 no 3 Std_15<0.00387798948 1->3 yes, missing 4 Ret_10<0.0136553161 1->4 no 5 Std_55<0.00888793357 2->5 yes, missing 6 Std_55<0.0149638159 2->6 no 7 leaf=0.0515985303 3->7 yes, missing 8 leaf=0.0843028426 3->8 no 9 leaf=0.0521427467 4->9 yes, missing 10 leaf=0.065698199 4->10 no 11 leaf=0.0454906262 5->11 yes, missing 12 leaf=0.0191409569 5->12 no 13 leaf=0.0490953363 6->13 yes, missing 14 leaf=0.0281613506 6->14 no

For a classification tree with two classes, the value of the leaf node represent the raw score. It can be converted to a probability score by using the logistic (sigmoid) function.

Creating a Strategy

In [23]:
# Create a new dataframe to subsume outsample data
btdf = df[-len(y_test):]

# Predict the signal and store in predicted signal column
btdf['Signal'] = cls.predict(X_test)
    
# Calculate the strategy returns
btdf['Strategy'] = btdf['Returns'] * btdf['Signal'].shift(1).fillna(0)

# Buy & Hold
cumret = btdf['Returns'].cumsum().apply(np.exp)

# Tree Algorithm
cumstg = btdf['Strategy'].cumsum().apply(np.exp)

print(f'SPY Returns \t {cumret[-1]:0.4}')
print(f'Strategy Returns \t {cumstg[-1]:0.4}')
SPY Returns 	 2.053
Strategy Returns 	 2.313
In [24]:
# Plot graph iteratively
fig, ax = plt.subplots(3,2, figsize=(20,20))

# 2017
ax[0,0].plot(cumret['2017'], label ='SPY', color ='cornflowerblue')
ax[0,0].plot(cumstg['2017'], label ='Strategy', color ='crimson')
# 2018
ax[0,1].plot(cumret['2018'], label ='SPY', color ='cornflowerblue')
ax[0,1].plot(cumstg['2018'], label ='Strategy', color ='crimson')
# 2019
ax[1,0].plot(cumret['2019'], label ='SPY', color ='cornflowerblue')
ax[1,0].plot(cumstg['2019'], label ='Strategy', color ='crimson')
# 2020
ax[1,1].plot(cumret['2020'], label ='SPY', color ='cornflowerblue')
ax[1,1].plot(cumstg['2020'], label ='Strategy', color ='crimson')
# 2021
ax[2,0].plot(cumret['2021'], label ='SPY', color ='cornflowerblue')
ax[2,0].plot(cumstg['2021'], label ='Strategy', color ='crimson')

# Set axis title
ax[0,0].set_title('2017'), ax[0,1].set_title('2018'), ax[1,0].set_title('2019'), ax[1,1].set_title('2020'), ax[1,1].set_title('2021')

# Define legend
ax[0,0].legend(), ax[0,1].legend(), ax[1,0].legend(), ax[1,1].legend(), ax[2,0].legend()

fig.suptitle('Trading Strategy Performance', fontsize=14);
In [25]:
# Performance measures
import pyfolio as pf
# Create tear sheet
pf.create_simple_tear_sheet(btdf['Strategy'])
Start date2017-07-17
End date2021-11-09
Total months51
Backtest
Annual return 19.005%
Cumulative returns 112.102%
Annual volatility 20.008%
Sharpe ratio 0.97
Calmar ratio 0.96
Stability 0.89
Max drawdown -19.819%
Omega ratio 1.23
Sortino ratio 1.40
Skew 0.41
Kurtosis 17.52
Tail ratio 0.81
Daily value at risk -2.444%