Import Libraries

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Data manipulation 
import pandas as pd
import numpy as np

# Yahoo Finance
import yfinance as yf

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

# XGBoost Classifier
from xgboost import XGBClassifier, plot_importance, Booster, plot_tree, to_graphviz

# Preprocessing & Cross validation
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, TimeSeriesSplit, cross_val_score, GridSearchCV

# Metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, plot_confusion_matrix, plot_roc_curve

Load and Visualize Data

df = yf.download('SPY', start='2000-01-01', end = '2021-11-10', progress=False)[['Adj Close']]
# df = pd.read_csv('data/spy.csv', index_col=0, parse_dates=True)[['Adj Close']]

# Calculate returns
df['Returns'] = np.log(df).diff()

# Output first five values
df.head()

# Visualize raw price series
plt.figure(figsize=(20,10))
plt.title('SPY Price Performance')
plt.plot(df['Adj Close'], color='cornflowerblue');

Create the Features

We'll create a total of 22 features: 11 are based on log returns (e.g. 10 days log returns, 15 days log returns etc) and another 11 based on standard deviation (e.g. 10 days std, 15 days std etc)

features_list = []
for r in range(10, 65, 5):
    df['Ret_'+str(r)] = df.Returns.rolling(r).sum()
    df['Std_'+str(r)] = df.Returns.rolling(r).std()
    features_list.append('Ret_'+str(r))
    features_list.append('Std_'+str(r))

# Drop NaN values
df.dropna(inplace=True)

Define Label

The label is defined as 1 if today's close is greater than 99.5% of yesterday's close. Else, it is defined as -1.

df['Target'] = np.where(df['Adj Close'].shift(-1)>.995*df['Adj Close'],1,-1) 

# Check output
df

Check Correlation

import seaborn as sns
corrmat = df.drop(['Adj Close', 'Returns', 'Target'],axis=1).corr()

# Visualize feature correlation
fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(corrmat, annot=True, annot_kws={"size": 10}, fmt="0.2f", linewidths=0.5, square=False, cbar=True, cmap='RdYlBu')
ax.set_title('Feature Correlation', fontsize=14, color='black')

Text(0.5, 1.0, 'Feature Correlation')

Split into Training and Test Subsets

# Splitting the datasets into training and testing data.
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 2:-1], df.iloc[:, -1], test_size=0.2, random_state=0, shuffle=False)

# Output the train and test data size
print(f"Train and Test Size {len(X_train)}, {len(X_test)}")

Train and Test Size 4352, 1089

Fit Model

# Fit the classifier model
# verbosity = 0 suppress the warning messages, which can clutter the output

xgb =  XGBClassifier(verbosity = 0, random_state = 0)
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='binary:logistic',
              predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=True, validate_parameters=1, verbosity=0)

# Predicting the test dataset
y_pred = xgb.predict(X_test)

# Output prediction scoare
print(f'Train Accuracy: {xgb.score(X_train,y_train):0.4}')
print(f'Test Accuracy: {xgb.score(X_test,y_test):0.4}')

Train Accuracy: 0.9841
Test Accuracy: 0.7713

The results above show serious overfitting

# Cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Cross validation score
score=cross_val_score(xgb,X_train,y_train,cv=tscv)
print(score)
print(f'Mean CV Score : {score.mean():0.4}')

[0.41931034 0.61655172 0.63586207 0.80689655 0.76      ]
Mean CV Score : 0.6477

Hyper-Parameter Tuning

We'll tune the following hyperparameters using a randomized search:

learning rate: step size shrinkage used in update to prevents overfitting. Range is 0 to 1.
max_depth: maximum depth of a tree.
colsample_bytree: percentage of features used per tree. High value can lead to overfitting.
min_child_weight: minimum sum of instance weight (hessian) needed in a child.
gamma: minimum loss reduction required to make a further partition on a leaf node of the tree. Large gamma will lead to more conservative algorithm.

Explanation of the `min_child_weight` hyperparameter¶

Recall: The Hessian matrix is a matrix of second order partial derivatives.

Consider the case for a regression. The loss of each point in a node is

$\frac{1}{2}(y_i−\hat{y_i})^2$

The second derivative of this expression with respect to $\hat{y_i}$ is 1. So when you sum the second derivative over all points in the node, you get the number of points in the node. Here, min_child_weight means something like "stop trying to split once your sample size in a node goes below a given threshold".

Reference

# Cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Hyper parameter optimization
param_grid = {'learning_rate': [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
              'max_depth': [3, 4, 5, 6, 8, 10, 12, 15],
              'min_child_weight': [1, 3, 5, 7],
              'gamma': [0.0, 0.1, 0.2 , 0.3, 0.4],
              'colsample_bytree': [0.3, 0.4, 0.5 , 0.7]}

random_search = RandomizedSearchCV(xgb, param_grid, n_iter=100, scoring='f1', n_jobs=-1, cv=tscv, random_state=0, verbose=1)
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   35.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.6min finished

RandomizedSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=5),
                   error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1,
                                           enable_categorical=False, gamma=0,
                                           gpu_id=-1, importance_type=None,
                                           interaction_constraints='',
                                           learning_rate=0.300000012,
                                           max_delta_step=0, max_depth=6,
                                           min_...
                                           validate_parameters=1, verbosity=0),
                   iid='deprecated', n_iter=100, n_jobs=-1,
                   param_distributions={'colsample_bytree': [0.3, 0.4, 0.5,
                                                             0.7],
                                        'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
                                        'learning_rate': [0.05, 0.1, 0.15, 0.2,
                                                          0.25, 0.3],
                                        'max_depth': [3, 4, 5, 6, 8, 10, 12,
                                                      15],
                                        'min_child_weight': [1, 3, 5, 7]},
                   pre_dispatch='2*n_jobs', random_state=0, refit=True,
                   return_train_score=False, scoring='f1', verbose=1)

random_search.best_params_

{'min_child_weight': 5,
 'max_depth': 3,
 'learning_rate': 0.05,
 'gamma': 0.3,
 'colsample_bytree': 0.7}

random_search.best_score_

0.8288861038757614

Retrain Model with the best parameters

# Instantiate XGB model with best_param_
# ** unpacks the random_search.best_params_ into variable = value items

cls = XGBClassifier(verbosity = 0, **random_search.best_params_, random_state = 0) 
cls.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7,
              enable_categorical=False, gamma=0.3, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=3,
              min_child_weight=5, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1,
              objective='binary:logistic', predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=0)

# Output prediction scoare
print(f'Train Accuracy: {cls.score(X_train,y_train):0.4}')
print(f'Test Accuracy: {cls.score(X_test,y_test):0.4}')

Train Accuracy: 0.7585
Test Accuracy: 0.8017

# Cross validation score
score=cross_val_score(cls,X_train,y_train,cv=tscv)
print(score)
print(f'Mean CV Score : {score.mean():0.4}')

[0.55586207 0.73793103 0.69931034 0.82206897 0.81517241]
Mean CV Score : 0.7261

Plot feature importances¶

Using the `feature_importances_` attribute¶

fig, ax = plt.subplots(figsize=(10,8))
feature_imp = pd.DataFrame({'Importance Score': cls.feature_importances_,'Features': df.iloc[:, 2:-1].columns}).sort_values(by='Importance Score', ascending=False)

sns.barplot(x=feature_imp['Importance Score'], y=feature_imp['Features'])
ax.set_title('Features Importance');

Using the `plot_importance()` method¶

plot_importance(cls, importance_type='gain', show_values = False, height = 0.7)

<matplotlib.axes._subplots.AxesSubplot at 0x1d5f375a5e0>

Importance type can be either of the following:

weight: the number of times a feature is used to split the data across all trees.
gain: the average gain across all splits the feature is used in.
cover: the average coverage across all splits the feature is used in.
total_gain: the total gain across all splits the feature is used in.
total_cover: the total coverage across all splits the feature is used in.

Tree Visualization¶

# You can add Graphiz to your PATH or use the code below (for Windows)
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin/'

# change tree number to see the corresponding plot
to_graphviz(cls, num_trees=1, rankdir='UT')

For a classification tree with two classes, the value of the leaf node represent the raw score. It can be converted to a probability score by using the logistic (sigmoid) function.

Creating a Strategy

# Create a new dataframe to subsume outsample data
btdf = df[-len(y_test):]

# Predict the signal and store in predicted signal column
btdf['Signal'] = cls.predict(X_test)
    
# Calculate the strategy returns
btdf['Strategy'] = btdf['Returns'] * btdf['Signal'].shift(1).fillna(0)

# Buy & Hold
cumret = btdf['Returns'].cumsum().apply(np.exp)

# Tree Algorithm
cumstg = btdf['Strategy'].cumsum().apply(np.exp)

print(f'SPY Returns \t {cumret[-1]:0.4}')
print(f'Strategy Returns \t {cumstg[-1]:0.4}')

SPY Returns 	 2.053
Strategy Returns 	 2.313

# Plot graph iteratively
fig, ax = plt.subplots(3,2, figsize=(20,20))

# 2017
ax[0,0].plot(cumret['2017'], label ='SPY', color ='cornflowerblue')
ax[0,0].plot(cumstg['2017'], label ='Strategy', color ='crimson')
# 2018
ax[0,1].plot(cumret['2018'], label ='SPY', color ='cornflowerblue')
ax[0,1].plot(cumstg['2018'], label ='Strategy', color ='crimson')
# 2019
ax[1,0].plot(cumret['2019'], label ='SPY', color ='cornflowerblue')
ax[1,0].plot(cumstg['2019'], label ='Strategy', color ='crimson')
# 2020
ax[1,1].plot(cumret['2020'], label ='SPY', color ='cornflowerblue')
ax[1,1].plot(cumstg['2020'], label ='Strategy', color ='crimson')
# 2021
ax[2,0].plot(cumret['2021'], label ='SPY', color ='cornflowerblue')
ax[2,0].plot(cumstg['2021'], label ='Strategy', color ='crimson')

# Set axis title
ax[0,0].set_title('2017'), ax[0,1].set_title('2018'), ax[1,0].set_title('2019'), ax[1,1].set_title('2020'), ax[1,1].set_title('2021')

# Define legend
ax[0,0].legend(), ax[0,1].legend(), ax[1,0].legend(), ax[1,1].legend(), ax[2,0].legend()

fig.suptitle('Trading Strategy Performance', fontsize=14);

# Performance measures
import pyfolio as pf
# Create tear sheet
pf.create_simple_tear_sheet(btdf['Strategy'])

	Adj Close	Returns
Date
1999-12-31	98.155983	NaN
2000-01-03	97.195335	-0.009835
2000-01-04	93.394363	-0.039892
2000-01-05	93.561440	0.001787
2000-01-06	92.057770	-0.016202

	Adj Close	Returns	Ret_10	Std_10	Ret_15	Std_15	Ret_20	Std_20	Ret_25	Std_25	...	Std_40	Ret_45	Std_45	Ret_50	Std_50	Ret_55	Std_55	Ret_60	Std_60	Target
Date
2000-03-28	101.211014	-0.005776	0.102991	0.016808	0.099908	0.017161	0.097062	0.016531	0.115188	0.016099	...	0.015079	0.076137	0.015692	0.030012	0.015742	0.038339	0.015368	0.030650	0.017420	1
2000-03-29	101.315758	0.001034	0.080964	0.016388	0.102198	0.017096	0.090847	0.016542	0.104483	0.016044	...	0.015027	0.065879	0.015623	0.038944	0.015694	0.035948	0.015363	0.041519	0.017367	-1
2000-03-30	99.619827	-0.016881	0.018427	0.011734	0.056512	0.016967	0.073290	0.017210	0.107945	0.015837	...	0.015315	0.056956	0.015801	0.013952	0.015853	0.031106	0.015451	0.064530	0.016697	1
2000-03-31	100.750481	0.011286	0.023125	0.012035	0.073136	0.016873	0.066025	0.016954	0.122858	0.015806	...	0.015243	0.072245	0.015850	0.040663	0.015763	0.052389	0.015448	0.074029	0.016749	1
2000-04-03	101.336700	0.005802	0.034044	0.011779	0.089926	0.016292	0.081618	0.016677	0.107899	0.015460	...	0.015233	0.109737	0.015023	0.048626	0.015772	0.044739	0.015368	0.096032	0.016601	-1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2021-11-03	464.720001	0.006087	0.026846	0.003909	0.065675	0.004935	0.066319	0.005419	0.067354	0.007012	...	0.008118	0.031925	0.007686	0.039907	0.007464	0.048719	0.007434	0.051786	0.007190	1
2021-11-04	466.910004	0.004701	0.028943	0.003960	0.053696	0.003588	0.062412	0.005287	0.084353	0.006284	...	0.008096	0.036096	0.007709	0.042513	0.007483	0.064426	0.007269	0.054006	0.007204	1
2021-11-05	468.529999	0.003464	0.033443	0.003712	0.049573	0.003412	0.067701	0.005157	0.076003	0.006033	...	0.007972	0.036487	0.007712	0.051897	0.007427	0.066342	0.007275	0.054477	0.007207	1
2021-11-08	468.929993	0.000853	0.028948	0.003714	0.047470	0.003471	0.075820	0.004559	0.089836	0.005059	...	0.007970	0.037583	0.007710	0.043849	0.007340	0.059270	0.007216	0.053512	0.007206	1
2021-11-09	467.380005	-0.003311	0.024738	0.004175	0.036474	0.003606	0.074974	0.004624	0.076174	0.005036	...	0.007931	0.037854	0.007706	0.036150	0.007345	0.047201	0.007162	0.047850	0.007223	-1

	Backtest
Start date	2017-07-17
End date	2021-11-09
Total months	51
Annual return	19.005%
Cumulative returns	112.102%
Annual volatility	20.008%
Sharpe ratio	0.97
Calmar ratio	0.96
Stability	0.89
Max drawdown	-19.819%
Omega ratio	1.23
Sortino ratio	1.40
Skew	0.41
Kurtosis	17.52
Tail ratio	0.81
Daily value at risk	-2.444%