# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
# Data manipulation
import pandas as pd
import numpy as np
# Yahoo Finance
import yfinance as yf
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
# XGBoost Classifier
from xgboost import XGBClassifier, plot_importance, Booster, plot_tree, to_graphviz
# Preprocessing & Cross validation
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, TimeSeriesSplit, cross_val_score, GridSearchCV
# Metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, plot_confusion_matrix, plot_roc_curve
df = yf.download('SPY', start='2000-01-01', end = '2021-11-10', progress=False)[['Adj Close']]
# df = pd.read_csv('data/spy.csv', index_col=0, parse_dates=True)[['Adj Close']]
# Calculate returns
df['Returns'] = np.log(df).diff()
# Output first five values
df.head()
# Visualize raw price series
plt.figure(figsize=(20,10))
plt.title('SPY Price Performance')
plt.plot(df['Adj Close'], color='cornflowerblue');
We'll create a total of 22 features: 11 are based on log returns (e.g. 10 days log returns, 15 days log returns etc) and another 11 based on standard deviation (e.g. 10 days std, 15 days std etc)
features_list = []
for r in range(10, 65, 5):
df['Ret_'+str(r)] = df.Returns.rolling(r).sum()
df['Std_'+str(r)] = df.Returns.rolling(r).std()
features_list.append('Ret_'+str(r))
features_list.append('Std_'+str(r))
# Drop NaN values
df.dropna(inplace=True)
The label is defined as 1 if today's close is greater than 99.5% of yesterday's close. Else, it is defined as -1.
df['Target'] = np.where(df['Adj Close'].shift(-1)>.995*df['Adj Close'],1,-1)
# Check output
df
import seaborn as sns
corrmat = df.drop(['Adj Close', 'Returns', 'Target'],axis=1).corr()
# Visualize feature correlation
fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(corrmat, annot=True, annot_kws={"size": 10}, fmt="0.2f", linewidths=0.5, square=False, cbar=True, cmap='RdYlBu')
ax.set_title('Feature Correlation', fontsize=14, color='black')
# Splitting the datasets into training and testing data.
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 2:-1], df.iloc[:, -1], test_size=0.2, random_state=0, shuffle=False)
# Output the train and test data size
print(f"Train and Test Size {len(X_train)}, {len(X_test)}")
# Fit the classifier model
# verbosity = 0 suppress the warning messages, which can clutter the output
xgb = XGBClassifier(verbosity = 0, random_state = 0)
xgb.fit(X_train, y_train)
# Predicting the test dataset
y_pred = xgb.predict(X_test)
# Output prediction scoare
print(f'Train Accuracy: {xgb.score(X_train,y_train):0.4}')
print(f'Test Accuracy: {xgb.score(X_test,y_test):0.4}')
The results above show serious overfitting
# Cross-validation
tscv = TimeSeriesSplit(n_splits=5)
# Cross validation score
score=cross_val_score(xgb,X_train,y_train,cv=tscv)
print(score)
print(f'Mean CV Score : {score.mean():0.4}')
We'll tune the following hyperparameters using a randomized search:
learning rate
: step size shrinkage used in update to prevents overfitting. Range is 0 to 1.max_depth
: maximum depth of a tree.colsample_bytree
: percentage of features used per tree. High value can lead to overfitting.min_child_weight
: minimum sum of instance weight (hessian) needed in a child.gamma
: minimum loss reduction required to make a further partition on a leaf node of the tree. Large gamma will lead to more conservative algorithm.min_child_weight
hyperparameter¶Recall: The Hessian matrix is a matrix of second order partial derivatives.
Consider the case for a regression. The loss of each point in a node is
$\frac{1}{2}(y_i−\hat{y_i})^2$
The second derivative of this expression with respect to $\hat{y_i}$ is 1. So when you sum the second derivative over all points in the node, you get the number of points in the node. Here, min_child_weight means something like "stop trying to split once your sample size in a node goes below a given threshold".
# Cross-validation
tscv = TimeSeriesSplit(n_splits=5)
# Hyper parameter optimization
param_grid = {'learning_rate': [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
'max_depth': [3, 4, 5, 6, 8, 10, 12, 15],
'min_child_weight': [1, 3, 5, 7],
'gamma': [0.0, 0.1, 0.2 , 0.3, 0.4],
'colsample_bytree': [0.3, 0.4, 0.5 , 0.7]}
random_search = RandomizedSearchCV(xgb, param_grid, n_iter=100, scoring='f1', n_jobs=-1, cv=tscv, random_state=0, verbose=1)
random_search.fit(X_train, y_train)
random_search.best_params_
random_search.best_score_
# Instantiate XGB model with best_param_
# ** unpacks the random_search.best_params_ into variable = value items
cls = XGBClassifier(verbosity = 0, **random_search.best_params_, random_state = 0)
cls.fit(X_train, y_train)
# Output prediction scoare
print(f'Train Accuracy: {cls.score(X_train,y_train):0.4}')
print(f'Test Accuracy: {cls.score(X_test,y_test):0.4}')
# Cross validation score
score=cross_val_score(cls,X_train,y_train,cv=tscv)
print(score)
print(f'Mean CV Score : {score.mean():0.4}')
feature_importances_
attribute¶fig, ax = plt.subplots(figsize=(10,8))
feature_imp = pd.DataFrame({'Importance Score': cls.feature_importances_,'Features': df.iloc[:, 2:-1].columns}).sort_values(by='Importance Score', ascending=False)
sns.barplot(x=feature_imp['Importance Score'], y=feature_imp['Features'])
ax.set_title('Features Importance');
plot_importance()
method¶plot_importance(cls, importance_type='gain', show_values = False, height = 0.7)
Importance type can be either of the following:
# You can add Graphiz to your PATH or use the code below (for Windows)
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin/'
# change tree number to see the corresponding plot
to_graphviz(cls, num_trees=1, rankdir='UT')
For a classification tree with two classes, the value of the leaf node represent the raw score. It can be converted to a probability score by using the logistic (sigmoid) function.
# Create a new dataframe to subsume outsample data
btdf = df[-len(y_test):]
# Predict the signal and store in predicted signal column
btdf['Signal'] = cls.predict(X_test)
# Calculate the strategy returns
btdf['Strategy'] = btdf['Returns'] * btdf['Signal'].shift(1).fillna(0)
# Buy & Hold
cumret = btdf['Returns'].cumsum().apply(np.exp)
# Tree Algorithm
cumstg = btdf['Strategy'].cumsum().apply(np.exp)
print(f'SPY Returns \t {cumret[-1]:0.4}')
print(f'Strategy Returns \t {cumstg[-1]:0.4}')
# Plot graph iteratively
fig, ax = plt.subplots(3,2, figsize=(20,20))
# 2017
ax[0,0].plot(cumret['2017'], label ='SPY', color ='cornflowerblue')
ax[0,0].plot(cumstg['2017'], label ='Strategy', color ='crimson')
# 2018
ax[0,1].plot(cumret['2018'], label ='SPY', color ='cornflowerblue')
ax[0,1].plot(cumstg['2018'], label ='Strategy', color ='crimson')
# 2019
ax[1,0].plot(cumret['2019'], label ='SPY', color ='cornflowerblue')
ax[1,0].plot(cumstg['2019'], label ='Strategy', color ='crimson')
# 2020
ax[1,1].plot(cumret['2020'], label ='SPY', color ='cornflowerblue')
ax[1,1].plot(cumstg['2020'], label ='Strategy', color ='crimson')
# 2021
ax[2,0].plot(cumret['2021'], label ='SPY', color ='cornflowerblue')
ax[2,0].plot(cumstg['2021'], label ='Strategy', color ='crimson')
# Set axis title
ax[0,0].set_title('2017'), ax[0,1].set_title('2018'), ax[1,0].set_title('2019'), ax[1,1].set_title('2020'), ax[1,1].set_title('2021')
# Define legend
ax[0,0].legend(), ax[0,1].legend(), ax[1,0].legend(), ax[1,1].legend(), ax[2,0].legend()
fig.suptitle('Trading Strategy Performance', fontsize=14);
# Performance measures
import pyfolio as pf
# Create tear sheet
pf.create_simple_tear_sheet(btdf['Strategy'])