# Base Libraries
import pandas as pd
import numpy as np
import missingno
import yfinance as yf
# Plotting
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10,6)
plt.style.use('fivethirtyeight')
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
dow = pd.read_html('https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average')
dow[1]
# Get stock tickers and company names as lists
dow_stocks = list(dow[1]['Symbol'])
dow_company_name = list(dow[1]['Company'])
dow_stocks
dow_company_name
# ohlc = {symbol: yf.download(symbol, start='2010-01-01', end='2020-09-30', auto_adjust=True, progress=False) for symbol in dow_stocks}
# np.save('ohlc.npy', dow_ohlc)
ohlc = np.load('dow_ohlc.npy', allow_pickle='TRUE').item()
type(ohlc)
ohlc
ohlc['MMM'].head()
# Function to calculate average true range
def ATR(dataframe, period=21):
"function to calculate True Range and Average True Range"
df = dataframe.copy()
df['H-L'] = abs(df['High']-df['Low'])
df['H-PC'] = abs(df['High']-df['Close'].shift(1))
df['L-PC'] = abs(df['Low']-df['Close'].shift(1))
df['TR'] = df[['H-L','H-PC','L-PC']].max(axis=1,skipna=False)
df['ATR'] = df['TR'].rolling(period).mean()
df = df.drop(['H-L','H-PC','L-PC'],axis=1,)
return df['ATR']
for symbol in dow_stocks:
ohlc[symbol]['ATR'] = ATR(ohlc[symbol])
ohlc['MMM'].head() # ATR column added
# Store closing price in a dataframe
close = pd.DataFrame({symbol: ohlc[symbol]['Close'] for symbol in dow_stocks})
close.head(8)
close.isnull().sum()
close.drop(['DOW'], axis=1, inplace=True)
resample('W-FRI')
means to convert the time series to weekly data, taking Fri as the last day of the weelk.
resample('W-FRI').last()
thus gives us the closing price on Friday
weekly_change = close.resample('W-FRI').last().pct_change()
weekly_change = weekly_change.T
# Check output
weekly_change.head()
# Add ATR to each stocks
atr = pd.DataFrame({symbol: ohlc[symbol]['ATR'] for symbol in dow_stocks})
atr.isnull().sum()
# Fill backward the missing values and drop DOW form the list
atr.fillna(method='bfill', axis=0, inplace=True)
atr.drop(['DOW'], axis=1, inplace=True)
# Manipulate dataframe: drop and resample
weekly_atr = atr.resample('W-FRI').last()
weekly_atr = weekly_atr.T
# Check output
weekly_atr.head()
# Plot weekly atr values for MMM
weekly_atr.iloc[0,:].plot()
# Import sklearn modules
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from datetime import datetime
n_clusters = range(2, 30)
inertia = []
for n in n_clusters:
kmeans = KMeans(n_clusters=n)
kmeans.fit(weekly_atr)
inertia.append(kmeans.inertia_)
plt.plot(n_clusters, np.divide(inertia,inertia[0]))
plt.hlines(0.1, n_clusters[0], n_clusters[-1], 'r', linestyles='dashed')
plt.hlines(0.05, n_clusters[0], n_clusters[-1], 'r', linestyles='dashed')
plt.xlabel('clusters')
plt.ylabel('relative inertia')
plt.legend(['inertia', '10% relative inertia', '5% relative inertia']);
pipe = Pipeline([("normalization", MinMaxScaler()), ("cluster", KMeans(n_clusters=15))])
# Fit Model
pipe.fit(weekly_atr)
# Assign Label
labels = pipe.predict(weekly_atr)
# Check lables
labels
# Remove DOW from the list
companies = dow_stocks
companies.remove('DOW')
# Remove Dow Inc from the compnay name
companies_name = dow_company_name
companies_name.remove('Dow')
# Create dataframe to hold cluster details
df = pd.DataFrame({'Cluster': labels,
'Companies Name': companies_name,
'Companies': companies,
'ATR': weekly_atr.mean(axis=1),
'CHG': weekly_change.mean(axis=1)}
).sort_values(by=['Cluster'], axis = 0)
df = df.reset_index(drop=True)
df
plt.figure(figsize=(20,10))
plt.scatter(df.Companies, df.Cluster)
plt.xlabel('Stocks')
plt.ylabel('Cluster')
plt.title('Dow Stocks ATR Clustering');
spy = yf.download('TSLA', start='2000-01-02', end = '2020-12-31', progress=False)
spy
# Plot share price
plt.title('Share Price')
plt.plot(spy['Adj Close'])
plt.show()
# Calculcate scaled volume and range
df = spy.copy(deep=True)
df['ATR'] = ATR(df, 20)
df['Svolume'] = df['Volume'] / df['Volume'].rolling(20).mean()
df['Range'] = (df['High'] - df['Low']) / df['ATR']
# Calculate open-to-open returns
df['Return'] = df.Open.shift(-2) - df.Open.shift(-1)
# Initial target with zeros
df['Target'] = 0
# Drop nan values
df.dropna(inplace=True)
# Verify last 5 values
df.tail()
# Split the data
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit, cross_val_score
df_train, df_test = train_test_split(df, test_size=0.2, random_state=0, shuffle=False)
X_train = df_train[['Svolume', 'Range']]
X_test = df_test[['Svolume', 'Range']]
kmeans = KMeans(n_clusters=3).fit(X_train)
df_train['Target'] = kmeans.predict(X_train)
# Train plot
centers = kmeans.cluster_centers_
plt.scatter(X_train['Svolume'], X_train['Range'], c=df_train['Target'])
plt.scatter(centers[:,0], centers[:,1], c='red', s=100, marker='x')
plt.xlabel('Volume')
plt.ylabel('Range')
plt.title('Cluster Analysis on Training Data')
plt.show()
df_test['Target'] = kmeans.predict(X_test)
# Test plot
centers = kmeans.cluster_centers_
plt.scatter(X_test['Svolume'], X_test['Range'], c=df_test['Target'])
plt.scatter(centers[:,0],centers[:,1], c='red', s=100, marker='x')
plt.xlabel('Volume')
plt.ylabel('Range')
plt.title('Cluster Analysis on Test Data')
plt.show()
# Performance plot
plt.plot(np.cumsum(df_train['Return'].loc[df_train['Target'] == 0]),label='Cluster 1')
plt.plot(np.cumsum(df_train['Return'].loc[df_train['Target'] == 1]),label='Cluster 2')
plt.plot(np.cumsum(df_train['Return'].loc[df_train['Target'] == 2]),label='Cluster 3')
plt.title('Total Points Gained')
plt.legend()
plt.show()
# Performance plot
plt.plot(np.cumsum(df_test['Return'].loc[df_test['Target'] == 0]),label='Cluster 1')
plt.plot(np.cumsum(df_test['Return'].loc[df_test['Target'] == 1]),label='Cluster 2')
plt.plot(np.cumsum(df_test['Return'].loc[df_test['Target'] == 2]),label='Cluster 3')
plt.title('Total Points Gained')
plt.legend()
plt.show()