Import Libraries

In [1]:
# Base Libraries
import pandas as pd
import numpy as np
import missingno
import yfinance as yf

# Plotting
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10,6)

plt.style.use('fivethirtyeight')

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

Get DOW 30 stocks

In [2]:
dow = pd.read_html('https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average')
dow[1]
Out[2]:
Company Exchange Symbol Industry Date added Notes Index weighting
0 3M NYSE MMM Conglomerate 1976-08-09 As Minnesota Mining and Manufacturing 3.38%
1 American Express NYSE AXP Financial services 1982-08-30 NaN 3.29%
2 Amgen NASDAQ AMGN Biopharmaceutical 2020-08-31 NaN 3.84%
3 Apple NASDAQ AAPL Information technology 2015-03-19 NaN 2.76%
4 Boeing NYSE BA Aerospace and defense 1987-03-12 NaN 4.01%
5 Caterpillar NYSE CAT Construction and Mining 1991-05-06 NaN 3.73%
6 Chevron NYSE CVX Petroleum industry 2008-02-19 Also 1930-07-18 to 1999-11-01 2.07%
7 Cisco Systems NASDAQ CSCO Information technology 2009-06-08 NaN 1.03%
8 Coca-Cola NYSE KO Soft Drink 1987-03-12 Also 1932-05-26 to 1935-11-20 1.01%
9 Disney NYSE DIS Broadcasting and entertainment 1991-05-06 NaN 3.18%
10 Dow NYSE DOW Chemical industry 2019-04-02 NaN 1.10%
11 Goldman Sachs NYSE GS Financial services 2013-09-20 NaN 7.65%
12 Home Depot NYSE HD Home Improvement 1999-11-01 NaN 6.65%
13 Honeywell NASDAQ HON Conglomerate 2020-08-31 NaN 4.12%
14 IBM NYSE IBM Information technology 1979-06-29 Also 1932-05-26 to 1939-03-04 2.64%
15 Intel NASDAQ INTC Semiconductor industry 1999-11-01 NaN 1.03%
16 Johnson & Johnson NYSE JNJ Pharmaceutical industry 1997-03-17 NaN 3.04%
17 JPMorgan Chase NYSE JPM Financial services 1991-05-06 NaN 3.13%
18 McDonald's NYSE MCD Food industry 1985-10-30 NaN 4.51%
19 Merck NYSE MRK Pharmaceutical industry 1979-06-29 NaN 1.48%
20 Microsoft NASDAQ MSFT Information technology 1999-11-01 NaN 5.72%
21 Nike NYSE NKE Apparel 2013-09-20 NaN 2.93%
22 Procter & Gamble NYSE PG Fast-moving consumer goods 1932-05-26 NaN 2.61%
23 Salesforce NYSE CRM Information technology 2020-08-31 NaN 5.43%
24 Travelers NYSE TRV Insurance 2009-06-08 NaN 2.89%
25 UnitedHealth NYSE UNH Managed health care 2012-09-24 NaN 7.88%
26 Verizon NYSE VZ Telecommunication 2004-04-08 NaN 0.97%
27 Visa NYSE V Financial services 2013-09-20 NaN 4.34%
28 Walgreens Boots Alliance NASDAQ WBA Retailing 2018-06-26 NaN 0.89%
29 Walmart NYSE WMT Retailing 1997-03-17 NaN 2.69%
In [3]:
# Get stock tickers and company names as lists

dow_stocks = list(dow[1]['Symbol'])
dow_company_name = list(dow[1]['Company'])
In [4]:
dow_stocks
Out[4]:
['MMM',
 'AXP',
 'AMGN',
 'AAPL',
 'BA',
 'CAT',
 'CVX',
 'CSCO',
 'KO',
 'DIS',
 'DOW',
 'GS',
 'HD',
 'HON',
 'IBM',
 'INTC',
 'JNJ',
 'JPM',
 'MCD',
 'MRK',
 'MSFT',
 'NKE',
 'PG',
 'CRM',
 'TRV',
 'UNH',
 'VZ',
 'V',
 'WBA',
 'WMT']
In [5]:
dow_company_name
Out[5]:
['3M',
 'American Express',
 'Amgen',
 'Apple',
 'Boeing',
 'Caterpillar',
 'Chevron',
 'Cisco Systems',
 'Coca-Cola',
 'Disney',
 'Dow',
 'Goldman Sachs',
 'Home Depot',
 'Honeywell',
 'IBM',
 'Intel',
 'Johnson & Johnson',
 'JPMorgan Chase',
 "McDonald's",
 'Merck',
 'Microsoft',
 'Nike',
 'Procter & Gamble',
 'Salesforce',
 'Travelers',
 'UnitedHealth',
 'Verizon',
 'Visa',
 'Walgreens Boots Alliance',
 'Walmart']

Download/Load OHLC values of the 30 stocks

In [6]:
# ohlc = {symbol: yf.download(symbol, start='2010-01-01', end='2020-09-30', auto_adjust=True, progress=False) for symbol in dow_stocks}
# np.save('ohlc.npy', dow_ohlc) 

ohlc = np.load('dow_ohlc.npy', allow_pickle='TRUE').item()
In [7]:
type(ohlc)
Out[7]:
dict
In [8]:
ohlc
Out[8]:
{'MMM':                   Open        High         Low       Close   Volume
 Date                                                               
 2009-12-31   60.447778   60.656991   59.546003   59.639786  2049800
 2010-01-04   59.942795   60.202507   59.639800   59.892296  3043700
 2010-01-05   59.733564   60.043775   58.939998   59.517136  2847000
 2010-01-06   60.512704   61.032127   60.245782   60.361206  5268500
 2010-01-07   60.108729   60.426156   59.243027   60.404514  4470100
 ...                ...         ...         ...         ...      ...
 2020-09-23  156.169088  157.821876  152.901944  153.276703  2248200
 2020-09-24  153.170978  155.630951  151.892965  154.151123  1569700
 2020-09-25  153.084512  154.689250  152.652100  154.007004  1948900
 2020-09-28  155.669392  157.725765  155.025576  155.342682  2098500
 2020-09-29  155.352303  155.621360  152.575238  153.564987  2006700
 
 [2705 rows x 5 columns],
 'AXP':                  Open       High        Low      Close    Volume
 Date                                                            
 2009-12-31  34.214254  34.364830  33.871275  33.896370   4030500
 2010-01-04  34.138971  34.381564  33.787625  34.230988   6894300
 2010-01-05  34.155693  34.490305  33.770885  34.155693  10641200
 2010-01-06  34.490313  34.858387  34.440119  34.707813   8399400
 2010-01-07  34.665803  35.489181  34.539778  35.270733   8981700
 ...               ...        ...        ...        ...       ...
 2020-09-23  94.262811  95.581856  93.603280  94.065933   9427200
 2020-09-24  93.849376  95.384991  92.244863  93.957657   4249000
 2020-09-25  93.199691  95.335761  92.983129  94.754990   2775600
 2020-09-28  97.452149  98.170739  97.087932  97.274963   3930900
 2020-09-29  97.196214  97.796678  95.709828  96.221695   3200400
 
 [2705 rows x 5 columns],
 'AMGN':                   Open        High         Low       Close    Volume
 Date                                                                
 2009-12-31   44.440469   44.463666   43.744514   43.744514   4027900
 2010-01-04   43.790935   44.749802   43.736805   44.633812   5277400
 2010-01-05   44.332231   44.610610   43.512551   44.247169   7882800
 2010-01-06   44.030652   44.378629   43.690409   43.914661   6015100
 2010-01-07   43.620794   43.713587   42.259820   43.512535  10371600
 ...                ...         ...         ...         ...       ...
 2020-09-23  238.260468  240.119914  232.653199  233.722626   2104300
 2020-09-24  233.298709  233.809335  229.088450  231.535614   1827200
 2020-09-25  230.755200  235.254496  229.117343  234.907669   2635700
 2020-09-28  238.626586  239.936874  235.822966  238.000351   2213900
 2020-09-29  238.501322  239.946493  236.506980  239.223907   2008000
 
 [2705 rows x 5 columns],
 'AAPL':                   Open        High         Low       Close     Volume
 Date                                                                 
 2009-12-31    6.526081    6.532817    6.447387    6.452592  352410800
 2010-01-04    6.535265    6.568028    6.503113    6.553024  493729600
 2010-01-05    6.571093    6.601407    6.529755    6.564356  601904800
 2010-01-06    6.564355    6.590383    6.453204    6.459940  552160000
 2010-01-07    6.483824    6.491479    6.401149    6.447998  477131200
 ...                ...         ...         ...         ...        ...
 2020-09-23  110.745037  111.231194  105.933049  106.280312  150718700
 2020-09-24  104.345594  109.385775  104.176928  107.371689  167743300
 2020-09-25  107.580045  111.558614  106.826000  111.399864  149981400
 2020-09-28  114.108464  114.416032  111.895941  114.058853  137672400
 2020-09-29  113.652080  114.406117  112.679758  113.195679   99382200
 
 [2705 rows x 5 columns],
 'BA':                   Open        High         Low       Close    Volume
 Date                                                                
 2009-12-31   42.858050   43.029483   42.117774   42.180115   2189400
 2010-01-04   43.419086   43.941174   42.702187   43.777534   6186700
 2010-01-05   43.832095   45.413946   43.637286   45.211346   8867800
 2010-01-06   45.374970   46.746429   45.102238   46.582787   8836500
 2010-01-07   46.372413   48.554280   45.990588   48.468563  14379100
 ...                ...         ...         ...         ...       ...
 2020-09-23  157.330002  160.419998  150.889999  151.179993  18247500
 2020-09-24  150.289993  150.309998  145.020004  146.050003  27098100
 2020-09-25  146.850006  156.309998  146.419998  156.029999  29123000
 2020-09-28  160.330002  168.000000  159.199997  166.080002  38868200
 2020-09-29  164.770004  165.500000  162.009995  163.600006  19874700
 
 [2705 rows x 5 columns],
 'CAT':                   Open        High         Low       Close   Volume
 Date                                                               
 2009-12-31   41.638513   41.898754   41.197552   41.197552  3859700
 2010-01-04   41.674664   42.787914   41.573457   42.325264  7325600
 2010-01-05   42.325259   42.968632   42.093935   42.831284  5697200
 2010-01-06   42.780684   43.322852   42.686707   42.961407  4718800
 2010-01-07   42.867434   43.301170   42.397557   43.134903  5432900
 ...                ...         ...         ...         ...      ...
 2020-09-23  143.585107  144.977349  139.807587  140.566986  2420100
 2020-09-24  140.216514  143.546192  138.960574  141.306931  2138800
 2020-09-25  140.089938  142.825725  139.233173  142.056595  1848900
 2020-09-28  144.091387  146.340379  143.516972  143.779831  2212700
 2020-09-29  144.451606  145.035766  142.348662  143.516968  1605600
 
 [2705 rows x 5 columns],
 'CVX':                  Open       High        Low      Close    Volume
 Date                                                            
 2009-12-31  48.138677  48.175839  47.649362  47.686523   4246600
 2010-01-04  48.435997  49.055384  48.411226  48.968670  10173800
 2010-01-05  49.067785  49.315540  48.758092  49.315540  10593700
 2010-01-06  49.204029  49.550883  49.049182  49.321709  11014600
 2010-01-07  49.154487  49.365076  48.912927  49.135906   9626900
 ...               ...        ...        ...        ...       ...
 2020-09-23  70.783812  71.083472  67.281611  67.375252  16649200
 2020-09-24  66.719762  68.386584  66.017449  67.234795  15203500
 2020-09-25  66.495024  67.871559  66.270278  67.262886  11070500
 2020-09-28  68.957793  70.006583  68.498950  69.229355  12863800
 2020-09-29  69.294914  69.426012  66.588667  67.328438  10553000
 
 [2705 rows x 5 columns],
 'CSCO':                  Open       High        Low      Close    Volume
 Date                                                            
 2009-12-31  17.651317  17.702586  17.534130  17.534130  25208100
 2010-01-04  17.658647  18.193313  17.585404  18.083450  59853700
 2010-01-05  18.017525  18.112739  17.856392  18.002876  45124500
 2010-01-06  17.973587  18.120071  17.827103  17.885696  35715700
 2010-01-07  17.797805  17.995559  17.702591  17.966263  31531200
 ...               ...        ...        ...        ...       ...
 2020-09-23  37.764108  37.918129  36.753344  36.849606  24843200
 2020-09-24  36.580065  36.734087  36.195011  36.435669  30910400
 2020-09-25  36.349033  37.099889  36.214265  37.013252  22966600
 2020-09-28  37.735223  37.821860  37.340544  37.667839  24275700
 2020-09-29  37.610081  37.908499  37.379051  37.600456  16353700
 
 [2705 rows x 5 columns],
 'KO':                  Open       High        Low      Close    Volume
 Date                                                            
 2009-12-31  19.832142  19.890695  19.608264  19.632374  10848800
 2010-01-04  19.687487  19.708153  19.597937  19.646156  13870400
 2010-01-05  19.580711  19.628932  19.336167  19.408497  23172400
 2010-01-06  19.408495  19.439493  19.281057  19.401608  19264600
 2010-01-07  19.401607  19.415383  19.201838  19.353386  13234600
 ...               ...        ...        ...        ...       ...
 2020-09-23  47.835048  48.017807  46.334490  46.372967  17121200
 2020-09-24  46.372965  47.229051  45.747736  46.844296  16788100
 2020-09-25  46.363347  46.902006  46.151729  46.863533  12603400
 2020-09-28  47.431054  47.815810  47.267530  47.402195  11215700
 2020-09-29  47.508004  47.671528  47.046295  47.055912  12426600
 
 [2705 rows x 5 columns],
 'DIS':                   Open        High         Low       Close    Volume
 Date                                                                
 2009-12-31   28.108127   28.526221   28.064576   28.090706  19651700
 2010-01-04   28.308465   28.526222   27.759717   27.933922  13700400
 2010-01-05   27.933921   28.012314   27.611641   27.864239  10307700
 2010-01-06   27.785844   27.872947   27.594218   27.716162  10709500
 2010-01-07   27.672615   27.751008   27.472279   27.724876   8202100
 ...                ...         ...         ...         ...       ...
 2020-09-23  127.070000  127.300003  122.900002  123.279999   8323600
 2020-09-24  121.930000  123.800003  120.779999  122.489998   8480000
 2020-09-25  121.360001  124.190002  120.980003  124.000000   6851800
 2020-09-28  125.750000  126.860001  125.370003  125.989998   6283700
 2020-09-29  125.919998  126.290001  123.680000  125.400002   7405800
 
 [2705 rows x 5 columns],
 'DOW':                  Open       High        Low      Close   Volume
 Date                                                           
 2019-03-20  45.409330  46.054960  42.611599  42.869850  2350800
 2019-03-21  43.033414  43.042021  41.492509  42.163963  1764700
 2019-03-22  42.009018  42.998986  41.458081  41.836849   844700
 2019-03-25  41.836845  42.525520  41.320343  42.310310   440900
 2019-03-26  42.181179  42.826809  41.475290  42.052052   504700
 ...               ...        ...        ...        ...      ...
 2020-09-23  45.054540  45.470311  43.911169  44.166302  4018000
 2020-09-24  44.100151  44.402534  43.174114  43.854469  4648700
 2020-09-25  43.249712  43.845022  43.117421  43.504845  4090200
 2020-09-28  43.939514  45.621499  43.939514  44.865551  3993100
 2020-09-29  44.629315  45.149029  43.712726  44.005657  2716800
 
 [387 rows x 5 columns],
 'GS':                   Open        High         Low       Close    Volume
 Date                                                                
 2009-12-31  139.744099  142.116476  139.443376  141.038879   6401800
 2010-01-04  142.049684  145.558112  141.598592  144.580765   9135000
 2010-01-05  144.513903  147.237108  144.154712  147.136871  11659400
 2010-01-06  146.501965  146.501965  145.148706  145.566376   7381100
 2010-01-07  145.616549  149.317101  145.307465  148.414932   8727400
 ...                ...         ...         ...         ...       ...
 2020-09-23  187.664717  188.563942  181.673125  181.917480   3116100
 2020-09-24  184.712892  193.001427  182.826477  190.704483   5114600
 2020-09-25  188.603054  190.978178  185.328689  190.548111   3106000
 2020-09-28  193.314208  197.311854  192.698428  194.575089   3280100
 2020-09-29  193.685634  194.281861  191.017265  192.346558   2400000
 
 [2705 rows x 5 columns],
 'HD':                   Open        High         Low       Close    Volume
 Date                                                                
 2009-12-31   21.971599   22.183083   21.820539   21.850752   7437100
 2010-01-04   22.016917   22.016917   21.563739   21.654375  13120900
 2010-01-05   21.699692   21.896070   21.367362   21.812986  15594300
 2010-01-06   21.812984   21.903620   21.677032   21.737455   8833200
 2010-01-07   21.797881   22.069787   21.699692   21.994259  12058200
 ...                ...         ...         ...         ...       ...
 2020-09-23  268.320994  269.490046  257.867552  259.679596   4467600
 2020-09-24  258.861223  262.748400  257.078405  258.851501   3095600
 2020-09-25  257.380416  263.186787  257.117386  261.628021   2457700
 2020-09-28  264.920924  267.151907  263.332933  265.310608   3061100
 2020-09-29  266.353041  266.781702  264.122058  265.096283   2048900
 
 [2705 rows x 5 columns],
 'HON':                   Open        High         Low       Close    Volume
 Date                                                                
 2009-12-31   29.078200   29.217331   28.668133   28.704746   2923630
 2010-01-04   29.004975   29.568819   28.909782   29.546850   7750615
 2010-01-05   29.407723   29.627402   29.356466   29.605436   6796106
 2010-01-06   29.612758   29.685985   29.458984   29.605436   6209345
 2010-01-07   29.612756   30.286437   29.349141   30.176598  10266533
 ...                ...         ...         ...         ...       ...
 2020-09-23  158.848141  160.394334  154.454173  155.393631   2507600
 2020-09-24  154.933697  157.478076  153.495137  155.364273   2238300
 2020-09-25  154.278011  158.593689  153.808282  158.035889   2593900
 2020-09-28  160.922796  162.596212  159.738670  161.118515   2512600
 2020-09-29  161.040221  162.674499  159.973538  160.991287   2169300
 
 [2705 rows x 5 columns],
 'IBM':                   Open        High         Low       Close   Volume
 Date                                                               
 2009-12-31   85.321740   85.605262   84.252077   84.348732  4417676
 2010-01-04   84.529154   85.682586   84.316514   85.347511  6438444
 2010-01-05   84.851342   84.960889   83.833231   84.316513  7156104
 2010-01-06   84.206957   84.728904   83.646355   83.768784  5863144
 2010-01-07   83.685026   83.929889   83.066428   83.478828  6109268
 ...                ...         ...         ...         ...      ...
 2020-09-23  108.383732  109.092531  106.257326  106.616219  4120612
 2020-09-24  105.961242  107.235288  104.507750  105.952271  3709325
 2020-09-25  105.512632  107.136592  104.920470  106.723869  3089570
 2020-09-28  108.177377  109.756479  108.033824  109.218147  3670623
 2020-09-29  108.931032  109.630860  107.854368  108.509338  2203504
 
 [2705 rows x 5 columns],
 'INTC':                  Open       High        Low      Close    Volume
 Date                                                            
 2009-12-31  14.381255  14.465028  14.241631  14.241631  26429200
 2010-01-04  14.513898  14.681447  14.472010  14.576728  47800900
 2010-01-05  14.618616  14.653521  14.381255  14.569748  52357700
 2010-01-06  14.527865  14.611640  14.430129  14.520884  40037400
 2010-01-07  14.472011  14.492955  14.199745  14.381256  54041500
 ...               ...        ...        ...        ...       ...
 2020-09-23  48.219855  48.529515  47.087659  47.242489  30078800
 2020-09-24  46.961862  48.035996  46.855416  47.571507  29343400
 2020-09-25  47.377965  48.655312  47.174751  48.326298  26633400
 2020-09-28  48.877884  49.797187  48.539195  49.768158  29652200
 2020-09-29  49.652033  50.068139  49.303665  49.535908  19558200
 
 [2705 rows x 5 columns],
 'JNJ':                   Open        High         Low       Close    Volume
 Date                                                                
 2009-12-31   45.751002   45.751002   45.195976   45.252182   6962300
 2010-01-04   45.462955   45.659673   45.252189   45.441879   9506200
 2010-01-05   45.371627   45.399730   44.837676   44.914959  10673100
 2010-01-06   44.893881   45.343522   44.844702   45.280289  13171500
 2010-01-07   45.139773   45.210029   44.781466   44.957108  10901600
 ...                ...         ...         ...         ...       ...
 2020-09-23  142.861884  142.929699  139.858965  139.917084   8784400
 2020-09-24  140.033336  140.701733  138.851538  140.139893   5614800
 2020-09-25  139.490869  141.670414  139.384313  141.098892   5531200
 2020-09-28  142.135404  143.646553  141.835113  142.503510   5843200
 2020-09-29  142.978162  143.288134  141.486376  142.455063   6768600
 
 [2705 rows x 5 columns],
 'JPM':                  Open       High        Low      Close    Volume
 Date                                                            
 2009-12-31  30.765731  31.142727  30.640067  30.802691  20143100
 2010-01-04  30.928493  31.816605  30.839680  31.712990  35460500
 2010-01-05  31.668592  32.445689  31.661189  32.327274  41208300
 2010-01-06  32.157049  32.630708  32.053436  32.504890  27729000
 2010-01-07  32.408687  33.393010  32.275470  33.148781  44864700
 ...               ...        ...        ...        ...       ...
 2020-09-23  91.608098  92.159022  89.559051  89.636368  19952700
 2020-09-24  89.230426  91.144167  88.321882  89.559052  18366500
 2020-09-25  88.969458  90.583568  88.795481  90.341934  13293100
 2020-09-28  91.878732  93.686153  91.192493  92.941925  20894100
 2020-09-29  92.816271  92.816271  91.173166  92.159027  12848200
 
 [2705 rows x 5 columns],
 'MCD':                   Open        High         Low       Close    Volume
 Date                                                                
 2009-12-31   44.580026   44.650821   44.169410   44.204807   4495300
 2010-01-04   44.339311   44.650811   44.112766   44.445503   5839300
 2010-01-05   44.360555   44.424271   44.027814   44.105690   7099000
 2010-01-06   44.034893   44.183563   43.227823   43.503925  10551300
 2010-01-07   43.362337   44.134010   43.263223   43.822510   7517700
 ...                ...         ...         ...         ...       ...
 2020-09-23  211.540485  213.503607  208.207080  208.916519   2777700
 2020-09-24  208.702702  212.259641  207.575364  210.034119   1727000
 2020-09-25  209.237227  213.075994  207.633682  212.036118   2196800
 2020-09-28  213.804868  215.729107  212.852468  214.057541   2111700
 2020-09-29  214.213041  214.737843  212.327669  212.522034   1731700
 
 [2705 rows x 5 columns],
 'MRK':                  Open       High        Low      Close    Volume
 Date                                                            
 2009-12-31  23.179495  23.349559  22.990530  23.015726   7403072
 2010-01-04  23.192089  23.462936  23.022023  23.311766  14563532
 2010-01-05  23.494421  23.588903  23.261366  23.406239  15452550
 2010-01-06  23.437742  23.765277  23.242479  23.721188  15971415
 2010-01-07  23.601515  23.872362  23.557423  23.758984  12488597
 ...               ...        ...        ...        ...       ...
 2020-09-23  75.906624  76.573520  75.440715  75.486397   7085842
 2020-09-24  75.002210  76.262899  74.773817  75.961433   7844280
 2020-09-25  75.047882  76.043649  74.728141  75.760445   6932101
 2020-09-28  76.007107  76.463878  75.422437  75.605148   6123674
 2020-09-29  75.532066  75.961438  74.636792  74.819504   6742413
 
 [2705 rows x 5 columns],
 'MSFT':                   Open        High         Low       Close    Volume
 Date                                                                
 2009-12-31   23.928155   23.935878   23.541967   23.541967  31929700
 2010-01-04   23.650104   24.020843   23.626932   23.904987  38409100
 2010-01-05   23.827750   24.020844   23.665551   23.912710  49749600
 2010-01-06   23.850913   24.005389   23.572860   23.765953  58182400
 2010-01-07   23.657817   23.711885   23.317974   23.518791  50559700
 ...                ...         ...         ...         ...       ...
 2020-09-23  205.633456  205.831288  197.849260  198.403152  30803800
 2020-09-24  197.671194  203.328835  197.028272  200.974777  31202500
 2020-09-25  201.330869  206.761007  200.331871  205.554321  29437300
 2020-09-28  208.580981  210.252559  205.791717  207.156677  32004900
 2020-09-29  207.067653  207.779805  204.555336  205.000427  24221900
 
 [2705 rows x 5 columns],
 'NKE':                   Open        High         Low       Close    Volume
 Date                                                                
 2009-12-31   14.325821   14.414533   14.291203   14.295529   6347600
 2010-01-04   14.293369   14.295533   14.089983   14.139747  11972400
 2010-01-05   14.094304   14.234945   13.999101   14.195998   6275200
 2010-01-06   14.150567   14.213314   14.051037   14.109456  13399200
 2010-01-07   14.089981   14.280385   14.074835   14.247930   7187600
 ...                ...         ...         ...         ...       ...
 2020-09-23  128.988612  129.147102  124.075516  125.908020  37822700
 2020-09-24  125.244357  126.333953  122.134050  123.570335  13485800
 2020-09-25  122.480742  123.570339  121.113787  123.055260   9182800
 2020-09-28  123.976462  124.996721  122.718467  123.144402   7525000
 2020-09-29  123.164210  125.898113  122.787806  125.155205   7796400
 
 [2705 rows x 5 columns],
 'PG':                   Open        High         Low       Close   Volume
 Date                                                               
 2009-12-31   42.746341   42.753289   42.079299   42.127937  5942200
 2010-01-04   42.461459   42.600427   42.127938   42.468407  9190800
 2010-01-05   42.475341   42.579565   42.107076   42.482288  8649400
 2010-01-06   42.343339   42.357236   42.037612   42.280804  9908400
 2010-01-07   42.093186   42.176565   41.856941   42.051495  8972800
 ...                ...         ...         ...         ...      ...
 2020-09-23  134.278281  134.278281  132.017996  132.231415  6373700
 2020-09-24  132.580646  133.046280  131.649379  132.823166  5708200
 2020-09-25  131.823981  133.861152  131.823981  133.502213  4969900
 2020-09-28  134.414085  135.752798  133.657425  133.880539  5810800
 2020-09-29  134.249178  134.278279  132.570946  133.152985  4492100
 
 [2705 rows x 5 columns],
 'CRM':                   Open        High         Low       Close   Volume
 Date                                                               
 2009-12-31   18.520000   18.590000   18.387501   18.442499  5436400
 2010-01-04   18.652500   18.882500   18.547501   18.705000  7906000
 2010-01-05   18.612499   18.750000   18.200001   18.625000  7942400
 2010-01-06   18.687500   18.750000   18.495001   18.592501  5122400
 2010-01-07   18.629999   18.737499   18.385000   18.510000  4840000
 ...                ...         ...         ...         ...      ...
 2020-09-23  248.490005  248.770004  235.350006  235.990005  7507900
 2020-09-24  235.839996  239.960007  233.630005  237.550003  4166800
 2020-09-25  237.389999  243.949997  235.050003  242.740005  5117800
 2020-09-28  247.559998  248.580002  242.210007  246.669998  4088300
 2020-09-29  245.259995  247.899994  244.500000  247.449997  3983700
 
 [2705 rows x 5 columns],
 'TRV':                   Open        High         Low       Close    Volume
 Date                                                                
 2009-12-31   37.371268   37.639323   37.110661   37.125553   2320000
 2010-01-04   37.341481   37.557411   36.976627   37.088318   3716000
 2010-01-05   36.954282   36.998956   36.127780   36.209686   5941900
 2010-01-06   35.934186   36.030984   35.256604   35.695915  10124500
 2010-01-07   35.606563   36.373495   35.606563   36.209686   4901600
 ...                ...         ...         ...         ...       ...
 2020-09-23  106.141815  107.259296  104.412134  104.451004   2883500
 2020-09-24  104.907724  106.336165  102.682467  104.771683   1746300
 2020-09-25  103.547309  105.257552  102.964273  105.053490   1714100
 2020-09-28  106.627676  108.736324  106.209833  107.103821   1555900
 2020-09-29  106.860890  107.132972  103.537583  104.402420   1306500
 
 [2705 rows x 5 columns],
 'UNH':                   Open        High         Low       Close    Volume
 Date                                                                
 2009-12-31   25.831849   25.998775   25.414533   25.439571   5354200
 2010-01-04   26.190741   26.374361   25.848542   26.315937  12199500
 2010-01-05   26.482870   26.683181   26.107286   26.274212  11180700
 2010-01-06   26.374361   26.758293   26.215781   26.532942   9761100
 2010-01-07   26.366016   27.584577   26.366016   27.551191  11789800
 ...                ...         ...         ...         ...       ...
 2020-09-23  289.866830  290.947695  286.751977  287.056610   3709900
 2020-09-24  287.105743  289.798056  284.600126  287.567566   2869100
 2020-09-25  286.182085  297.992919  286.152608  297.236328   2455800
 2020-09-28  301.058662  304.615667  297.560580  297.953644   2575700
 2020-09-29  298.385976  300.184120  295.447984  298.857605   1797800
 
 [2705 rows x 5 columns],
 'VZ':                  Open       High        Low      Close    Volume
 Date                                                            
 2009-12-31  17.840692  17.862026  17.670019  17.670019   7982714
 2010-01-04  17.814035  17.840703  17.648695  17.750032  16176648
 2010-01-05  17.798025  17.814026  17.483347  17.782024  23722957
 2010-01-06  17.660295  17.714401  17.205801  17.270729  37506464
 2010-01-07  17.368116  17.368116  17.059710  17.167923  25508242
 ...               ...        ...        ...        ...       ...
 2020-09-23  56.632848  56.926280  55.705214  55.762009  15416200
 2020-09-24  55.847204  56.377281  55.515909  56.027050  10875300
 2020-09-25  55.705217  56.273157  55.667357  56.216362  10676300
 2020-09-28  56.377282  56.604456  56.017588  56.187969  11910500
 2020-09-29  56.112243  56.311023  55.941861  56.074383  10005900
 
 [2705 rows x 5 columns],
 'V':                   Open        High         Low       Close    Volume
 Date                                                                
 2009-12-31   20.209995   20.255936   20.023927   20.090544  12643600
 2010-01-04   20.214587   20.391463   20.090542   20.246746  20180000
 2010-01-05   20.053787   20.106619   19.755161   20.014734  25833600
 2010-01-06   19.987177   20.003257   19.732197   19.745979  16254000
 2010-01-07   19.748265   19.984867   19.651787   19.929737  27841200
 ...                ...         ...         ...         ...       ...
 2020-09-23  199.273808  199.561597  193.091163  193.885071   6310600
 2020-09-24  193.666737  196.167588  191.662102  194.033936   9865300
 2020-09-25  193.537762  196.634044  192.039229  195.750809   5496100
 2020-09-28  199.105127  200.186839  198.311204  198.797485   5565300
 2020-09-29  199.055483  200.841805  197.854673  197.924149   5513500
 
 [2705 rows x 5 columns],
 'WBA':                  Open       High        Low      Close    Volume
 Date                                                            
 2009-12-31  27.462935  27.462935  27.086327  27.115866   2806100
 2010-01-04  27.330016  27.544167  27.056790  27.544167   8171000
 2010-01-05  27.396476  27.440784  27.042020  27.322632  10843200
 2010-01-06  26.702333  27.536782  26.525104  27.115866   8945400
 2010-01-07  27.093719  27.322640  26.931262  27.278332   5040700
 ...               ...        ...        ...        ...       ...
 2020-09-23  34.154243  34.535002  33.259453  33.373684   8479100
 2020-09-24  33.487906  33.564056  32.859651  33.345119   5498600
 2020-09-25  33.021471  33.849628  32.869167  33.725880   5955400
 2020-09-28  34.163756  34.763455  34.106644  34.335098   5901600
 2020-09-29  34.078085  34.525480  33.649728  33.678288   5079100
 
 [2705 rows x 5 columns],
 'WMT':                   Open        High         Low       Close    Volume
 Date                                                                
 2009-12-31   40.978737   41.069381   40.351780   40.374443   9764800
 2010-01-04   40.593513   41.296003   40.540635   40.963642  20753100
 2010-01-05   40.857885   40.933421   40.465093   40.555737  15648400
 2010-01-06   40.412224   40.661497   40.351793   40.465099  12517200
 2010-01-07   40.578413   40.601073   40.230941   40.487766  10662700
 ...                ...         ...         ...         ...       ...
 2020-09-23  136.325835  136.561223  133.207021  133.373749   7711100
 2020-09-24  133.118744  135.139110  132.471453  134.070084   9817700
 2020-09-25  133.893549  134.893915  133.344325  134.629120   7539600
 2020-09-28  134.521232  135.394109  134.089695  134.609497   7065700
 2020-09-29  134.609487  135.482363  133.766031  134.501602   9234300
 
 [2705 rows x 5 columns]}
In [9]:
ohlc['MMM'].head()
Out[9]:
Open High Low Close Volume
Date
2009-12-31 60.447778 60.656991 59.546003 59.639786 2049800
2010-01-04 59.942795 60.202507 59.639800 59.892296 3043700
2010-01-05 59.733564 60.043775 58.939998 59.517136 2847000
2010-01-06 60.512704 61.032127 60.245782 60.361206 5268500
2010-01-07 60.108729 60.426156 59.243027 60.404514 4470100

Get ATR

Define Function

In [10]:
# Function to calculate average true range
def ATR(dataframe, period=21):
    "function to calculate True Range and Average True Range"
    df = dataframe.copy()

    df['H-L']  = abs(df['High']-df['Low'])
    df['H-PC'] = abs(df['High']-df['Close'].shift(1))
    df['L-PC'] = abs(df['Low']-df['Close'].shift(1))
    
    df['TR']   = df[['H-L','H-PC','L-PC']].max(axis=1,skipna=False)
    df['ATR']  = df['TR'].rolling(period).mean()

    df = df.drop(['H-L','H-PC','L-PC'],axis=1,)
    return df['ATR']

Create Column

In [11]:
for symbol in dow_stocks:
    ohlc[symbol]['ATR'] = ATR(ohlc[symbol])
In [12]:
ohlc['MMM'].head() # ATR column added
Out[12]:
Open High Low Close Volume ATR
Date
2009-12-31 60.447778 60.656991 59.546003 59.639786 2049800 NaN
2010-01-04 59.942795 60.202507 59.639800 59.892296 3043700 NaN
2010-01-05 59.733564 60.043775 58.939998 59.517136 2847000 NaN
2010-01-06 60.512704 61.032127 60.245782 60.361206 5268500 NaN
2010-01-07 60.108729 60.426156 59.243027 60.404514 4470100 NaN

Create Features

Weekly Percentage Change

In [13]:
# Store closing price in a dataframe
close = pd.DataFrame({symbol: ohlc[symbol]['Close'] for symbol in dow_stocks})

close.head(8)
Out[13]:
MMM AXP AMGN AAPL BA CAT CVX CSCO KO DIS ... MSFT NKE PG CRM TRV UNH VZ V WBA WMT
Date
2009-12-31 59.639786 33.896370 43.744514 6.452592 42.180115 41.197552 47.686523 17.534130 19.632374 28.090706 ... 23.541967 14.295529 42.127937 18.442499 37.125553 25.439571 17.670019 20.090544 27.115866 40.374443
2010-01-04 59.892296 34.230988 44.633812 6.553024 43.777534 42.325264 48.968670 18.083450 19.646156 27.933922 ... 23.904987 14.139747 42.468407 18.705000 37.088318 26.315937 17.750032 20.246746 27.544167 40.963642
2010-01-05 59.517136 34.155693 44.247169 6.564356 45.211346 42.831284 49.315540 18.002876 19.408497 27.864239 ... 23.912710 14.195998 42.482288 18.625000 36.209686 26.274212 17.782024 20.014734 27.322632 40.555737
2010-01-06 60.361206 34.707813 43.914661 6.459940 46.582787 42.961407 49.321709 17.885696 19.401608 27.716162 ... 23.765953 14.109456 42.280804 18.592501 35.695915 26.532942 17.270729 19.745979 27.115866 40.465099
2010-01-07 60.404514 35.270733 43.512535 6.447998 48.468563 43.134903 49.135906 17.966263 19.353386 27.724876 ... 23.518791 14.247930 42.051495 18.510000 36.209686 27.551191 17.167923 19.929737 27.278332 40.487766
2010-01-08 60.830135 35.245533 43.899181 6.490866 48.001011 43.619232 49.222626 18.061476 18.995190 27.768427 ... 23.680996 14.219799 41.995911 18.537500 36.157566 27.292458 17.178745 19.984875 27.315260 40.283806
2010-01-11 60.584869 34.842255 44.092499 6.433609 47.432178 46.358990 50.095951 18.010206 19.380943 27.315491 ... 23.379774 14.044546 41.829144 18.412500 36.142673 27.476078 17.249083 19.927444 27.359558 40.948528
2010-01-12 60.635357 35.304344 43.326958 6.360424 47.089306 44.992733 49.804836 17.724560 19.591042 26.845131 ... 23.225292 14.025070 42.308582 17.072500 36.373486 26.749947 17.265312 19.833267 27.130632 41.341309

8 rows × 30 columns

In [14]:
close.isnull().sum()
Out[14]:
MMM        0
AXP        0
AMGN       0
AAPL       0
BA         0
CAT        0
CVX        0
CSCO       0
KO         0
DIS        0
DOW     2318
GS         0
HD         0
HON        0
IBM        0
INTC       0
JNJ        0
JPM        0
MCD        0
MRK        0
MSFT       0
NKE        0
PG         0
CRM        0
TRV        0
UNH        0
VZ         0
V          0
WBA        0
WMT        0
dtype: int64
In [15]:
close.drop(['DOW'], axis=1, inplace=True)

resample('W-FRI') means to convert the time series to weekly data, taking Fri as the last day of the weelk.

resample('W-FRI').last() thus gives us the closing price on Friday

In [16]:
weekly_change = close.resample('W-FRI').last().pct_change()
weekly_change = weekly_change.T

# Check output
weekly_change.head()
Out[16]:
Date 2010-01-01 2010-01-08 2010-01-15 2010-01-22 2010-01-29 2010-02-05 2010-02-12 2010-02-19 2010-02-26 2010-03-05 ... 2020-07-31 2020-08-07 2020-08-14 2020-08-21 2020-08-28 2020-09-04 2020-09-11 2020-09-18 2020-09-25 2020-10-02
MMM NaN 0.019959 -0.011266 -0.022670 -0.012151 -0.024226 0.008149 0.036314 -0.016806 0.028572 ... -0.058621 0.052236 0.049075 -0.017568 0.024426 0.000664 0.002413 0.020341 -0.054733 -0.002870
AXP NaN 0.039803 0.010489 -0.089644 -0.024100 0.005045 0.015059 0.016658 -0.022274 0.052632 ... -0.021085 0.062580 0.012606 -0.042426 0.066459 0.030525 -0.021861 0.000774 -0.069412 0.015479
AMGN NaN 0.003536 -0.009160 0.006223 0.033216 -0.013681 -0.020804 0.015935 -0.013419 0.011129 ... -0.013348 -0.016267 0.002600 -0.008635 0.065141 -0.018647 -0.020894 0.018544 -0.015744 0.018374
AAPL NaN 0.005932 -0.028541 -0.039722 -0.028774 0.017703 0.025171 0.006438 0.014628 0.070033 ... 0.147330 0.047552 0.034154 0.082349 0.003518 -0.030828 -0.074074 -0.046072 0.050917 0.016120
BA NaN 0.138001 -0.012662 -0.050148 0.048987 -0.029724 0.021404 0.066052 -0.006762 0.075523 ... -0.090700 0.076076 0.047406 -0.059412 0.049552 -0.027019 -0.063256 0.005679 -0.031712 0.048516

5 rows × 562 columns

Weekly ATR

In [17]:
# Add ATR to each stocks
atr = pd.DataFrame({symbol: ohlc[symbol]['ATR'] for symbol in dow_stocks})

atr.isnull().sum()  
Out[17]:
MMM       21
AXP       21
AMGN      21
AAPL      21
BA        21
CAT       21
CVX       21
CSCO      21
KO        21
DIS       21
DOW     2339
GS        21
HD        21
HON       21
IBM       21
INTC      21
JNJ       21
JPM       21
MCD       21
MRK       21
MSFT      21
NKE       21
PG        21
CRM       21
TRV       21
UNH       21
VZ        21
V         21
WBA       21
WMT       21
dtype: int64
In [18]:
# Fill backward the missing values and drop DOW form the list
atr.fillna(method='bfill', axis=0, inplace=True)
atr.drop(['DOW'], axis=1, inplace=True)
In [19]:
# Manipulate dataframe: drop and resample
weekly_atr = atr.resample('W-FRI').last()
weekly_atr = weekly_atr.T

# Check output
weekly_atr.head()
Out[19]:
Date 2010-01-01 2010-01-08 2010-01-15 2010-01-22 2010-01-29 2010-02-05 2010-02-12 2010-02-19 2010-02-26 2010-03-05 ... 2020-07-31 2020-08-07 2020-08-14 2020-08-21 2020-08-28 2020-09-04 2020-09-11 2020-09-18 2020-09-25 2020-10-02
MMM 1.070107 1.070107 1.070107 1.070107 1.070107 1.150837 1.246684 1.233323 1.158499 1.002220 ... 3.297071 3.199575 3.141532 3.000859 2.519725 3.000982 3.061846 3.503506 4.055093 4.174521
AXP 1.018999 1.018999 1.018999 1.018999 1.018999 1.055425 1.077831 1.064228 0.832579 0.772566 ... 2.800283 2.557475 2.631537 2.625913 2.550446 2.825599 2.497008 2.627318 3.033252 3.018721
AMGN 0.909528 0.909528 0.909528 0.909528 0.909528 0.911000 0.831830 0.777332 0.706264 0.680488 ... 6.157390 5.567672 5.044478 4.839070 5.089008 5.783428 6.012785 6.435372 6.631731 6.434912
AAPL 0.188285 0.188285 0.188285 0.188285 0.188285 0.199950 0.200635 0.184815 0.150272 0.130778 ... 2.796396 3.137421 3.250342 3.351009 3.500987 4.567365 5.187703 5.692051 6.025253 5.990408
BA 1.431941 1.431941 1.431941 1.431941 1.431941 1.400376 1.382445 1.393515 1.246801 1.229516 ... 7.648096 7.380952 7.329522 6.879046 6.982855 6.893810 6.563810 6.975239 7.350477 7.441906

5 rows × 562 columns

In [20]:
# Plot weekly atr values for MMM
weekly_atr.iloc[0,:].plot()
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x28b16b702b0>

Elbow Plot

In [21]:
# Import sklearn modules
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline

from datetime import datetime
In [22]:
n_clusters = range(2, 30)
inertia = []

for n in n_clusters:
    kmeans = KMeans(n_clusters=n)
    kmeans.fit(weekly_atr)
    inertia.append(kmeans.inertia_)

plt.plot(n_clusters, np.divide(inertia,inertia[0]))
plt.hlines(0.1, n_clusters[0], n_clusters[-1], 'r', linestyles='dashed')
plt.hlines(0.05, n_clusters[0], n_clusters[-1], 'r', linestyles='dashed')
plt.xlabel('clusters')
plt.ylabel('relative inertia')
plt.legend(['inertia', '10% relative inertia', '5% relative inertia']);

Train Model

In [23]:
pipe = Pipeline([("normalization", MinMaxScaler()), ("cluster", KMeans(n_clusters=15))])
# Fit Model
pipe.fit(weekly_atr)

# Assign Label
labels = pipe.predict(weekly_atr)

# Check lables
labels
Out[23]:
array([11,  0,  1,  3,  9, 12,  8,  3,  3,  0,  2, 14,  0,  5,  3,  0,  0,
        0,  7, 10,  7,  7,  6,  0,  4,  3, 10, 13,  7])
In [24]:
# Remove DOW from the list
companies = dow_stocks
companies.remove('DOW')

# Remove Dow Inc from the compnay name
companies_name = dow_company_name
companies_name.remove('Dow')
In [25]:
# Create dataframe to hold cluster details
df = pd.DataFrame({'Cluster': labels, 
                    'Companies Name': companies_name,
                    'Companies': companies,
                    'ATR': weekly_atr.mean(axis=1),
                    'CHG': weekly_change.mean(axis=1)}
                  ).sort_values(by=['Cluster'], axis = 0)

df = df.reset_index(drop=True)
df
Out[25]:
Cluster Companies Name Companies ATR CHG
0 0 Disney DIS 1.485320 0.003183
1 0 American Express AXP 1.405039 0.002617
2 0 JPMorgan Chase JPM 1.283181 0.002647
3 0 Johnson & Johnson JNJ 1.266946 0.002280
4 0 Honeywell HON 1.580776 0.003610
5 0 Travelers TRV 1.385457 0.002259
6 0 McDonald's MCD 1.607499 0.003092
7 1 Amgen AMGN 2.572938 0.003558
8 2 Goldman Sachs GS 3.555737 0.001351
9 3 Intel INTC 0.677648 0.002865
10 3 Cisco Systems CSCO 0.527522 0.001977
11 3 Verizon VZ 0.573559 0.002336
12 3 Apple AAPL 0.677700 0.005884
13 3 Coca-Cola KO 0.477599 0.001874
14 4 UnitedHealth UNH 2.702716 0.005025
15 5 IBM IBM 1.895580 0.000909
16 6 Salesforce CRM 2.206371 0.005697
17 7 Procter & Gamble PG 1.002342 0.002292
18 7 Nike NKE 0.979289 0.004454
19 7 Walmart WMT 1.101889 0.002457
20 7 Merck MRK 0.790886 0.002460
21 8 Chevron CVX 1.528697 0.001211
22 9 Boeing BA 3.789684 0.004037
23 10 Visa V 1.509562 0.004574
24 10 Microsoft MSFT 1.283454 0.004348
25 11 3M MMM 2.019235 0.002096
26 12 Caterpillar CAT 2.038790 0.003032
27 13 Walgreens Boots Alliance WBA 1.067749 0.001035
28 14 Home Depot HD 1.935985 0.005014
In [26]:
plt.figure(figsize=(20,10))
plt.scatter(df.Companies, df.Cluster)
plt.xlabel('Stocks')
plt.ylabel('Cluster')
plt.title('Dow Stocks ATR Clustering');

Trading Strategy

In [27]:
spy = yf.download('TSLA', start='2000-01-02', end = '2020-12-31', progress=False)
In [28]:
spy
Out[28]:
Open High Low Close Adj Close Volume
Date
2010-06-29 3.800000 5.000000 3.508000 4.778000 4.778000 93831500
2010-06-30 5.158000 6.084000 4.660000 4.766000 4.766000 85935500
2010-07-01 5.000000 5.184000 4.054000 4.392000 4.392000 41094000
2010-07-02 4.600000 4.620000 3.742000 3.840000 3.840000 25699000
2010-07-06 4.000000 4.000000 3.166000 3.222000 3.222000 34334500
... ... ... ... ... ... ...
2020-12-23 632.200012 651.500000 622.570007 645.979980 645.979980 33173000
2020-12-24 642.989990 666.090027 641.000000 661.770020 661.770020 22865600
2020-12-28 674.510010 681.400024 660.799988 663.690002 663.690002 32278600
2020-12-29 661.000000 669.900024 655.000000 665.989990 665.989990 22910800
2020-12-30 672.000000 696.599976 668.359985 694.780029 694.780029 42846000

2646 rows × 6 columns

In [29]:
# Plot share price
plt.title('Share Price')
plt.plot(spy['Adj Close'])
plt.show()
In [30]:
# Calculcate scaled volume and range
df = spy.copy(deep=True)
df['ATR'] = ATR(df, 20)
df['Svolume'] = df['Volume'] / df['Volume'].rolling(20).mean()
df['Range'] = (df['High'] - df['Low']) / df['ATR']

# Calculate open-to-open returns
df['Return'] = df.Open.shift(-2) - df.Open.shift(-1)
# Initial target with zeros
df['Target'] = 0

# Drop nan values
df.dropna(inplace=True)

# Verify last 5 values
df.tail()
Out[30]:
Open High Low Close Adj Close Volume ATR Svolume Range Return Target
Date
2020-12-21 666.239990 668.500000 646.070007 649.859985 649.859985 58045300 38.903503 0.972031 0.576555 -15.799988 0
2020-12-22 648.000000 649.880005 614.229980 640.340027 640.340027 51716000 38.866504 0.864986 0.917243 10.789978 0
2020-12-23 632.200012 651.500000 622.570007 645.979980 645.979980 33173000 38.406003 0.564508 0.753267 31.520020 0
2020-12-24 642.989990 666.090027 641.000000 661.770020 661.770020 22865600 38.229004 0.397931 0.656309 -13.510010 0
2020-12-28 674.510010 681.400024 660.799988 663.690002 663.690002 32278600 38.020004 0.564340 0.541821 11.000000 0
In [31]:
# Split the data
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit, cross_val_score

df_train, df_test = train_test_split(df, test_size=0.2, random_state=0, shuffle=False)
In [32]:
X_train = df_train[['Svolume', 'Range']]
X_test = df_test[['Svolume', 'Range']]
In [33]:
kmeans = KMeans(n_clusters=3).fit(X_train)
df_train['Target']  = kmeans.predict(X_train)
In [34]:
# Train plot
centers = kmeans.cluster_centers_
plt.scatter(X_train['Svolume'], X_train['Range'], c=df_train['Target'])
plt.scatter(centers[:,0], centers[:,1], c='red', s=100, marker='x')
plt.xlabel('Volume')
plt.ylabel('Range')
plt.title('Cluster Analysis on Training Data')
plt.show()
In [35]:
df_test['Target']  = kmeans.predict(X_test)
In [36]:
# Test plot
centers = kmeans.cluster_centers_
plt.scatter(X_test['Svolume'], X_test['Range'], c=df_test['Target'])
plt.scatter(centers[:,0],centers[:,1], c='red', s=100, marker='x')
plt.xlabel('Volume')
plt.ylabel('Range')
plt.title('Cluster Analysis on Test Data')
plt.show()
In [37]:
# Performance plot
plt.plot(np.cumsum(df_train['Return'].loc[df_train['Target'] == 0]),label='Cluster 1')
plt.plot(np.cumsum(df_train['Return'].loc[df_train['Target'] == 1]),label='Cluster 2')
plt.plot(np.cumsum(df_train['Return'].loc[df_train['Target'] == 2]),label='Cluster 3')
plt.title('Total Points Gained')
plt.legend()
plt.show()
In [38]:
# Performance plot
plt.plot(np.cumsum(df_test['Return'].loc[df_test['Target'] == 0]),label='Cluster 1')
plt.plot(np.cumsum(df_test['Return'].loc[df_test['Target'] == 1]),label='Cluster 2')
plt.plot(np.cumsum(df_test['Return'].loc[df_test['Target'] == 2]),label='Cluster 3')
plt.title('Total Points Gained')
plt.legend()
plt.show()
In [ ]: