### Regression – Forecasting and Predicting

This next tutorial covers using the trained regression model to forecast out data. Full notes here:-

https://pythonprogramming.net/forecasting-predicting-machine-learning-tutorial/

Key takeaways:-

- I used plt.savefig() to save the final chart instead of displaying it. Additional arguments and options can be found here:-

https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.savefig.html

import pandas as pd import quandl, math, datetime #imports Math, datetime and Quandl import numpy as np # support for arrays from sklearn import preprocessing, model_selection, svm #machine learning and from sklearn.linear_model import LinearRegression # regression import matplotlib.pyplot as plt from matplotlib import style style.use('ggplot') quandl.ApiConfig.api_key = "9qfnyWSTDUpx6uhNX2dc" df = quandl.get('WIKI/GOOGL') #import data from Qunadl # print (df.head()) # print out the head rows of the data to check what we're getting # create a dataframe df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close','Adj. Volume']] df['HL_pct'] = ((df['Adj. High'] - df['Adj. Close']) / df['Adj. Close']) * 100 df['pct_Change'] = ((df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open']) * 100 df = df[['Adj. Close','HL_pct','pct_Change','Adj. Volume']] forecast_col = 'Adj. Close' # define what we're forcasting df.fillna(-99999, inplace=True) #replaces missing data with an outlier value (-99999) rather that getting rid of any data forecast_out = int(math.ceil(0.01*len(df))) # math.ceil rounds everything up to the nearest whole - so this formula takes 1% of the length of the datafram, rounds this up and finally converts it to an interger df['label'] = df[forecast_col].shift(-forecast_out) # so this adds a new column 'label' that contains the 'Adj. Close' value from ~1 days in future(?) # print (df.head()) #just used to check data X = np.array(df.drop(['label'],1)) # everything except the lable column; this returns a new dataframe that is then converted to a numpy array and stored as X X = preprocessing.scale(X) # scale X before classifier - this can help with performance but can also take longer: can be skipped X_lately = X[-forecast_out:] # used to predict against - note there are no y values for these to check against X = X[:-forecast_out] # needs to happen after scaling df.dropna(inplace=True) y = np.array(df['label']) y = np.array(df['label']) # array of labels ### creat training and testing sets X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2) # 0.2 = 20% of the datafram ### Swapping different algorythms # clf = LinearRegression() # simple linear regressions # clf = LinearRegression(n_jobs=10) # linear regression using threading, 10 jobs at a time = faster clf = LinearRegression(n_jobs=-1) # linear regression using threading with as many jobs as preprocessor will handle # clf = svm.SVR() # base support vector regression # clf = svm.SVR(kernel="poly") # support vector regression with specific kernel clf.fit(X_train, y_train) # fit the data to the training data accuracy = clf.score(X_test, y_test) # score it against test # print(accuracy) ### pridiction - easy once the classifier is sets forecast_set = clf.predict(X_lately) print (forecast_set, accuracy, forecast_out) df['Forecast'] = np.nan last_date = df.iloc[-1].name last_unix = last_date.timestamp() one_day = 86400 next_unix = last_unix + one_day # moving to future dates not in dataset ## Set a new Data Frame including dates with the forecast values for i in forecast_set: next_date = datetime.datetime.fromtimestamp(next_unix) next_unix += 86400 df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i] df['Adj. Close'].plot() df['Forecast'].plot() plt.legend(loc=4) plt.xlabel('Date') plt.ylabel('Price') plt.savefig('ML_Tutorial5.svg', bbox_inches='tight') #bbox_inches='tight' minimises whitespace around the fig plt.show()