Regression – Forecasting and Predicting
This next tutorial covers using the trained regression model to forecast out data. Full notes here:-
https://pythonprogramming.net/forecasting-predicting-machine-learning-tutorial/
Key takeaways:-
- I used plt.savefig() to save the final chart instead of displaying it. Additional arguments and options can be found here:-
https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.savefig.html
import pandas as pd
import quandl, math, datetime #imports Math, datetime and Quandl
import numpy as np # support for arrays
from sklearn import preprocessing, model_selection, svm #machine learning and
from sklearn.linear_model import LinearRegression # regression
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
quandl.ApiConfig.api_key = "9qfnyWSTDUpx6uhNX2dc"
df = quandl.get('WIKI/GOOGL') #import data from Qunadl
# print (df.head()) # print out the head rows of the data to check what we're getting
# create a dataframe
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close','Adj. Volume']]
df['HL_pct'] = ((df['Adj. High'] - df['Adj. Close']) / df['Adj. Close']) * 100
df['pct_Change'] = ((df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open']) * 100
df = df[['Adj. Close','HL_pct','pct_Change','Adj. Volume']]
forecast_col = 'Adj. Close' # define what we're forcasting
df.fillna(-99999, inplace=True) #replaces missing data with an outlier value (-99999) rather that getting rid of any data
forecast_out = int(math.ceil(0.01*len(df))) # math.ceil rounds everything up to the nearest whole - so this formula takes 1% of the length of the datafram, rounds this up and finally converts it to an interger
df['label'] = df[forecast_col].shift(-forecast_out) # so this adds a new column 'label' that contains the 'Adj. Close' value from ~1 days in future(?)
# print (df.head()) #just used to check data
X = np.array(df.drop(['label'],1)) # everything except the lable column; this returns a new dataframe that is then converted to a numpy array and stored as X
X = preprocessing.scale(X) # scale X before classifier - this can help with performance but can also take longer: can be skipped
X_lately = X[-forecast_out:] # used to predict against - note there are no y values for these to check against
X = X[:-forecast_out] # needs to happen after scaling
df.dropna(inplace=True)
y = np.array(df['label'])
y = np.array(df['label']) # array of labels
### creat training and testing sets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2) # 0.2 = 20% of the datafram
### Swapping different algorythms
# clf = LinearRegression() # simple linear regressions
# clf = LinearRegression(n_jobs=10) # linear regression using threading, 10 jobs at a time = faster
clf = LinearRegression(n_jobs=-1) # linear regression using threading with as many jobs as preprocessor will handle
# clf = svm.SVR() # base support vector regression
# clf = svm.SVR(kernel="poly") # support vector regression with specific kernel
clf.fit(X_train, y_train) # fit the data to the training data
accuracy = clf.score(X_test, y_test) # score it against test
# print(accuracy)
### pridiction - easy once the classifier is sets
forecast_set = clf.predict(X_lately)
print (forecast_set, accuracy, forecast_out)
df['Forecast'] = np.nan
last_date = df.iloc[-1].name
last_unix = last_date.timestamp()
one_day = 86400
next_unix = last_unix + one_day # moving to future dates not in dataset
## Set a new Data Frame including dates with the forecast values
for i in forecast_set:
next_date = datetime.datetime.fromtimestamp(next_unix)
next_unix += 86400
df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i]
df['Adj. Close'].plot()
df['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.savefig('ML_Tutorial5.svg', bbox_inches='tight') #bbox_inches='tight' minimises whitespace around the fig
plt.show()