Machine Learning – Tutorial 5

Regression – Forecasting and Predicting

This next tutorial covers using the trained regression model to forecast out data. Full notes here:-

Key takeaways:-

import pandas as pd
import quandl, math, datetime #imports Math, datetime and Quandl
import numpy as np # support for arrays
from sklearn import preprocessing, model_selection, svm #machine learning and
from sklearn.linear_model import LinearRegression # regression
import matplotlib.pyplot as plt
from matplotlib import style


quandl.ApiConfig.api_key = "9qfnyWSTDUpx6uhNX2dc"

df = quandl.get('WIKI/GOOGL') #import data from Qunadl
# print (df.head()) # print out the head rows of the data to check what we're getting

# create a dataframe
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close','Adj. Volume']]
df['HL_pct'] = ((df['Adj. High'] - df['Adj. Close']) / df['Adj. Close']) * 100
df['pct_Change'] = ((df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open']) * 100

df = df[['Adj. Close','HL_pct','pct_Change','Adj. Volume']]

forecast_col = 'Adj. Close' # define what we're forcasting
df.fillna(-99999, inplace=True) #replaces missing data with an outlier value (-99999) rather that getting rid of any data

forecast_out = int(math.ceil(0.01*len(df))) # math.ceil rounds everything up to the nearest whole - so this formula takes 1% of the length of the datafram, rounds this up  and finally converts it to an interger

df['label'] = df[forecast_col].shift(-forecast_out) # so this adds a new column 'label' that contains the 'Adj. Close' value from ~1 days in future(?)

# print (df.head()) #just used to check data

X = np.array(df.drop(['label'],1)) # everything except the lable column; this returns a new dataframe that is then converted to a numpy array and stored as X
X = preprocessing.scale(X) # scale X before classifier - this can help with performance but can also take longer: can be skipped
X_lately = X[-forecast_out:] # used to predict against - note there are no y values for these to check against
X = X[:-forecast_out] # needs to happen after scaling

y = np.array(df['label'])
y = np.array(df['label']) # array of labels

### creat training and testing sets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2) # 0.2 = 20% of the datafram

### Swapping different algorythms
# clf = LinearRegression() # simple linear regressions
# clf = LinearRegression(n_jobs=10) # linear regression using threading, 10 jobs at a time = faster
clf = LinearRegression(n_jobs=-1) # linear regression using threading with as many jobs as preprocessor will handle
# clf = svm.SVR() # base support vector regression
# clf = svm.SVR(kernel="poly") # support vector regression with specific kernel, y_train) # fit the data to the training data
accuracy = clf.score(X_test, y_test) # score it against test

# print(accuracy)

### pridiction - easy once the classifier is sets

forecast_set = clf.predict(X_lately)
print (forecast_set, accuracy, forecast_out)
df['Forecast'] = np.nan

last_date = df.iloc[-1].name
last_unix = last_date.timestamp()
one_day = 86400
next_unix = last_unix + one_day # moving to future dates not in dataset

## Set a new Data Frame including dates with the forecast values
for i in forecast_set:
    next_date = datetime.datetime.fromtimestamp(next_unix)
    next_unix += 86400
    df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i]

df['Adj. Close'].plot()
plt.savefig('ML_Tutorial5.svg', bbox_inches='tight') #bbox_inches='tight' minimises whitespace around the fig

Leave a Reply