Soft Margin Support Vector Machine
https://pythonprogramming.net/soft-margin-svm-machine-learning-tutorial/
https://pythonprogramming.net/soft-margin-svm-machine-learning-tutorial/
https://pythonprogramming.net/predictions-svm-machine-learning-tutorial/
import matplotlib.pyplot as plt from matplotlib import style import numpy as np style.use('ggplot') # build SVM class class Support_Vector_Machine: # The __init__ method of a class is one that runs whenever an object is created with the class # calling self in the class allows sharing of variables across the class, so is included in all function defs def __init__(self, visualisation=True): # sets visualisations to what ever the user specifies (defaults to True) self.visualisation = visualisation # defines colours for the two states 1 & -1 self.colors = {1:'r', -1:'b'} # sets some standards for the graphs if self.visualisation: self.fig = plt.figure() self.ax = self.fig.add_subplot(1,1,1) # train def fit(self, data): # set up access to the data that's passed when the function is called self.data = data # { ||w||: [w,b] } opt_dict = {} # transforms = [[1,1], [-1,1], [-1,-1], [1,-1]] # finding values to work with for our ranges. all_data = [] # set up a placeholder for the values # for loop to step through data and append it to all_data (list of values) for yi in self.data: for featureset in self.data[yi]: for feature in featureset: all_data.append(feature) # next define the max and min value in list self.max_feature_value = max(all_data) self.min_feature_value = min(all_data) # free up memory once we've got the values all_data=None # define step size for optimisation Big through to small step_sizes = [self.max_feature_value * 0.1, self.max_feature_value * 0.01, # starts getting very high cost after this. self.max_feature_value * 0.001] # extremely expensive b_range_multiple = 5 b_multiple = 5 # first element in vector w latest_optimum = self.max_feature_value*10 ## Begin the stepping process for step in step_sizes: w = np.array([latest_optimum,latest_optimum]) # we can do this because convex optimized = False while not optimized: # we're not optimising b as much as w (not needed) for b in np.arange(-1*(self.max_feature_value*b_range_multiple), self.max_feature_value*b_range_multiple, step*b_multiple): for transformation in transforms: w_t = w*transformation found_option = True # weakest link in the SVM fundamentally # SMO attempts to fix this a bit # yi(xi.w+b) >= 1 # # #### add a break here later.. for i in self.data: for xi in self.data[i]: yi=i if not yi*(np.dot(w_t,xi)+b) >= 1: found_option = False if found_option: opt_dict[np.linalg.norm(w_t)] = [w_t,b] if w[0]<0: optimized = True print('optimised a step') else: w = w - step # break out of while loop # take a list of the magnitudes and sort them norms = sorted([n for n in opt_dict]) # sorting lowest to highest #||w|| : [w,b] opt_choice = opt_dict[norms[0]] # smallest magnitude self.w = opt_choice[0] # sets w to first element in the smallest mag self.b = opt_choice[1] # sets b to second element in the smallest mag latest_optimum = opt_choice[0][0]+step*2 # resetting the opt to the latest def predict(self,features): # sign( x.w+b ) classification = np.sign(np.dot(np.array(features),self.w)+self.b) if classification !=0 and self.visualisation: self.ax.scatter(features[0], features[1], s=100, marker='*', c=self.colors[classification]) return classification def visualise(self): #scattering known featuresets using a one line for loop [[self.ax.scatter(x[0],x[1],s=100,color=self.colors[i]) for x in data_dict[i]] for i in data_dict] # hyperplane = x.w+b def hyperplane(x,w,b,v): # v = (w.x+b) return (-w[0]*x-b+v) / w[1] datarange = (self.min_feature_value*0.9,self.max_feature_value*1.1) # gives space on the graph hyp_x_min = datarange[0] hyp_x_max = datarange[1] # w.x + b = 1 # pos sv hyperplane psv1 = hyperplane(hyp_x_min, self.w, self.b, 1) # define the ys psv2 = hyperplane(hyp_x_max, self.w, self.b, 1) # define the ys self.ax.plot([hyp_x_min,hyp_x_max], [psv1,psv2], "k") # plot xs, ys then colour k=black g-- = green # w.x + b = -1 # negative sv hyperplane nsv1 = hyperplane(hyp_x_min, self.w, self.b, -1) nsv2 = hyperplane(hyp_x_max, self.w, self.b, -1) self.ax.plot([hyp_x_min,hyp_x_max], [nsv1,nsv2], "k") # w.x + b = 0 # decision db1 = hyperplane(hyp_x_min, self.w, self.b, 0) db2 = hyperplane(hyp_x_max, self.w, self.b, 0) self.ax.plot([hyp_x_min,hyp_x_max], [db1,db2], "g--") plt.show() # define data dictionary data_dict = {-1:np.array([[1,7], [2,8], [3,8],]), 1:np.array([[5,1], [6,-1], [7,3],])} svm = Support_Vector_Machine() svm.fit(data=data_dict) predict_us = [[0,10], [1,3], [3,4], [3,5], [5,5], [5,6], [6,-5], [5,8]] for p in predict_us: svm.predict(p) svm.visualise()
https://pythonprogramming.net/svm-optimization-python-2-machine-learning-tutorial/
import matplotlib.pyplot as plt from matplotlib import style import numpy as np style.use('ggplot') # build SVM class class Support_Vector_Machine: # The __init__ method of a class is one that runs whenever an object is created with the class # calling self in the class allows sharing of variables across the class, so is included in all function defs def __init__(self, visualisation=True): # sets visualisations to what ever the user specifies (defaults to True) self.visualisation = visualisation # defines colours for the two states 1 & -1 self.colors = {1:'r', -1:'b'} # sets some standards for the graphs if self.visualisation: self.fig = plt.figure() self.ax = self.fig.add_subplot(1,1,1) # train def fit(self, data): # set up access to the data that's passed when the function is called self.data = data # { ||w||: [w,b] } opt_dict = {} # transforms = [[1,1], [-1,1], [-1,-1], [1,-1]] # finding values to work with for our ranges. all_data = [] # set up a placeholder for the values # for loop to step through data and append it to all_data (list of values) for yi in self.data: for featureset in self.data[yi]: for feature in featureset: all_data.append(feature) # next define the max and min value in list self.max_feature_value = max(all_data) self.min_feature_value = min(all_data) # free up memory once we've got the values all_data=None # define step size for optimisation Big through to small step_sizes = [self.max_feature_value * 0.1, self.max_feature_value * 0.01, # starts getting very high cost after this. self.max_feature_value * 0.001] # extremely expensive b_range_multiple = 5 b_multiple = 5 # first element in vector w latest_optimum = self.max_feature_value*10 ## Begin the stepping process for step in step_sizes: w = np.array([latest_optimum,latest_optimum]) # we can do this because convex optimized = False while not optimized: # we're not optimising b as much as w (not needed) for b in np.arange(-1*(self.max_feature_value*b_range_multiple), self.max_feature_value*b_range_multiple, step*b_multiple): for transformation in transforms: w_t = w*transformation found_option = True # weakest link in the SVM fundamentally # SMO attempts to fix this a bit # yi(xi.w+b) >= 1 # # #### add a break here later.. for i in self.data: for xi in self.data[i]: yi=i if not yi*(np.dot(w_t,xi)+b) >= 1: found_option = False if found_option: opt_dict[np.linalg.norm(w_t)] = [w_t,b] if w[0]<0: optimized = True print('optimised a step') else: w = w - step # break out of while loop # take a list of the magnitudes and sort them norms = sorted([n for n in opt_dict]) # sorting lowest to highest #||w|| : [w,b] opt_choice = opt_dict[norms[0]] # smallest magnitude self.w = opt_choice[0] # sets w to first element in the smallest mag self.b = opt_choice[1] # sets b to second element in the smallest mag latest_optimum = opt_choice[0][0]+step*2 # resetting the opt to the latest def predict(self,features): # sign( x.w+b ) classification = np.sign(np.dot(np.array(features),self.w)+self.b) return classification # define data dictionary data_dict = {-1:np.array([[1,7], [2,8], [3,8],]), 1:np.array([[5,1], [6,-1], [7,3],])}
https://pythonprogramming.net/svm-optimization-python-machine-learning-tutorial/
More resources:-
import matplotlib.pyplot as plt from matplotlib import style import numpy as np style.use('ggplot') # build SVM class class Support_Vector_Machine: # The __init__ method of a class is one that runs whenever an object is created with the class # calling self in the class allows sharing of variables across the class, so is included in all function defs def __init__(self, visualisation=True): # sets visualisations to what ever the user specifies (defaults to True) self.visualisation = visualisation # defines colours for the two states 1 & -1 self.colors = {1:'r', -1:'b'} # sets some standards for the graphs if self.visualisation: self.fig = plt.figure() self.ax = self.fig.add_subplot(1,1,1) # train def fit(self, data): # set up access to the data that's passed when the function is called self.data = data # { ||w||: [w,b] } opt_dict = {} # transforms = [[1,1], [-1,1], [-1,-1], [1,-1]] # finding values to work with for our ranges. all_data = [] # set up a placeholder for the values # for loop to step through data and append it to all_data (list of values) for yi in self.data: for featureset in self.data[yi]: for feature in featureset: all_data.append(feature) # next define the max and min value in list self.max_feature_value = max(all_data) self.min_feature_value = min(all_data) # free up memory once we've got the values all_data=None # define step size for optimisation Big through to small step_sizes = [self.max_feature_value * 0.1, self.max_feature_value * 0.01, # starts getting very high cost after this. self.max_feature_value * 0.001] # extremely expensive b_range_multiple = 5 b_multiple = 5 # first element in vector w latest_optimum = self.max_feature_value*10 ## Begin the stepping process for step in step_sizes: w = np.array([latest_optimum,latest_optimum]) # we can do this because convex optimized = False while not optimized: pass def predict(self,features): # sign( x.w+b ) classification = np.sign(np.dot(np.array(features),self.w)+self.b) return classification # define data dictionary data_dict = {-1:np.array([[1,7], [2,8], [3,8],]), 1:np.array([[5,1], [6,-1], [7,3],])}
https://pythonprogramming.net/svm-in-python-machine-learning-tutorial/
import matplotlib.pyplot as plt from matplotlib import style import numpy as np style.use('ggplot') # build SVM class class Support_Vector_Machine: # The __init__ method of a class is one that runs whenever an object is created with the class # calling self in the class allows sharing of variables across the class, so is included in all function defs def __init__(self, visualisation=True): # sets visualisations to what ever the user specifies (defaults to True) self.visualisation = visualisation # defines colours for the two states 1 & -1 self.colors = {1:'r', -1:'b'} # sets some standards for the graphs if self.visualisation: self.fig = plt.figure() self.ax = self.fig.add_subplot(1,1,1) # train def fit(self, data): pass def predict(self,features): # sign( x.w+b ) classification = np.sign(np.dot(np.array(features),self.w)+self.b) return classification # define data dictionary data_dict = {-1:np.array([[1,7], [2,8], [3,8],]), 1:np.array([[5,1], [6,-1], [7,3],])}
https://pythonprogramming.net/svm-constraint-optimization-machine-learning-tutorial/
https://pythonprogramming.net/support-vector-machine-fundamentals-machine-learning-tutorial/
https://pythonprogramming.net/support-vector-assertions-machine-learning-tutorial/
https://pythonprogramming.net/vector-basics-machine-learning-tutorial/
Covering the basics of vectors:
Magnitude = square root of the sum of the squares of the other 2 sides (Pythagoras)
Dot product
(1,3) x (4,2) = (1×4)+(3×2) = 6
https://pythonprogramming.net/support-vector-machine-intro-machine-learning-tutorial/
# import libs import numpy as np from sklearn import preprocessing, neighbors, svm # cross_validation is depreciated and train_test_split moved into model_selection from sklearn.model_selection import train_test_split import pandas as pd df = pd.read_csv('breast-cancer-wisconsin.data') # there are gaps in the data denoted by '?' - these need to be converted to -99999 so the algorythm treats it as an outlier df.replace('?',-99999,inplace=True) # drop any useless data - in this case the ID df.drop('id',1,inplace=True) #print(df) # define X & y (X for features; y for labels) # X is everything except 'class' # In the datafile I had a space after 'class' which caused errors X = np.array(df.drop(['class'], 1)) y = np.array(df['class']) # split the data into train and test datasets using train_Test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #define classifier (clf) clf = svm.SVC() #swapped out K Nearest neighbors # fit the classifier clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) print(accuracy) # important the array needs to be 2D so double brackets are needed rather than reshaping the array #example_measures = np.array([[4,2,1,1,1,2,3,2,1],[4,2,1,2,2,2,3,2,1]]) #prediction = clf.predict(example_measures) #print(prediction)
https://pythonprogramming.net/final-thoughts-knn-machine-learning-tutorial/
# imports import numpy as np from math import sqrt import warnings from collections import Counter import pandas as pd import random # define function def K_nearest_neighbours(data, predict, k=3): if len(data) >= k: warnings.warn('K is set to value less than total voting groups!') distances = [] for group in data: for features in data[group]: euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict)) distances.append([euclidean_distance, group]) votes = [i[1] for i in sorted(distances) [:k]] #print(Counter(votes).most_common(1)) vote_result = Counter(votes).most_common(1)[0][0] confidence = Counter(votes).most_common(1)[0][1] / k #print(vote_result, confidence) return vote_result, confidence accuracies = [] for i in range(25): # import data df = pd.read_csv('breast-cancer-wisconsin.data') # there are gaps in the data denoted by '?' - these need to be converted to -99999 so the algorythm treats it as an outlier df.replace('?',-99999,inplace=True) # drop any useless data - in this case the ID df.drop('id',1,inplace=True) #print(df) # convert everything in the list to a number full_data = df.astype(float).values.tolist() #print(full_data[:5]) # print first 5 rows random.shuffle(full_data) # no need to define variable again i.e. full_data = random.shuffle(full_data) test_size = 0.2 train_set = {2:[],4:[]} test_set = {2:[],4:[]} train_data = full_data[:-int(test_size*len(full_data))] # slicing the full data set by the test_size test_data = full_data[-int(test_size*len(full_data)):] # last 20% for i in train_data: train_set[i[-1]].append(i[:-1]) # -1 gives the last column for i in test_data: test_set[i[-1]].append(i[:-1]) # -1 gives the last column correct = 0 total = 0 for group in test_set: for data in test_set[group]: vote, confidence = K_nearest_neighbours(train_set,data, k=5) if group == vote: correct +=1 # else: # print(confidence) total +=1 #print('Accuracy', correct/total) accuracies.append(correct / total) print((sum(accuracies)/len(accuracies)*100))
https://pythonprogramming.net/testing-our-k-nearest-neighbors-machine-learning-tutorial/
# imports import numpy as np from math import sqrt import warnings from collections import Counter import pandas as pd import random # define function def K_nearest_neighbours(data, predict, k=3): if len(data) >= k: warnings.warn('K is set to value less than total voting groups!') distances = [] for group in data: for features in data[group]: euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict)) distances.append([euclidean_distance, group]) votes = [i[1] for i in sorted(distances) [:k]] #print(Counter(votes).most_common(1)) vote_result = Counter(votes).most_common(1)[0][0] return vote_result # import data df = pd.read_csv('breast-cancer-wisconsin.data') # there are gaps in the data denoted by '?' - these need to be converted to -99999 so the algorythm treats it as an outlier df.replace('?',-99999,inplace=True) # drop any useless data - in this case the ID df.drop('id',1,inplace=True) #print(df) # convert everything in the list to a number full_data = df.astype(float).values.tolist() #print(full_data[:5]) # print first 5 rows random.shuffle(full_data) # no need to define variable again i.e. full_data = random.shuffle(full_data) test_size = 0.2 train_set = {2:[],4:[]} test_set = {2:[],4:[]} train_data = full_data[:-int(test_size*len(full_data))] # slicing the full data set by the test_size test_data = full_data[-int(test_size*len(full_data)):] # last 20% for i in train_data: train_set[i[-1]].append(i[:-1]) # -1 gives the last column for i in test_data: test_set[i[-1]].append(i[:-1]) # -1 gives the last column correct = 0 total = 0 for group in test_set: for data in test_set[group]: vote = K_nearest_neighbours(train_set,data, k=5) if group == vote: correct +=1 total +=1 print('Accuracy', correct/total)
https://pythonprogramming.net/coding-k-nearest-neighbors-machine-learning-tutorial/
# imports import numpy as np from math import sqrt import matplotlib.pyplot as plt import warnings from collections import Counter # To set charts to save as images we need to change the default behaviour from matplotlib import style # inport style to change default behaviour of plot style.use('ggplot') # use ggplot dataset = {'k':[[1,2],[2,3],[3,1]], 'r':[[6,5],[7,7],[8,6]]} # defines as a dictionary 2 classes (k&r) with 3 features (lists of lists) new_features = [5,7] ## Expanded one line for loop #for i in dataset: # for ii in dataset[i]: # plt.scatter(ii[0],ii[1],s=100, color=i) # define function def K_nearest_neighbours(data, predict, k=3): if len(data) >= k: warnings.warn('K is set to value less than total voting groups!') distances = [] for group in data: for features in data[group]: euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict)) distances.append([euclidean_distance, group]) votes = [i[1] for i in sorted(distances) [:k]] print(Counter(votes).most_common(1)) vote_result = Counter(votes).most_common(1)[0][0] return vote_result # generate results results = K_nearest_neighbours(dataset, new_features, k=3) print(results) [[plt.scatter(ii[0],ii[1],s=100,color=i) for ii in dataset[i]] for i in dataset] # one line for loop plt.scatter(new_features[0], new_features[1], color=results,s=100) plt.show()
https://pythonprogramming.net/programming-k-nearest-neighbors-machine-learning-tutorial/
# imports import numpy as np from math import sqrt import matplotlib.pyplot as plt import warnings from collections import Counter # To set charts to save as images we need to change the default behaviour from matplotlib import style # inport style to change default behaviour of plot style.use('ggplot') # use ggplot dataset = {'k':[[1,2],[2,3],[3,1]], 'r':[[6,5],[7,7],[8,6]]} # defines as a dictionary 2 classes (k&r) with 3 features (lists of lists) new_features = [5,7] ## Expanded one line for loop #for i in dataset: # for ii in dataset[i]: # plt.scatter(ii[0],ii[1],s=100, color=i) [[plt.scatter(ii[0],ii[1],s=100,color=i) for ii in dataset[i]] for i in dataset] # one line for loop plt.show() def K_nearest_neighbours(data, predict, k=3): if len(data) >= k: warnings.warn('K is set to value less than total voting groups!') return vote_result
https://pythonprogramming.net/euclidean-distance-machine-learning-tutorial/
This stuff makes my head hurt….just look at this formula….WTF man!
Basically, it’s just the square root of the sum of the distance of the points from each other, squared….easy
So in Python this translates into:-
plot1 = [1,3] plot2 = [2,5] euclidean_distance = sqrt( (plot1[0]-plot2[0])**2 + (plot1[1]-plot2[1])**2 )
https://pythonprogramming.net/k-nearest-neighbors-application-machine-learning-tutorial/
Data links:-
# import libs import numpy as np from sklearn import preprocessing, neighbors # cross_validation is depreciated and train_test_split moved into model_selection from sklearn.model_selection import train_test_split import pandas as pd df = pd.read_csv('breast-cancer-wisconsin.data') # there are gaps in the data denoted by '?' - these need to be converted to -99999 so the algorythm treats it as an outlier df.replace('?',-99999,inplace=True) # drop any useless data - in this case the ID df.drop('id',1,inplace=True) #print(df) # define X & y (X for features; y for labels) # X is everything except 'class' # In the datafile I had a space after 'class' which caused errors X = np.array(df.drop(['class'], 1)) y = np.array(df['class']) # split the data into train and test datasets using train_Test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #define classifier (clf) clf = neighbors.KNeighborsClassifier() # fit the classifier clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) print(accuracy) # important the array needs to be 2D so double brackets are needed rather than reshaping the array example_measures = np.array([[4,2,1,1,1,2,3,2,1],[4,2,1,2,2,2,3,2,1]]) prediction = clf.predict(example_measures) print(prediction)
https://pythonprogramming.net/k-nearest-neighbors-intro-machine-learning-tutorial/
Intro to K Nearest Neighbours classification.
https://pythonprogramming.net/how-to-program-r-squared-machine-learning-tutorial/
Straight forward tutorial – plugging in the R^2 calculation into a function.
# Import Libs from statistics import mean import numpy as np import matplotlib.pyplot as plt # To set charts to save as images we need to change the default behaviour from matplotlib import style # inport style to change default behaviour of plot style.use('ggplot') # use ggplot # Define values xs = np.array([1,2,3,4,5], dtype=np.float64) # dtype lets you set the data type. Not needed for this example but useful in future ys = np.array([5,4,6,5,6], dtype=np.float64) # Define best fit function def best_fit_slope_and_intercept(xs, ys): # defining function to calculate slope (m) - passing values of xs and ys m = ( ((mean(xs)*mean(ys)) - mean(xs * ys)) / # bracket space at the start and space slash at the end allows for a carridge return in the code ((mean(xs)**2)-mean(xs**2))) ## **2 raises to the power of 2 b = mean(ys) - m*mean(xs) return m, b m, b = best_fit_slope_and_intercept(xs,ys) # Define function to square error def squared_error(ys_orig,ys_line): return sum((ys_line - ys_orig) * (ys_line - ys_orig)) # return used with calc rather than seperately first def coefficient_of_determination(ys_orig,ys_line): y_mean_line = [mean(ys_orig) for y in ys_orig] # one line for loop squared_error_regr = squared_error(ys_orig, ys_line) squared_error_y_mean = squared_error(ys_orig, y_mean_line) return 1 - (squared_error_regr/squared_error_y_mean) m, b = best_fit_slope_and_intercept(xs,ys) regression_line = [(m*x)+b for x in xs] r_squared = coefficient_of_determination(ys,regression_line) print(r_squared) #plt.scatter(xs,ys) #plt.savefig('ML_Tutorial8.png', bbox_inches='tight') #Sets the output to save an image #plt.show() # exports the image
https://pythonprogramming.net/r-squared-coefficient-of-determination-machine-learning-tutorial/
https://pythonprogramming.net/how-to-program-best-fit-line-machine-learning-tutorial/
Next part of the equation works out the y intercept…
So Y intercept (b) equals the mean of the Ys minus the slope (m) times the mean of the Xs…easy.
# Import Libs from statistics import mean import numpy as np import matplotlib.pyplot as plt # To set charts to save as images we need to change the default behaviour from matplotlib import style # inport style to change default behaviour of plot style.use('ggplot') # use ggplot # Define values xs = np.array([1,2,3,4,5,6], dtype=np.float64) # dtype lets you set the data type. Not needed for this example but useful in future ys = np.array([5,4,6,5,6,7], dtype=np.float64) def best_fit_slope_and_intercept(xs, ys): # defining function to calculate slope (m) - passing values of xs and ys m = ( ((mean(xs)*mean(ys)) - mean(xs * ys)) / # bracket space at the start and space slash at the end allows for a carridge return in the code ((mean(xs)**2)-mean(xs**2))) ## **2 raises to the power of 2 b = mean(ys) - (m * mean(xs)) return m, b # add in b to be returned as well as m m, b = best_fit_slope_and_intercept(xs,ys) # define both usinf the function print(m, b) #calculate the line regression_line = [(m*x)+b for x in xs] # one line for loop to create the line for illustration #plot the data plt.scatter(xs, ys) plt.plot(xs, regression_line) plt.savefig('ML_Tutorial9.png', bbox_inches='tight') #Sets the output to save an image plt.show() # exports the image
https://pythonprogramming.net/how-to-program-best-fit-line-slope-machine-learning-tutorial/
This covers building up a Linear Regression model in Python based on the standard equation:-
Slope of the best fit line being equal to Mean of the X values times the Mean of the Y values, minus the Mean of the Xs times the Ys. Divided by the Mean of Xs to the power of 2, minus the of all the Xs to the power of 2 (I know confusing right lol).
The code was a fairly straight forward application of math. Comments and learnings are in the code:-
# Import Libs from statistics import mean import numpy as np import matplotlib.pyplot as plt # To set charts to save as images we need to change the default behaviour from matplotlib import style # inport style to change default behaviour of plot style.use('ggplot') # use ggplot # Define values xs = np.array([1,2,3,4,5,6], dtype=np.float64) # dtype lets you set the data type. Not needed for this example but useful in future ys = np.array([5,4,6,5,6,7], dtype=np.float64) def best_fit_slope(xs, ys): # defining function to calculate slope (m) - passing values of xs and ys m = ( ((mean(xs)*mean(ys)) - mean(xs * ys)) / # bracket space at the start and space slash at the end allows for a carridge return in the code ((mean(xs)**2)-mean(xs**2))) ## **2 raises to the power of 2 return m m = best_fit_slope(xs,ys) print(m) #plt.scatter(xs,ys) #plt.savefig('ML_Tutorial8.png', bbox_inches='tight') #Sets the output to save an image #plt.show() # exports the image
https://pythonprogramming.net/simple-linear-regression-machine-learning-tutorial/
Simple tutorial covering the maths behind a best fit line.
This next tutorial covers using the trained regression model to forecast out data. Full notes here:-
https://pythonprogramming.net/forecasting-predicting-machine-learning-tutorial/
Key takeaways:-
import pandas as pd import quandl, math, datetime #imports Math, datetime and Quandl import numpy as np # support for arrays from sklearn import preprocessing, model_selection, svm #machine learning and from sklearn.linear_model import LinearRegression # regression import matplotlib.pyplot as plt from matplotlib import style style.use('ggplot') quandl.ApiConfig.api_key = "9qfnyWSTDUpx6uhNX2dc" df = quandl.get('WIKI/GOOGL') #import data from Qunadl # print (df.head()) # print out the head rows of the data to check what we're getting # create a dataframe df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close','Adj. Volume']] df['HL_pct'] = ((df['Adj. High'] - df['Adj. Close']) / df['Adj. Close']) * 100 df['pct_Change'] = ((df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open']) * 100 df = df[['Adj. Close','HL_pct','pct_Change','Adj. Volume']] forecast_col = 'Adj. Close' # define what we're forcasting df.fillna(-99999, inplace=True) #replaces missing data with an outlier value (-99999) rather that getting rid of any data forecast_out = int(math.ceil(0.01*len(df))) # math.ceil rounds everything up to the nearest whole - so this formula takes 1% of the length of the datafram, rounds this up and finally converts it to an interger df['label'] = df[forecast_col].shift(-forecast_out) # so this adds a new column 'label' that contains the 'Adj. Close' value from ~1 days in future(?) # print (df.head()) #just used to check data X = np.array(df.drop(['label'],1)) # everything except the lable column; this returns a new dataframe that is then converted to a numpy array and stored as X X = preprocessing.scale(X) # scale X before classifier - this can help with performance but can also take longer: can be skipped X_lately = X[-forecast_out:] # used to predict against - note there are no y values for these to check against X = X[:-forecast_out] # needs to happen after scaling df.dropna(inplace=True) y = np.array(df['label']) y = np.array(df['label']) # array of labels ### creat training and testing sets X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2) # 0.2 = 20% of the datafram ### Swapping different algorythms # clf = LinearRegression() # simple linear regressions # clf = LinearRegression(n_jobs=10) # linear regression using threading, 10 jobs at a time = faster clf = LinearRegression(n_jobs=-1) # linear regression using threading with as many jobs as preprocessor will handle # clf = svm.SVR() # base support vector regression # clf = svm.SVR(kernel="poly") # support vector regression with specific kernel clf.fit(X_train, y_train) # fit the data to the training data accuracy = clf.score(X_test, y_test) # score it against test # print(accuracy) ### pridiction - easy once the classifier is sets forecast_set = clf.predict(X_lately) print (forecast_set, accuracy, forecast_out) df['Forecast'] = np.nan last_date = df.iloc[-1].name last_unix = last_date.timestamp() one_day = 86400 next_unix = last_unix + one_day # moving to future dates not in dataset ## Set a new Data Frame including dates with the forecast values for i in forecast_set: next_date = datetime.datetime.fromtimestamp(next_unix) next_unix += 86400 df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i] df['Adj. Close'].plot() df['Forecast'].plot() plt.legend(loc=4) plt.xlabel('Date') plt.ylabel('Price') plt.savefig('ML_Tutorial5.svg', bbox_inches='tight') #bbox_inches='tight' minimises whitespace around the fig plt.show()
This tutorial covered the first application of Regression to sample data.
https://pythonprogramming.net/training-testing-machine-learning-tutorial/
Key takeaways being:-
Generally, you want your features in machine learning to be in a range of -1 to 1. This may do nothing, but it usually speeds up processing and can also help with accuracy. Because this range is so popularly used, it is included in the preprocessing module of Scikit-Learn. To utilize this, you can apply preprocessing.scale to your X variable:
import pandas as pd import quandl, math #imports Math and Quandl import numpy as np # support for arrays from sklearn import preprocessing, model_selection, svm #machine learning and from sklearn.linear_model import LinearRegression # regression quandl.ApiConfig.api_key = "9qfnyWSTDUpx6uhNX2dc" df = quandl.get('WIKI/GOOGL') #import data from Qunadl # print (df.head()) # print out the head rows of the data to check what we're getting # create a dataframe df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close','Adj. Volume']] df['HL_pct'] = ((df['Adj. High'] - df['Adj. Close']) / df['Adj. Close']) * 100 df['pct_Change'] = ((df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open']) * 100 df = df[['Adj. Close','HL_pct','pct_Change','Adj. Volume']] forecast_col = 'Adj. Close' # define what we're forcasting df.fillna(-99999, inplace=True) #replaces missing data with an outlier value (-99999) rather that getting rid of any data forecast_out = int(math.ceil(0.01*len(df))) # math.ceil rounds everything up to the nearest whole - so this formula takes 1% of the length of the datafram, rounds this up and finally converts it to an interger df['label'] = df[forecast_col].shift(-forecast_out) # so this adds a new column 'label' that contains the 'Adj. Close' value from ~1 days in future(?) df.dropna(inplace=True) # print (df.head()) #just used to check data X = np.array(df.drop(['label'],1)) # everything except the lable column; this returns a new dataframe that is then converted to a numpy array and stored as X y = np.array(df['label']) # array of labels X = preprocessing.scale(X) # scale X before classifier - this can help with performance but can also take longer: can be skipped y = np.array(df['label']) ### creat training and testing sets X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2) # 0.2 = 20% of the datafram ### Swapping different algorythms # clf = LinearRegression() # simple linear regressions # clf = LinearRegression(n_jobs=10) # linear regression using threading, 10 jobs at a time = faster clf = LinearRegression(n_jobs=-1) # linear regression using threading with as many jobs as preprocessor will handle # clf = svm.SVR() # base support vector regression # clf = svm.SVR(kernel="poly") # support vector regression with specific kernel clf.fit(X_train, y_train) # fit the data to the training data accuracy = clf.score(X_test, y_test) # score it against test print(accuracy)
So the first two tutorials basically introduced the topic and imported some stock data – straight forward. Biggest takeaway being the use of Quandl – I’ll be doing some research into them at a later date.
So this tutorial gets into the meat of regression using Numpy to convert data into Numpy Arrays for Sykit-learn to do its thing.
Quick note on features and labels:-
A common example with regression might be to try to predict the dollar value of an insurance policy premium for someone. The company may collect your age, past driving infractions, public criminal record, and your credit score for example. The company will use past customers, taking this data, and feeding in the amount of the “ideal premium” that they think should have been given to that customer, or they will use the one they actually used if they thought it was a profitable amount.
Thus, for training the machine learning classifier, the features are customer attributes, the label is the premium associated with those attributes.
import pandas as pd import quandl, math #imports Math and Quandl df = quandl.get('WIKI/GOOGL') #import data from Qunadl # print (df.head()) # print out the head rows of the data to check what we're getting # create a dataframe df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close','Adj. Volume']] df['HL_pct'] = ((df['Adj. High'] - df['Adj. Close']) / df['Adj. Close']) * 100 df['pct_Change'] = ((df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open']) * 100 df = df[['Adj. Close','HL_pct','pct_Change','Adj. Volume']] forecast_col = 'Adj. Close' # define what we're forcasting df.fillna(-99999, inplace=True) #replaces missing data with an outlier value (-99999) rather that getting rid of any data forecast_out = int(math.ceil(0.01*len(df))) # math.ceil rounds everything up to the nearest whole - so this formula takes 1% of the length of the datafram, rounds this up and finally converts it to an interger df['label'] = df[forecast_col].shift(-forecast_out) # so this adds a new column 'label' that contains the 'Adj. Close' value from ~1 days in future(?) df.dropna(inplace=True) print (df.head()) #just used to check data
Tutorial script here:-
import pandas as pd import quandl df = quandl.get('WIKI/GOOGL') #import data from Qunadl # print (df.head()) # print out the head rows of the data to check what we're getting # create a dataframe df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close','Adj. Volume']] df['HL_pct'] = ((df['Adj. High'] - df['Adj. Close']) / df['Adj. Close']) * 100 df['pct_Change'] = ((df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open']) * 100 df = df[['Adj. Close','HL_pct','pct_Change','Adj. Volume']] print (df.head()) #just used to che data
So the key project I’ve been working on is looking at stock market data and trying to develop a set of python tools that allows me to make better predictions.
This started with Beautiful Soup scripts designed to harvest company fundamentals from the London Stock Exchange website – integrating this to a SQL database for later analysis. This should given me all the raw data I need to determine whether a company is viable (passing a set of qualifier tests) along with (eventually) a way of predicting a base share value.
The second element to the project is then using sentiment analysis to look at how those same companies are being discussed on social media. This has been based on Sendex’s tutorials using Twitter, but my hope is to adapt these to other platforms. This then compliments the base data with some views on where current sentiment is – and hopefully there is a correlation between the two data-sets.
However, the current virus has meant free fall in most stock indexes which is probably going to skew my model. So I’m going to let it pass while working on a few supplementary modules that I would have gotten around to including at a later date.
Namely Machine Learning 🙂
So as before I’m going to follow Sendex’s tutorials on this.
There are a few Udemy courses I’ve done in this area – so I might not keep extensive notes – just cover the key elements I need to keep track of.
Anyway….music not related…
G