Soft Margin Support Vector Machine
https://pythonprogramming.net/soft-margin-svm-machine-learning-tutorial/
https://pythonprogramming.net/soft-margin-svm-machine-learning-tutorial/
https://pythonprogramming.net/predictions-svm-machine-learning-tutorial/
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
style.use('ggplot')
# build SVM class
class Support_Vector_Machine:
# The __init__ method of a class is one that runs whenever an object is created with the class
# calling self in the class allows sharing of variables across the class, so is included in all function defs
def __init__(self, visualisation=True):
# sets visualisations to what ever the user specifies (defaults to True)
self.visualisation = visualisation
# defines colours for the two states 1 & -1
self.colors = {1:'r', -1:'b'}
# sets some standards for the graphs
if self.visualisation:
self.fig = plt.figure()
self.ax = self.fig.add_subplot(1,1,1)
# train
def fit(self, data):
# set up access to the data that's passed when the function is called
self.data = data
# { ||w||: [w,b] }
opt_dict = {}
#
transforms = [[1,1],
[-1,1],
[-1,-1],
[1,-1]]
# finding values to work with for our ranges.
all_data = [] # set up a placeholder for the values
# for loop to step through data and append it to all_data (list of values)
for yi in self.data:
for featureset in self.data[yi]:
for feature in featureset:
all_data.append(feature)
# next define the max and min value in list
self.max_feature_value = max(all_data)
self.min_feature_value = min(all_data)
# free up memory once we've got the values
all_data=None
# define step size for optimisation Big through to small
step_sizes = [self.max_feature_value * 0.1,
self.max_feature_value * 0.01,
# starts getting very high cost after this.
self.max_feature_value * 0.001]
# extremely expensive
b_range_multiple = 5
b_multiple = 5
# first element in vector w
latest_optimum = self.max_feature_value*10
## Begin the stepping process
for step in step_sizes:
w = np.array([latest_optimum,latest_optimum])
# we can do this because convex
optimized = False
while not optimized:
# we're not optimising b as much as w (not needed)
for b in np.arange(-1*(self.max_feature_value*b_range_multiple),
self.max_feature_value*b_range_multiple,
step*b_multiple):
for transformation in transforms:
w_t = w*transformation
found_option = True
# weakest link in the SVM fundamentally
# SMO attempts to fix this a bit
# yi(xi.w+b) >= 1
#
# #### add a break here later..
for i in self.data:
for xi in self.data[i]:
yi=i
if not yi*(np.dot(w_t,xi)+b) >= 1:
found_option = False
if found_option:
opt_dict[np.linalg.norm(w_t)] = [w_t,b]
if w[0]<0:
optimized = True
print('optimised a step')
else:
w = w - step
# break out of while loop
# take a list of the magnitudes and sort them
norms = sorted([n for n in opt_dict]) # sorting lowest to highest
#||w|| : [w,b]
opt_choice = opt_dict[norms[0]] # smallest magnitude
self.w = opt_choice[0] # sets w to first element in the smallest mag
self.b = opt_choice[1] # sets b to second element in the smallest mag
latest_optimum = opt_choice[0][0]+step*2 # resetting the opt to the latest
def predict(self,features):
# sign( x.w+b )
classification = np.sign(np.dot(np.array(features),self.w)+self.b)
if classification !=0 and self.visualisation:
self.ax.scatter(features[0], features[1], s=100, marker='*', c=self.colors[classification])
return classification
def visualise(self):
#scattering known featuresets using a one line for loop
[[self.ax.scatter(x[0],x[1],s=100,color=self.colors[i]) for x in data_dict[i]] for i in data_dict]
# hyperplane = x.w+b
def hyperplane(x,w,b,v):
# v = (w.x+b)
return (-w[0]*x-b+v) / w[1]
datarange = (self.min_feature_value*0.9,self.max_feature_value*1.1) # gives space on the graph
hyp_x_min = datarange[0]
hyp_x_max = datarange[1]
# w.x + b = 1
# pos sv hyperplane
psv1 = hyperplane(hyp_x_min, self.w, self.b, 1) # define the ys
psv2 = hyperplane(hyp_x_max, self.w, self.b, 1) # define the ys
self.ax.plot([hyp_x_min,hyp_x_max], [psv1,psv2], "k") # plot xs, ys then colour k=black g-- = green
# w.x + b = -1
# negative sv hyperplane
nsv1 = hyperplane(hyp_x_min, self.w, self.b, -1)
nsv2 = hyperplane(hyp_x_max, self.w, self.b, -1)
self.ax.plot([hyp_x_min,hyp_x_max], [nsv1,nsv2], "k")
# w.x + b = 0
# decision
db1 = hyperplane(hyp_x_min, self.w, self.b, 0)
db2 = hyperplane(hyp_x_max, self.w, self.b, 0)
self.ax.plot([hyp_x_min,hyp_x_max], [db1,db2], "g--")
plt.show()
# define data dictionary
data_dict = {-1:np.array([[1,7],
[2,8],
[3,8],]),
1:np.array([[5,1],
[6,-1],
[7,3],])}
svm = Support_Vector_Machine()
svm.fit(data=data_dict)
predict_us = [[0,10],
[1,3],
[3,4],
[3,5],
[5,5],
[5,6],
[6,-5],
[5,8]]
for p in predict_us:
svm.predict(p)
svm.visualise()
https://pythonprogramming.net/svm-optimization-python-2-machine-learning-tutorial/
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
style.use('ggplot')
# build SVM class
class Support_Vector_Machine:
# The __init__ method of a class is one that runs whenever an object is created with the class
# calling self in the class allows sharing of variables across the class, so is included in all function defs
def __init__(self, visualisation=True):
# sets visualisations to what ever the user specifies (defaults to True)
self.visualisation = visualisation
# defines colours for the two states 1 & -1
self.colors = {1:'r', -1:'b'}
# sets some standards for the graphs
if self.visualisation:
self.fig = plt.figure()
self.ax = self.fig.add_subplot(1,1,1)
# train
def fit(self, data):
# set up access to the data that's passed when the function is called
self.data = data
# { ||w||: [w,b] }
opt_dict = {}
#
transforms = [[1,1],
[-1,1],
[-1,-1],
[1,-1]]
# finding values to work with for our ranges.
all_data = [] # set up a placeholder for the values
# for loop to step through data and append it to all_data (list of values)
for yi in self.data:
for featureset in self.data[yi]:
for feature in featureset:
all_data.append(feature)
# next define the max and min value in list
self.max_feature_value = max(all_data)
self.min_feature_value = min(all_data)
# free up memory once we've got the values
all_data=None
# define step size for optimisation Big through to small
step_sizes = [self.max_feature_value * 0.1,
self.max_feature_value * 0.01,
# starts getting very high cost after this.
self.max_feature_value * 0.001]
# extremely expensive
b_range_multiple = 5
b_multiple = 5
# first element in vector w
latest_optimum = self.max_feature_value*10
## Begin the stepping process
for step in step_sizes:
w = np.array([latest_optimum,latest_optimum])
# we can do this because convex
optimized = False
while not optimized:
# we're not optimising b as much as w (not needed)
for b in np.arange(-1*(self.max_feature_value*b_range_multiple),
self.max_feature_value*b_range_multiple,
step*b_multiple):
for transformation in transforms:
w_t = w*transformation
found_option = True
# weakest link in the SVM fundamentally
# SMO attempts to fix this a bit
# yi(xi.w+b) >= 1
#
# #### add a break here later..
for i in self.data:
for xi in self.data[i]:
yi=i
if not yi*(np.dot(w_t,xi)+b) >= 1:
found_option = False
if found_option:
opt_dict[np.linalg.norm(w_t)] = [w_t,b]
if w[0]<0:
optimized = True
print('optimised a step')
else:
w = w - step
# break out of while loop
# take a list of the magnitudes and sort them
norms = sorted([n for n in opt_dict]) # sorting lowest to highest
#||w|| : [w,b]
opt_choice = opt_dict[norms[0]] # smallest magnitude
self.w = opt_choice[0] # sets w to first element in the smallest mag
self.b = opt_choice[1] # sets b to second element in the smallest mag
latest_optimum = opt_choice[0][0]+step*2 # resetting the opt to the latest
def predict(self,features):
# sign( x.w+b )
classification = np.sign(np.dot(np.array(features),self.w)+self.b)
return classification
# define data dictionary
data_dict = {-1:np.array([[1,7],
[2,8],
[3,8],]),
1:np.array([[5,1],
[6,-1],
[7,3],])}
https://pythonprogramming.net/svm-optimization-python-machine-learning-tutorial/
More resources:-
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
style.use('ggplot')
# build SVM class
class Support_Vector_Machine:
# The __init__ method of a class is one that runs whenever an object is created with the class
# calling self in the class allows sharing of variables across the class, so is included in all function defs
def __init__(self, visualisation=True):
# sets visualisations to what ever the user specifies (defaults to True)
self.visualisation = visualisation
# defines colours for the two states 1 & -1
self.colors = {1:'r', -1:'b'}
# sets some standards for the graphs
if self.visualisation:
self.fig = plt.figure()
self.ax = self.fig.add_subplot(1,1,1)
# train
def fit(self, data):
# set up access to the data that's passed when the function is called
self.data = data
# { ||w||: [w,b] }
opt_dict = {}
#
transforms = [[1,1],
[-1,1],
[-1,-1],
[1,-1]]
# finding values to work with for our ranges.
all_data = [] # set up a placeholder for the values
# for loop to step through data and append it to all_data (list of values)
for yi in self.data:
for featureset in self.data[yi]:
for feature in featureset:
all_data.append(feature)
# next define the max and min value in list
self.max_feature_value = max(all_data)
self.min_feature_value = min(all_data)
# free up memory once we've got the values
all_data=None
# define step size for optimisation Big through to small
step_sizes = [self.max_feature_value * 0.1,
self.max_feature_value * 0.01,
# starts getting very high cost after this.
self.max_feature_value * 0.001]
# extremely expensive
b_range_multiple = 5
b_multiple = 5
# first element in vector w
latest_optimum = self.max_feature_value*10
## Begin the stepping process
for step in step_sizes:
w = np.array([latest_optimum,latest_optimum])
# we can do this because convex
optimized = False
while not optimized:
pass
def predict(self,features):
# sign( x.w+b )
classification = np.sign(np.dot(np.array(features),self.w)+self.b)
return classification
# define data dictionary
data_dict = {-1:np.array([[1,7],
[2,8],
[3,8],]),
1:np.array([[5,1],
[6,-1],
[7,3],])}
https://pythonprogramming.net/svm-in-python-machine-learning-tutorial/
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
style.use('ggplot')
# build SVM class
class Support_Vector_Machine:
# The __init__ method of a class is one that runs whenever an object is created with the class
# calling self in the class allows sharing of variables across the class, so is included in all function defs
def __init__(self, visualisation=True):
# sets visualisations to what ever the user specifies (defaults to True)
self.visualisation = visualisation
# defines colours for the two states 1 & -1
self.colors = {1:'r', -1:'b'}
# sets some standards for the graphs
if self.visualisation:
self.fig = plt.figure()
self.ax = self.fig.add_subplot(1,1,1)
# train
def fit(self, data):
pass
def predict(self,features):
# sign( x.w+b )
classification = np.sign(np.dot(np.array(features),self.w)+self.b)
return classification
# define data dictionary
data_dict = {-1:np.array([[1,7],
[2,8],
[3,8],]),
1:np.array([[5,1],
[6,-1],
[7,3],])}
https://pythonprogramming.net/svm-constraint-optimization-machine-learning-tutorial/
https://pythonprogramming.net/support-vector-machine-fundamentals-machine-learning-tutorial/
https://pythonprogramming.net/support-vector-assertions-machine-learning-tutorial/
https://pythonprogramming.net/vector-basics-machine-learning-tutorial/
Covering the basics of vectors:
Magnitude = square root of the sum of the squares of the other 2 sides (Pythagoras)
Dot product
(1,3) x (4,2) = (1×4)+(3×2) = 6
https://pythonprogramming.net/support-vector-machine-intro-machine-learning-tutorial/
# import libs
import numpy as np
from sklearn import preprocessing, neighbors, svm
# cross_validation is depreciated and train_test_split moved into model_selection
from sklearn.model_selection import train_test_split
import pandas as pd
df = pd.read_csv('breast-cancer-wisconsin.data')
# there are gaps in the data denoted by '?' - these need to be converted to -99999 so the algorythm treats it as an outlier
df.replace('?',-99999,inplace=True)
# drop any useless data - in this case the ID
df.drop('id',1,inplace=True)
#print(df)
# define X & y (X for features; y for labels)
# X is everything except 'class'
# In the datafile I had a space after 'class' which caused errors
X = np.array(df.drop(['class'], 1))
y = np.array(df['class'])
# split the data into train and test datasets using train_Test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#define classifier (clf)
clf = svm.SVC() #swapped out K Nearest neighbors
# fit the classifier
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print(accuracy)
# important the array needs to be 2D so double brackets are needed rather than reshaping the array
#example_measures = np.array([[4,2,1,1,1,2,3,2,1],[4,2,1,2,2,2,3,2,1]])
#prediction = clf.predict(example_measures)
#print(prediction)
https://pythonprogramming.net/final-thoughts-knn-machine-learning-tutorial/
# imports
import numpy as np
from math import sqrt
import warnings
from collections import Counter
import pandas as pd
import random
# define function
def K_nearest_neighbours(data, predict, k=3):
if len(data) >= k:
warnings.warn('K is set to value less than total voting groups!')
distances = []
for group in data:
for features in data[group]:
euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
distances.append([euclidean_distance, group])
votes = [i[1] for i in sorted(distances) [:k]]
#print(Counter(votes).most_common(1))
vote_result = Counter(votes).most_common(1)[0][0]
confidence = Counter(votes).most_common(1)[0][1] / k
#print(vote_result, confidence)
return vote_result, confidence
accuracies = []
for i in range(25):
# import data
df = pd.read_csv('breast-cancer-wisconsin.data')
# there are gaps in the data denoted by '?' - these need to be converted to -99999 so the algorythm treats it as an outlier
df.replace('?',-99999,inplace=True)
# drop any useless data - in this case the ID
df.drop('id',1,inplace=True)
#print(df)
# convert everything in the list to a number
full_data = df.astype(float).values.tolist()
#print(full_data[:5]) # print first 5 rows
random.shuffle(full_data) # no need to define variable again i.e. full_data = random.shuffle(full_data)
test_size = 0.2
train_set = {2:[],4:[]}
test_set = {2:[],4:[]}
train_data = full_data[:-int(test_size*len(full_data))] # slicing the full data set by the test_size
test_data = full_data[-int(test_size*len(full_data)):] # last 20%
for i in train_data:
train_set[i[-1]].append(i[:-1]) # -1 gives the last column
for i in test_data:
test_set[i[-1]].append(i[:-1]) # -1 gives the last column
correct = 0
total = 0
for group in test_set:
for data in test_set[group]:
vote, confidence = K_nearest_neighbours(train_set,data, k=5)
if group == vote:
correct +=1
# else:
# print(confidence)
total +=1
#print('Accuracy', correct/total)
accuracies.append(correct / total)
print((sum(accuracies)/len(accuracies)*100))
https://pythonprogramming.net/testing-our-k-nearest-neighbors-machine-learning-tutorial/
# imports
import numpy as np
from math import sqrt
import warnings
from collections import Counter
import pandas as pd
import random
# define function
def K_nearest_neighbours(data, predict, k=3):
if len(data) >= k:
warnings.warn('K is set to value less than total voting groups!')
distances = []
for group in data:
for features in data[group]:
euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
distances.append([euclidean_distance, group])
votes = [i[1] for i in sorted(distances) [:k]]
#print(Counter(votes).most_common(1))
vote_result = Counter(votes).most_common(1)[0][0]
return vote_result
# import data
df = pd.read_csv('breast-cancer-wisconsin.data')
# there are gaps in the data denoted by '?' - these need to be converted to -99999 so the algorythm treats it as an outlier
df.replace('?',-99999,inplace=True)
# drop any useless data - in this case the ID
df.drop('id',1,inplace=True)
#print(df)
# convert everything in the list to a number
full_data = df.astype(float).values.tolist()
#print(full_data[:5]) # print first 5 rows
random.shuffle(full_data) # no need to define variable again i.e. full_data = random.shuffle(full_data)
test_size = 0.2
train_set = {2:[],4:[]}
test_set = {2:[],4:[]}
train_data = full_data[:-int(test_size*len(full_data))] # slicing the full data set by the test_size
test_data = full_data[-int(test_size*len(full_data)):] # last 20%
for i in train_data:
train_set[i[-1]].append(i[:-1]) # -1 gives the last column
for i in test_data:
test_set[i[-1]].append(i[:-1]) # -1 gives the last column
correct = 0
total = 0
for group in test_set:
for data in test_set[group]:
vote = K_nearest_neighbours(train_set,data, k=5)
if group == vote:
correct +=1
total +=1
print('Accuracy', correct/total)
https://pythonprogramming.net/coding-k-nearest-neighbors-machine-learning-tutorial/
# imports
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
import warnings
from collections import Counter
# To set charts to save as images we need to change the default behaviour
from matplotlib import style # inport style to change default behaviour of plot
style.use('ggplot') # use ggplot
dataset = {'k':[[1,2],[2,3],[3,1]], 'r':[[6,5],[7,7],[8,6]]} # defines as a dictionary 2 classes (k&r) with 3 features (lists of lists)
new_features = [5,7]
## Expanded one line for loop
#for i in dataset:
# for ii in dataset[i]:
# plt.scatter(ii[0],ii[1],s=100, color=i)
# define function
def K_nearest_neighbours(data, predict, k=3):
if len(data) >= k:
warnings.warn('K is set to value less than total voting groups!')
distances = []
for group in data:
for features in data[group]:
euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
distances.append([euclidean_distance, group])
votes = [i[1] for i in sorted(distances) [:k]]
print(Counter(votes).most_common(1))
vote_result = Counter(votes).most_common(1)[0][0]
return vote_result
# generate results
results = K_nearest_neighbours(dataset, new_features, k=3)
print(results)
[[plt.scatter(ii[0],ii[1],s=100,color=i) for ii in dataset[i]] for i in dataset] # one line for loop
plt.scatter(new_features[0], new_features[1], color=results,s=100)
plt.show()
https://pythonprogramming.net/programming-k-nearest-neighbors-machine-learning-tutorial/
# imports
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
import warnings
from collections import Counter
# To set charts to save as images we need to change the default behaviour
from matplotlib import style # inport style to change default behaviour of plot
style.use('ggplot') # use ggplot
dataset = {'k':[[1,2],[2,3],[3,1]], 'r':[[6,5],[7,7],[8,6]]} # defines as a dictionary 2 classes (k&r) with 3 features (lists of lists)
new_features = [5,7]
## Expanded one line for loop
#for i in dataset:
# for ii in dataset[i]:
# plt.scatter(ii[0],ii[1],s=100, color=i)
[[plt.scatter(ii[0],ii[1],s=100,color=i) for ii in dataset[i]] for i in dataset] # one line for loop
plt.show()
def K_nearest_neighbours(data, predict, k=3):
if len(data) >= k:
warnings.warn('K is set to value less than total voting groups!')
return vote_result
https://pythonprogramming.net/euclidean-distance-machine-learning-tutorial/
This stuff makes my head hurt….just look at this formula….WTF man!
Basically, it’s just the square root of the sum of the distance of the points from each other, squared….easy
So in Python this translates into:-
plot1 = [1,3]
plot2 = [2,5]
euclidean_distance = sqrt( (plot1[0]-plot2[0])**2 + (plot1[1]-plot2[1])**2 )
https://pythonprogramming.net/k-nearest-neighbors-application-machine-learning-tutorial/
Data links:-
# import libs
import numpy as np
from sklearn import preprocessing, neighbors
# cross_validation is depreciated and train_test_split moved into model_selection
from sklearn.model_selection import train_test_split
import pandas as pd
df = pd.read_csv('breast-cancer-wisconsin.data')
# there are gaps in the data denoted by '?' - these need to be converted to -99999 so the algorythm treats it as an outlier
df.replace('?',-99999,inplace=True)
# drop any useless data - in this case the ID
df.drop('id',1,inplace=True)
#print(df)
# define X & y (X for features; y for labels)
# X is everything except 'class'
# In the datafile I had a space after 'class' which caused errors
X = np.array(df.drop(['class'], 1))
y = np.array(df['class'])
# split the data into train and test datasets using train_Test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#define classifier (clf)
clf = neighbors.KNeighborsClassifier()
# fit the classifier
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print(accuracy)
# important the array needs to be 2D so double brackets are needed rather than reshaping the array
example_measures = np.array([[4,2,1,1,1,2,3,2,1],[4,2,1,2,2,2,3,2,1]])
prediction = clf.predict(example_measures)
print(prediction)
https://pythonprogramming.net/k-nearest-neighbors-intro-machine-learning-tutorial/
Intro to K Nearest Neighbours classification.
https://pythonprogramming.net/how-to-program-r-squared-machine-learning-tutorial/
Straight forward tutorial – plugging in the R^2 calculation into a function.
# Import Libs
from statistics import mean
import numpy as np
import matplotlib.pyplot as plt
# To set charts to save as images we need to change the default behaviour
from matplotlib import style # inport style to change default behaviour of plot
style.use('ggplot') # use ggplot
# Define values
xs = np.array([1,2,3,4,5], dtype=np.float64) # dtype lets you set the data type. Not needed for this example but useful in future
ys = np.array([5,4,6,5,6], dtype=np.float64)
# Define best fit function
def best_fit_slope_and_intercept(xs, ys): # defining function to calculate slope (m) - passing values of xs and ys
m = ( ((mean(xs)*mean(ys)) - mean(xs * ys)) / # bracket space at the start and space slash at the end allows for a carridge return in the code
((mean(xs)**2)-mean(xs**2))) ## **2 raises to the power of 2
b = mean(ys) - m*mean(xs)
return m, b
m, b = best_fit_slope_and_intercept(xs,ys)
# Define function to square error
def squared_error(ys_orig,ys_line):
return sum((ys_line - ys_orig) * (ys_line - ys_orig)) # return used with calc rather than seperately first
def coefficient_of_determination(ys_orig,ys_line):
y_mean_line = [mean(ys_orig) for y in ys_orig] # one line for loop
squared_error_regr = squared_error(ys_orig, ys_line)
squared_error_y_mean = squared_error(ys_orig, y_mean_line)
return 1 - (squared_error_regr/squared_error_y_mean)
m, b = best_fit_slope_and_intercept(xs,ys)
regression_line = [(m*x)+b for x in xs]
r_squared = coefficient_of_determination(ys,regression_line)
print(r_squared)
#plt.scatter(xs,ys)
#plt.savefig('ML_Tutorial8.png', bbox_inches='tight') #Sets the output to save an image
#plt.show() # exports the image
Pip
https://pythonprogramming.net/r-squared-coefficient-of-determination-machine-learning-tutorial/
https://pythonprogramming.net/how-to-program-best-fit-line-machine-learning-tutorial/
Next part of the equation works out the y intercept…
So Y intercept (b) equals the mean of the Ys minus the slope (m) times the mean of the Xs…easy.
# Import Libs
from statistics import mean
import numpy as np
import matplotlib.pyplot as plt
# To set charts to save as images we need to change the default behaviour
from matplotlib import style # inport style to change default behaviour of plot
style.use('ggplot') # use ggplot
# Define values
xs = np.array([1,2,3,4,5,6], dtype=np.float64) # dtype lets you set the data type. Not needed for this example but useful in future
ys = np.array([5,4,6,5,6,7], dtype=np.float64)
def best_fit_slope_and_intercept(xs, ys): # defining function to calculate slope (m) - passing values of xs and ys
m = ( ((mean(xs)*mean(ys)) - mean(xs * ys)) / # bracket space at the start and space slash at the end allows for a carridge return in the code
((mean(xs)**2)-mean(xs**2))) ## **2 raises to the power of 2
b = mean(ys) - (m * mean(xs))
return m, b # add in b to be returned as well as m
m, b = best_fit_slope_and_intercept(xs,ys) # define both usinf the function
print(m, b)
#calculate the line
regression_line = [(m*x)+b for x in xs] # one line for loop to create the line for illustration
#plot the data
plt.scatter(xs, ys)
plt.plot(xs, regression_line)
plt.savefig('ML_Tutorial9.png', bbox_inches='tight') #Sets the output to save an image
plt.show() # exports the image
https://pythonprogramming.net/how-to-program-best-fit-line-slope-machine-learning-tutorial/
This covers building up a Linear Regression model in Python based on the standard equation:-
Slope of the best fit line being equal to Mean of the X values times the Mean of the Y values, minus the Mean of the Xs times the Ys. Divided by the Mean of Xs to the power of 2, minus the of all the Xs to the power of 2 (I know confusing right lol).
The code was a fairly straight forward application of math. Comments and learnings are in the code:-
# Import Libs
from statistics import mean
import numpy as np
import matplotlib.pyplot as plt
# To set charts to save as images we need to change the default behaviour
from matplotlib import style # inport style to change default behaviour of plot
style.use('ggplot') # use ggplot
# Define values
xs = np.array([1,2,3,4,5,6], dtype=np.float64) # dtype lets you set the data type. Not needed for this example but useful in future
ys = np.array([5,4,6,5,6,7], dtype=np.float64)
def best_fit_slope(xs, ys): # defining function to calculate slope (m) - passing values of xs and ys
m = ( ((mean(xs)*mean(ys)) - mean(xs * ys)) / # bracket space at the start and space slash at the end allows for a carridge return in the code
((mean(xs)**2)-mean(xs**2))) ## **2 raises to the power of 2
return m
m = best_fit_slope(xs,ys)
print(m)
#plt.scatter(xs,ys)
#plt.savefig('ML_Tutorial8.png', bbox_inches='tight') #Sets the output to save an image
#plt.show() # exports the image
https://pythonprogramming.net/simple-linear-regression-machine-learning-tutorial/
Simple tutorial covering the maths behind a best fit line.
https://pythonprogramming.net/pickling-scaling-machine-learning-tutorial/
I’ve covered Pickling before and gone through the steps – so not a lot of new stuff to cover.
Key elements being:-
Writing a Pickle
clf.fit(X_train, y_train) # fit the data to the training data
with open('LinearRegression.pickle','wb') as f:
pickle.dump(clf, f)
Reading a Pickle
pickle_in = open ('LinearRegression.pickle', 'rb')
clf = pickle.load(pickle_in)
The other key concept on this tutorial was scaling on rented host systems. With the work flow being:-
This means that you don’t need to pay for the large server all the time.
Nice idea.
This next tutorial covers using the trained regression model to forecast out data. Full notes here:-
https://pythonprogramming.net/forecasting-predicting-machine-learning-tutorial/
Key takeaways:-
import pandas as pd
import quandl, math, datetime #imports Math, datetime and Quandl
import numpy as np # support for arrays
from sklearn import preprocessing, model_selection, svm #machine learning and
from sklearn.linear_model import LinearRegression # regression
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
quandl.ApiConfig.api_key = "9qfnyWSTDUpx6uhNX2dc"
df = quandl.get('WIKI/GOOGL') #import data from Qunadl
# print (df.head()) # print out the head rows of the data to check what we're getting
# create a dataframe
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close','Adj. Volume']]
df['HL_pct'] = ((df['Adj. High'] - df['Adj. Close']) / df['Adj. Close']) * 100
df['pct_Change'] = ((df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open']) * 100
df = df[['Adj. Close','HL_pct','pct_Change','Adj. Volume']]
forecast_col = 'Adj. Close' # define what we're forcasting
df.fillna(-99999, inplace=True) #replaces missing data with an outlier value (-99999) rather that getting rid of any data
forecast_out = int(math.ceil(0.01*len(df))) # math.ceil rounds everything up to the nearest whole - so this formula takes 1% of the length of the datafram, rounds this up and finally converts it to an interger
df['label'] = df[forecast_col].shift(-forecast_out) # so this adds a new column 'label' that contains the 'Adj. Close' value from ~1 days in future(?)
# print (df.head()) #just used to check data
X = np.array(df.drop(['label'],1)) # everything except the lable column; this returns a new dataframe that is then converted to a numpy array and stored as X
X = preprocessing.scale(X) # scale X before classifier - this can help with performance but can also take longer: can be skipped
X_lately = X[-forecast_out:] # used to predict against - note there are no y values for these to check against
X = X[:-forecast_out] # needs to happen after scaling
df.dropna(inplace=True)
y = np.array(df['label'])
y = np.array(df['label']) # array of labels
### creat training and testing sets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2) # 0.2 = 20% of the datafram
### Swapping different algorythms
# clf = LinearRegression() # simple linear regressions
# clf = LinearRegression(n_jobs=10) # linear regression using threading, 10 jobs at a time = faster
clf = LinearRegression(n_jobs=-1) # linear regression using threading with as many jobs as preprocessor will handle
# clf = svm.SVR() # base support vector regression
# clf = svm.SVR(kernel="poly") # support vector regression with specific kernel
clf.fit(X_train, y_train) # fit the data to the training data
accuracy = clf.score(X_test, y_test) # score it against test
# print(accuracy)
### pridiction - easy once the classifier is sets
forecast_set = clf.predict(X_lately)
print (forecast_set, accuracy, forecast_out)
df['Forecast'] = np.nan
last_date = df.iloc[-1].name
last_unix = last_date.timestamp()
one_day = 86400
next_unix = last_unix + one_day # moving to future dates not in dataset
## Set a new Data Frame including dates with the forecast values
for i in forecast_set:
next_date = datetime.datetime.fromtimestamp(next_unix)
next_unix += 86400
df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i]
df['Adj. Close'].plot()
df['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.savefig('ML_Tutorial5.svg', bbox_inches='tight') #bbox_inches='tight' minimises whitespace around the fig
plt.show()
This tutorial covered the first application of Regression to sample data.
https://pythonprogramming.net/training-testing-machine-learning-tutorial/
Key takeaways being:-
Generally, you want your features in machine learning to be in a range of -1 to 1. This may do nothing, but it usually speeds up processing and can also help with accuracy. Because this range is so popularly used, it is included in the preprocessing module of Scikit-Learn. To utilize this, you can apply preprocessing.scale to your X variable:
import pandas as pd
import quandl, math #imports Math and Quandl
import numpy as np # support for arrays
from sklearn import preprocessing, model_selection, svm #machine learning and
from sklearn.linear_model import LinearRegression # regression
quandl.ApiConfig.api_key = "9qfnyWSTDUpx6uhNX2dc"
df = quandl.get('WIKI/GOOGL') #import data from Qunadl
# print (df.head()) # print out the head rows of the data to check what we're getting
# create a dataframe
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close','Adj. Volume']]
df['HL_pct'] = ((df['Adj. High'] - df['Adj. Close']) / df['Adj. Close']) * 100
df['pct_Change'] = ((df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open']) * 100
df = df[['Adj. Close','HL_pct','pct_Change','Adj. Volume']]
forecast_col = 'Adj. Close' # define what we're forcasting
df.fillna(-99999, inplace=True) #replaces missing data with an outlier value (-99999) rather that getting rid of any data
forecast_out = int(math.ceil(0.01*len(df))) # math.ceil rounds everything up to the nearest whole - so this formula takes 1% of the length of the datafram, rounds this up and finally converts it to an interger
df['label'] = df[forecast_col].shift(-forecast_out) # so this adds a new column 'label' that contains the 'Adj. Close' value from ~1 days in future(?)
df.dropna(inplace=True)
# print (df.head()) #just used to check data
X = np.array(df.drop(['label'],1)) # everything except the lable column; this returns a new dataframe that is then converted to a numpy array and stored as X
y = np.array(df['label']) # array of labels
X = preprocessing.scale(X) # scale X before classifier - this can help with performance but can also take longer: can be skipped
y = np.array(df['label'])
### creat training and testing sets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2) # 0.2 = 20% of the datafram
### Swapping different algorythms
# clf = LinearRegression() # simple linear regressions
# clf = LinearRegression(n_jobs=10) # linear regression using threading, 10 jobs at a time = faster
clf = LinearRegression(n_jobs=-1) # linear regression using threading with as many jobs as preprocessor will handle
# clf = svm.SVR() # base support vector regression
# clf = svm.SVR(kernel="poly") # support vector regression with specific kernel
clf.fit(X_train, y_train) # fit the data to the training data
accuracy = clf.score(X_test, y_test) # score it against test
print(accuracy)
So the first two tutorials basically introduced the topic and imported some stock data – straight forward. Biggest takeaway being the use of Quandl – I’ll be doing some research into them at a later date.
So this tutorial gets into the meat of regression using Numpy to convert data into Numpy Arrays for Sykit-learn to do its thing.
Quick note on features and labels:-
A common example with regression might be to try to predict the dollar value of an insurance policy premium for someone. The company may collect your age, past driving infractions, public criminal record, and your credit score for example. The company will use past customers, taking this data, and feeding in the amount of the “ideal premium” that they think should have been given to that customer, or they will use the one they actually used if they thought it was a profitable amount.
Thus, for training the machine learning classifier, the features are customer attributes, the label is the premium associated with those attributes.
import pandas as pd
import quandl, math #imports Math and Quandl
df = quandl.get('WIKI/GOOGL') #import data from Qunadl
# print (df.head()) # print out the head rows of the data to check what we're getting
# create a dataframe
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close','Adj. Volume']]
df['HL_pct'] = ((df['Adj. High'] - df['Adj. Close']) / df['Adj. Close']) * 100
df['pct_Change'] = ((df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open']) * 100
df = df[['Adj. Close','HL_pct','pct_Change','Adj. Volume']]
forecast_col = 'Adj. Close' # define what we're forcasting
df.fillna(-99999, inplace=True) #replaces missing data with an outlier value (-99999) rather that getting rid of any data
forecast_out = int(math.ceil(0.01*len(df))) # math.ceil rounds everything up to the nearest whole - so this formula takes 1% of the length of the datafram, rounds this up and finally converts it to an interger
df['label'] = df[forecast_col].shift(-forecast_out) # so this adds a new column 'label' that contains the 'Adj. Close' value from ~1 days in future(?)
df.dropna(inplace=True)
print (df.head()) #just used to check data
Tutorial script here:-
import pandas as pd
import quandl
df = quandl.get('WIKI/GOOGL') #import data from Qunadl
# print (df.head()) # print out the head rows of the data to check what we're getting
# create a dataframe
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close','Adj. Volume']]
df['HL_pct'] = ((df['Adj. High'] - df['Adj. Close']) / df['Adj. Close']) * 100
df['pct_Change'] = ((df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open']) * 100
df = df[['Adj. Close','HL_pct','pct_Change','Adj. Volume']]
print (df.head()) #just used to che data
So the key project I’ve been working on is looking at stock market data and trying to develop a set of python tools that allows me to make better predictions.
This started with Beautiful Soup scripts designed to harvest company fundamentals from the London Stock Exchange website – integrating this to a SQL database for later analysis. This should given me all the raw data I need to determine whether a company is viable (passing a set of qualifier tests) along with (eventually) a way of predicting a base share value.
The second element to the project is then using sentiment analysis to look at how those same companies are being discussed on social media. This has been based on Sendex’s tutorials using Twitter, but my hope is to adapt these to other platforms. This then compliments the base data with some views on where current sentiment is – and hopefully there is a correlation between the two data-sets.
However, the current virus has meant free fall in most stock indexes which is probably going to skew my model. So I’m going to let it pass while working on a few supplementary modules that I would have gotten around to including at a later date.
Namely Machine Learning 🙂
So as before I’m going to follow Sendex’s tutorials on this.
There are a few Udemy courses I’ve done in this area – so I might not keep extensive notes – just cover the key elements I need to keep track of.
Anyway….music not related…
G
The final stage of this set of tutorials is graphing the sentiment output…based on a more in depth tutorial here:-
https://pythonprogramming.net/live-graphs-matplotlib-tutorial/
Code here:-
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib import style
import time
style.use("ggplot")
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
def animate(i):
pullData = open("twitter-out.txt","r").read()
lines = pullData.split('\n')
xar = []
yar = []
x = 0
y = 0
for l in lines[-200:]:
x += 1
if "pos" in l:
y += 1
elif "neg" in l:
y -= 1
xar.append(x)
yar.append(y)
ax1.clear()
ax1.plot(xar,yar)
ani = animation.FuncAnimation(fig, animate, interval=1000)
plt.show()
As I’m running this on a headless server it ran into issues straight away…
galiquis@localhost: $ python3 nltk_tutorial21.py
Unable to init server: Could not connect: Connection refused
Unable to init server: Could not connect: Connection refused(nltk_tutorial21.py:26565): Gdk-CRITICAL **: 08:21:48.134: gdk_cursor_new_for_display: assertion ‘GDK_IS_DISPLAY (display)’ failed
(nltk_tutorial21.py:26565): Gdk-CRITICAL **: 08:21:48.137: gdk_cursor_new_for_display: assertion ‘GDK_IS_DISPLAY (display)’ failed
So after a little research I found a way of switching the ‘canvas’ to the Agg Buffer – allowing the output to be saved rather than shown.
import matplotlib as mpl
mpl.use('Agg')
It’s important that this is defined ahead of any other canvas calls/functions – otherwise it throws errors.
The other tweaks I made just switched off the animation for now.
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.animation as animation
#from matplotlib import style
import time
#style.use("ggplot")
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
def animate(i):
pullData = open("twitter-out.txt","r").read()
lines = pullData.split('\n')
xar = []
yar = []
x = 0
y = 0
for l in lines[-200:]:
x += 1
if "pos" in l:
y += 1
elif "neg" in l:
y -= 1
xar.append(x)
yar.append(y)
ax1.clear()
ax1.plot(xar,yar)
#ani = animation.FuncAnimation(fig, animate, interval=100)
animate(1)
fig.savefig('temp.png')
First up the Twitter API module needed installing:-
galiquis@raspberrypi: $ pip3 install tweepy
Next a Twitter App is required from this link:-
https://developer.twitter.com/en/apps
This required setting up a developer account – with more justification in the application form than I was expecting – especially around what I’d be using the app for….anyway once generated it gave a live stream of twitter based on this code:-
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
#consumer key, consumer secret, access token, access secret.
ckey="6ru23AnzOKAieH4eYXF0XuTPS"
csecret="74Oz560aRCfo5QzzXu2I0gfOm58qkNPfZx0oSl3tnWEnEND4ex"
atoken="241873929-QkQ1eN0Du1Cg6el6rJa3sMGRHBaiSp7Cxekq61Of"
asecret="hf0ECMVfcqlPWkgOGKeNNTU1m41QQuiTOLzktsiNqqIxD"
class listener(StreamListener):
def on_data(self, data):
print(data)
return(True)
def on_error(self, status):
print(status)
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, listener())
twitterStream.filter(track=["car"])
https://pythonprogramming.net/twitter-api-streaming-tweets-python-tutorial/
The below covers a few tweaks with the output of the sentiment engine being saved off into a text file.
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
import sentiment_mod as s
#consumer key, consumer secret, access token, access secret.
ckey="*"
csecret="*"
atoken="*"
asecret="*"
class listener(StreamListener):
def on_data(self, data):
all_data = json.loads(data)
tweet = all_data["text"]
sentiment_value, confidence = s.sentiment(tweet)
print(tweet, sentiment_value, confidence)
if confidence*100 >= 80:
output = open("twitter-out.txt", "a")
output.write(sentiment_value)
output.write('\n')
output.close()
return(True)
def on_error(self, status):
print(status)
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, listener())
twitterStream.filter(track=["car"]) # term searched for in tweets
Next we’ll look at graphing this data.
This section brings all the detail together to create a module that can be used to monitor twitter sentiment.
The code for this is seperated into two blocks.
The first pickles most of the heavy training to save time in future iteration:-.
import nltk
import random
#from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize
#from unidecode import unidecode
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf
short_pos = open("positive.txt","r", encoding='utf-8', errors='replace').read() ## had to add a line to tell the open function to use utf-8
short_neg = open("negative.txt","r", encoding='utf-8', errors='replace').read()
# move this up here
all_words = []
documents = []
# j is adject, r is adverb, and v is verb
#allowed_word_types = ["J","R","V"]
allowed_word_types = ["J"]
for p in short_pos.split('\n'):
documents.append( (p, "pos") )
words = word_tokenize(p)
pos = nltk.pos_tag(words)
for w in pos:
if w[1][0] in allowed_word_types:
all_words.append(w[0].lower())
for p in short_neg.split('\n'):
documents.append( (p, "neg") )
words = word_tokenize(p)
pos = nltk.pos_tag(words)
for w in pos:
if w[1][0] in allowed_word_types:
all_words.append(w[0].lower())
save_documents = open("pickled_algos/documents.pickle","wb")
pickle.dump(documents, save_documents)
save_documents.close()
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:5000]
save_word_features = open("pickled_algos/word_features5k.pickle","wb")
pickle.dump(word_features, save_word_features)
save_word_features.close()
def find_features(document):
words = word_tokenize(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
featuresets = [(find_features(rev), category) for (rev, category) in documents]
random.shuffle(featuresets)
print(len(featuresets))
testing_set = featuresets[7400:] # memory limitations on the Pi meant this needed reducing from 10,000
training_set = featuresets[:7400]
save_featuresets = open("pickled_algos/featuresets.pickle","wb") ## added code to pickle featuresets
pickle.dump(featuresets, save_featuresets)
save_featuresets.close()
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)
###############
save_classifier = open("pickled_algos/originalnaivebayes5k.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)
save_classifier = open("pickled_algos/MNB_classifier5k.pickle","wb")
pickle.dump(MNB_classifier, save_classifier)
save_classifier.close()
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)
save_classifier = open("pickled_algos/BernoulliNB_classifier5k.pickle","wb")
pickle.dump(BernoulliNB_classifier, save_classifier)
save_classifier.close()
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)
save_classifier = open("pickled_algos/LogisticRegression_classifier5k.pickle","wb")
pickle.dump(LogisticRegression_classifier, save_classifier)
save_classifier.close()
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)
save_classifier = open("pickled_algos/LinearSVC_classifier5k.pickle","wb")
pickle.dump(LinearSVC_classifier, save_classifier)
save_classifier.close()
##NuSVC_classifier = SklearnClassifier(NuSVC())
##NuSVC_classifier.train(training_set)
##print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)
SGDC_classifier = SklearnClassifier(SGDClassifier())
SGDC_classifier.train(training_set)
print("SGDClassifier accuracy percent:",nltk.classify.accuracy(SGDC_classifier, testing_set)*100)
save_classifier = open("pickled_algos/SGDC_classifier5k.pickle","wb")
pickle.dump(SGDC_classifier, save_classifier)
save_classifier.close()
I had to drop the size of the learning set given the memory restrictions on the Pi not sure there was another way around this and it probably means that to start doing some of this stuff seriously….I’m going to need a bigger/better server.
Also I added a section to pickle the featuresets:-
save_featuresets = open("pickled_algos/featuresets.pickle","wb") ## added code to pickle featuresets
pickle.dump(featuresets, save_featuresets)
save_featuresets.close()
One option might be clustering the Pi or putting together a low power server – which then leads into building a full home server farm.
The next section is then the sentiment module:-
#File: sentiment_mod.py
import nltk
import random
#from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf
documents_f = open("pickled_algos/documents.pickle", "rb")
documents = pickle.load(documents_f)
documents_f.close()
word_features5k_f = open("pickled_algos/word_features5k.pickle", "rb")
word_features = pickle.load(word_features5k_f)
word_features5k_f.close()
def find_features(document):
words = word_tokenize(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
featuresets_f = open("pickled_algos/featuresets.pickle", "rb")
featuresets = pickle.load(featuresets_f)
featuresets_f.close()
random.shuffle(featuresets)
print(len(featuresets))
testing_set = featuresets[10000:]
training_set = featuresets[:10000]
open_file = open("pickled_algos/originalnaivebayes5k.pickle", "rb")
classifier = pickle.load(open_file)
open_file.close()
open_file = open("pickled_algos/MNB_classifier5k.pickle", "rb")
MNB_classifier = pickle.load(open_file)
open_file.close()
open_file = open("pickled_algos/BernoulliNB_classifier5k.pickle", "rb")
BernoulliNB_classifier = pickle.load(open_file)
open_file.close()
open_file = open("pickled_algos/LogisticRegression_classifier5k.pickle", "rb")
LogisticRegression_classifier = pickle.load(open_file)
open_file.close()
open_file = open("pickled_algos/LinearSVC_classifier5k.pickle", "rb")
LinearSVC_classifier = pickle.load(open_file)
open_file.close()
open_file = open("pickled_algos/SGDC_classifier5k.pickle", "rb")
SGDC_classifier = pickle.load(open_file)
open_file.close()
voted_classifier = VoteClassifier(
classifier,
LinearSVC_classifier,
MNB_classifier,
BernoulliNB_classifier,
LogisticRegression_classifier)
def sentiment(text):
feats = find_features(text)
return voted_classifier.classify(feats),voted_classifier.confidence(feats)
This tutorial covers training the algorithm on a new more tailored data-set. The training data set used still covers movie reviews but contains those that are a lot shorter – which should give better results.
In this section we combine the classifiers from previous steps into a voting classifier that will give an aggregated confidence level. Each algorithm gets one vote, and the classification that has the votes votes is the chosen one.
To do this we’ll import the modules we need:-
from nltk.classify import ClassifierI
from statistics import mode # to allow us to add up and compare votes
Next build a VoteClassifier class:-
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf
And then combine it all:-
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]
def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
#print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_features(rev), category) for (rev, category) in documents]
training_set = featuresets[:1900]
testing_set = featuresets[1900:]
#classifier = nltk.NaiveBayesClassifier.train(training_set)
classifier_f = open("naivebayes.pickle","rb")
classifier = pickle.load(classifier_f)
classifier_f.close()
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)
##SVC_classifier = SklearnClassifier(SVC())
##SVC_classifier.train(training_set)
##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)
voted_classifier = VoteClassifier(classifier,
NuSVC_classifier,
LinearSVC_classifier,
SGDClassifier_classifier,
MNB_classifier,
BernoulliNB_classifier,
LogisticRegression_classifier)
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)
print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)
Output:-
Original Naive Bayes Algo accuracy percent: 66.0
Most Informative Features
thematic = True pos : neg = 9.1 : 1.0
secondly = True pos : neg = 8.5 : 1.0
narrates = True pos : neg = 7.8 : 1.0
layered = True pos : neg = 7.1 : 1.0
rounded = True pos : neg = 7.1 : 1.0
supreme = True pos : neg = 7.1 : 1.0
crappy = True neg : pos = 6.9 : 1.0
uplifting = True pos : neg = 6.2 : 1.0
ugh = True neg : pos = 5.3 : 1.0
gaining = True pos : neg = 5.1 : 1.0
mamet = True pos : neg = 5.1 : 1.0
wanda = True neg : pos = 4.9 : 1.0
onset = True neg : pos = 4.9 : 1.0
fantastic = True pos : neg = 4.5 : 1.0
milos = True pos : neg = 4.4 : 1.0
MNB_classifier accuracy percent: 67.0
BernoulliNB_classifier accuracy percent: 67.0 LogisticRegression_classifier accuracy percent: 68.0 SGDClassifier_classifier accuracy percent: 57.99999999999999 LinearSVC_classifier accuracy percent: 67.0
NuSVC_classifier accuracy percent: 65.0
voted_classifier accuracy percent: 65.0
Classification: neg Confidence %: 100.0
Classification: pos Confidence %: 57.14285714285714
Classification: neg Confidence %: 57.14285714285714
Classification: neg Confidence %: 57.14285714285714
Classification: pos Confidence %: 57.14285714285714
Classification: pos Confidence %: 85.71428571428571
One issue that did crop up on my system was the error:-
/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
The Scikit-learn (sklearn) module adds a bunch of other classifiers as well as machine learning algorithms – so although NLTK has Naive Bayes algorithms it’s straight forward to incorporate others from SciKit Learn.
As with most Python modules this involves importing the various parts before using them.
To import:-
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
Then using the classifiers:-
## Multinomial Naive Bayse
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)
## Gaussian Naive Bayse
#GNB_classifier = SklearnClassifier(GaussianNB())
#GNB_classifier.train(training_set)
#print("GNB_classifier accuracy:", (nltk.classify.accuracy(GNB_classifier, testing_set))*100)
## Bernoulli Naive Bayse
BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BNB_classifier accuracy:", (nltk.classify.accuracy(BNB_classifier, testing_set))*100)
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)
This gives a range of different views on +ve/-ve and the next section will focus on a voting system for these.
Once a classifier has been trained it’s often quicker in the long run to save the algorithm for re-use later, rather than training it each and every time.
Script to Save the classifier:-
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()
Script to load the classifier once pickled:-
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()
import nltk
import random
import pickle
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower()) # normalise everything to lower case and append
all_words = nltk.FreqDist(all_words) # converts to a nltk frequency distribution
word_features = list (all_words.keys())[:3000] # from the frequency list we're taking just the words(keys) and only the top 3000
def find_fetures(document):
words = set(document) # this gives a list of the unique words - removes duplicates
features = {} # declare an empty dictionary
for w in word_features:
features[w] = (w in words) # this checks each word in the top 3000 to see if it is present in the passed text 'document' so gives a true/false against the 3000
return features
# print((find_fetures(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_fetures(rev), category) for (rev, category) in documents]
training_set = featuresets[:1900] # splits the featuresets into two seperate groups 1 to train and the other to test
testing_set = featuresets[1900:]
## Naive Bayse Algorythm
# classifier = nltk.NaiveBayesClassifier.train(training_set) # training the NaiveBayesClassifier on training data commented out once naivebayes.pickle is generated
classifier_f = open("naivebayes.pickle","rb")
classifer = pickle.load(classifier_f)
classifier_f.close()
print("Naive Bayes Algo accuracy:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15) # tells most popular words on both sides and if +ve or -ve
## Pickle allows you to save python obects and import them later
# save_classifier = open("naivebayes.pickle","wb") # commented out once naivebayes.pickle has been generated
# pickle.dump(classifier, save_classifier)
# save_classifier.close()
This section builds on the last 2 tutorials to choose an algorithm, separate the data into training and testing sets – and set it running.
The algorithm in this example is the Naive Bayes classifier.
But first the data needs to be split into training and test sets for some supervised machine learning. In essence we show the machine data, and telling it “this data is positive,” or “this data is negative.” Then, after the training is done, we show the machine some new data and ask the computer what the computer thinks the category of the new data is.
import nltk
import random
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower()) # normalise everything to lower case and append
all_words = nltk.FreqDist(all_words) # converts to a nltk frequency distribution
word_features = list (all_words.keys())[:3000] # from the frequency list we're taking just the words(keys) and only the top 3000
def find_fetures(document):
words = set(document) # this gives a list of the unique words - removes duplicates
features = {} # declare an empty dictionary
for w in word_features:
features[w] = (w in words) # this checks each word in the top 3000 to see if it is present in the passed text 'document' so gives a true/false against the 3000
return features
# print((find_fetures(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_fetures(rev), category) for (rev, category) in documents]
training_set = featuresets[:1900] # splits the featuresets into two seperate groups 1 to train and the other to test
testing_set = featuresets[1900:]
## Naive Bayse Algorythm
classifier = nltk.NaiveBayesClassifier.train(training_set) # training the NaiveBayesClassifier on training data
print("Naive Bayes Algo accuracy:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15) # tells most popular words on both sides and if +ve or -ve
Output:-
galiquis@raspberrypi: $ python3 ./nltk_tutorial13.py
Naive Bayes Algo accuracy: 80.0
Most Informative Features
annual = True pos : neg = 9.6 : 1.0
sucks = True neg : pos = 9.1 : 1.0
bothered = True neg : pos = 9.1 : 1.0
frances = True pos : neg = 8.9 : 1.0
idiotic = True neg : pos = 8.8 : 1.0
unimaginative = True neg : pos = 8.4 : 1.0
silverstone = True neg : pos = 7.7 : 1.0
shoddy = True neg : pos = 7.1 : 1.0
suvari = True neg : pos = 7.1 : 1.0
mena = True neg : pos = 7.1 : 1.0
sexist = True neg : pos = 7.1 : 1.0
regard = True pos : neg = 6.9 : 1.0
schumacher = True neg : pos = 6.7 : 1.0
uninspired = True neg : pos = 6.6 : 1.0
kidding = True neg : pos = 6.4 : 1.0
In order to apply some machine learning to the algorithm we need to identify features in the date (word).
This section covers compiling feature lists from positive and negative reviews, to hopefully see trends.
import nltk
import random
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower()) # normalise everything to lower case and append
all_words = nltk.FreqDist(all_words) # converts to a nltk frequency distribution
word_features = list (all_words.keys())[:3000] # from the frequency list we're taking just the words(keys) and only the top 3000
def find_fetures(document):
words = set(document) # this gives a list of the unique words - removes duplicates
features = {} # declare an empty dictionary
for w in word_features:
features[w] = (w in words) # this checks each word in the top 3000 to see if it is present in the passed text 'document' so gives a true/false against the 3000
return features
print((find_fetures(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_fetures(rev), category) for (rev, category) in documents]
The step on from looking at words and how words relate to each other, is to broaden out to classifying sections of text. Classification can be focused on identifying what a piece of text is about (e.g. politics, the military etc) or as simple as identifying if some text is spam or not spam – for comment/email filters.
import nltk
import random
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
print(documents[1])
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower()) # normalise everything to lower case and append
all_words = nltk.FreqDist(all_words) # converts to a nltk frequency distribution
print(all_words.most_common(15)) # top 15 most common
print(all_words["stupid"]) # shows the frequency of a specific word
The first key piece of code is defining ‘documents’.
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
This uses the list() constructor to iterate through the movie_reviews and create a tuple.
However, this is easier to see when broken down into multiple lines:-
documents = [] # declare the list
for category in movie_reviews.categories():
for fileid in movie_reviews.fileids(category):
documents.append(list(movie_reviews.words(fileid)), category)
Next we shuffle the documents as we don’t want to train and test on the same data.
So the principal being used is to take all of the words from all of the reviews and compile them to find out the most popular words and whether they appear in positive or negative reviews. Then when we look at a new review we can test what words appear and determine if they are more positive or negative….onto the next tutorial.
Useful links:-
Lists and Tuples – https://realpython.com/python-lists-tuples/
WordNet is a lexical database for the English language, created by Princeton, and is part of the NLTK corpus.
WordNet can be used alongside NLTK to find the meanings of words, synonyms, antonyms, context, etc.
Below is an example of calling various aspects of Synsets for the word program:-
from nltk.corpus import wordnet
syns = wordnet.synsets("program")
print(syns)
print(syns[0])
#synset 0
print(syns[0].name())
# just the word
print(syns[0].lemmas()[0].name())
# definition
print(syns[0].definition())
# examples
print(syns[0].examples())
So Synsets has a number of entries for ‘program’:-
All relating to different meanings for ‘program’.
Once a Synset is selected its Lemma, definition or examples can be accessed.
Synsets can also be used to generate Synonyms and Antonyms for a given word, in the below example “Good”…
from nltk.corpus import wordnet
synonyms = [] #declare an empty list
antonyms = [] #declare an empty list
for syn in wordnet.synsets("good"):
for lemma in syn.lemmas():
synonyms.append(lemma.name())
if lemma.antonyms():
antonyms.append(lemma.antonyms()[0].name())
print("Synonyms",set(synonyms))
print("Antonyms",set(antonyms))
Giving the output:-
Synonyms {‘respectable’, ‘salutary’, ‘effective’, ‘dependable’, ‘thoroughly’, ‘beneficial’, ‘unspoilt’, ‘secure’, ‘good’, ‘skilful’, ‘in_effect’, ‘well’, ‘dear’, ‘undecomposed’, ‘serious’, ‘soundly’, ‘goodness’, ‘practiced’, ‘honorable’, ‘safe’, ‘expert’, ‘in_force’, ‘honest’, ‘sound’, ‘full’, ‘trade_good’, ‘skillful’, ‘ripe’, ‘upright’, ‘just’, ‘right’, ‘estimable’, ‘near’, ‘adept’, ‘unspoiled’, ‘proficient’, ‘commodity’}
Antonyms {‘evilness’, ‘evil’, ‘badness’, ‘ill’, ‘bad’}
WordNet can also be used to compare the similarity of two words and their tenses, by using the Wu and Palmer method for semantic comparison.
from nltk.corpus import wordnet
w1 = wordnet.synset("ship.n.01") # word.(n)oun.(1)st_occurance
w2 = wordnet.synset("boat.n.01")
print(w1.wup_similarity(w2)) # apllying the Wu & Palmer method
w1 = wordnet.synset("ship.n.01") # word.(n)oun.(1)st_occurance
w2 = wordnet.synset("car.n.01")
print(w1.wup_similarity(w2)) # apllying the Wu & Palmer method
w1 = wordnet.synset("ship.n.01") # word.(n)oun.(1)st_occurance
w2 = wordnet.synset("cat.n.01")
print(w1.wup_similarity(w2)) # apllying the Wu & Palmer method
This tutorial just covers navigating the Corpa.
I used the following to find the data directory:-
galiquis@raspberrypi:~ $ sudo find / -type d -name “nltk_data”
Lemmatizing is very similar to stemming with the key difference being that lemmatizing ends up at a real word.
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))
Gives the output:-
galiquis@raspberrypi:$ python3 ./nltk_tutorial8.py
cat
cactus
goose
rock
python
good
best
run
run
Some points to note:-
A key method for chunking in natural language processing is called “Named Entity Recognition.” The concept being to have the code identify and pull out “entities” like people, places, things, locations, monetary figures, and more.
NE Type and Examples
ORGANIZATION – Georgia-Pacific Corp., WHO
PERSON – Eddy Bonte, President Obama
LOCATION – Murray River, Mount Everest
DATE – June, 2008-06-29
TIME – two fifty a m, 1:30 p.m.
MONEY – 175 million Canadian Dollars, GBP 10.40
PERCENT – twenty pct, 18.75 %
FACILITY – Washington Monument, Stonehenge
GPE – South East Asia, Midlothian
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)
def process_content():
try:
for i in tokenized[5:]:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
namedEnt = nltk.ne_chunk(tagged, binary=True)
namedEnt.draw()
except Exception as e:
print(str(e))
process_content()
The key line that applies the entity recognition is:-
namedEnt = nltk.ne_chunk(tagged, binary=True)
Which has two options…
In the example when Binary is False, it picked up the same tagged items, but split up terms like White House into “White” and “House” as if they were different, whereas in the binary = True option, the named entity recognition was correct to say White House was part of the same named entity.
NLTK’s Named Entity Recognition can be a bit hit and miss leading to a lot of false positives.
Chinking is the process of excluding/removing things from Chunks. So there might be items in the Chunk that need to be removed.
This is done by expanding the Chunk code to include }{ brackets that contain the items to be excluded.
Example:-
chunkGram = r"""Chunk: {<.*>+} }<VB.?|IN|DT|TO>+{"""
The next step on from knowing the parts speech (with tags) is to group them into meaningful chunks. The chunks are based around a subject (usually a noun – naming word) with the relevant verbs and adverbs associated with it.
To enable chunking regular expressions are employed, mainly:-
Regex cheat-sheet (full) – http://www.rexegg.com/regex-quickstart.html#ref
The main focus of Chunking is developing the right regular expression to pull together each proper noun (tag: NNP) along with the various types of verb and adverb.
The final expression was:-
r”””Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}”””
So breaking this down:-
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)
def process_content():
try:
for x in tokenized[:5]:
words = nltk.word_tokenize(x)
tagged = nltk.pos_tag(words)
chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
chunkParser = nltk.RegexpParser(chunkGram)
chunked = chunkParser.parse(tagged)
print(chunked)
except Exception as e:
print(str(e))
process_content()
Output:-
galiquis@raspberrypi: $ python3 ./nltk_tutorial5.py
(S
(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
‘S/POS
(Chunk ADDRESS/NNP)
BEFORE/IN
(Chunk A/NNP JOINT/NNP SESSION/NNP)
OF/IN
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
OF/IN
(Chunk THE/NNP UNION/NNP January/NNP)
31/CD
,/,
2006/CD
(Chunk THE/NNP PRESIDENT/NNP)
:/:
(Chunk Thank/NNP)
you/PRP
all/DT
./.)
(S
(Chunk Mr./NNP Speaker/NNP)
,/,
(Chunk Vice/NNP President/NNP Cheney/NNP)
,/,
members/NNS
of/IN
(Chunk Congress/NNP)
,/,
members/NNS
of/IN
the/DT
(Chunk Supreme/NNP Court/NNP)
and/CC
diplomatic/JJ
corps/NN
,/,
distinguished/JJ
guests/NNS
,/,
and/CC
fellow/JJ
citizens/NNS
:/:
Today/VB
our/PRP$
nation/NN
lost/VBD
a/DT
beloved/VBN
,/,
graceful/JJ
,/,
courageous/JJ
woman/NN
who/WP
(Chunk called/VBD America/NNP)
to/TO
its/PRP$
founding/NN
ideals/NNS
and/CC
carried/VBD
on/IN
a/DT
noble/JJ
dream/NN
./.)
(S
Tonight/NN
we/PRP
are/VBP
comforted/VBN
by/IN
the/DT
hope/NN
of/IN
a/DT
glad/JJ
reunion/NN
with/IN
the/DT
husband/NN
who/WP
was/VBD
taken/VBN
so/RB
long/RB
ago/RB
,/,
and/CC
we/PRP
are/VBP
grateful/JJ
for/IN
the/DT
good/JJ
life/NN
of/IN
(Chunk Coretta/NNP Scott/NNP King/NNP)
./.)
(S (/( (Chunk Applause/NNP) ./. )/))
(S
(Chunk President/NNP George/NNP W./NNP Bush/NNP)
reacts/VBZ
to/TO
applause/VB
during/IN
his/PRP$
(Chunk State/NNP)
of/IN
the/DT
(Chunk Union/NNP Address/NNP)
at/IN
the/DT
(Chunk Capitol/NNP)
,/,
(Chunk Tuesday/NNP)
,/,
(Chunk Jan/NNP)
./.)
Part of Speech Tagging is the process of adding a label to every single word in a text, identifying the type of word it is.
Below is a list of the POS tags and their meaning…
This time round a different Tokenizer is used – PunktSentenceTokenizer.
This is a machine learning tokenizer which can be trained first on some training data and then applied.
In the example two of GW Bush’s speeches are used: 2005 to train and then that tokenizer applied to the 2006’s speech.
The output of the tokenizer is then passed to POS tag function to create tuples of the words in the speech and the corresponding POS tag.
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)
def process_content():
try:
for x in tokenized[:5]:
words = nltk.word_tokenize(x)
tagged = nltk.pos_tag(words)
print(tagged)
except Exception as e:
print(str(e))
process_content()
Note: tokenized[:5] limits the example to only a few lines of the speech
Giving the output….
galiquis@raspberrypi: $ python3 ./nltk_tutorial4.py
[(‘PRESIDENT’, ‘NNP’), (‘GEORGE’, ‘NNP’), (‘W.’, ‘NNP’), (‘BUSH’, ‘NNP’), (“‘S”, ‘POS’), (‘ADDRESS’, ‘NNP’), (‘BEFORE’, ‘IN’), (‘A’, ‘NNP’), (‘JOINT’, ‘NNP’), (‘SESSION’, ‘NNP’),
(‘OF’, ‘IN’), (‘THE’, ‘NNP’), (‘CONGRESS’, ‘NNP’), (‘ON’, ‘NNP’), (‘THE’, ‘NNP’), (‘STATE’, ‘NNP’), (‘OF’, ‘IN’), (‘THE’, ‘NNP’), (‘UNION’, ‘NNP’), (‘January’, ‘NNP’), (’31’, ‘CD’), (‘,’, ‘,’), (‘2006’, ‘CD’), (‘THE’, ‘NNP’), (‘PRESIDENT’, ‘NNP’), (‘:’, ‘:’), (‘Thank’, ‘NNP’), (‘you’, ‘PRP’), (‘all’, ‘DT’), (‘.’, ‘.’)][(‘Mr.’, ‘NNP’), (‘Speaker’, ‘NNP’), (‘,’, ‘,’), (‘Vice’, ‘NNP’), (‘President’, ‘NNP’), (‘Cheney’, ‘NNP’), (‘,’, ‘,’), (‘members’, ‘NNS’), (‘of’, ‘IN’), (‘Congress’, ‘NNP’), (‘,’, ‘,’), (‘members’, ‘NNS’), (‘of’, ‘IN’), (‘the’, ‘DT’), (‘Supreme’, ‘NNP’), (‘Court’, ‘NNP’), (‘and’, ‘CC’), (‘diplomatic’, ‘JJ’), (‘corps’, ‘NN’), (‘,’, ‘,’), (‘distinguished’, ‘JJ’), (‘guests’, ‘NNS’), (‘,’, ‘,’), (‘and’, ‘CC’), (‘fellow’, ‘JJ’), (‘citizens’, ‘NNS’), (‘:’, ‘:’), (‘Today’, ‘VB’), (‘our’, ‘PRP$’), (‘nation’, ‘NN’), (‘lost’, ‘VBD’), (‘a’, ‘DT’), (‘beloved’, ‘VBN’), (‘,’, ‘,’), (‘graceful’, ‘JJ’), (‘,’, ‘,’), (‘courageous’, ‘JJ’), (‘woman’, ‘NN’), (‘who’, ‘WP’), (‘called’, ‘VBD’), (‘America’, ‘NNP’), (‘to’, ‘TO’), (‘its’, ‘PRP$’), (‘founding’, ‘NN’), (‘ideals’, ‘NNS’), (‘and’, ‘CC’), (‘carried’, ‘VBD’), (‘on’, ‘IN’), (‘a’, ‘DT’), (‘noble’, ‘JJ’), (‘dream’, ‘NN’), (‘.’, ‘.’)][(‘Tonight’, ‘NN’), (‘we’, ‘PRP’), (‘are’, ‘VBP’), (‘comforted’, ‘VBN’), (‘by’, ‘IN’), (‘the’, ‘DT’), (‘hope’, ‘NN’), (‘of’, ‘IN’), (‘a’, ‘DT’), (‘glad’, ‘JJ’), (‘reunion’, ‘NN’), (‘with’, ‘IN’), (‘the’, ‘DT’), (‘husband’, ‘NN’), (‘who’, ‘WP’), (‘was’, ‘VBD’), (‘taken’, ‘VBN’), (‘so’, ‘RB’), (‘long’, ‘RB’), (‘ago’, ‘RB’), (‘,’, ‘,’), (‘and’, ‘CC’), (‘we’,
‘PRP’), (‘are’, ‘VBP’), (‘grateful’, ‘JJ’), (‘for’, ‘IN’), (‘the’, ‘DT’), (‘good’, ‘JJ’), (‘life’, ‘NN’), (‘of’, ‘IN’), (‘Coretta’, ‘NNP’), (‘Scott’, ‘NNP’), (‘King’, ‘NNP’), (‘.’, ‘.’)][(‘(‘, ‘(‘), (‘Applause’, ‘NNP’), (‘.’, ‘.’), (‘)’, ‘)’)][(‘President’, ‘NNP’), (‘George’, ‘NNP’), (‘W.’, ‘NNP’), (‘Bush’, ‘NNP’), (‘reacts’, ‘VBZ’), (‘to’, ‘TO’), (‘applause’, ‘VB’), (‘during’, ‘IN’), (‘his’, ‘PRP$’), (‘State’, ‘NNP’), (‘of’, ‘IN’), (‘the’, ‘DT’), (‘Union’, ‘NNP’), (‘Address’, ‘NNP’), (‘at’, ‘IN’), (‘the’, ‘DT’), (‘Capitol’, ‘NNP’), (‘,’, ‘,’), (‘Tuesday’, ‘NNP’), (‘,’, ‘,’), (‘Jan’, ‘NNP’), (‘.’, ‘.’)]
Stemming is the process of reducing words to their root forms, mapping a group of words to the same stem even if the stem itself is not a valid word in the Language.
Example:-
Stem (root) is the part of the word to which you add inflectional (changing/deriving) affixes such as (-ed,-ize, -s,-de,mis).
Stemmers remove these morphological affixes from words, leaving only the word stem – which may result in words that are not actual words.
In NLTK there are multiple stemmers, examples being Porter, Porter2, Paice-Husk, and Lovins.
In the tutorial Porter is used, first to demonstrate the basic function on a list of words.
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()
example_words = ["pyhton","pythoner","pyhtoning","pythoned","pythonly"]
for x in example_words:
print (ps.stem(x))
Giving an output of:-
galiquis@raspberrypi: $ python3 ./nltk_tutorial3.py
pyhton
python
pyhton
python
pythonli
It can also be used to stem words in a sentence, using tokenize to pull the sentence into individual words.
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()
new_text = "It is very important to be pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once"
words = word_tokenize(new_text)
for x in words:
print(ps.stem(x))
Giving an output of:-
galiquis@raspberrypi: $ python3 ./nltk_tutorial3.py
It
is
veri
import
to
be
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc
Part 2 focuses on Stop words, those little structural words that humans rely on to make sense of a sentence but which just get in the way of algorithmic analysis. Words such as: a, is, the, it….etc
First we import from Corpus a list of predefined stop words, which using print(stopwords) shows:-
{‘whom’, ‘through’, ‘y’, “hadn’t”, ‘while’, ‘they’, ‘some’, ‘into’, ‘you’, ‘how’, ‘too’, ‘until’, ‘ourselves’, “should’ve”, ‘me’, ‘a’, ‘wouldn’, ‘or’, ‘yours’, ‘ve’, ‘themselves’, “you’ve”, ‘nor’, ‘so’, ‘not’, ‘haven’, ‘those’, ‘needn’, ‘didn’, ‘was’, ‘she’, ‘is’, ‘because’, ‘once’, ‘did’, ‘from’, ‘don’, ‘mustn’, ‘own’, ‘myself’, ‘doing’, ‘have’, “won’t”,
‘wasn’, ‘few’, ‘during’, ‘aren’, ‘out’, ‘having’, ‘both’, ‘who’, ‘all’, ‘d’, ‘which’, ‘for’, ‘if’, ‘her’, ‘any’, “don’t”, ‘won’, ‘between’, ‘your’, ‘ain’, ‘mightn’, “mustn’t”, “you’ll”, ‘hers’, ‘am’, ‘this’, ‘does’, ‘are’, ‘before’, ‘most’, ‘what’, ‘after’, “wouldn’t”, ‘we’, ‘re’, ‘isn’, ‘yourselves’, ‘down’, ‘it’, ‘our’, ‘he’, “shouldn’t”, ‘o’, ‘were’, ‘been’, ‘there’, “isn’t”, ‘but’, ‘yourself’, ‘other’, “couldn’t”, ‘again’, ‘herself’, “mightn’t”, ‘to’, ‘their’, ‘i’, ‘when’, ‘hasn’, “doesn’t”, “needn’t”, ‘same’, ‘m’, ‘its’, “haven’t”, “weren’t”, ‘an’, ‘had’, ‘weren’, ‘shan’, ‘against’, “aren’t”, ‘will’, “you’re”, ‘the’, ‘my’, ‘him’, ‘himself’, ‘s’, ‘ll’, ‘of’, ‘ours’, ‘in’, ‘itself’, ‘about’, ‘as’, ‘than’, ‘couldn’, “shan’t”, “hasn’t”, ‘theirs’, ‘just’, ‘where’, ‘be’, ‘with’, ‘why’, ‘below’, ‘now’, ‘off’, ‘up’, ‘each’, ‘only’, ‘here’, ‘further’, ‘shouldn’, “wasn’t”, ‘on’, “didn’t”, “you’d”, ‘do’, ‘no’, ‘more’, ‘over’, ‘can’, ‘that’, ‘being’, ‘such’, ‘by’, ‘at’, “that’ll”, ‘above’, ‘ma’, “it’s”, ‘should’, ‘these’, ‘has’, “she’s”, ‘very’, ‘t’, ‘under’, ‘them’, ‘doesn’, ‘then’, ‘his’, ‘and’, ‘hadn’}
Then using word_tokenise and a for loop we remove the stop words.
Example code:-
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
example_sentence = "This is an example showing off stop word filtration"
stop_words = set(stopwords.words("english"))
words = word_tokenize(example_sentence)
filtered_sentence = []
for x in words:
if x not in stop_words:
filtered_sentence.append(x)
print(filtered_sentence)
The for loop can be combined into one line of code but it’s not as easy to follow:-
filtered_sentence = [w for w in words if not w in stop_words]
Straight forward way of removing noise from the word lists.
So to learn about sentiment analysis I’m initially going to be working through a series of tutorials by Sentdex on YouTube.
The main focus on this was installing the Natural Language Tool Kit, which unlike a lot of python libraries also requires you to download extensions. (…if that’s the right term?)
In the vid it recommends running:- Continue reading %s
Recently I’ve started a project to look at tracking stocks combined with company sentiment – utilising Python running on a little Raspberry Pi. I’ll link to the detailed project posts at a later point.
Anyway the first hurdle I’ve hit was getting NumPy to install correctly with the usual pip install giving the following error when NumPy is called…
ImportError:
Importing the multiarray numpy extension module failed. Most
likely you are trying to import a failed build of numpy.
If you’re working with a numpy git repo, try `git clean -xdf` (removes all
files not under version control). Otherwise reinstall numpy.Original error was: libf77blas.so.3: cannot open shared object file: No such file or directory
—————————————-
ERROR: Command errored out with exit status 1: /usr/bin/python3 /usr/local/lib/python3.7/dist-packages/pip/_vendor/pep517/_in_process.py prepare_metadata_for_build_wheel /tmp/tmpumibpogm Check the logs for full command output.
I tried a couple of things to fix this…
sudo pip3 install -U numpy
sudo pip3 uninstall -y numpy
sudo pip3 uninstall -y setuptools
sudo pip3 install -U setuptools
sudo pip3 install -U numpy
sudo apt-get remove python-numpy
But to know avail.
So after a little light browsing I came up with an answer – not all of the NumPy dependencies had been installed. Ineed to run
sudo apt-get install python-dev libatlas-base-dev
Which then allowed NumPy to be reinstalled and worked a treat.