import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib import style
import time
style.use("ggplot")
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
def animate(i):
pullData = open("twitter-out.txt","r").read()
lines = pullData.split('\n')
xar = []
yar = []
x = 0
y = 0
for l in lines[-200:]:
x += 1
if "pos" in l:
y += 1
elif "neg" in l:
y -= 1
xar.append(x)
yar.append(y)
ax1.clear()
ax1.plot(xar,yar)
ani = animation.FuncAnimation(fig, animate, interval=1000)
plt.show()
As I’m running this on a headless server it ran into issues straight away…
galiquis@localhost: $ python3 nltk_tutorial21.py Unable to init server: Could not connect: Connection refused Unable to init server: Could not connect: Connection refused
So after a little research I found a way of switching the ‘canvas’ to the Agg Buffer – allowing the output to be saved rather than shown.
import matplotlib as mpl
mpl.use('Agg')
It’s important that this is defined ahead of any other canvas calls/functions – otherwise it throws errors.
The other tweaks I made just switched off the animation for now.
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.animation as animation
#from matplotlib import style
import time
#style.use("ggplot")
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
def animate(i):
pullData = open("twitter-out.txt","r").read()
lines = pullData.split('\n')
xar = []
yar = []
x = 0
y = 0
for l in lines[-200:]:
x += 1
if "pos" in l:
y += 1
elif "neg" in l:
y -= 1
xar.append(x)
yar.append(y)
ax1.clear()
ax1.plot(xar,yar)
#ani = animation.FuncAnimation(fig, animate, interval=100)
animate(1)
fig.savefig('temp.png')
This required setting up a developer account – with more justification in the application form than I was expecting – especially around what I’d be using the app for….anyway once generated it gave a live stream of twitter based on this code:-
This section brings all the detail together to create a module that can be used to monitor twitter sentiment.
The code for this is seperated into two blocks.
The first pickles most of the heavy training to save time in future iteration:-.
import nltk
import random
#from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize
#from unidecode import unidecode
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf
short_pos = open("positive.txt","r", encoding='utf-8', errors='replace').read() ## had to add a line to tell the open function to use utf-8
short_neg = open("negative.txt","r", encoding='utf-8', errors='replace').read()
# move this up here
all_words = []
documents = []
# j is adject, r is adverb, and v is verb
#allowed_word_types = ["J","R","V"]
allowed_word_types = ["J"]
for p in short_pos.split('\n'):
documents.append( (p, "pos") )
words = word_tokenize(p)
pos = nltk.pos_tag(words)
for w in pos:
if w[1][0] in allowed_word_types:
all_words.append(w[0].lower())
for p in short_neg.split('\n'):
documents.append( (p, "neg") )
words = word_tokenize(p)
pos = nltk.pos_tag(words)
for w in pos:
if w[1][0] in allowed_word_types:
all_words.append(w[0].lower())
save_documents = open("pickled_algos/documents.pickle","wb")
pickle.dump(documents, save_documents)
save_documents.close()
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:5000]
save_word_features = open("pickled_algos/word_features5k.pickle","wb")
pickle.dump(word_features, save_word_features)
save_word_features.close()
def find_features(document):
words = word_tokenize(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
featuresets = [(find_features(rev), category) for (rev, category) in documents]
random.shuffle(featuresets)
print(len(featuresets))
testing_set = featuresets[7400:] # memory limitations on the Pi meant this needed reducing from 10,000
training_set = featuresets[:7400]
save_featuresets = open("pickled_algos/featuresets.pickle","wb") ## added code to pickle featuresets
pickle.dump(featuresets, save_featuresets)
save_featuresets.close()
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)
###############
save_classifier = open("pickled_algos/originalnaivebayes5k.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)
save_classifier = open("pickled_algos/MNB_classifier5k.pickle","wb")
pickle.dump(MNB_classifier, save_classifier)
save_classifier.close()
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)
save_classifier = open("pickled_algos/BernoulliNB_classifier5k.pickle","wb")
pickle.dump(BernoulliNB_classifier, save_classifier)
save_classifier.close()
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)
save_classifier = open("pickled_algos/LogisticRegression_classifier5k.pickle","wb")
pickle.dump(LogisticRegression_classifier, save_classifier)
save_classifier.close()
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)
save_classifier = open("pickled_algos/LinearSVC_classifier5k.pickle","wb")
pickle.dump(LinearSVC_classifier, save_classifier)
save_classifier.close()
##NuSVC_classifier = SklearnClassifier(NuSVC())
##NuSVC_classifier.train(training_set)
##print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)
SGDC_classifier = SklearnClassifier(SGDClassifier())
SGDC_classifier.train(training_set)
print("SGDClassifier accuracy percent:",nltk.classify.accuracy(SGDC_classifier, testing_set)*100)
save_classifier = open("pickled_algos/SGDC_classifier5k.pickle","wb")
pickle.dump(SGDC_classifier, save_classifier)
save_classifier.close()
I had to drop the size of the learning set given the memory restrictions on the Pi not sure there was another way around this and it probably means that to start doing some of this stuff seriously….I’m going to need a bigger/better server.
Also I added a section to pickle the featuresets:-
This tutorial covers training the algorithm on a new more tailored data-set. The training data set used still covers movie reviews but contains those that are a lot shorter – which should give better results.
In this section we combine the classifiers from previous steps into a voting classifier that will give an aggregated confidence level. Each algorithm gets one vote, and the classification that has the votes votes is the chosen one.
To do this we’ll import the modules we need:-
from nltk.classify import ClassifierI
from statistics import mode # to allow us to add up and compare votes
Next build a VoteClassifier class:-
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf
And then combine it all:-
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]
def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
#print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_features(rev), category) for (rev, category) in documents]
training_set = featuresets[:1900]
testing_set = featuresets[1900:]
#classifier = nltk.NaiveBayesClassifier.train(training_set)
classifier_f = open("naivebayes.pickle","rb")
classifier = pickle.load(classifier_f)
classifier_f.close()
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)
##SVC_classifier = SklearnClassifier(SVC())
##SVC_classifier.train(training_set)
##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)
voted_classifier = VoteClassifier(classifier,
NuSVC_classifier,
LinearSVC_classifier,
SGDClassifier_classifier,
MNB_classifier,
BernoulliNB_classifier,
LogisticRegression_classifier)
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)
print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)
One issue that did crop up on my system was the error:-
/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
The Scikit-learn (sklearn) module adds a bunch of other classifiers as well as machine learning algorithms – so although NLTK has Naive Bayes algorithms it’s straight forward to incorporate others from SciKit Learn.
As with most Python modules this involves importing the various parts before using them.
To import:-
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
Once a classifier has been trained it’s often quicker in the long run to save the algorithm for re-use later, rather than training it each and every time.
import nltk
import random
import pickle
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower()) # normalise everything to lower case and append
all_words = nltk.FreqDist(all_words) # converts to a nltk frequency distribution
word_features = list (all_words.keys())[:3000] # from the frequency list we're taking just the words(keys) and only the top 3000
def find_fetures(document):
words = set(document) # this gives a list of the unique words - removes duplicates
features = {} # declare an empty dictionary
for w in word_features:
features[w] = (w in words) # this checks each word in the top 3000 to see if it is present in the passed text 'document' so gives a true/false against the 3000
return features
# print((find_fetures(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_fetures(rev), category) for (rev, category) in documents]
training_set = featuresets[:1900] # splits the featuresets into two seperate groups 1 to train and the other to test
testing_set = featuresets[1900:]
## Naive Bayse Algorythm
# classifier = nltk.NaiveBayesClassifier.train(training_set) # training the NaiveBayesClassifier on training data commented out once naivebayes.pickle is generated
classifier_f = open("naivebayes.pickle","rb")
classifer = pickle.load(classifier_f)
classifier_f.close()
print("Naive Bayes Algo accuracy:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15) # tells most popular words on both sides and if +ve or -ve
## Pickle allows you to save python obects and import them later
# save_classifier = open("naivebayes.pickle","wb") # commented out once naivebayes.pickle has been generated
# pickle.dump(classifier, save_classifier)
# save_classifier.close()
But first the data needs to be split into training and test sets for some supervised machine learning. In essence we show the machine data, and telling it “this data is positive,” or “this data is negative.” Then, after the training is done, we show the machine some new data and ask the computer what the computer thinks the category of the new data is.
import nltk
import random
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower()) # normalise everything to lower case and append
all_words = nltk.FreqDist(all_words) # converts to a nltk frequency distribution
word_features = list (all_words.keys())[:3000] # from the frequency list we're taking just the words(keys) and only the top 3000
def find_fetures(document):
words = set(document) # this gives a list of the unique words - removes duplicates
features = {} # declare an empty dictionary
for w in word_features:
features[w] = (w in words) # this checks each word in the top 3000 to see if it is present in the passed text 'document' so gives a true/false against the 3000
return features
# print((find_fetures(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_fetures(rev), category) for (rev, category) in documents]
training_set = featuresets[:1900] # splits the featuresets into two seperate groups 1 to train and the other to test
testing_set = featuresets[1900:]
## Naive Bayse Algorythm
classifier = nltk.NaiveBayesClassifier.train(training_set) # training the NaiveBayesClassifier on training data
print("Naive Bayes Algo accuracy:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15) # tells most popular words on both sides and if +ve or -ve
In order to apply some machine learning to the algorithm we need to identify features in the date (word).
This section covers compiling feature lists from positive and negative reviews, to hopefully see trends.
import nltk
import random
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower()) # normalise everything to lower case and append
all_words = nltk.FreqDist(all_words) # converts to a nltk frequency distribution
word_features = list (all_words.keys())[:3000] # from the frequency list we're taking just the words(keys) and only the top 3000
def find_fetures(document):
words = set(document) # this gives a list of the unique words - removes duplicates
features = {} # declare an empty dictionary
for w in word_features:
features[w] = (w in words) # this checks each word in the top 3000 to see if it is present in the passed text 'document' so gives a true/false against the 3000
return features
print((find_fetures(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_fetures(rev), category) for (rev, category) in documents]