Natural Language Tool Kit – Tutorial 16

Combining Algorithms with NLTK

In this section we combine the classifiers from previous steps into a voting classifier that will give an aggregated confidence level. Each algorithm gets one vote, and the classification that has the votes votes is the chosen one.

To do this we’ll import the modules we need:-

from nltk.classify import ClassifierI
from statistics import mode # to allow us to add up and compare votes

Next build a VoteClassifier class:-

class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers

def classify(self, features):
for c in self._classifiers:
v = c.classify(features)

def confidence(self, features):
for c in self._classifiers:
v = c.classify(features)

return conf

And then combine it all:-

import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI
from statistics import mode

class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers

def classify(self, features):
for c in self._classifiers:
v = c.classify(features)

def confidence(self, features):
for c in self._classifiers:
v = c.classify(features)

return conf

documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []

for w in movie_reviews.words():
all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words)

return features

#print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

featuresets = [(find_features(rev), category) for (rev, category) in documents]

training_set = featuresets[:1900]
testing_set =  featuresets[1900:]

#classifier = nltk.NaiveBayesClassifier.train(training_set)

classifier_f = open("naivebayes.pickle","rb")
classifier_f.close()

print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

##SVC_classifier = SklearnClassifier(SVC())
##SVC_classifier.train(training_set)
##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

voted_classifier = VoteClassifier(classifier,
NuSVC_classifier,
LinearSVC_classifier,
SGDClassifier_classifier,
MNB_classifier,
BernoulliNB_classifier,
LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print("Classification:", voted_classifier.classify(testing_set), "Confidence %:",voted_classifier.confidence(testing_set)*100)
print("Classification:", voted_classifier.classify(testing_set), "Confidence %:",voted_classifier.confidence(testing_set)*100)
print("Classification:", voted_classifier.classify(testing_set), "Confidence %:",voted_classifier.confidence(testing_set)*100)
print("Classification:", voted_classifier.classify(testing_set), "Confidence %:",voted_classifier.confidence(testing_set)*100)
print("Classification:", voted_classifier.classify(testing_set), "Confidence %:",voted_classifier.confidence(testing_set)*100)
print("Classification:", voted_classifier.classify(testing_set), "Confidence %:",voted_classifier.confidence(testing_set)*100)

Output:-

Original Naive Bayes Algo accuracy percent: 66.0
Most Informative Features
thematic
= True pos : neg = 9.1 : 1.0
secondly
= True pos : neg = 8.5 : 1.0
narrates
= True pos : neg = 7.8 : 1.0
layered
= True pos : neg = 7.1 : 1.0
rounded
= True pos : neg = 7.1 : 1.0
supreme
= True pos : neg = 7.1 : 1.0
crappy
= True neg : pos = 6.9 : 1.0
uplifting
= True pos : neg = 6.2 : 1.0
ugh
= True neg : pos = 5.3 : 1.0
gaining
= True pos : neg = 5.1 : 1.0
mamet
= True pos : neg = 5.1 : 1.0
wanda
= True neg : pos = 4.9 : 1.0
onset
= True neg : pos = 4.9 : 1.0
fantastic
= True pos : neg = 4.5 : 1.0
milos
= True pos : neg = 4.4 : 1.0
MNB_classifier accuracy percent
: 67.0
BernoulliNB_classifier accuracy percent: 67.0 LogisticRegression_classifier accuracy percent: 68.0 SGDClassifier_classifier accuracy percent: 57.99999999999999 LinearSVC_classifier accuracy percent: 67.0
NuSVC_classifier accuracy percent: 65.0
voted_classifier accuracy percent
: 65.0
Classification: neg Confidence %: 100.0
Classification: pos Confidence %: 57.14285714285714
Classification: neg Confidence %: 57.14285714285714
Classification: neg Confidence %: 57.14285714285714
Classification: pos Confidence %: 57.14285714285714
Classification: pos Confidence %: 85.71428571428571

One issue that did crop up on my system was the error:-

/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)