Words as Features for Learning
In order to apply some machine learning to the algorithm we need to identify features in the date (word).
This section covers compiling feature lists from positive and negative reviews, to hopefully see trends.
import nltk
import random
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower()) # normalise everything to lower case and append
all_words = nltk.FreqDist(all_words) # converts to a nltk frequency distribution
word_features = list (all_words.keys())[:3000] # from the frequency list we're taking just the words(keys) and only the top 3000
def find_fetures(document):
words = set(document) # this gives a list of the unique words - removes duplicates
features = {} # declare an empty dictionary
for w in word_features:
features[w] = (w in words) # this checks each word in the top 3000 to see if it is present in the passed text 'document' so gives a true/false against the 3000
return features
print((find_fetures(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_fetures(rev), category) for (rev, category) in documents]