Final thoughts on K Nearest Neighbors
https://pythonprogramming.net/final-thoughts-knn-machine-learning-tutorial/
# imports
import numpy as np
from math import sqrt
import warnings
from collections import Counter
import pandas as pd
import random
# define function
def K_nearest_neighbours(data, predict, k=3):
if len(data) >= k:
warnings.warn('K is set to value less than total voting groups!')
distances = []
for group in data:
for features in data[group]:
euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
distances.append([euclidean_distance, group])
votes = [i[1] for i in sorted(distances) [:k]]
#print(Counter(votes).most_common(1))
vote_result = Counter(votes).most_common(1)[0][0]
confidence = Counter(votes).most_common(1)[0][1] / k
#print(vote_result, confidence)
return vote_result, confidence
accuracies = []
for i in range(25):
# import data
df = pd.read_csv('breast-cancer-wisconsin.data')
# there are gaps in the data denoted by '?' - these need to be converted to -99999 so the algorythm treats it as an outlier
df.replace('?',-99999,inplace=True)
# drop any useless data - in this case the ID
df.drop('id',1,inplace=True)
#print(df)
# convert everything in the list to a number
full_data = df.astype(float).values.tolist()
#print(full_data[:5]) # print first 5 rows
random.shuffle(full_data) # no need to define variable again i.e. full_data = random.shuffle(full_data)
test_size = 0.2
train_set = {2:[],4:[]}
test_set = {2:[],4:[]}
train_data = full_data[:-int(test_size*len(full_data))] # slicing the full data set by the test_size
test_data = full_data[-int(test_size*len(full_data)):] # last 20%
for i in train_data:
train_set[i[-1]].append(i[:-1]) # -1 gives the last column
for i in test_data:
test_set[i[-1]].append(i[:-1]) # -1 gives the last column
correct = 0
total = 0
for group in test_set:
for data in test_set[group]:
vote, confidence = K_nearest_neighbours(train_set,data, k=5)
if group == vote:
correct +=1
# else:
# print(confidence)
total +=1
#print('Accuracy', correct/total)
accuracies.append(correct / total)
print((sum(accuracies)/len(accuracies)*100))