Final thoughts on K Nearest Neighbors
https://pythonprogramming.net/final-thoughts-knn-machine-learning-tutorial/
# imports import numpy as np from math import sqrt import warnings from collections import Counter import pandas as pd import random # define function def K_nearest_neighbours(data, predict, k=3): if len(data) >= k: warnings.warn('K is set to value less than total voting groups!') distances = [] for group in data: for features in data[group]: euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict)) distances.append([euclidean_distance, group]) votes = [i[1] for i in sorted(distances) [:k]] #print(Counter(votes).most_common(1)) vote_result = Counter(votes).most_common(1)[0][0] confidence = Counter(votes).most_common(1)[0][1] / k #print(vote_result, confidence) return vote_result, confidence accuracies = [] for i in range(25): # import data df = pd.read_csv('breast-cancer-wisconsin.data') # there are gaps in the data denoted by '?' - these need to be converted to -99999 so the algorythm treats it as an outlier df.replace('?',-99999,inplace=True) # drop any useless data - in this case the ID df.drop('id',1,inplace=True) #print(df) # convert everything in the list to a number full_data = df.astype(float).values.tolist() #print(full_data[:5]) # print first 5 rows random.shuffle(full_data) # no need to define variable again i.e. full_data = random.shuffle(full_data) test_size = 0.2 train_set = {2:[],4:[]} test_set = {2:[],4:[]} train_data = full_data[:-int(test_size*len(full_data))] # slicing the full data set by the test_size test_data = full_data[-int(test_size*len(full_data)):] # last 20% for i in train_data: train_set[i[-1]].append(i[:-1]) # -1 gives the last column for i in test_data: test_set[i[-1]].append(i[:-1]) # -1 gives the last column correct = 0 total = 0 for group in test_set: for data in test_set[group]: vote, confidence = K_nearest_neighbours(train_set,data, k=5) if group == vote: correct +=1 # else: # print(confidence) total +=1 #print('Accuracy', correct/total) accuracies.append(correct / total) print((sum(accuracies)/len(accuracies)*100))