Machine Learning – Tutorial 18

Applying our K Nearest Neighbours Algorithm

https://pythonprogramming.net/testing-our-k-nearest-neighbors-machine-learning-tutorial/

# imports
import numpy as np
from math import sqrt
import warnings
from collections import Counter
import pandas as pd
import random

# define function
def K_nearest_neighbours(data, predict, k=3):
    if len(data) >= k:
        warnings.warn('K is set to value less than total voting groups!')
    distances = []
    for group in data:
        for features in data[group]:
            euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
            distances.append([euclidean_distance, group])

    votes = [i[1] for i in sorted(distances) [:k]]
    #print(Counter(votes).most_common(1))
    vote_result = Counter(votes).most_common(1)[0][0]

    return vote_result

# import data
df = pd.read_csv('breast-cancer-wisconsin.data')
# there are gaps in the data denoted by '?' - these need to be converted to -99999 so the algorythm treats it as an outlier
df.replace('?',-99999,inplace=True)
# drop any useless data - in this case the ID
df.drop('id',1,inplace=True)
#print(df)
# convert everything in the list to a number
full_data = df.astype(float).values.tolist()
#print(full_data[:5]) # print first 5 rows
random.shuffle(full_data) # no need to define variable again i.e. full_data = random.shuffle(full_data)

test_size = 0.2
train_set = {2:[],4:[]}
test_set = {2:[],4:[]}
train_data = full_data[:-int(test_size*len(full_data))] # slicing the full data set by the test_size
test_data = full_data[-int(test_size*len(full_data)):] # last 20%

for i in train_data:
    train_set[i[-1]].append(i[:-1]) # -1 gives the last column
for i in test_data:
    test_set[i[-1]].append(i[:-1]) # -1 gives the last column

correct = 0
total = 0

for group in test_set:
    for data in test_set[group]:
        vote = K_nearest_neighbours(train_set,data, k=5)
        if group == vote:
            correct +=1
        total +=1

print('Accuracy', correct/total)

Leave a Reply