K Nearest Neighbours

Machine Learning – Tutorial 19

Final thoughts on K Nearest Neighbors

https://pythonprogramming.net/final-thoughts-knn-machine-learning-tutorial/

# imports
import numpy as np
from math import sqrt
import warnings
from collections import Counter
import pandas as pd
import random

# define function
def K_nearest_neighbours(data, predict, k=3):
    if len(data) >= k:
        warnings.warn('K is set to value less than total voting groups!')
    distances = []
    for group in data:
        for features in data[group]:
            euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
            distances.append([euclidean_distance, group])

    votes = [i[1] for i in sorted(distances) [:k]]
    #print(Counter(votes).most_common(1))
    vote_result = Counter(votes).most_common(1)[0][0]
    confidence = Counter(votes).most_common(1)[0][1] / k
    #print(vote_result, confidence)
    return vote_result, confidence
accuracies = []

for i in range(25):

    # import data
    df = pd.read_csv('breast-cancer-wisconsin.data')
    # there are gaps in the data denoted by '?' - these need to be converted to -99999 so the algorythm treats it as an outlier
    df.replace('?',-99999,inplace=True)
    # drop any useless data - in this case the ID
    df.drop('id',1,inplace=True)
    #print(df)
    # convert everything in the list to a number
    full_data = df.astype(float).values.tolist()
    #print(full_data[:5]) # print first 5 rows
    random.shuffle(full_data) # no need to define variable again i.e. full_data = random.shuffle(full_data)

    test_size = 0.2
    train_set = {2:[],4:[]}
    test_set = {2:[],4:[]}
    train_data = full_data[:-int(test_size*len(full_data))] # slicing the full data set by the test_size
    test_data = full_data[-int(test_size*len(full_data)):] # last 20%

    for i in train_data:
        train_set[i[-1]].append(i[:-1]) # -1 gives the last column
    for i in test_data:
        test_set[i[-1]].append(i[:-1]) # -1 gives the last column

    correct = 0
    total = 0

    for group in test_set:
        for data in test_set[group]:
            vote, confidence  = K_nearest_neighbours(train_set,data, k=5)
            if group == vote:
                correct +=1
        #    else:
        #        print(confidence)
            total +=1

    #print('Accuracy', correct/total)
    accuracies.append(correct / total)

print((sum(accuracies)/len(accuracies)*100))

Machine Learning – Tutorial 18

Applying our K Nearest Neighbours Algorithm

https://pythonprogramming.net/testing-our-k-nearest-neighbors-machine-learning-tutorial/

# imports
import numpy as np
from math import sqrt
import warnings
from collections import Counter
import pandas as pd
import random

# define function
def K_nearest_neighbours(data, predict, k=3):
    if len(data) >= k:
        warnings.warn('K is set to value less than total voting groups!')
    distances = []
    for group in data:
        for features in data[group]:
            euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
            distances.append([euclidean_distance, group])

    votes = [i[1] for i in sorted(distances) [:k]]
    #print(Counter(votes).most_common(1))
    vote_result = Counter(votes).most_common(1)[0][0]

    return vote_result

# import data
df = pd.read_csv('breast-cancer-wisconsin.data')
# there are gaps in the data denoted by '?' - these need to be converted to -99999 so the algorythm treats it as an outlier
df.replace('?',-99999,inplace=True)
# drop any useless data - in this case the ID
df.drop('id',1,inplace=True)
#print(df)
# convert everything in the list to a number
full_data = df.astype(float).values.tolist()
#print(full_data[:5]) # print first 5 rows
random.shuffle(full_data) # no need to define variable again i.e. full_data = random.shuffle(full_data)

test_size = 0.2
train_set = {2:[],4:[]}
test_set = {2:[],4:[]}
train_data = full_data[:-int(test_size*len(full_data))] # slicing the full data set by the test_size
test_data = full_data[-int(test_size*len(full_data)):] # last 20%

for i in train_data:
    train_set[i[-1]].append(i[:-1]) # -1 gives the last column
for i in test_data:
    test_set[i[-1]].append(i[:-1]) # -1 gives the last column

correct = 0
total = 0

for group in test_set:
    for data in test_set[group]:
        vote = K_nearest_neighbours(train_set,data, k=5)
        if group == vote:
            correct +=1
        total +=1

print('Accuracy', correct/total)

Machine Learning – Tutorial 17

Writing our own K Nearest Neighbours in Code

https://pythonprogramming.net/coding-k-nearest-neighbors-machine-learning-tutorial/

# imports
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
import warnings
from collections import Counter

# To set charts to save as images we need to change the default behaviour
from matplotlib import style # inport style to change default behaviour of plot
style.use('ggplot') # use ggplot

dataset = {'k':[[1,2],[2,3],[3,1]], 'r':[[6,5],[7,7],[8,6]]} # defines as a dictionary 2 classes (k&r) with 3 features (lists of lists)
new_features = [5,7]

## Expanded one line for loop
#for i in dataset:
#    for ii in dataset[i]:
#        plt.scatter(ii[0],ii[1],s=100, color=i)


# define function
def K_nearest_neighbours(data, predict, k=3):
    if len(data) >= k:
        warnings.warn('K is set to value less than total voting groups!')
    distances = []
    for group in data:
        for features in data[group]:
            euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
            distances.append([euclidean_distance, group])

    votes = [i[1] for i in sorted(distances) [:k]]
    print(Counter(votes).most_common(1))
    vote_result = Counter(votes).most_common(1)[0][0]

    return vote_result

# generate results
results = K_nearest_neighbours(dataset, new_features, k=3)
print(results)
[[plt.scatter(ii[0],ii[1],s=100,color=i) for ii in dataset[i]] for i in dataset] # one line for loop
plt.scatter(new_features[0], new_features[1], color=results,s=100)
plt.show()

Machine Learning – Tutorial 16

Creating Our K Nearest Neighbors Algorithm 

https://pythonprogramming.net/programming-k-nearest-neighbors-machine-learning-tutorial/

# imports
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
import warnings
from collections import Counter

# To set charts to save as images we need to change the default behaviour
from matplotlib import style # inport style to change default behaviour of plot
style.use('ggplot') # use ggplot

dataset = {'k':[[1,2],[2,3],[3,1]], 'r':[[6,5],[7,7],[8,6]]} # defines as a dictionary 2 classes (k&r) with 3 features (lists of lists)
new_features = [5,7]

## Expanded one line for loop
#for i in dataset:
#    for ii in dataset[i]:
#        plt.scatter(ii[0],ii[1],s=100, color=i)
[[plt.scatter(ii[0],ii[1],s=100,color=i) for ii in dataset[i]] for i in dataset] # one line for loop
plt.show()

def K_nearest_neighbours(data, predict, k=3):
    if len(data) >= k:
        warnings.warn('K is set to value less than total voting groups!')
        
    return vote_result

Machine Learning – Tutorial 15

Euclidean Distance

https://pythonprogramming.net/euclidean-distance-machine-learning-tutorial/

 

euclidean distanceThis stuff makes my head hurt….just look at this formula….WTF man!

Basically, it’s just the square root of the sum of the distance of the points from each other, squared….easy

So in Python this translates into:-

plot1 = [1,3]
plot2 = [2,5]
euclidean_distance = sqrt( (plot1[0]-plot2[0])**2 + (plot1[1]-plot2[1])**2 )

Machine Learning – Tutorial 14

Applying K Nearest Neighbors to Data

https://pythonprogramming.net/k-nearest-neighbors-application-machine-learning-tutorial/

Data links:-

 

# import libs
import numpy as np
from sklearn import preprocessing, neighbors
# cross_validation is depreciated and train_test_split moved into model_selection
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv('breast-cancer-wisconsin.data')
# there are gaps in the data denoted by '?' - these need to be converted to -99999 so the algorythm treats it as an outlier
df.replace('?',-99999,inplace=True)
# drop any useless data - in this case the ID
df.drop('id',1,inplace=True)
#print(df)

# define X & y (X for features; y for labels)
# X is everything except 'class'
# In the datafile I had a space after 'class' which caused errors
X = np.array(df.drop(['class'], 1))
y = np.array(df['class'])

# split the data into train and test datasets using train_Test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#define classifier (clf)
clf = neighbors.KNeighborsClassifier()
# fit the classifier
clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)

print(accuracy)

# important the array needs to be 2D so double brackets are needed rather than reshaping the array
example_measures = np.array([[4,2,1,1,1,2,3,2,1],[4,2,1,2,2,2,3,2,1]])
prediction = clf.predict(example_measures)
print(prediction)