# Machine Learning – Tutorial 19

### Final thoughts on K Nearest Neighbors

https://pythonprogramming.net/final-thoughts-knn-machine-learning-tutorial/

```# imports
import numpy as np
from math import sqrt
import warnings
from collections import Counter
import pandas as pd
import random

# define function
def K_nearest_neighbours(data, predict, k=3):
if len(data) >= k:
warnings.warn('K is set to value less than total voting groups!')
distances = []
for group in data:
for features in data[group]:
euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
distances.append([euclidean_distance, group])

votes = [i for i in sorted(distances) [:k]]
#print(vote_result, confidence)
return vote_result, confidence
accuracies = []

for i in range(25):

# import data
# there are gaps in the data denoted by '?' - these need to be converted to -99999 so the algorythm treats it as an outlier
df.replace('?',-99999,inplace=True)
# drop any useless data - in this case the ID
df.drop('id',1,inplace=True)
#print(df)
# convert everything in the list to a number
full_data = df.astype(float).values.tolist()
#print(full_data[:5]) # print first 5 rows
random.shuffle(full_data) # no need to define variable again i.e. full_data = random.shuffle(full_data)

test_size = 0.2
train_set = {2:[],4:[]}
test_set = {2:[],4:[]}
train_data = full_data[:-int(test_size*len(full_data))] # slicing the full data set by the test_size
test_data = full_data[-int(test_size*len(full_data)):] # last 20%

for i in train_data:
train_set[i[-1]].append(i[:-1]) # -1 gives the last column
for i in test_data:
test_set[i[-1]].append(i[:-1]) # -1 gives the last column

correct = 0
total = 0

for group in test_set:
for data in test_set[group]:
vote, confidence  = K_nearest_neighbours(train_set,data, k=5)
if group == vote:
correct +=1
#    else:
#        print(confidence)
total +=1

#print('Accuracy', correct/total)
accuracies.append(correct / total)

print((sum(accuracies)/len(accuracies)*100))

```

# Machine Learning – Tutorial 18

### Applying our K Nearest Neighbours Algorithm

https://pythonprogramming.net/testing-our-k-nearest-neighbors-machine-learning-tutorial/

```# imports
import numpy as np
from math import sqrt
import warnings
from collections import Counter
import pandas as pd
import random

# define function
def K_nearest_neighbours(data, predict, k=3):
if len(data) >= k:
warnings.warn('K is set to value less than total voting groups!')
distances = []
for group in data:
for features in data[group]:
euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
distances.append([euclidean_distance, group])

votes = [i for i in sorted(distances) [:k]]

return vote_result

# import data
# there are gaps in the data denoted by '?' - these need to be converted to -99999 so the algorythm treats it as an outlier
df.replace('?',-99999,inplace=True)
# drop any useless data - in this case the ID
df.drop('id',1,inplace=True)
#print(df)
# convert everything in the list to a number
full_data = df.astype(float).values.tolist()
#print(full_data[:5]) # print first 5 rows
random.shuffle(full_data) # no need to define variable again i.e. full_data = random.shuffle(full_data)

test_size = 0.2
train_set = {2:[],4:[]}
test_set = {2:[],4:[]}
train_data = full_data[:-int(test_size*len(full_data))] # slicing the full data set by the test_size
test_data = full_data[-int(test_size*len(full_data)):] # last 20%

for i in train_data:
train_set[i[-1]].append(i[:-1]) # -1 gives the last column
for i in test_data:
test_set[i[-1]].append(i[:-1]) # -1 gives the last column

correct = 0
total = 0

for group in test_set:
for data in test_set[group]:
vote = K_nearest_neighbours(train_set,data, k=5)
if group == vote:
correct +=1
total +=1

print('Accuracy', correct/total)
```

# Machine Learning – Tutorial 17

### Writing our own K Nearest Neighbours in Code

https://pythonprogramming.net/coding-k-nearest-neighbors-machine-learning-tutorial/

```# imports
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
import warnings
from collections import Counter

# To set charts to save as images we need to change the default behaviour
from matplotlib import style # inport style to change default behaviour of plot
style.use('ggplot') # use ggplot

dataset = {'k':[[1,2],[2,3],[3,1]], 'r':[[6,5],[7,7],[8,6]]} # defines as a dictionary 2 classes (k&amp;r) with 3 features (lists of lists)
new_features = [5,7]

## Expanded one line for loop
#for i in dataset:
#    for ii in dataset[i]:
#        plt.scatter(ii,ii,s=100, color=i)

# define function
def K_nearest_neighbours(data, predict, k=3):
if len(data) >= k:
warnings.warn('K is set to value less than total voting groups!')
distances = []
for group in data:
for features in data[group]:
euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
distances.append([euclidean_distance, group])

votes = [i for i in sorted(distances) [:k]]

return vote_result

# generate results
results = K_nearest_neighbours(dataset, new_features, k=3)
print(results)
[[plt.scatter(ii,ii,s=100,color=i) for ii in dataset[i]] for i in dataset] # one line for loop
plt.scatter(new_features, new_features, color=results,s=100)
plt.show()

```

# Machine Learning – Tutorial 16

### Creating Our K Nearest Neighbors Algorithm

https://pythonprogramming.net/programming-k-nearest-neighbors-machine-learning-tutorial/

```# imports
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
import warnings
from collections import Counter

# To set charts to save as images we need to change the default behaviour
from matplotlib import style # inport style to change default behaviour of plot
style.use('ggplot') # use ggplot

dataset = {'k':[[1,2],[2,3],[3,1]], 'r':[[6,5],[7,7],[8,6]]} # defines as a dictionary 2 classes (k&amp;r) with 3 features (lists of lists)
new_features = [5,7]

## Expanded one line for loop
#for i in dataset:
#    for ii in dataset[i]:
#        plt.scatter(ii,ii,s=100, color=i)
[[plt.scatter(ii,ii,s=100,color=i) for ii in dataset[i]] for i in dataset] # one line for loop
plt.show()

def K_nearest_neighbours(data, predict, k=3):
if len(data) >= k:
warnings.warn('K is set to value less than total voting groups!')

return vote_result
```

# Machine Learning – Tutorial 15

## Euclidean Distance

https://pythonprogramming.net/euclidean-distance-machine-learning-tutorial/  This stuff makes my head hurt….just look at this formula….WTF man!

Basically, it’s just the square root of the sum of the distance of the points from each other, squared….easy

So in Python this translates into:-

```plot1 = [1,3]
plot2 = [2,5]
euclidean_distance = sqrt( (plot1-plot2)**2 + (plot1-plot2)**2 )
```

# Machine Learning – Tutorial 14

## Applying K Nearest Neighbors to Data

https://pythonprogramming.net/k-nearest-neighbors-application-machine-learning-tutorial/

```# import libs
import numpy as np
from sklearn import preprocessing, neighbors
# cross_validation is depreciated and train_test_split moved into model_selection
from sklearn.model_selection import train_test_split
import pandas as pd

# there are gaps in the data denoted by '?' - these need to be converted to -99999 so the algorythm treats it as an outlier
df.replace('?',-99999,inplace=True)
# drop any useless data - in this case the ID
df.drop('id',1,inplace=True)
#print(df)

# define X &amp; y (X for features; y for labels)
# X is everything except 'class'
# In the datafile I had a space after 'class' which caused errors
X = np.array(df.drop(['class'], 1))
y = np.array(df['class'])

# split the data into train and test datasets using train_Test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#define classifier (clf)
clf = neighbors.KNeighborsClassifier()
# fit the classifier
clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)

print(accuracy)

# important the array needs to be 2D so double brackets are needed rather than reshaping the array
example_measures = np.array([[4,2,1,1,1,2,3,2,1],[4,2,1,2,2,2,3,2,1]])
prediction = clf.predict(example_measures)
print(prediction)

```