Machine Learning – Tutorial 14

Applying K Nearest Neighbors to Data

https://pythonprogramming.net/k-nearest-neighbors-application-machine-learning-tutorial/

Data links:-

 

# import libs
import numpy as np
from sklearn import preprocessing, neighbors
# cross_validation is depreciated and train_test_split moved into model_selection
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv('breast-cancer-wisconsin.data')
# there are gaps in the data denoted by '?' - these need to be converted to -99999 so the algorythm treats it as an outlier
df.replace('?',-99999,inplace=True)
# drop any useless data - in this case the ID
df.drop('id',1,inplace=True)
#print(df)

# define X & y (X for features; y for labels)
# X is everything except 'class'
# In the datafile I had a space after 'class' which caused errors
X = np.array(df.drop(['class'], 1))
y = np.array(df['class'])

# split the data into train and test datasets using train_Test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#define classifier (clf)
clf = neighbors.KNeighborsClassifier()
# fit the classifier
clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)

print(accuracy)

# important the array needs to be 2D so double brackets are needed rather than reshaping the array
example_measures = np.array([[4,2,1,1,1,2,3,2,1],[4,2,1,2,2,2,3,2,1]])
prediction = clf.predict(example_measures)
print(prediction)

Leave a Reply