import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
Genersanje podataka¶
Generisanje podataka (dve klase, normalne raspodele u 2-D prostoru). Varirati parametre normalnih raspodela za prikaz različitih slučajeva primene KNN.
X_1 = np.random.normal(0,1,size=(50,2))
X_2 = np.random.normal(2,2,size=(50,2))
X = np.concatenate((X_1,X_2), axis=0)
Y = np.concatenate((np.zeros((50,1)),np.ones((50,1))), axis=0)
plt.scatter(X[:50,0], X[:50,1], label='0', marker='o')
plt.scatter(X[50:,0], X[50:,1], label='1', marker='*')
plt.legend()
<matplotlib.legend.Legend at 0x7e7ed0b8edd0>
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=10, stratify=Y)
plt.scatter(X[:50,0], X[:50,1], label='0', marker='o')
plt.scatter(X[50:,0], X[50:,1], label='1', marker='*')
plt.scatter(X_test[:,0], X_test[:,1], marker='.', c='yellow') #zutim su oznaceni test uzorci
plt.legend()
<matplotlib.legend.Legend at 0x7e7e95a87e90>
Primena KNN algoritma na generisanim podacima i poređenje pravih i dobijenih labela (oznaka klase).
from sklearn.neighbors import KNeighborsClassifier
# inicijalizacija i obuka klasifikatora
classifier = KNeighborsClassifier(n_neighbors=10, metric='euclidean')
classifier.fit(X_train, Y_train)
# n_neighbors (default 5)
# weights (uniform, distance or user-defined)
# p (default 2, meaning euclidean, 1 would be manhattan)
# metric (default minkowski, sklearn.neighbors.DistanceMetric (euclidean, manhattan, chebyshev, minkowski, hamming(int), jaccard, dice))
# metric_params
# testiranje
# print(classifier)
Y_pred = classifier.predict(X_test)
print(Y_pred)
[0. 0. 1. 0. 1. 0. 1. 0. 1. 1.]
/usr/local/lib/python3.11/dist-packages/sklearn/neighbors/_classification.py:239: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel(). return self._fit(X, y)
Vizualizacija dobijenih rezultata. Pogrešno klasifikovani uzorci imaće tačku suprotne boje.
##NOTE: u ovom slucaju su svi dobro klasifikovani
plt.scatter(X[:50,0], X[:50,1], label='0', marker='o', c='blue')
plt.scatter(X[50:,0], X[50:,1], label='1', marker='*', c='orange')
for i in range(len(X_test)):
# print('Y_pred[i]=',Y_pred[i],' Y_test[i]=',Y_test[i])
if Y_pred[i]==0:
plt.scatter(X_test[i,0], X_test[i,1], marker='.', c='blue') ## was blue
elif Y_pred[i]==1:
plt.scatter(X_test[i,0], X_test[i,1], marker='.', c='orange') ## was orange
plt.legend()
##NOTE: u ovom slucaju su svi dobro klasifikovani
<matplotlib.legend.Legend at 0x7e7e94f7fd10>
Matrica konfuzije (kolone su predviđene, a vrste su prave klase).
# negativ je klasa 0, a pozitiv klasa 1
TN=0
FP=0
FN=0
TP=0
for i in range(len(Y_pred)):
if Y_pred[i]==0 and Y_test[i]==0:
TN = TN+1
elif Y_pred[i]==1 and Y_test[i]==0:
FP = FP+1
elif Y_pred[i]==0 and Y_test[i]==1:
FN = FN+1
elif Y_pred[i]==1 and Y_test[i]==1:
TP = TP+1
conf_mat = pd.DataFrame(columns=['0','1'], index=['0','1'])
conf_mat.iloc[0,0]=TN
conf_mat.iloc[0,1]=FP
conf_mat.iloc[1,0]=FN
conf_mat.iloc[1,1]=TP
conf_mat
| 0 | 1 | |
|---|---|---|
| 0 | 5 | 0 |
| 1 | 0 | 5 |
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(Y_test, Y_pred) # TN, FP
print(conf_mat) # FN, TP
[[5 0] [0 5]]
Mere uspešnosti klasifikatora.
- $ precision = \frac{TP}{TP+FP} $
- $ accuracy = \frac{TP+TN}{TP+TN+FP+FN} $
- $ sensitivity = \frac{TP}{TP+FN} $
- $ specificity = \frac{TN}{TN+FP} $
- $ F\_score = \frac{2*precision*sensitivity}{precision+sensitivity} $
precision = TP/(TP+FP)
accuracy = (TP+TN)/(TP+TN+FP+FN)
sensitivity = TP/(TP+FN)
specificity = TN/(TN+FP)
F_score = 2*precision*sensitivity/(precision+sensitivity)
print('precision: ', precision)
print('accuracy: ', accuracy)
print('sensitivity/recall: ', sensitivity)
print('specificity: ', specificity)
print('F score: ', F_score)
precision: 1.0 accuracy: 1.0 sensitivity/recall: 1.0 specificity: 1.0 F score: 1.0
Zadatak 1¶
Implementirati algoritam za klasifikaciju ispitanika na one koji se izjašnjavaju kao srećni i one koji se ne izjašnjavaju kao srećni. Koristiti bazu podataka koja sadrži rezultate ankete od 143 osobe. U anketi se postavlja 6 pitanja u kojima se proverava koliko je anketirana osoba na skali od 1 do 5 zadovoljna sa: dostupnošću informacija o gradskim službama, visinom troškova stanovanja, kvalitetom državnog školstva, poverenjem u lokalnu policiju, održavanjem ulica i trotoara, i dostupnošću društvenih događaja. Odgovori anketirane osobe su obeležja za tu osobu i svaka osoba rekla je da li je srećna ili nije, a taj odgovor predstavlja klasnu labelu.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Read dataset to pandas dataframe
!gdown 18LShd5kJch5nxHbHxYkkI_qRlrQ60WmL
Učitati podatke u DataFrame. Koliko ima uzoraka? Koliko ima obeležja i kog su tipa?
cols = ['class', 'services', 'cost', 'schools', 'police', 'streets', 'events']
dataset = pd.read_csv('SomervilleHappinessSurvey2015.csv', encoding = "utf-16")
dataset.columns = cols
print(dataset.shape)
dataset.head()
X = dataset.iloc[:, 1:] #obeležja
y = dataset.iloc[:, 0] #labele
Proveriti koliko uzoraka ima u kojoj klasi, kao i da li ima nedostajućih vrednosti.
print("broj uzoraka u klasi 1 je: ", sum(y==1))
print("broj uzoraka u klasi 0 je: ", sum(y==0))
print("broj nedostajućih podataka je: ", X.isnull().sum().sum())
Trostruka podela skupa¶
from sklearn.model_selection import train_test_split
# podela podataka na trening i test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=10, stratify=y)
# podela trening skupa na trening i validacioni
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=10, stratify=y_test)
Kada koristiti Hamming metriku? a = [1, 0, 1, 1] b = [1, 1, 0, 1]
Hamming distanca = (broj različitih pozicija) / (ukupan broj pozicija) = 2 / 4 = 0.5
- Kada radiš sa binarno kodiranim podacima
- Kada koristiš jedno-hot enkodirane kategorije
- Kada meriš sličnost između nizova ili vektora znakova
from sklearn import metrics
# inicijalizacija i obuka klasifikatora
classifier = KNeighborsClassifier(n_neighbors=11, metric='hamming')
classifier.fit(X_train, y_train)
# validacija
y_pred = classifier.predict(X_val)
accuracy = metrics.accuracy_score(y_val, y_pred)
print(accuracy)
# korigovanje hiperparametara i validacija
classifier = KNeighborsClassifier(n_neighbors=3, metric='hamming')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_val)
accuracy = metrics.accuracy_score(y_val, y_pred)
print(accuracy)
acc = []
# Racunanje greske za razlicito k, hamming metrika
for i in range(1, 21):
classifier = KNeighborsClassifier(n_neighbors=i, metric='hamming')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_val)
acc.append(metrics.accuracy_score(y_val, y_pred))
plt.figure(figsize=(12, 6))
plt.plot(range(1, 21), acc, color='red', linestyle='dashed', marker='o')
plt.title('Postignuta tacnost za razlicito k na validacionom skupu (hamming metrika)')
plt.xlabel('broj suseda')
plt.ylabel('tacnost')
The Bray-Curtis distance, also known as Bray-Curtis dissimilarity, is a statistical measure used to quantify the dissimilarity between two samples, especially in ecology and biology. It is often used to compare species abundance or diversity between different sites or environments. The distance ranges from 0 to 1, where 0 indicates identical samples and 1 indicates completely dissimilar samples
acc = []
# Racunanje greske za razlicito k (euklidska metrika)
for i in range(1, 21):
classifier = KNeighborsClassifier(n_neighbors=i, metric='euclidean')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_val)
acc.append(metrics.accuracy_score(y_val, y_pred))
plt.figure(figsize=(12, 6))
plt.plot(range(1, 21), acc, color='red', linestyle='dashed', marker='o')
plt.title('Postignuta tacnost za razlicito k na validacionom skupu (euklidska metrika)')
plt.xlabel('broj suseda')
plt.ylabel('tacnost')
acc = []
# Racunanje greske za razlicito k (braycurtis metrika)
for i in range(1, 21):
classifier = KNeighborsClassifier(n_neighbors=i, metric='braycurtis')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_val)
acc.append(metrics.accuracy_score(y_val, y_pred))
plt.figure(figsize=(12, 6))
plt.plot(range(1, 21), acc, color='red', linestyle='dashed', marker='o')
plt.title('Postignuta tacnost za razlicito k na validacionom skupu (braycurtis metrika)')
plt.xlabel('broj suseda')
plt.ylabel('tacnost')
# obuka konacnog modela
classifier = KNeighborsClassifier(n_neighbors=18, metric='hamming')
classifier.fit(pd.concat([X_train, X_val], axis=0), pd.concat([y_train, y_val], axis=0)) #classifier.fit(X_train1, y_train1)
# testiranje
y_pred = classifier.predict(X_test)
for i,j in zip(y_pred, y_test):
print(i,j)
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_pred) # TN, FP
print(conf_mat) # FN, TP
precision = metrics.precision_score(y_test, y_pred)
accuracy = metrics.accuracy_score(y_test, y_pred)
sensitivity = metrics.recall_score(y_test, y_pred)
f_score = metrics.f1_score(y_test, y_pred)
print('precision: ', precision)
print('accuracy: ', accuracy)
print('sensitivity/recall: ', sensitivity)
print('F score: ', f_score)
print(metrics.classification_report(y_test, y_pred))
!jupyter nbconvert --to html "/content/drive/MyDrive/Colab Notebooks/KNN_cas1_reseno_moja_rjesenja.ipynb"