import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
Učitati podatke u DataFrame. Koliko ima uzoraka? Koliko ima obeležja i kog su tipa?
# Read dataset to pandas dataframe
!gdown 18LShd5kJch5nxHbHxYkkI_qRlrQ60WmL
cols = ['class', 'services', 'cost', 'schools', 'police', 'streets', 'events']
dataset = pd.read_csv('SomervilleHappinessSurvey2015.csv', encoding = "utf-16")
dataset.columns = cols
print(dataset.shape)
dataset.head()
Downloading... From: https://drive.google.com/uc?id=18LShd5kJch5nxHbHxYkkI_qRlrQ60WmL To: /content/SomervilleHappinessSurvey2015.csv 0% 0.00/4.33k [00:00<?, ?B/s] 100% 4.33k/4.33k [00:00<00:00, 8.50MB/s] (143, 7)
| class | services | cost | schools | police | streets | events | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 3 | 3 | 4 | 2 | 4 |
| 1 | 0 | 3 | 2 | 3 | 5 | 4 | 3 |
| 2 | 1 | 5 | 3 | 3 | 3 | 3 | 5 |
| 3 | 0 | 5 | 4 | 3 | 3 | 3 | 5 |
| 4 | 0 | 5 | 4 | 3 | 3 | 3 | 5 |
X = dataset.iloc[:, 1:] #obeležja
y = dataset.iloc[:, 0] #labele
Proveriti koliko uzoraka ima u kojoj klasi, kao i da li ima nedostajućih vrednosti.
print("broj uzoraka u klasi 1 je: ", sum(y==1))
print("broj uzoraka u klasi 0 je: ", sum(y==0))
print("broj nedostajućih podataka je: ", X.isnull().sum().sum())
broj uzoraka u klasi 1 je: 77 broj uzoraka u klasi 0 je: 66 broj nedostajućih podataka je: 0
Unakrsna validacija¶
K fold¶
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=10,stratify=y)
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
kf = StratifiedKFold(n_splits=5, shuffle=True)
indexes = kf.split(x_train, y_train)
fin_conf_mat = np.zeros((len(np.unique(y)),len(np.unique(y))))
# print(fin_conf_mat)
for train_index, test_index in indexes:
classifier = KNeighborsClassifier(n_neighbors=5, metric='hamming')
classifier.fit(x_train.iloc[train_index,:], y_train.iloc[train_index])
y_pred = classifier.predict(x_train.iloc[test_index,:])
conf_mat = confusion_matrix(y_train.iloc[test_index], y_pred)
print(conf_mat)
fin_conf_mat += conf_mat
print('finalna matrica je: ')
print(fin_conf_mat)
[[ 3 9] [ 4 10]] [[ 8 4] [ 3 11]] [[ 4 8] [ 4 10]] [[ 8 3] [ 4 10]] [[ 3 9] [ 2 11]] finalna matrica je: [[26. 33.] [17. 52.]]
TP = fin_conf_mat[1, 1]
TN = fin_conf_mat[0, 0]
FP = fin_conf_mat[0, 1]
FN = fin_conf_mat[1, 0]
precision = TP/(TP+FP)
accuracy = (TP+TN)/(TP+TN+FP+FN)
sensitivity = TP/(TP+FN)
F_score = 2*precision*sensitivity/(precision+sensitivity)
print('precision: ', precision)
print('accuracy: ', accuracy)
print('sensitivity/recall: ', sensitivity)
print('F score: ', F_score)
precision: 0.611764705882353 accuracy: 0.609375 sensitivity/recall: 0.7536231884057971 F score: 0.6753246753246754
GridSearchCV = sistematsko isprobavanje svih kombinacija parametara + evaluacija modela pomoću kros-validacije.
Koristi se za optimizaciju hiperparametara:
- npr. u KNN: koliko komšija (n_neighbors), koju metriku udaljenosti (metric)
- u Ridge/Lasso: vrednost alpha
- u RandomForest: broj stabala (n_estimators), maksimalna dubina (max_depth)
from sklearn.model_selection import GridSearchCV
parameters = {'n_neighbors':[1,2,3,4,5], 'metric':('hamming', 'euclidean', 'manhattan')}
classifier = KNeighborsClassifier()
clf = GridSearchCV(classifier, parameters, scoring='accuracy', cv=5, verbose=3)
clf.fit(x_train, y_train)
# clf.best_params_ # najbolja kombinacija hiperparametara
# clf.best_score_ # najbolji skor (npr. tačnost) u toku validacije
# clf.best_estimator_ # kompletan trenirani model sa najboljim parametrima
Fitting 5 folds for each of 15 candidates, totalling 75 fits [CV 1/5] END .....metric=hamming, n_neighbors=1;, score=0.654 total time= 0.0s [CV 2/5] END .....metric=hamming, n_neighbors=1;, score=0.538 total time= 0.0s [CV 3/5] END .....metric=hamming, n_neighbors=1;, score=0.769 total time= 0.0s [CV 4/5] END .....metric=hamming, n_neighbors=1;, score=0.600 total time= 0.0s [CV 5/5] END .....metric=hamming, n_neighbors=1;, score=0.520 total time= 0.0s [CV 1/5] END .....metric=hamming, n_neighbors=2;, score=0.692 total time= 0.0s [CV 2/5] END .....metric=hamming, n_neighbors=2;, score=0.462 total time= 0.0s [CV 3/5] END .....metric=hamming, n_neighbors=2;, score=0.808 total time= 0.0s [CV 4/5] END .....metric=hamming, n_neighbors=2;, score=0.560 total time= 0.0s [CV 5/5] END .....metric=hamming, n_neighbors=2;, score=0.640 total time= 0.0s [CV 1/5] END .....metric=hamming, n_neighbors=3;, score=0.615 total time= 0.0s [CV 2/5] END .....metric=hamming, n_neighbors=3;, score=0.577 total time= 0.0s [CV 3/5] END .....metric=hamming, n_neighbors=3;, score=0.731 total time= 0.0s [CV 4/5] END .....metric=hamming, n_neighbors=3;, score=0.520 total time= 0.0s [CV 5/5] END .....metric=hamming, n_neighbors=3;, score=0.480 total time= 0.0s [CV 1/5] END .....metric=hamming, n_neighbors=4;, score=0.615 total time= 0.0s [CV 2/5] END .....metric=hamming, n_neighbors=4;, score=0.577 total time= 0.0s [CV 3/5] END .....metric=hamming, n_neighbors=4;, score=0.808 total time= 0.0s [CV 4/5] END .....metric=hamming, n_neighbors=4;, score=0.480 total time= 0.0s [CV 5/5] END .....metric=hamming, n_neighbors=4;, score=0.520 total time= 0.0s
print("najbolji skor: ", clf.best_score_)
print("najbolji hiperparametri: ", clf.best_params_)
# obuka konacnog modela
classifier = KNeighborsClassifier(n_neighbors=2, metric='hamming')
classifier.fit(x_train,y_train)
# testiranjeaccuracy
y_pred = classifier.predict(x_test)
for i,j in zip(y_pred, y_test):
print(i,j)
classifier.classes_
conf_mat = confusion_matrix(y_test, y_pred, labels=classifier.classes_) # TN, FP
print(conf_mat) # FN, TP
from sklearn.metrics import ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat, display_labels=classifier.classes_)
disp.plot(cmap="Blues")
from sklearn import metrics
precision = metrics.precision_score(y_test, y_pred)
accuracy = metrics.accuracy_score(y_test, y_pred)
sensitivity = metrics.recall_score(y_test, y_pred)
f_score = metrics.f1_score(y_test, y_pred)
print('precision: ', precision)
print('accuracy: ', accuracy)
print('sensitivity/recall: ', sensitivity)
print('F score: ', f_score)
print(metrics.classification_report(y_test, y_pred))
Leave one out¶
Ponoviti računanje mera za klasifikator sa optimalnim parametrima koristeći unakrsnu validaciju sa jednim izdvojenim uzorkom LeaveOneOut. Da li se rezultati razlikuju? Zašto?
LeaveOneOut() -> svi uzorci osim 1 postaju trening skup a taj 1 je test skup. Chatgpt kaze da se rjesenja NE ralikuju. "Znači – ista prosečna tačnost je dobijena i kod KFold(5) i kod LeaveOneOut()" samo sto u LeaveOneOut imas vecu varijansu Koristi se kada:
- imaš mali broj podataka (npr. <100)
- želiš da iskoristiš svaki podatak maksimalno za obuku
- želiš detaljnu validaciju modela
from sklearn.model_selection import LeaveOneOut, cross_val_score
cv = LeaveOneOut()
model = KNeighborsClassifier(n_neighbors=2, metric='hamming')
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv)
# print(scores)
print(np.mean(np.absolute(scores)))
Standardizacija?¶
Da li je bitno standardizovati obeležja kod KNN algoritma? Ima li u ovom konkretnom problemu smisla standardizovati obeležja? Ako ima ponoviti postupak sa standardizovanim obeležjima.
from sklearn.preprocessing import StandardScaler
classifier = KNeighborsClassifier(n_neighbors=2, metric='hamming')
s = StandardScaler()
s.fit(x_train)
x_train_std = s.transform(x_train)
x_test_std = s.transform(x_test)
x_train_std = pd.DataFrame(x_train_std)
x_test_std = pd.DataFrame(x_test_std)
x_train_std.columns = list(X.columns)
x_test_std.columns = list(X.columns)
classifier.fit(x_train_std, y_train)
# testiranje
y_pred = classifier.predict(x_test_std)
for i,j in zip(y_pred, y_test):
print(i,j)
conf_mat = confusion_matrix(y_test, y_pred, labels=classifier.classes_)
print(conf_mat)
print(metrics.classification_report(y_test, y_pred))
x_train_std.head(5)
x_train.head()
!jupyter nbconvert --to html "/content/drive/MyDrive/Colab Notebooks/zadatak_KNN_cas2_reseno_moje_rjesenje.ipynb"