import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
# ucitavanje baze
data = pd.read_csv('train.csv')
# data = pd.read_csv('sample_data/train.csv')
data.head()
| label | pixel0 | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | ... | pixel774 | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 785 columns
Smestiti obeležja u X, a klasne labele u y. Proveriti vrednosti klasnih labela, kao i postoji li disbalans među klasama.
X = data.iloc[:,1:]
y = data.iloc[:,0]
target_names = y.unique()
print(target_names)
[1 0 4 7 3 5 8 9 2 6]
target_names = np.sort(target_names)
print(target_names)
[0 1 2 3 4 5 6 7 8 9]
num_k = np.zeros(10)
nan_count = 0
unaccounted_for = 0
for i in range(len(y)):
for k in target_names:
if y[i]==k:
num_k[k]=num_k[k]+1
break
elif y[i]==np.nan:
nan_count += 1
else:
unaccounted_for += 1
print(num_k)
print('nan_count:',nan_count)
print('unaccounted_for:',unaccounted_for)
[4132. 4684. 4177. 4351. 4072. 3795. 4137. 4401. 4063. 4188.] nan_count: 0 unaccounted_for: 187179
from sklearn.preprocessing import StandardScaler
X = X.drop(X.shape[0]-1,axis=0) ## the last element has None values
y = y.drop(y.shape[0]-1,axis=0) ## the last element has None values
# print(X)
# print(X.isna().sum())
# print(X.iloc[-1,:])
s = StandardScaler()
X_std = s.fit_transform(X)
# print(X_std)
Linear Discriminant Analysis (LDA) je tehnika koja se koristi za klasifikaciju i redukciju dimenzionalnosti, slična PCA, ali sa drugim ciljem.
- LDA je nadzirana metoda (koristi oznake klasa).
- Cilj: maksimizovati razdvajanje klasa u novoj (manjoj) dimenziji.
- Za razliku od PCA, koje ne zna ništa o klasama, LDA koristi oznake klasa da projektuje podatke tako da su klase što više odvojene.
Koristi se u:
- Kod klasifikacionih problema, npr. pre nego što se trenira model (npr. KNN, SVM...)
- Za vizualizaciju višedimenzionalnih podataka
- Za smanjenje dimenzionalnosti uz očuvanje separacije među klasama
Zamisli da imaš 3 klase u 10-dimenzionalnom prostoru. LDA:
Računa srednje vrednosti svake klase i ukupne podatke.
Računa unutar-klasnu i između-klasnu varijansu:
Unutar-klasna varijansa: kako se podaci raspršuju unutar iste klase
Između-klasna varijansa: koliko su klase međusobno udaljene
Traži nove ose (linearne kombinacije obeležja) koje maksimizuju razliku između klasa, a minimizuju rasipanje unutar iste klase.
LDA optimizuje tzv. Fisherov kriterijum: $$ maximize{J(ω)} = (ω^T S_B ω) ÷ (ω^T S_W ω) $$ gdje je:
- $S_B$: izmedju-klasna kovarijansa
- $S_W$: unutar-klasna kovarijansa
Ključna razlika PCA vs LDA:
| Metrika | PCA | LDA |
|---|---|---|
| Tip metode | Nenadzirana | Nadzirana |
| Koristi klase | Ne | Da |
| Cilj | Maksimalna varijansa | Maksimalna separacija klasa |
| Zavisnost od y | Nema | Ima |
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
pca = PCA(n_components=2)
pca.fit(X_std)
X_pca = pca.transform(X_std)
lda = LinearDiscriminantAnalysis(n_components=2)
lda.fit(X_std, y)
X_lda = lda.transform(X_std)
plt.figure(figsize=(16,9))
for i in target_names:
plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1], alpha=.5, label=i)
plt.legend()
plt.title('PCA of MNIST dataset')
Text(0.5, 1.0, 'PCA of MNIST dataset')
plt.figure(figsize=(16,9))
for i in target_names:
plt.scatter(X_lda[y == i, 0], X_lda[y == i, 1], alpha=.5, label=i)
plt.legend()
plt.title('LDA of MNIST dataset')
Text(0.5, 1.0, 'LDA of MNIST dataset')
pca = PCA(n_components=3)
pca.fit(X_std)
X_pca = pca.transform(X_std)
lda = LinearDiscriminantAnalysis(n_components=3)
lda.fit(X_std, y)
X_lda = lda.transform(X_std)
import plotly.express as px
fig = px.scatter_3d(x=X_pca[:, 0],y=X_pca[:, 1],z=X_pca[:, 2],color=y,title="PCA of dataset", width = 1000, height = 800)
fig.update_traces(marker_size = 3)
fig.show()
Output hidden; open in https://colab.research.google.com to view.
fig = px.scatter_3d(x=X_lda[:, 0],y=X_lda[:, 1],z=X_lda[:, 2],color=y,title="LDA of dataset", width = 1000, height = 800)
fig.update_traces(marker_size = 3)
fig.show()
Output hidden; open in https://colab.research.google.com to view.
from sklearn.model_selection import train_test_split
# Podela podataka na skup za obuku i skup za testiranje
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, stratify=y, random_state=42)
#standardizacija
s = StandardScaler()
s.fit(X_train)
X_train_std = s.transform(X_train)
X_test_std = s.transform(X_test)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
# klasifikacija bez redukcije dimenzionanosti
import time
start = time.time()
knn1 = KNeighborsClassifier(n_neighbors=1)
knn1.fit(X_train_std, y_train)
y_pred1 = knn1.predict(X_test_std)
c1 = confusion_matrix(y_test, y_pred1)
print(c1)
print("udeo ispravno pogodjenih org: ", np.trace(c1)/sum(sum(c1)))
end = time.time()
print('vreme trajanja je: ', end - start, ' sekundi.')
#zavisnost objašnjene varijanse od broja PCA komponenti
pca = PCA(n_components=None)
pca.fit(X_train_std)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
# SA PCA
pca = PCA(n_components=0.9) ## n_components=0.9 => želiš da zadržiš 90% informacija (varijanse) iz originalnog skupa
pca.fit(X_train_std)
X_train_r = pca.transform(X_train_std)
X_test_r = pca.transform(X_test_std)
print('Redukovani prostor ima dimenziju: ', pca.n_components_)
import time
start = time.time()
knn2 = KNeighborsClassifier(n_neighbors=1)
knn2.fit(X_train_r, y_train)
y_pred2 = knn2.predict(X_test_r)
c2 = confusion_matrix(y_test, y_pred2)
print(c2)
print("udeo ispravno pogodjenih pca: ", np.trace(c2)/sum(sum(c2)))
end = time.time()
print('vreme trajanja je: ', end - start, ' sekundi.')
# SA LDA
lda = LinearDiscriminantAnalysis(n_components=9)
lda.fit(X_train_std, y_train)
X_train_r = lda.transform(X_train_std)
X_test_r = lda.transform(X_test_std)
knn3 = KNeighborsClassifier(n_neighbors=1)
knn3.fit(X_train_r, y_train)
y_pred3 = knn3.predict(X_test_r)
c3 = confusion_matrix(y_test, y_pred3)
print(c3)
print("udeo ispravno pogodjenih lda: ", np.trace(c3)/sum(sum(c2)))
!jupyter nbconvert --to html "/content/drive/MyDrive/Colab Notebooks/zadatak2_PCAiLDA_reseno_moje_rjesenje.ipynb"