In [4]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
In [5]:
# ucitavanje baze
data = pd.read_csv('train.csv')
# data = pd.read_csv('sample_data/train.csv')
data.head()
Out[5]:
label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 ... pixel774 pixel775 pixel776 pixel777 pixel778 pixel779 pixel780 pixel781 pixel782 pixel783
0 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 785 columns

Smestiti obeležja u X, a klasne labele u y. Proveriti vrednosti klasnih labela, kao i postoji li disbalans među klasama.

In [6]:
X = data.iloc[:,1:]
y = data.iloc[:,0]
target_names = y.unique()
print(target_names)
[1 0 4 7 3 5 8 9 2 6]
In [7]:
target_names = np.sort(target_names)
print(target_names)
[0 1 2 3 4 5 6 7 8 9]
In [8]:
num_k = np.zeros(10)
nan_count = 0
unaccounted_for = 0
for i in range(len(y)):
  for k in target_names:
    if y[i]==k:
      num_k[k]=num_k[k]+1
      break
    elif y[i]==np.nan:
      nan_count += 1
    else:
      unaccounted_for += 1
print(num_k)
print('nan_count:',nan_count)
print('unaccounted_for:',unaccounted_for)
[4132. 4684. 4177. 4351. 4072. 3795. 4137. 4401. 4063. 4188.]
nan_count: 0
unaccounted_for: 187179
In [9]:
from sklearn.preprocessing import StandardScaler

X = X.drop(X.shape[0]-1,axis=0) ## the last element has None values
y = y.drop(y.shape[0]-1,axis=0) ## the last element has None values
# print(X)
# print(X.isna().sum())
# print(X.iloc[-1,:])

s = StandardScaler()
X_std = s.fit_transform(X)
# print(X_std)

Linear Discriminant Analysis (LDA) je tehnika koja se koristi za klasifikaciju i redukciju dimenzionalnosti, slična PCA, ali sa drugim ciljem.

  • LDA je nadzirana metoda (koristi oznake klasa).
  • Cilj: maksimizovati razdvajanje klasa u novoj (manjoj) dimenziji.
  • Za razliku od PCA, koje ne zna ništa o klasama, LDA koristi oznake klasa da projektuje podatke tako da su klase što više odvojene.

Koristi se u:

  • Kod klasifikacionih problema, npr. pre nego što se trenira model (npr. KNN, SVM...)
  • Za vizualizaciju višedimenzionalnih podataka
  • Za smanjenje dimenzionalnosti uz očuvanje separacije među klasama

Zamisli da imaš 3 klase u 10-dimenzionalnom prostoru. LDA:

  1. Računa srednje vrednosti svake klase i ukupne podatke.

  2. Računa unutar-klasnu i između-klasnu varijansu:

    Unutar-klasna varijansa: kako se podaci raspršuju unutar iste klase

    Između-klasna varijansa: koliko su klase međusobno udaljene

  3. Traži nove ose (linearne kombinacije obeležja) koje maksimizuju razliku između klasa, a minimizuju rasipanje unutar iste klase.

LDA optimizuje tzv. Fisherov kriterijum: $$ maximize{J(ω)} = (ω^T S_B ω) ÷ (ω^T S_W ω) $$ gdje je:

  • $S_B$: izmedju-klasna kovarijansa
  • $S_W$: unutar-klasna kovarijansa

Ključna razlika PCA vs LDA:

Metrika PCA LDA
Tip metode Nenadzirana Nadzirana
Koristi klase Ne Da
Cilj Maksimalna varijansa Maksimalna separacija klasa
Zavisnost od y Nema Ima
In [10]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

pca = PCA(n_components=2)
pca.fit(X_std)
X_pca = pca.transform(X_std)

lda = LinearDiscriminantAnalysis(n_components=2)
lda.fit(X_std, y)
X_lda = lda.transform(X_std)
In [11]:
plt.figure(figsize=(16,9))

for i in target_names:
    plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1], alpha=.5, label=i)
plt.legend()
plt.title('PCA of MNIST dataset')
Out[11]:
Text(0.5, 1.0, 'PCA of MNIST dataset')
No description has been provided for this image
In [12]:
plt.figure(figsize=(16,9))

for i in target_names:
    plt.scatter(X_lda[y == i, 0], X_lda[y == i, 1], alpha=.5, label=i)
plt.legend()
plt.title('LDA of MNIST dataset')
Out[12]:
Text(0.5, 1.0, 'LDA of MNIST dataset')
No description has been provided for this image
In [13]:
pca = PCA(n_components=3)
pca.fit(X_std)
X_pca = pca.transform(X_std)

lda = LinearDiscriminantAnalysis(n_components=3)
lda.fit(X_std, y)
X_lda = lda.transform(X_std)
In [14]:
import plotly.express as px

fig = px.scatter_3d(x=X_pca[:, 0],y=X_pca[:, 1],z=X_pca[:, 2],color=y,title="PCA of dataset", width = 1000, height = 800)
fig.update_traces(marker_size = 3)
fig.show()
Output hidden; open in https://colab.research.google.com to view.
In [15]:
fig = px.scatter_3d(x=X_lda[:, 0],y=X_lda[:, 1],z=X_lda[:, 2],color=y,title="LDA of dataset", width = 1000, height = 800)
fig.update_traces(marker_size = 3)
fig.show()
Output hidden; open in https://colab.research.google.com to view.
In [16]:
from sklearn.model_selection import train_test_split

# Podela podataka na skup za obuku i skup za testiranje
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, stratify=y, random_state=42)
In [17]:
#standardizacija
s = StandardScaler()
s.fit(X_train)
X_train_std = s.transform(X_train)
X_test_std = s.transform(X_test)
In [ ]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

# klasifikacija bez redukcije dimenzionanosti

import time
start = time.time()

knn1 = KNeighborsClassifier(n_neighbors=1)
knn1.fit(X_train_std, y_train)
y_pred1 = knn1.predict(X_test_std)
c1 = confusion_matrix(y_test, y_pred1)
print(c1)
print("udeo ispravno pogodjenih org: ", np.trace(c1)/sum(sum(c1)))

end = time.time()
print('vreme trajanja je: ', end - start, ' sekundi.')
In [ ]:
#zavisnost objašnjene varijanse od broja PCA komponenti
pca = PCA(n_components=None)
pca.fit(X_train_std)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
In [ ]:
# SA PCA
pca = PCA(n_components=0.9) ## n_components=0.9 => želiš da zadržiš 90% informacija (varijanse) iz originalnog skupa
pca.fit(X_train_std)
X_train_r = pca.transform(X_train_std)
X_test_r = pca.transform(X_test_std)
print('Redukovani prostor ima dimenziju: ', pca.n_components_)

import time
start = time.time()

knn2 = KNeighborsClassifier(n_neighbors=1)
knn2.fit(X_train_r, y_train)
y_pred2 = knn2.predict(X_test_r)
c2 = confusion_matrix(y_test, y_pred2)
print(c2)
print("udeo ispravno pogodjenih pca: ", np.trace(c2)/sum(sum(c2)))

end = time.time()
print('vreme trajanja je: ', end - start, ' sekundi.')
In [ ]:
# SA LDA
lda = LinearDiscriminantAnalysis(n_components=9)
lda.fit(X_train_std, y_train)
X_train_r = lda.transform(X_train_std)
X_test_r = lda.transform(X_test_std)

knn3 = KNeighborsClassifier(n_neighbors=1)
knn3.fit(X_train_r, y_train)
y_pred3 = knn3.predict(X_test_r)
c3 = confusion_matrix(y_test, y_pred3)
print(c3)
print("udeo ispravno pogodjenih lda: ", np.trace(c3)/sum(sum(c2)))
In [ ]:
!jupyter nbconvert --to html "/content/drive/MyDrive/Colab Notebooks/zadatak2_PCAiLDA_reseno_moje_rjesenje.ipynb"