import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn import datasets
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

# preuzimanje baze
!gdown 1NpFP8jxoW-CwL4v73wigrg7hI7DfK3nN

Downloading...
From: https://drive.google.com/uc?id=1NpFP8jxoW-CwL4v73wigrg7hI7DfK3nN
To: /content/weatherHistory.csv
100% 16.3M/16.3M [00:00<00:00, 70.7MB/s]

# weatherHistory.csv
df = pd.read_csv('weatherHistory.csv')
df.head()

print(df.shape)  ##df.shape[0] je broj uzoraka a df.shape[1] je broj obelezja
print(df.dtypes)  ## su tipovi obelezja, NOTE: moras sam reci da je NPR: fromatted date datum a ne tip 'object'

(96453, 12)
Formatted Date               object
Summary                      object
Precip Type                  object
Temperature (C)             float64
Apparent Temperature (C)    float64
Humidity                    float64
Wind Speed (km/h)           float64
Wind Bearing (degrees)      float64
Visibility (km)             float64
Loud Cover                  float64
Pressure (millibars)        float64
Daily Summary                object
dtype: object

## kategoricka obelezja su  'Summary', 'Precip Type','Daily Summary'
df['Summary'].unique()

array(['Partly Cloudy', 'Mostly Cloudy', 'Overcast', 'Foggy',
       'Breezy and Mostly Cloudy', 'Clear', 'Breezy and Partly Cloudy',
       'Breezy and Overcast', 'Humid and Mostly Cloudy',
       'Humid and Partly Cloudy', 'Windy and Foggy', 'Windy and Overcast',
       'Breezy and Foggy', 'Windy and Partly Cloudy', 'Breezy',
       'Dry and Partly Cloudy', 'Windy and Mostly Cloudy',
       'Dangerously Windy and Partly Cloudy', 'Dry', 'Windy',
       'Humid and Overcast', 'Light Rain', 'Drizzle', 'Windy and Dry',
       'Dry and Mostly Cloudy', 'Breezy and Dry', 'Rain'], dtype=object)

df['Daily Summary'].unique()  ## 214 unique values sa jako puno teksta, summary type nam vise znaci

array(['Partly cloudy throughout the day.',
       'Mostly cloudy throughout the day.', 'Foggy in the evening.',
       'Foggy overnight and breezy in the morning.',
       'Overcast throughout the day.', 'Partly cloudy until night.',
       'Mostly cloudy until night.',
       'Foggy starting overnight continuing until morning.',
       'Foggy in the morning.', 'Partly cloudy until evening.',
       'Partly cloudy starting in the morning.',
       'Mostly cloudy starting overnight continuing until night.',
       'Mostly cloudy until evening.',
       'Partly cloudy starting in the morning continuing until evening.',
       'Partly cloudy starting in the afternoon.',
       'Partly cloudy starting overnight.',
       'Partly cloudy until morning.',
       'Partly cloudy starting overnight continuing until night.',
       'Partly cloudy starting in the afternoon continuing until night.',
       'Mostly cloudy starting overnight.',
       'Partly cloudy until afternoon.',
       'Mostly cloudy until night and breezy in the afternoon.',
       'Foggy starting in the evening.', 'Foggy throughout the day.',
       'Foggy starting in the evening continuing until night.',
       'Mostly cloudy until morning.',
       'Foggy starting in the morning continuing until evening.',
       'Foggy starting overnight continuing until afternoon.',
       'Partly cloudy starting in the morning continuing until afternoon.',
       'Foggy starting overnight.', 'Foggy until morning.',
       'Foggy starting overnight continuing until evening.',
       'Foggy starting in the afternoon.',
       'Partly cloudy starting overnight continuing until afternoon.',
       'Partly cloudy starting in the morning continuing until night.',
       'Overcast until night.',
       'Mostly cloudy starting overnight continuing until evening.',
       'Foggy overnight.', 'Partly cloudy in the morning.',
       'Mostly cloudy starting in the morning.',
       'Foggy starting in the afternoon continuing until evening.',
       'Mostly cloudy until afternoon.',
       'Foggy starting overnight continuing until night.',
       'Mostly cloudy throughout the day and breezy in the evening.',
       'Foggy starting in the morning continuing until afternoon.',
       'Partly cloudy in the afternoon.', 'Clear throughout the day.',
       'Partly cloudy starting in the afternoon continuing until evening.',
       'Partly cloudy overnight.', 'Overcast until evening.',
       'Foggy in the morning and breezy starting in the afternoon continuing until night.',
       'Breezy starting overnight continuing until afternoon and foggy starting in the morning continuing until evening.',
       'Partly cloudy starting overnight continuing until morning.',
       'Mostly cloudy throughout the day and breezy in the afternoon.',
       'Mostly cloudy starting overnight and breezy in the afternoon.',
       'Partly cloudy throughout the day and breezy starting in the morning continuing until night.',
       'Mostly cloudy throughout the day and breezy in the morning.',
       'Partly cloudy starting in the evening continuing until night.',
       'Mostly cloudy until night and breezy starting in the morning continuing until afternoon.',
       'Partly cloudy starting in the morning continuing until evening and breezy starting in the morning continuing until afternoon.',
       'Partly cloudy throughout the day and breezy starting in the morning continuing until afternoon.',
       'Partly cloudy throughout the day and breezy starting in the morning continuing until evening.',
       'Foggy until afternoon.',
       'Overcast until night and breezy overnight.',
       'Breezy until morning and mostly cloudy throughout the day.',
       'Mostly cloudy starting in the morning continuing until night.',
       'Breezy starting overnight continuing until morning and partly cloudy starting overnight continuing until evening.',
       'Partly cloudy in the evening.',
       'Mostly cloudy starting overnight continuing until afternoon.',
       'Mostly cloudy starting in the morning continuing until afternoon.',
       'Mostly cloudy starting in the afternoon.',
       'Mostly cloudy starting in the morning continuing until evening.',
       'Partly cloudy starting overnight continuing until afternoon and breezy in the afternoon.',
       'Partly cloudy starting overnight and breezy in the afternoon.',
       'Mostly cloudy starting in the morning and breezy in the evening.',
       'Foggy starting in the afternoon continuing until night.',
       'Foggy until night.',
       'Foggy starting in the morning continuing until night.',
       'Foggy until evening.', 'Foggy starting in the morning.',
       'Partly cloudy starting overnight continuing until evening.',
       'Partly cloudy starting overnight continuing until evening and breezy starting in the morning continuing until evening.',
       'Breezy starting overnight continuing until morning and foggy in the evening.',
       'Mostly cloudy throughout the day and breezy starting in the morning continuing until evening.',
       'Partly cloudy until evening and breezy starting in the morning continuing until afternoon.',
       'Mostly cloudy starting in the afternoon continuing until night.',
       'Breezy starting overnight continuing until afternoon and mostly cloudy starting overnight continuing until evening.',
       'Mostly cloudy throughout the day and windy starting in the morning continuing until evening.',
       'Breezy and partly cloudy in the afternoon.',
       'Mostly cloudy starting overnight and breezy starting in the morning continuing until afternoon.',
       'Partly cloudy until night and breezy starting in the morning continuing until afternoon.',
       'Breezy and mostly cloudy overnight.',
       'Mostly cloudy throughout the day and breezy overnight.',
       'Mostly cloudy throughout the day and breezy starting in the morning continuing until afternoon.',
       'Partly cloudy throughout the day and breezy in the morning.',
       'Partly cloudy starting in the morning continuing until evening and breezy starting in the afternoon continuing until evening.',
       'Partly cloudy throughout the day and breezy starting in the afternoon continuing until evening.',
       'Mostly cloudy starting overnight and breezy in the morning.',
       'Partly cloudy starting in the afternoon and breezy in the afternoon.',
       'Partly cloudy starting in the morning and breezy in the evening.',
       'Partly cloudy until evening and breezy in the morning.',
       'Partly cloudy starting overnight continuing until evening and breezy starting in the morning continuing until afternoon.',
       'Partly cloudy starting overnight continuing until evening and breezy in the evening.',
       'Mostly cloudy throughout the day and breezy starting in the evening.',
       'Mostly cloudy throughout the day and windy starting in the morning continuing until night.',
       'Breezy starting overnight continuing until morning and partly cloudy starting in the morning.',
       'Mostly cloudy starting in the morning and breezy overnight.',
       'Overcast throughout the day and breezy starting overnight continuing until morning.',
       'Partly cloudy throughout the day and breezy in the evening.',
       'Mostly cloudy until evening and breezy starting in the morning continuing until afternoon.',
       'Mostly cloudy until night and breezy in the evening.',
       'Partly cloudy starting in the evening.',
       'Overcast starting in the morning.',
       'Mostly cloudy starting overnight continuing until evening and breezy starting overnight continuing until morning.',
       'Partly cloudy starting overnight continuing until morning and breezy starting in the morning continuing until afternoon.',
       'Partly cloudy until evening and breezy starting in the morning continuing until evening.',
       'Breezy starting in the morning continuing until afternoon and partly cloudy starting in the morning.',
       'Partly cloudy starting in the morning and breezy starting in the afternoon continuing until evening.',
       'Mostly cloudy starting overnight continuing until morning.',
       'Mostly cloudy throughout the day and breezy starting overnight continuing until afternoon.',
       'Breezy starting overnight continuing until morning and foggy overnight.',
       'Mostly cloudy throughout the day and breezy starting overnight continuing until morning.',
       'Overcast throughout the day and breezy in the morning.',
       'Overcast throughout the day and breezy in the evening.',
       'Mostly cloudy starting in the morning continuing until night and breezy in the afternoon.',
       'Mostly cloudy until night and breezy starting in the evening continuing until night.',
       'Partly cloudy until night and breezy in the morning.',
       'Partly cloudy until evening and breezy overnight.',
       'Partly cloudy starting overnight continuing until night and windy starting in the morning continuing until afternoon.',
       'Breezy starting in the morning continuing until afternoon and mostly cloudy starting in the morning.',
       'Foggy starting overnight continuing until morning and breezy starting in the evening.',
       'Mostly cloudy until night and breezy starting in the afternoon.',
       'Foggy in the afternoon.',
       'Mostly cloudy until night and breezy starting in the afternoon continuing until night.',
       'Foggy starting overnight continuing until morning and breezy starting in the evening continuing until night.',
       'Breezy until afternoon and mostly cloudy throughout the day.',
       'Mostly cloudy throughout the day and breezy starting in the morning continuing until night.',
       'Partly cloudy starting overnight continuing until evening and breezy in the morning.',
       'Mostly cloudy starting in the morning and breezy in the afternoon.',
       'Mostly cloudy starting overnight continuing until night and breezy starting in the morning continuing until evening.',
       'Foggy starting overnight continuing until morning and breezy starting in the morning continuing until afternoon.',
       'Mostly cloudy until evening and windy starting in the morning continuing until afternoon.',
       'Foggy starting overnight continuing until afternoon and breezy in the morning.',
       'Foggy starting in the morning continuing until afternoon and breezy starting in the evening.',
       'Partly cloudy starting overnight and breezy starting in the morning continuing until afternoon.',
       'Foggy starting overnight continuing until morning and breezy in the afternoon.',
       'Mostly cloudy starting overnight and breezy starting in the afternoon continuing until evening.',
       'Overcast throughout the day and breezy starting overnight continuing until afternoon.',
       'Partly cloudy starting in the morning continuing until evening and breezy in the afternoon.',
       'Partly cloudy starting in the morning continuing until night and breezy starting in the afternoon continuing until evening.',
       'Mostly cloudy until night and breezy starting in the evening.',
       'Breezy in the morning and mostly cloudy starting in the morning.',
       'Mostly cloudy until night and breezy starting in the morning continuing until evening.',
       'Partly cloudy starting overnight continuing until evening and windy starting in the morning continuing until evening.',
       'Breezy in the morning and partly cloudy starting in the evening continuing until night.',
       'Partly cloudy overnight and breezy starting in the morning continuing until afternoon.',
       'Light rain in the morning.', 'Light rain until morning.',
       'Light rain in the morning and afternoon.',
       'Partly cloudy starting in the morning continuing until night and breezy starting in the morning continuing until afternoon.',
       'Breezy starting in the afternoon continuing until night and mostly cloudy starting in the evening.',
       'Mostly cloudy throughout the day and breezy starting in the evening continuing until night.',
       'Foggy starting in the afternoon and breezy starting in the afternoon continuing until evening.',
       'Breezy and foggy until morning.',
       'Mostly cloudy until night and breezy starting overnight continuing until morning.',
       'Partly cloudy starting overnight continuing until night and breezy in the morning.',
       'Partly cloudy starting overnight continuing until night and breezy in the afternoon.',
       'Mostly cloudy starting in the morning and breezy starting in the afternoon continuing until evening.',
       'Partly cloudy starting overnight and breezy starting in the evening.',
       'Breezy overnight and overcast throughout the day.',
       'Partly cloudy until night and breezy in the afternoon.',
       'Mostly cloudy starting overnight and breezy starting in the evening.',
       'Breezy overnight and partly cloudy until evening.',
       'Mostly cloudy starting in the evening.',
       'Mostly cloudy throughout the day and breezy starting in the afternoon.',
       'Mostly cloudy throughout the day and breezy starting in the afternoon continuing until evening.',
       'Mostly cloudy until night and windy starting in the morning continuing until afternoon.',
       'Breezy and foggy starting in the evening.',
       'Breezy overnight and partly cloudy throughout the day.',
       'Overcast throughout the day and breezy starting in the evening.',
       'Breezy until evening and foggy in the morning.',
       'Breezy overnight and mostly cloudy throughout the day.',
       'Partly cloudy until evening and breezy in the afternoon.',
       'Partly cloudy starting in the morning and breezy starting in the morning continuing until afternoon.',
       'Mostly cloudy until evening and breezy in the evening.',
       'Windy in the afternoon.', 'Overcast until morning.',
       'Mostly cloudy overnight.',
       'Foggy starting in the morning continuing until evening and breezy in the evening.',
       'Breezy starting overnight continuing until morning.',
       'Breezy starting in the afternoon continuing until evening and foggy starting in the evening.',
       'Mostly cloudy until night and breezy overnight.',
       'Mostly cloudy starting in the morning and windy in the evening.',
       'Partly cloudy throughout the day and windy starting in the morning continuing until afternoon.',
       'Breezy until afternoon and overcast throughout the day.',
       'Breezy in the morning and foggy in the evening.',
       'Breezy starting in the afternoon continuing until evening and foggy in the evening.',
       'Breezy starting in the morning continuing until night.',
       'Breezy in the morning and mostly cloudy starting in the evening.',
       'Mostly cloudy until evening and breezy in the afternoon.',
       'Mostly cloudy until night and breezy starting in the afternoon continuing until evening.',
       'Mostly cloudy until evening and breezy starting overnight continuing until morning.',
       'Overcast throughout the day and breezy in the afternoon.',
       'Overcast throughout the day and breezy starting in the morning continuing until evening.',
       'Overcast throughout the day and breezy overnight.',
       'Overcast starting in the afternoon.',
       'Partly cloudy throughout the day and breezy in the afternoon.',
       'Light rain starting overnight.',
       'Drizzle starting in the evening.', 'Drizzle until morning.',
       'Rain throughout the day.', 'Rain until morning.',
       'Light rain overnight.', 'Rain until afternoon.'], dtype=object)

df['Wind Bearing (degrees)'].unique() ## 360 unqiue values... why tf would this be a categorical when it can be a numercial one

array([251., 259., 204., 269., 258., 260., 279., 290., 316., 281., 289.,
       262., 288., 230., 163., 139., 147., 160., 152., 150., 149., 180.,
       161., 135., 141., 151., 169., 170., 187., 179., 162., 159., 168.,
        32., 140., 103., 113., 129., 207., 153.,   4., 341.,  15., 348.,
       321., 311., 339., 340., 330.,  19., 277.,   9.,   0., 350., 349.,
       338., 320., 310., 328.,  20.,  28.,  11., 326., 309., 193., 273.,
       300., 307., 319., 318., 243., 177., 172., 142., 130., 359., 166.,
       145., 178., 223., 240., 231., 214., 222., 241., 235., 238., 211.,
       221., 215., 224., 358.,   8.,  59.,  63.,  65., 146., 305., 327.,
       271., 297., 301., 308., 272., 351., 175., 138., 158., 132., 209.,
       250., 295., 280., 270., 239., 242., 266., 278., 325., 282., 274.,
       255.,  46., 284., 283., 313., 345.,  16., 332.,  12.,  39.,   3.,
        33.,  24.,  25.,  31.,  47.,  67.,  60., 144.,  57.,   2.,  18.,
        48.,  29., 335., 228., 315.,  40., 143., 133., 136., 355., 123.,
       199., 227., 156., 261., 357., 190., 298.,  10., 127.,  81., 120.,
       134., 115., 347.,  72.,   7.,   1.,  30., 352., 353.,  85.,  49.,
        21.,  23.,  35.,  56., 337., 285., 306.,   6., 116., 197., 275.,
       100., 333., 304.,  90.,  58., 157., 329., 268., 317., 210., 226.,
       188., 292., 154., 205., 181., 185., 148.,  68., 155., 137.,  95.,
        50.,  80.,  83.,  73.,  70., 106.,  94., 117.,  53.,  82., 101.,
       102.,  54.,  52.,  38., 128., 164.,  71., 108., 121., 122., 189.,
       246., 302., 245., 291., 312., 296., 264.,  41.,  62., 119.,  98.,
       124., 299., 173., 176., 194., 171., 183., 167., 229., 303., 212.,
       322., 331., 342., 324., 265., 267., 165., 236., 110., 216., 218.,
       354.,  75., 186., 249.,  55.,  64.,  17.,   5., 263., 256.,  69.,
        89., 200., 112., 192., 287., 237., 213., 220., 217., 202., 208.,
       198., 201., 111., 104., 126., 191., 254., 107., 118., 131., 232.,
       196., 174., 248., 182., 294., 252., 247., 233., 244., 276., 225.,
       234., 219.,  84.,  96., 336., 253., 346., 314., 203.,  13., 343.,
       125.,  61.,  79.,  93., 286.,  27., 323., 195.,  87., 105., 206.,
        44.,  51.,  36.,  43.,  26.,  74., 109., 356., 334.,  99.,  88.,
       257., 293.,  45., 344., 184.,  22.,  42.,  37.,  14.,  34.,  76.,
        91.,  86.,  97.,  66., 114.,  77.,  92.,  78.])

df['Precip Type'].unique()    ## 'rain', 'snow', nan

array(['rain', 'snow', nan], dtype=object)

df.describe()

df=df.drop(['Summary','Daily Summary', 'Formatted Date', 'Loud Cover'], axis=1)

# TODO
X=df.drop(['Visibility (km)'], axis=1)
y=df['Visibility (km)']
print(X.shape)
print(X.columns)
X.head()

(96453, 7)
Index(['Precip Type', 'Temperature (C)', 'Apparent Temperature (C)',
       'Humidity', 'Wind Speed (km/h)', 'Wind Bearing (degrees)',
       'Pressure (millibars)'],
      dtype='object')

X['Precip Type'].unique()

array(['rain', 'snow', nan], dtype=object)

# print(pd.get_dummies(X, columns=['Precip Type']))
X = pd.get_dummies(X, columns=['Precip Type'])
X

X.isna().sum()

X.describe()

plt.hist(X['Pressure (millibars)'], bins=30, label='pressure')
plt.legend()
plt.show()

X['Pressure (millibars)'] = X['Pressure (millibars)'].replace(0.0, method='ffill') ## ja bi zamjenio sa mean/median jer je numericko al oni kazu da je ffill bolje
X.describe()

/tmp/ipython-input-17-2157937382.py:1: FutureWarning: The 'method' keyword in Series.replace is deprecated and will be removed in a future version.
  X['Pressure (millibars)'] = X['Pressure (millibars)'].replace(0.0, method='ffill') ## ja bi zamjenio sa mean/median jer je numericko al oni kazu da je ffill bolje

plt.figure(figsize=(10,5))
plt.hist(y, density=True, bins=500)
plt.show()

y.describe()
# sorted(y.unique(),reverse=True) ## 0.0 pa 0.0161 0.322...

def get_wind_direction(direction):
  if 45 <= direction < 135:
    return 'N'
  elif 135 <= direction < 225:
    return 'W'
  elif 225 <= direction < 315:
    return 'S'
  else:
    return 'E'

X['Wind bearing cat'] = X['Wind Bearing (degrees)'].apply(get_wind_direction)
X.head()

X.drop('Wind Bearing (degrees)', axis=1, inplace=True)
X = pd.get_dummies(X, columns=['Wind bearing cat'], drop_first=True)
X.head()

y_normalizovano = (y-min(y))/(max(y)-min(y))
y_normalizovano.describe()

plt.figure(figsize=(10,5))
plt.hist(y_normalizovano, density=True, bins=50)
plt.title('normalizovano')
plt.show()

y_standardizovano = (y-np.mean(y))/np.std(y)
y_standardizovano.describe()

plt.figure(figsize=(10,5))
plt.hist(y_standardizovano, density=True, bins=50)
plt.title('standardizovano')
plt.show()

from sklearn.preprocessing import MinMaxScaler

s = MinMaxScaler()
s.fit(y.array.reshape(-1,1))
y_norm = s.transform(y.array.reshape(-1,1))
y_norm = pd.DataFrame(y_norm)
print(y_norm.describe())
print(y_norm.head())

                  0
count  96453.000000
mean       0.642691
std        0.260380
min        0.000000
25%        0.518000
50%        0.624000
75%        0.920000
max        1.000000
       0
0  0.983
1  0.983
2  0.929
3  0.983
4  0.983

from sklearn.preprocessing import StandardScaler

s = StandardScaler()
s.fit(y.array.reshape(-1,1))
y_std = s.transform(y.array.reshape(-1,1))
y_std = pd.DataFrame(y_std)
y_std.describe()

# TODO
def model_evaluation(y_test, y_predicted, N, d):
    mse = np.mean((y_test-y_predicted)**2)
    # mse = mean_squared_error(y_test,y_predicted)
    mae = np.mean(np.abs(y_test-y_predicted))
    # mae = mean_absolute_error(y_test,y_predicted)
    rmse = np.sqrt(mse)
    r2 = 1-np.sum((y_test-y_predicted)**2)/np.sum((y_test-np.mean(y_test))**2)
    # r2 = r2_score(y_test, y_predicted)
    r2_adj = 1-((1-r2)*(N-1))/(N-d-1)

    # printing values
    print('Mean squared error: ', mse)
    print('Mean absolute error: ', mae)
    print('Root mean squared error: ', rmse)
    print('R2 score: ', r2)
    print('R2 adjusted score: ', r2_adj)

    # Uporedni prikaz nekoliko pravih i predvidjenih vrednosti
    res=pd.concat([pd.DataFrame(y_test.values), pd.DataFrame(y_predicted)], axis=1)
    res.columns = ['y', 'y_pred']
    print(res.head(20))
    return mse,mae,rmse,r2,r2_adj

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# TODO
# tmp_in = np.random.normal(size=(10,))
tmp_true = df.loc[:9,'Wind Speed (km/h)']
tmp_predicted = np.arange(0,10)
# print(tmp_in)
# print(tmp_out)

tmp_res = model_evaluation(y_test=tmp_true, y_predicted=tmp_predicted, N=10, d=1)
print(tmp_res)
print('mean_squared_error:',mean_squared_error(tmp_true,tmp_predicted))
print('mean_absolute_error:',mean_absolute_error(tmp_true,tmp_predicted))
sk_learn_r2 = r2_score(tmp_true,tmp_predicted)
print('r2_score:',sk_learn_r2)
custom_r2_adjusted = 1 - ((1 - sk_learn_r2) * (10-1)) / (10 - 1 - 1)
print('r2_score custom adjusted:', custom_r2_adjusted )
del tmp_true, tmp_predicted, tmp_res

Mean squared error:  74.73116288000001
Mean absolute error:  7.678040000000001
Root mean squared error:  8.644718785478219
R2 score:  -7.384075767581049
R2 adjusted score:  -8.43208523852868
         y  y_pred
0  14.1197       0
1  14.2646       1
2   3.9284       2
3  14.1036       3
4  11.0446       4
5  13.9587       5
6  12.3648       6
7  14.1519       7
8  11.3183       8
9  12.5258       9
(np.float64(74.73116288000001), np.float64(7.678040000000001), np.float64(8.644718785478219), np.float64(-7.384075767581049), np.float64(-8.43208523852868))
mean_squared_error: 74.73116288000001
mean_absolute_error: 7.678040000000001
r2_score: -7.384075767581049
r2_score custom adjusted: -8.43208523852868

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Inicijalizacija modela (podesavanje hiperparametara)
model = LinearRegression(fit_intercept=True)

# Obuka modela
model.fit(X_train, y_train)

# Testiranje obucenog modela (primena modela)
y_predicted = model.predict(X_test)

# Evaluacija (racunanje mera uspesnosti)
model_evaluation(y_test, y_predicted, X_train.shape[0], X_train.shape[1])

Mean squared error:  14.020892363319918
Mean absolute error:  3.0559024248680258
Root mean squared error:  3.7444482054529633
R2 score:  0.21754464246307192
R2 adjusted score:  0.21745449368230585
          y     y_pred
0   15.5526  12.945771
1    9.9820   9.740866
2    9.6278  10.706595
3    8.0500  10.253302
4    7.5509   6.345266
5   13.7977   9.030456
6    9.9820  13.091935
7   11.2700  10.720846
8    4.5080   6.901841
9    8.4203   9.517001
10   8.0983  10.340767
11  10.3523  11.267772
12   3.4937   5.366907
13   9.9820  10.033562
14  16.1000  12.207898
15   0.0000   7.770094
16   8.0500   9.694605
17   9.9820  11.428571
18   0.5313   9.255958
19  11.2700   9.727986

(np.float64(14.020892363319918),
 np.float64(3.0559024248680258),
 np.float64(3.7444482054529633),
 np.float64(0.21754464246307192),
 np.float64(0.21745449368230585))

# Ilustracija koeficijenata
plt.figure(figsize=(10,5))
plt.bar(model.feature_names_in_,model.coef_)
plt.xticks(rotation=45, ha="right")
print(f'coef: {model.coef_}')           ## koef. modela
print(f'intercept: {model.intercept_}') ## slobodni clan
plt.show()

coef: [ 9.16231002e-02 -3.56597522e-02 -5.29981407e+00  2.77470063e-03
 -3.73660177e-02  2.86953981e+00  6.22868232e-01 -3.09246313e-01
  1.67222542e-01 -3.88999643e-01]
intercept: 49.0224861405965

# podela skupa na trening (90%) i test (10%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

cat_feats = X_train.columns[['_' in column for column in X_train.columns]]
print(f'cat feats: {cat_feats}')
numeric_feats = X_train.columns[['_' not in column for column in X_train.columns]]
print(f'numeric feats: {numeric_feats}')

cat feats: Index(['Precip Type_rain', 'Precip Type_snow', 'Wind bearing cat_N',
       'Wind bearing cat_S', 'Wind bearing cat_W'],
      dtype='object')
numeric feats: Index(['Temperature (C)', 'Apparent Temperature (C)', 'Humidity',
       'Wind Speed (km/h)', 'Pressure (millibars)'],
      dtype='object')

# print(X_train[numeric_feats].head(5))
# print('----------------')
# print(X_test[numeric_feats].head(5))
# print('----------------')
s = StandardScaler()
s.fit(X_train[numeric_feats])
X_train[numeric_feats] = s.transform(X_train[numeric_feats])
X_test[numeric_feats] = s.transform(X_test[numeric_feats])
# print(X_train[numeric_feats].head(5))
# print('----------------')
# print(X_test[numeric_feats].head(5))
# print('----------------')

# Inicijalizacija modela (podesavanje hiperparametara)
model = LinearRegression(fit_intercept=True)

# Obuka modela
model.fit(X_train, y_train)

# Testiranje obucenog modela (primena modela)
y_predicted = model.predict(X_test)

# Evaluacija (racunanje mera uspesnosti)
model_evaluation(y_test, y_predicted, X_train.shape[0], X_train.shape[1])

Mean squared error:  14.020892363319915
Mean absolute error:  3.0559024248680258
Root mean squared error:  3.744448205452963
R2 score:  0.21754464246307215
R2 adjusted score:  0.21745449368230618
          y     y_pred
0   15.5526  12.945771
1    9.9820   9.740866
2    9.6278  10.706595
3    8.0500  10.253302
4    7.5509   6.345266
5   13.7977   9.030456
6    9.9820  13.091935
7   11.2700  10.720846
8    4.5080   6.901841
9    8.4203   9.517001
10   8.0983  10.340767
11  10.3523  11.267772
12   3.4937   5.366907
13   9.9820  10.033562
14  16.1000  12.207898
15   0.0000   7.770094
16   8.0500   9.694605
17   9.9820  11.428571
18   0.5313   9.255958
19  11.2700   9.727986

(np.float64(14.020892363319915),
 np.float64(3.0559024248680258),
 np.float64(3.744448205452963),
 np.float64(0.21754464246307215),
 np.float64(0.21745449368230618))

# Ilustracija koeficijenata
plt.figure(figsize=(10,5))
plt.bar(model.feature_names_in_,model.coef_)
plt.xticks(rotation=45, ha="right")
print(f'coef: {model.coef_}')
print(f'intercept: {model.intercept_}')
plt.show()

coef: [ 0.87431593 -0.38103252 -1.03544497  0.01917541 -0.2904449   2.86953981
  0.62286823 -0.30924631  0.16722254 -0.38899964]
intercept: 7.870049301791442

# podela skupa na trening (90%) i test (10%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# standardizacija spram trening seta
s = StandardScaler()
s.fit(X_train[numeric_feats])
X_train[numeric_feats] = s.transform(X_train[numeric_feats])
X_test[numeric_feats] = s.transform(X_test[numeric_feats])

# Inicijalizacija modela (podesavanje hiperparametara)
from sklearn.linear_model import Ridge
model_ridge = Ridge(alpha=5, fit_intercept=True)

# Obuka modela
model_ridge.fit(X_train, y_train)

# Testiranje obucenog modela (primena modela)
y_predicted = model_ridge.predict(X_test)

# Evaluacija (racunanje mera uspesnosti)
model_evaluation(y_test, y_predicted, X_train.shape[0], X_train.shape[1])

Mean squared error:  14.020800010360787
Mean absolute error:  3.055845320516795
Root mean squared error:  3.7444358734475327
R2 score:  0.21754979634813032
R2 adjusted score:  0.21745964816115726
          y     y_pred
0   15.5526  12.945527
1    9.9820   9.740490
2    9.6278  10.706926
3    8.0500  10.252578
4    7.5509   6.344797
5   13.7977   9.030272
6    9.9820  13.091452
7   11.2700  10.721259
8    4.5080   6.938428
9    8.4203   9.517242
10   8.0983  10.340601
11  10.3523  11.267687
12   3.4937   5.366064
13   9.9820  10.034060
14  16.1000  12.207746
15   0.0000   7.806896
16   8.0500   9.695141
17   9.9820  11.428267
18   0.5313   9.256618
19  11.2700   9.727803

(np.float64(14.020800010360787),
 np.float64(3.055845320516795),
 np.float64(3.7444358734475327),
 np.float64(0.21754979634813032),
 np.float64(0.21745964816115726))

# Inicijalizacija modela (podesavanje hiperparametara)
model_lasso = Lasso(alpha=0.01, fit_intercept=True)

# Obuka modela
model_lasso.fit(X_train, y_train)

# Testiranje obucenog modela (primena modela)
y_predicted = model_lasso.predict(X_test)

# Evaluacija (racunanje mera uspesnosti)
model_evaluation(y_test, y_predicted, X_train.shape[0], X_train.shape[1])

Mean squared error:  14.029134958291259
Mean absolute error:  3.055760798733383
Root mean squared error:  3.745548685879181
R2 score:  0.21708465301103064
R2 adjusted score:  0.2169944512336458
          y     y_pred
0   15.5526  12.907872
1    9.9820   9.748854
2    9.6278  10.759736
3    8.0500  10.145137
4    7.5509   6.470670
5   13.7977   9.066796
6    9.9820  13.031737
7   11.2700  10.783161
8    4.5080   7.640638
9    8.4203   9.557600
10   8.0983  10.259546
11  10.3523  11.247211
12   3.4937   5.460913
13   9.9820  10.105863
14  16.1000  12.276990
15   0.0000   8.459230
16   8.0500   9.782073
17   9.9820  11.398009
18   0.5313   9.251604
19  11.2700   9.754239

(np.float64(14.029134958291259),
 np.float64(3.055760798733383),
 np.float64(3.745548685879181),
 np.float64(0.21708465301103064),
 np.float64(0.2169944512336458))

# Ilustracija koeficijenata
plt.figure(figsize=(10,5))
plt.bar(model_ridge.feature_names_in_,model_ridge.coef_, label='ridge')
plt.legend()
plt.xticks(rotation=45, ha="right")
print(f'coef: {model_ridge.coef_}')
print(f'intercept: {model_ridge.intercept_}')
plt.show()

coef: [ 0.86798416 -0.37476099 -1.03554388  0.01951315 -0.29055701  2.83286984
  0.58626602 -0.3090594   0.16718629 -0.38880959]
intercept: 7.906437747983643

# Ilustracija koeficijenata
plt.figure(figsize=(10,5))
plt.bar(model_lasso.feature_names_in_,model_lasso.coef_, label='lasso')
plt.legend()
plt.xticks(rotation=45, ha="right")
print(f'coef: {model_lasso.coef_}')
print(f'intercept: {model_lasso.intercept_}')
plt.show()

coef: [ 0.50646735  0.         -1.03066045  0.04108523 -0.28764133  2.15662212
 -0.         -0.20699429  0.15832752 -0.31984528]
intercept: 8.53517625666293

# podela skupa na trening (80%), val(10%) i test (10%)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# standardizacija spram trening seta
s = StandardScaler()
s.fit(X_train[numeric_feats])
X_train[numeric_feats] = s.transform(X_train[numeric_feats])
X_val[numeric_feats] = s.transform(X_val[numeric_feats])
X_test[numeric_feats] = s.transform(X_test[numeric_feats])

# Inicijalizacija modela (podesavanje hiperparametara)
model_ridge = Ridge(alpha=5)
# Obuka modela
model_ridge.fit(X_train, y_train)

Ridge(alpha=5)

Ridge(alpha=5)

# Inicijalizacija modela (podesavanje hiperparametara)
model_lasso = Lasso(alpha=0.01)
# Obuka modela
model_lasso.fit(X_train, y_train)

Lasso(alpha=0.01)

Lasso(alpha=0.01)

# Testiranje obucenog modela (primena modela)
y_predicted_ridge = model_ridge.predict(X_val)
# Evaluacija (racunanje mera uspesnosti)
model_evaluation(y_val, y_predicted_ridge, X_train.shape[0], X_train.shape[1])

Mean squared error:  14.07552718783539
Mean absolute error:  3.0583353090592866
Root mean squared error:  3.75173655629435
R2 score:  0.21580370876882204
R2 adjusted score:  0.2157020644231582
          y     y_pred
0    9.9820  10.171718
1   16.1000  11.144381
2   15.1823  10.756456
3    6.0697   6.469956
4   10.2557  11.468892
5   15.8263  12.054888
6    9.9820  14.211538
7   14.9569  10.060617
8   11.2700  10.123561
9    4.4436   8.859296
10  10.4006  11.423057
11  10.3523  12.729105
12   9.9820  10.209543
13  15.8263   7.033093
14  16.1000  11.093510
15   0.1610   8.568039
16   9.9820  10.523047
17  10.3523  12.513890
18   2.0769   6.139311
19  15.8263  12.946812

(np.float64(14.07552718783539),
 np.float64(3.0583353090592866),
 np.float64(3.75173655629435),
 np.float64(0.21580370876882204),
 np.float64(0.2157020644231582))

# Testiranje obucenog modela (primena modela)
y_predicted_lasso = model_lasso.predict(X_val)
# Evaluacija (racunanje mera uspesnosti)
model_evaluation(y_val, y_predicted_lasso, X_train.shape[0], X_train.shape[1])

Mean squared error:  14.074672381479967
Mean absolute error:  3.057906587294417
Root mean squared error:  3.7516226331388887
R2 score:  0.21585133298670733
R2 adjusted score:  0.2157496948139016
          y     y_pred
0    9.9820  10.144456
1   16.1000  11.098217
2   15.1823  10.673193
3    6.0697   6.463707
4   10.2557  11.349296
5   15.8263  12.020563
6    9.9820  14.180232
7   14.9569  10.139356
8   11.2700  10.171205
9    4.4436   8.915285
10  10.4006  11.435534
11  10.3523  12.726415
12   9.9820  10.202198
13  15.8263   7.012660
14  16.1000  11.151853
15   0.1610   8.601278
16   9.9820  10.611018
17  10.3523  12.453106
18   2.0769   6.248733
19  15.8263  12.948846

(np.float64(14.074672381479967),
 np.float64(3.057906587294417),
 np.float64(3.7516226331388887),
 np.float64(0.21585133298670733),
 np.float64(0.2157496948139016))

X_train_full = pd.concat((X_train, X_val))
y_train_full = pd.concat((y_train, y_val))
X_train_full

# Inicijalizacija modela (podesavanje hiperparametara)
model_lasso = Lasso(alpha=0.1)
# Obuka modela
model_lasso.fit(X_train_full, y_train_full)

Lasso(alpha=0.1)

Lasso(alpha=0.1)

# Testiranje obucenog modela (primena modela)
y_predicted_ridge = model_lasso.predict(X_test)
# Evaluacija (racunanje mera uspesnosti)
model_evaluation(y_test, y_predicted_ridge, X_train.shape[0], X_train.shape[1])

Mean squared error:  13.971763071250301
Mean absolute error:  3.037403500670552
Root mean squared error:  3.737882163906495
R2 score:  0.19822978355000698
R2 adjusted score:  0.19812586134336674
          y     y_pred
0   15.5526  11.636798
1    4.1216   7.814812
2    9.9820  11.965885
3   15.8263  10.039523
4   15.1340   9.420337
5    7.9695   8.147696
6    9.9820  11.826974
7    9.9820  14.021446
8   15.8263  11.452480
9   10.3523  13.645432
10  10.2557   8.847953
11   4.0250   7.288054
12   8.9677   8.742788
13   4.9105   6.801529
14   1.9642   8.888358
15   9.9820  10.148058
16  16.1000   8.516962
17  14.9569  10.328438
18  14.0231   8.714493
19   3.0268   7.170125

(np.float64(13.971763071250301),
 np.float64(3.037403500670552),
 np.float64(3.737882163906495),
 np.float64(0.19822978355000698),
 np.float64(0.19812586134336674))

# podela skupa na trening (90%) i test (10%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# TODO
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

kf_ridge = KFold(n_splits=5,shuffle=False) ## ,random_state=42
kf_ridge.get_n_splits(X_train)
# print(kf_ridge)
# print(kf_ridge.split(X_train))

# print(numeric_feats)

ridge_results = []
for split_id,(train_index,test_index) in enumerate(kf_ridge.split(X_train)):
  # print('train_index:',train_index, len(train_index))
  # print('test_index:',test_index, len(test_index))

  ## can not find the indexes in 'X_train' even though we split it. LIKE WHAT THE FUCK
  # train_df = X_train.loc[train_index,:]
  # test_df = y_train.loc[test_index,:]

  train_df_x = X.loc[train_index]
  test_df_x = X.loc[test_index]
  train_df_y = y.loc[train_index]
  test_df_y = y.loc[test_index]
  # print(train_index.shape)
  # print(train_df)
  # print(test_df)

  tmp_sc = StandardScaler()
  tmp_sc.fit(train_df_x[numeric_feats])
  # print(tmp_sc.mean_)

  train_df_x[numeric_feats] = tmp_sc.transform(train_df_x[numeric_feats])
  test_df_x[numeric_feats] = tmp_sc.transform(test_df_x[numeric_feats])
  # print(train_df_x[numeric_feats])

  my_ridge_model = Ridge(alpha=5, fit_intercept=True)
  my_ridge_model.fit(train_df_x, train_df_y)
  # print(my_ridge_model.coef_)

  my_y_predicted = my_ridge_model.predict(train_df_x)
  # print(my_y_predicted.shape)
  # print("--------------")
  # print(train_df_y.shape)
  # print("--------------")

  mean_squared_error(y_true=train_df_y,y_pred=my_y_predicted)
  mean_absolute_error(y_true=train_df_y,y_pred=my_y_predicted)
  r2_score(y_true=train_df_y,y_pred=my_y_predicted)
  my_res = model_evaluation(train_df_y,my_y_predicted,train_df_y.shape[0],train_df_x.shape[1])
  # print(train_df_y.shape[0],train_df_x.shape[1])
  # print("--------------")
  # print(my_res)
  # print("--------------")
  ridge_results.append(my_res)

mean_mse = 0
mean_mAe = 0
mean_root_square_error = 0
mean_r2 = 0
mean_r2_adj = 0
for result in ridge_results:
  mean_mse += result[0]
  mean_mAe += result[1]
  mean_root_square_error = result[2]
  mean_r2 += result[3]
  mean_r2_adj += result[4]

mean_mse /= len(ridge_results)
mean_mAe /= len(ridge_results)
mean_root_square_error /= len(ridge_results)
mean_r2 /= len(ridge_results)
mean_r2_adj /= len(ridge_results)

print("MSE:",mean_mse)
print("Mean AbsoluteE:",mean_mAe)
print("mean_root_square_error: ",mean_root_square_error)
print("r2:",mean_r2)
print("r2_adj:",mean_r2_adj)

del tmp_sc, train_df_x, train_df_y, test_df_x,  test_df_y , my_ridge_model, my_y_predicted, my_res, mean_mse, mean_mAe,mean_r2,mean_r2_adj, mean_root_square_error

Mean squared error:  13.993916688217425
Mean absolute error:  3.0454294957806023
Root mean squared error:  3.740844381716169
R2 score:  0.21445290689183205
R2 adjusted score:  0.2143397710948005
          y     y_pred
0    9.9820  13.182870
1   10.3523  13.447281
2   11.2056  13.172528
3    9.9820  13.839584
4   10.3523  14.197236
5    9.9820  13.659735
6    9.9820  14.245082
7   10.3523  14.071979
8    9.9820  13.833627
9    9.9820  13.775057
10  10.3523  12.882075
11   9.9820  11.185861
12  15.8263  11.333326
13  14.9569  10.941102
14   9.9820   9.316291
15   6.2951   9.332095
16  14.1680   9.418226
17   6.2951   9.165078
18   6.2951   9.254108
19   6.8425   9.158915
Mean squared error:  13.943623197415658
Mean absolute error:  3.05111515961069
Root mean squared error:  3.7341161199694444
R2 score:  0.21862038210729162
R2 adjusted score:  0.2185078465169623
          y     y_pred
0   15.8263  10.040441
1   15.8263  10.184886
2   14.9569   9.262247
3   15.8263  10.273044
4   15.8263  10.261926
5   14.9569  10.193982
6    9.9820   9.552946
7    9.9820   9.946408
8    9.9820  10.313834
9    9.9820  11.046320
10  11.2056  11.454064
11  11.4471  11.979353
12  11.2700  12.224600
13  11.2700  12.422178
14  11.4471  12.735427
15  11.2700  12.810080
16  11.2700  11.848270
17  11.4471  11.684506
18  11.2056  10.665760
19  11.2056  10.544664
Mean squared error:  14.120209333556948
Mean absolute error:  3.0684599495694
Root mean squared error:  3.7576866997604985
R2 score:  0.18244066753023125
R2 adjusted score:  0.18232292297309582
          y     y_pred
0   15.8263  10.076561
1   15.8263  10.209325
2   14.9569   9.471395
3   15.8263  10.291836
4   15.8263  10.294370
5   14.9569  10.211679
6    9.9820   9.617736
7    9.9820   9.976242
8    9.9820  10.387983
9    9.9820  11.042999
10  11.2056  11.403906
11  11.4471  11.999587
12  11.2700  12.114609
13  11.2700  12.307495
14  11.4471  12.575736
15  11.2700  12.643415
16  11.2700  11.791877
17  11.4471  11.628643
18  11.2056  10.740144
19  11.2056  10.639857
Mean squared error:  13.55406866484194
Mean absolute error:  2.969827670230717
Root mean squared error:  3.6815850750514976
R2 score:  0.216961088578021
R2 adjusted score:  0.21684831563765627
          y     y_pred
0   15.8263   9.899432
1   15.8263  10.041198
2   14.9569   9.127865
3   15.8263  10.123315
4   15.8263  10.095732
5   14.9569  10.045953
6    9.9820   9.402599
7    9.9820   9.801979
8    9.9820  10.154153
9    9.9820  10.887186
10  11.2056  11.321937
11  11.4471  11.864421
12  11.2700  12.109351
13  11.2700  12.295882
14  11.4471  12.574952
15  11.2700  12.619854
16  11.2700  11.692831
17  11.4471  11.512042
18  11.2056  10.544776
19  11.2056  10.418596
Mean squared error:  12.269043996179049
Mean absolute error:  2.700273180177254
Root mean squared error:  3.5027195143458245
R2 score:  0.19615032709738522
R2 adjusted score:  0.19603455699975392
          y     y_pred
0   15.8263  10.122521
1   15.8263  10.224259
2   14.9569   9.221741
3   15.8263  10.285211
4   15.8263  10.226435
5   14.9569  10.226191
6    9.9820   9.765225
7    9.9820  10.061082
8    9.9820  10.239218
9    9.9820  10.760370
10  11.2056  11.111724
11  11.4471  11.487560
12  11.2700  11.702019
13  11.2700  11.819483
14  11.4471  11.961616
15  11.2700  11.943505
16  11.2700  11.336992
17  11.4471  11.179818
18  11.2056  10.251249
19  11.2056  10.155006
MSE: 13.576172376042203
Mean AbsoluteE: 2.967021091073733
mean_root_square_error:  0.7005439028691649
r2: 0.20572507444095223
r2_adj: 0.20561068264445376

# TODO
# TODO
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

kf_ridge = KFold(n_splits=5,shuffle=False) ## ,random_state=42
kf_ridge.get_n_splits(X_train)
# print(kf_ridge)
# print(kf_ridge.split(X_train))

# print(numeric_feats)

lasso_results = []
for split_id,(train_index,test_index) in enumerate(kf_ridge.split(X_train)):
  # print('train_index:',train_index, len(train_index))
  # print('test_index:',test_index, len(test_index))

  ## can not find the indexes in 'X_train' even though we split it. LIKE WHAT THE FUCK
  # train_df = X_train.loc[train_index,:]
  # test_df = y_train.loc[test_index,:]

  train_df_x = X.loc[train_index]
  test_df_x = X.loc[test_index]
  train_df_y = y.loc[train_index]
  test_df_y = y.loc[test_index]
  # print(train_index.shape)
  # print(train_df)
  # print(test_df)

  tmp_sc = StandardScaler()
  tmp_sc.fit(train_df_x[numeric_feats])
  # print(tmp_sc.mean_)

  train_df_x[numeric_feats] = tmp_sc.transform(train_df_x[numeric_feats])
  test_df_x[numeric_feats] = tmp_sc.transform(test_df_x[numeric_feats])
  # print(train_df_x[numeric_feats])

  my_lasso_model = Lasso(alpha=5, fit_intercept=True)
  my_lasso_model.fit(train_df_x, train_df_y)
  # print(my_lasso_model.coef_)

  my_y_predicted = my_lasso_model.predict(train_df_x)
  # print(my_y_predicted.shape)
  # print("--------------")
  # print(train_df_y.shape)
  # print("--------------")

  mean_squared_error(y_true=train_df_y,y_pred=my_y_predicted)
  mean_absolute_error(y_true=train_df_y,y_pred=my_y_predicted)
  r2_score(y_true=train_df_y,y_pred=my_y_predicted)
  my_res = model_evaluation(train_df_y,my_y_predicted,train_df_y.shape[0],train_df_x.shape[1])
  # print(train_df_y.shape[0],train_df_x.shape[1])
  # print("--------------")
  # print(my_res)
  # print("--------------")
  lasso_results.append(my_res)

mean_mse = 0
mean_mAe = 0
mean_root_square_error = 0
mean_r2 = 0
mean_r2_adj = 0
for result in lasso_results:
  mean_mse += result[0]
  mean_mAe += result[1]
  mean_root_square_error = result[2]
  mean_r2 += result[3]
  mean_r2_adj += result[4]

mean_mse /= len(ridge_results)
mean_mAe /= len(ridge_results)
mean_root_square_error /= len(ridge_results)
mean_r2 /= len(ridge_results)
mean_r2_adj /= len(ridge_results)

print("MSE:",mean_mse)
print("Mean AbsoluteE:",mean_mAe)
print("mean_root_square_error: ",mean_root_square_error)
print("r2:",mean_r2)
print("r2_adj:",mean_r2_adj)

del tmp_sc, train_df_x, train_df_y, test_df_x,  test_df_y , my_lasso_model, my_y_predicted, my_res, mean_mse, mean_mAe,mean_r2,mean_r2_adj

Mean squared error:  17.814230121898618
Mean absolute error:  3.1370521780775253
Root mean squared error:  4.220690716209685
R2 score:  0.0
R2 adjusted score:  -0.00014402166085769608
          y     y_pred
0    9.9820  10.280538
1   10.3523  10.280538
2   11.2056  10.280538
3    9.9820  10.280538
4   10.3523  10.280538
5    9.9820  10.280538
6    9.9820  10.280538
7   10.3523  10.280538
8    9.9820  10.280538
9    9.9820  10.280538
10  10.3523  10.280538
11   9.9820  10.280538
12  15.8263  10.280538
13  14.9569  10.280538
14   9.9820  10.280538
15   6.2951  10.280538
16  14.1680  10.280538
17   6.2951  10.280538
18   6.2951  10.280538
19   6.8425  10.280538
Mean squared error:  17.844877033035512
Mean absolute error:  3.1556347938495377
Root mean squared error:  4.224319712454955
R2 score:  0.0
R2 adjusted score:  -0.00014402166085769608
          y     y_pred
0   15.8263  10.276835
1   15.8263  10.276835
2   14.9569  10.276835
3   15.8263  10.276835
4   15.8263  10.276835
5   14.9569  10.276835
6    9.9820  10.276835
7    9.9820  10.276835
8    9.9820  10.276835
9    9.9820  10.276835
10  11.2056  10.276835
11  11.4471  10.276835
12  11.2700  10.276835
13  11.2700  10.276835
14  11.4471  10.276835
15  11.2700  10.276835
16  11.2700  10.276835
17  11.4471  10.276835
18  11.2056  10.276835
19  11.2056  10.276835
Mean squared error:  17.271173813038306
Mean absolute error:  3.1006709860106625
Root mean squared error:  4.155860177272366
R2 score:  0.0
R2 adjusted score:  -0.00014401958666376835
          y     y_pred
0   15.8263  10.474373
1   15.8263  10.474373
2   14.9569  10.474373
3   15.8263  10.474373
4   15.8263  10.474373
5   14.9569  10.474373
6    9.9820  10.474373
7    9.9820  10.474373
8    9.9820  10.474373
9    9.9820  10.474373
10  11.2056  10.474373
11  11.4471  10.474373
12  11.2700  10.474373
13  11.2700  10.474373
14  11.4471  10.474373
15  11.2700  10.474373
16  11.2700  10.474373
17  11.4471  10.474373
18  11.2056  10.474373
19  11.2056  10.474373
Mean squared error:  17.309572317712913
Mean absolute error:  3.0630281759525784
Root mean squared error:  4.160477414638002
R2 score:  0.0
R2 adjusted score:  -0.00014401958666376835
          y     y_pred
0   15.8263  10.166142
1   15.8263  10.166142
2   14.9569  10.166142
3   15.8263  10.166142
4   15.8263  10.166142
5   14.9569  10.166142
6    9.9820  10.166142
7    9.9820  10.166142
8    9.9820  10.166142
9    9.9820  10.166142
10  11.2056  10.166142
11  11.4471  10.166142
12  11.2700  10.166142
13  11.2700  10.166142
14  11.4471  10.166142
15  11.2700  10.166142
16  11.2700  10.166142
17  11.4471  10.166142
18  11.2056  10.166142
19  11.2056  10.166142
Mean squared error:  15.262858728146083
Mean absolute error:  2.7558326699385938
Root mean squared error:  3.906770882473924
R2 score:  0.0
R2 adjusted score:  -0.00014401958666376835
          y     y_pred
0   15.8263  10.009671
1   15.8263  10.009671
2   14.9569  10.009671
3   15.8263  10.009671
4   15.8263  10.009671
5   14.9569  10.009671
6    9.9820  10.009671
7    9.9820  10.009671
8    9.9820  10.009671
9    9.9820  10.009671
10  11.2056  10.009671
11  11.4471  10.009671
12  11.2700  10.009671
13  11.2700  10.009671
14  11.4471  10.009671
15  11.2700  10.009671
16  11.2700  10.009671
17  11.4471  10.009671
18  11.2056  10.009671
19  11.2056  10.009671
MSE: 17.100542402766287
Mean AbsoluteE: 3.0424437607657793
mean_root_square_error:  0.7813541764947848
r2: 0.0
r2_adj: -0.00014402041634133944

# TODO
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


# print(numeric_feats)

train_df_x = X_train
test_df_x = X_test
train_df_y = y_train
test_df_y = y_test
# print(train_index.shape)
# print(train_df)
# print(test_df)

tmp_sc = StandardScaler()
tmp_sc.fit(train_df_x[numeric_feats])
# print(tmp_sc.mean_)

train_df_x[numeric_feats] = tmp_sc.transform(train_df_x[numeric_feats])
test_df_x[numeric_feats] = tmp_sc.transform(test_df_x[numeric_feats])
# print(train_df_x[numeric_feats])

my_ridge_model = Ridge(alpha=5, fit_intercept=True)
my_ridge_model.fit(train_df_x, train_df_y)
# print(my_ridge_model.coef_)

my_y_predicted = my_ridge_model.predict(train_df_x)
# print(my_y_predicted.shape)
# print("--------------")
# print(train_df_y.shape)
# print("--------------")

my_res = model_evaluation(train_df_y,my_y_predicted,train_df_y.shape[0],train_df_x.shape[1])
# print(train_df_y.shape[0],train_df_x.shape[1])
# print("--------------")
print(my_res)
print("--------------")

my_y_predicted = my_ridge_model.predict(test_df_x)
# print(my_y_predicted.shape)
# print("--------------")
# print(train_df_y.shape)
# print("--------------")

my_res = model_evaluation(test_df_y,my_y_predicted,test_df_y.shape[0],test_df_x.shape[1])
# print(train_df_y.shape[0],train_df_x.shape[1])
# print("--------------")
print(my_res)
print("--------------")

del tmp_sc, train_df_x, train_df_y, test_df_x,  test_df_y , my_ridge_model, my_y_predicted, my_res

Mean squared error:  13.878578644356596
Mean absolute error:  3.0340208667273254
Root mean squared error:  3.7253964412336837
R2 score:  0.20853077845901935
R2 adjusted score:  0.20843959116680066
          y     y_pred
0   10.2557  11.696899
1   15.8263  11.042516
2   11.2700  13.786124
3   11.2700  10.471661
4    9.9820   9.913415
5    3.3327   9.250926
6   15.8263  11.733761
7   15.5526  10.730600
8    1.4812   6.762791
9    9.9820   9.882093
10   6.7781   9.104298
11   7.9051   9.225040
12  15.5526  14.171902
13   5.5706   8.885414
14  16.1000  12.227292
15   0.1610   5.931609
16   0.1610   5.967857
17   6.1180   7.114873
18  11.2700  11.389195
19  16.1000  10.850203
(np.float64(13.878578644356596), np.float64(3.0340208667273254), np.float64(3.7253964412336837), np.float64(0.20853077845901935), np.float64(0.20843959116680066))
--------------
Mean squared error:  14.020800010360787
Mean absolute error:  3.055845320516795
Root mean squared error:  3.7444358734475327
R2 score:  0.21754979634813032
R2 adjusted score:  0.2167377048030843
          y     y_pred
0   15.5526  12.945527
1    9.9820   9.740490
2    9.6278  10.706926
3    8.0500  10.252578
4    7.5509   6.344797
5   13.7977   9.030272
6    9.9820  13.091452
7   11.2700  10.721259
8    4.5080   6.938428
9    8.4203   9.517242
10   8.0983  10.340601
11  10.3523  11.267687
12   3.4937   5.366064
13   9.9820  10.034060
14  16.1000  12.207746
15   0.0000   7.806896
16   8.0500   9.695141
17   9.9820  11.428267
18   0.5313   9.256618
19  11.2700   9.727803
(np.float64(14.020800010360787), np.float64(3.055845320516795), np.float64(3.7444358734475327), np.float64(0.21754979634813032), np.float64(0.2167377048030843))
--------------

# TODO
## done in the above one

from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import Pipeline

s = StandardScaler()
model_ridge = Ridge(alpha=5)
cv = KFold(n_splits = 5, random_state = 42, shuffle=True)

pipeline = Pipeline([('scaler', s), ('LinReg', model_ridge)])
scores = cross_val_score(pipeline, X_train, y_train, cv = cv, scoring='r2')

print(f'r2: {scores}')

r2: [0.21452231 0.21133761 0.20963071 0.20078304 0.20420529]

# podela skupa na trening (90%) i test (10%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# TODO
from sklearn.feature_selection import SequentialFeatureSelector

pipeline = Pipeline([('scaler', s), ('LinReg', LinearRegression())])
cv = KFold(n_splits = 5, random_state = 42, shuffle=True)

sfs = SequentialFeatureSelector(pipeline, n_features_to_select = 2, direction='forward', scoring='r2', cv=cv)
sfs.fit(X_train, y_train)
forward_features = sfs.get_feature_names_out()
print(f"odabrana obelezja, forward: {forward_features}, score: {cross_val_score(pipeline, X_train[forward_features], y_train, cv = cv, scoring='r2').mean()}")

sfs = SequentialFeatureSelector(pipeline, n_features_to_select = 2, direction='backward', scoring='r2', cv=cv)
sfs.fit(X_train, y_train)
backward_features = sfs.get_feature_names_out()
print(f"odabrana obelezja, backward: {backward_features}, score: {cross_val_score(pipeline, X_train[backward_features], y_train, cv = cv, scoring='r2').mean()}")

odabrana obelezja, forward: ['Temperature (C)' 'Humidity'], score: 0.17749488981647626

# standardizacija spram trening seta
s = StandardScaler()
s.fit(X_train[numeric_feats])
X_train[numeric_feats] = s.transform(X_train[numeric_feats])
X_test[numeric_feats] = s.transform(X_test[numeric_feats])

model.fit(X_train[backward_features], y_train)
y_predicted = model.predict(X_test[backward_features])
model_evaluation(y_test, y_predicted, X_train[backward_features].shape[0], X_train[backward_features].shape[1])

plt.figure(figsize=(10,5))
plt.bar(model.feature_names_in_,model.coef_, label='osnovna hipoteza')
plt.legend()
plt.xticks(rotation=45, ha="right")
print(f'coef: {model.coef_}')
print(f'intercept: {model.intercept_}')
plt.show()

corr = X_train[numeric_feats].corr()
f = plt.figure(figsize=(12, 9))
sb.heatmap(corr.abs(), annot=True);

# podela skupa na trening (80%), val(10%) i test (10%)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# standardizacija spram trening seta
s = StandardScaler()
s.fit(X_train[numeric_feats])
X_train[numeric_feats] = s.transform(X_train[numeric_feats])
X_val[numeric_feats] = s.transform(X_val[numeric_feats])
X_test[numeric_feats] = s.transform(X_test[numeric_feats])

# TODO
## PolynomialFeatures(degree=2,interaction_only=True)
## interaction_only=True ne generise x_i^2 nego samo x_i*x_j
## include_bias=True dodaj bias tj. slobodni clan

from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

poly_feat1 = PolynomialFeatures(degree=2,interaction_only=True, include_bias=True)
X_train_poly_feat1 = poly_feat1.fit_transform(X_train)
print(X_train_poly_feat1.shape)

# TODO
poly_feat1.get_feature_names_out()

# TODO
## should I put the scaler?
my_pipeline1 = Pipeline([
    # ('scaler',s),
    ('polyFeat',poly_feat1),
    ('LinReg', LinearRegression()) ])
my_pipeline1.fit(X_train, y_train)
y_pred1 = my_pipeline1.predict(X_val)
print(my_pipeline1['LinReg'].coef_.shape)
print(y_pred1)
# del y_pred1
# X_train[numeric_feats] = s.transform(X_train[numeric_feats])
# X_val[numeric_feats] = s.transform(X_val[numeric_feats])
# X_test[numeric_feats] = s.transform(X_test[numeric_feats])

# TODO
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

poly_feat2 = PolynomialFeatures(degree=2,interaction_only=False, include_bias=True)
X_train_poly_feat2 = poly_feat2.fit_transform(X_train)
print(X_train_poly_feat2.shape)

# poly_feat2.get_feature_names_out()

## should I put the scaler?
my_pipeline2 = Pipeline([
    # ('scaler',s),
    ('polyFeat',poly_feat2 ),
    ('LinReg', LinearRegression()) ])

my_pipeline2.fit(X_train, y_train)
y_pred2 = my_pipeline2.predict(X_val)
print(my_pipeline2['LinReg'].coef_.shape)
print(y_pred2)
# del y_pred2
# X_train[numeric_feats] = s.transform(X_train[numeric_feats])
# X_val[numeric_feats] = s.transform(X_val[numeric_feats])
# X_test[numeric_feats] = s.transform(X_test[numeric_feats])

# TODO
# TODO
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

poly_feat3 = PolynomialFeatures(degree=3,interaction_only=False, include_bias=True)
X_train_poly_feat3 = poly_feat3.fit_transform(X_train)
print(X_train_poly_feat3.shape)

# poly_feat3.get_feature_names_out()

## should I put the scaler?
my_pipeline3 = Pipeline([
    # ('scaler',s),
    ('polyFeat',poly_feat3 ),
    ('LinReg', LinearRegression()) ])

my_pipeline3.fit(X_train, y_train)
y_pred3 = my_pipeline3.predict(X_val)
print(my_pipeline3['LinReg'].coef_.shape)
print(y_pred3)
# del y_pred
# X_train[numeric_feats] = s.transform(X_train[numeric_feats])
# X_val[numeric_feats] = s.transform(X_val[numeric_feats])
# X_test[numeric_feats] = s.transform(X_test[numeric_feats])

# TODO
# TODO
from sklearn.feature_selection import SequentialFeatureSelector

# cv = KFold(n_splits = 5, random_state = 42, shuffle=True)
# # print(f"model1 r2_score: {cross_val_score(pipeline, X_train[forward_features], y_train, cv = cv, scoring='r2').mean()}")
# print(f"model1 r2_score: {cross_val_score(my_pipeline1, X_train_poly_feat1, y_train, cv = cv, scoring='r2').mean()}")
# print(f"model2 r2_score: {cross_val_score(my_pipeline2, X_train_poly_feat2, y_train, cv = cv, scoring='r2').mean()}")
# print(f"model3 r2_score: {cross_val_score(my_pipeline3, X_train_poly_feat3, y_train, cv = cv, scoring='r2').mean()}")

# print(f"model3 r2_score: {cross_val_score(my_pipeline3, X_train_poly_feat3, y_train, cv = cv, scoring='r2').mean()}")

##
X_val_poly_feat1 = poly_feat1.transform(X_val)
X_val_poly_feat2 = poly_feat2.transform(X_val)
X_val_poly_feat3 = poly_feat3.transform(X_val)

y_val_poly_feat1_predicted = my_pipeline1.predict(X_val)
y_val_poly_feat2_predicted = my_pipeline2.predict(X_val)
y_val_poly_feat3_predicted = my_pipeline3.predict(X_val)
# y_val_poly_feat3_predicted = my_pipeline3.predict(X_val_poly_feat3)

print(X_val_poly_feat1.shape)
print(X_val_poly_feat2.shape)
print(X_val_poly_feat3.shape)
# print(my_pipeline1['linReg'].coef_.shape[0])
model_evaluation(y_test=y_val, y_predicted=y_val_poly_feat1_predicted, N=X_val_poly_feat1.shape[0],  d=X_val_poly_feat1.shape[1])
# Mean squared error:  11.629556853227895
# Mean absolute error:  2.796682534744267
# Root mean squared error:  3.410213608152412
# R2 score:  0.3520771740013281
# R2 adjusted score:  0.34829289383279183
print("--------------------------")
model_evaluation(y_test=y_val, y_predicted=y_val_poly_feat2_predicted, N=X_val_poly_feat2.shape[0],  d=X_val_poly_feat2.shape[1])
# Mean squared error:  10.718202047577192
# Mean absolute error:  2.7256805994361852
# Root mean squared error:  3.273866528674801
# R2 score:  0.40285190158699524
# R2 adjusted score:  0.3987370786077451
print("--------------------------")
model_evaluation(y_test=y_val, y_predicted=y_val_poly_feat3_predicted, N=X_val_poly_feat3.shape[0],  d=X_val_poly_feat3.shape[1])
# Mean squared error:  9.830574762996207
# Mean absolute error:  2.594140518624325
# Root mean squared error:  3.135374740441118
# R2 score:  0.4523046869267574
# R2 adjusted score:  0.4355659757129353
print("--------------------------")
# X_train[numeric_feats] = s.transform(X_train[numeric_feats])
# X_val[numeric_feats] = s.transform(X_val[numeric_feats])
# X_test[numeric_feats] = s.transform(X_test[numeric_feats])

# TODO
# print(X_train.corr())
sb.heatmap(X_train.corr(),annot=True,fmt=".2f")
plt.show()
# print(X_train.columns)
# 'Temperature (C)','Humidity','Precip Type_snow','Precip Type_rain','Pressure (millibars)','Apparent Temperature (C)'
# kolone bez |corr| > 0.3: 'Wind bearing cat_N', 'Wind Speed (km/h)',

# sb.heatmap(X.corr(),annot=True,fmt=".2f")
# plt.show()
# print(X.columns)

# TODO
# from sklearn.feature_selection import SequentialFeatureSelector

# pipeline = Pipeline([('scaler', s), ('LinReg', LinearRegression())])
# cv = KFold(n_splits = 5, random_state = 42, shuffle=True)

# sfs = SequentialFeatureSelector(pipeline, n_features_to_select = 2, direction='forward', scoring='r2', cv=cv)
# sfs.fit(X_train, y_train)
# forward_features = sfs.get_feature_names_out()
# print(f"odabrana obelezja, forward: {forward_features}, score: {cross_val_score(pipeline, X_train[forward_features], y_train, cv = cv, scoring='r2').mean()}")

# sfs = SequentialFeatureSelector(pipeline, n_features_to_select = 2, direction='backward', scoring='r2', cv=cv)
# sfs.fit(X_train, y_train)
# backward_features = sfs.get_feature_names_out()
# print(f"odabrana obelezja, backward: {backward_features}, score: {cross_val_score(pipeline, X_train[backward_features], y_train, cv = cv, scoring='r2').mean()}")

# TODO
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge

# s = StandardScaler()
# s.fit(X_train[numeric_feats])
# X_train[numeric_feats] = s.transform(X_train[numeric_feats])
# X_val[numeric_feats] = s.transform(X_val[numeric_feats])
# X_test[numeric_feats] = s.transform(X_test[numeric_feats])

# pipeline = Pipeline([
#     ('poly', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)),
#     ('select', SelectKBest(score_func=f_regression, k=10)),
#     ('ridge', Ridge(alpha=1.0))
# ])
my_tmp_pipeline = Pipeline([('scaler', s), ('LinReg', LinearRegression())])
my_tmp_sfs = SequentialFeatureSelector(pipeline, n_features_to_select = 9, direction='forward', scoring='r2') ## , cv=cv
my_tmp_sfs.fit(X_train, y_train)

## n_features_to_select = 9: 'Wind Speed (km/h)' was the 1st to go

print(my_tmp_sfs.get_feature_names_out())
# print(X_train.columns)  ## 'Temperature (C)', 'Apparent Temperature (C)', 'Humidity','Wind Speed (km/h)', 'Pressure (millibars)', 'Precip Type_rain','Precip Type_snow', 'Wind bearing cat_N', 'Wind bearing cat_S','Wind bearing cat_W'
# print(len(X_train.columns)) ## 10

del my_tmp_pipeline, my_tmp_sfs

!jupyter nbconvert --to html "/content/drive/MyDrive/Colab Notebooks/Linearna_Regresija_moja_rjesenja.ipynb"

	Temperature (C)	Apparent Temperature (C)	Humidity	Wind Speed (km/h)	Wind Bearing (degrees)	Visibility (km)	Loud Cover	Pressure (millibars)
count	96453.000000	96453.000000	96453.000000	96453.000000	96453.000000	96453.000000	96453.0	96453.000000
mean	11.932678	10.855029	0.734899	10.810640	187.509232	10.347325	0.0	1003.235956
std	9.551546	10.696847	0.195473	6.913571	107.383428	4.192123	0.0	116.969906
min	-21.822222	-27.716667	0.000000	0.000000	0.000000	0.000000	0.0	0.000000
25%	4.688889	2.311111	0.600000	5.828200	116.000000	8.339800	0.0	1011.900000
50%	12.000000	12.000000	0.780000	9.965900	180.000000	10.046400	0.0	1016.450000
75%	18.838889	18.838889	0.890000	14.135800	290.000000	14.812000	0.0	1021.090000
max	39.905556	39.344444	1.000000	63.852600	359.000000	16.100000	0.0	1046.380000

	Precip Type	Temperature (C)	Apparent Temperature (C)	Humidity	Wind Speed (km/h)	Wind Bearing (degrees)	Pressure (millibars)
0	rain	9.472222	7.388889	0.89	14.1197	251.0	1015.13
1	rain	9.355556	7.227778	0.86	14.2646	259.0	1015.63
2	rain	9.377778	9.377778	0.89	3.9284	204.0	1015.94
3	rain	8.288889	5.944444	0.83	14.1036	269.0	1016.41
4	rain	8.755556	6.977778	0.83	11.0446	259.0	1016.51

	Temperature (C)	Apparent Temperature (C)	Humidity	Wind Speed (km/h)	Wind Bearing (degrees)	Pressure (millibars)	Precip Type_rain	Precip Type_snow
0	9.472222	7.388889	0.89	14.1197	251.0	1015.13	True	False
1	9.355556	7.227778	0.86	14.2646	259.0	1015.63	True	False
2	9.377778	9.377778	0.89	3.9284	204.0	1015.94	True	False
3	8.288889	5.944444	0.83	14.1036	269.0	1016.41	True	False
4	8.755556	6.977778	0.83	11.0446	259.0	1016.51	True	False
...	...	...	...	...	...	...	...	...
96448	26.016667	26.016667	0.43	10.9963	31.0	1014.36	True	False
96449	24.583333	24.583333	0.48	10.0947	20.0	1015.16	True	False
96450	22.038889	22.038889	0.56	8.9838	30.0	1015.66	True	False
96451	21.522222	21.522222	0.60	10.5294	20.0	1015.95	True	False
96452	20.438889	20.438889	0.61	5.8765	39.0	1016.16	True	False

	Temperature (C)	Apparent Temperature (C)	Humidity	Wind Speed (km/h)	Wind Bearing (degrees)	Pressure (millibars)
count	96453.000000	96453.000000	96453.000000	96453.000000	96453.000000	96453.000000
mean	11.932678	10.855029	0.734899	10.810640	187.509232	1003.235956
std	9.551546	10.696847	0.195473	6.913571	107.383428	116.969906
min	-21.822222	-27.716667	0.000000	0.000000	0.000000	0.000000
25%	4.688889	2.311111	0.600000	5.828200	116.000000	1011.900000
50%	12.000000	12.000000	0.780000	9.965900	180.000000	1016.450000
75%	18.838889	18.838889	0.890000	14.135800	290.000000	1021.090000
max	39.905556	39.344444	1.000000	63.852600	359.000000	1046.380000

	Temperature (C)	Apparent Temperature (C)	Humidity	Wind Speed (km/h)	Wind Bearing (degrees)	Pressure (millibars)
count	96453.000000	96453.000000	96453.000000	96453.000000	96453.000000	96453.000000
mean	11.932678	10.855029	0.734899	10.810640	187.509232	1016.817525
std	9.551546	10.696847	0.195473	6.913571	107.383428	7.779863
min	-21.822222	-27.716667	0.000000	0.000000	0.000000	973.780000
25%	4.688889	2.311111	0.600000	5.828200	116.000000	1012.120000
50%	12.000000	12.000000	0.780000	9.965900	180.000000	1016.560000
75%	18.838889	18.838889	0.890000	14.135800	290.000000	1021.170000
max	39.905556	39.344444	1.000000	63.852600	359.000000	1046.380000

Linearna regresija¶

Upoznavanje baze podataka¶

Normalizacija obeležja¶

Mere uspešnosti regresora¶

Obuka modela linearne regresije¶

I model¶

I model - standardizacija¶

I model - regularizacija¶

Ridge¶

Lasso¶

I model - trostruka podela podataka¶

I model - unakrsna validacija¶

Selekcija obeležja¶

II model¶

Isprobavanje različitih hipoteza¶

III model - hipoteza sa interakcijama drugog stepena (bez kvadrata)¶

III model - hipoteza interakcijama drugog stepena¶

III model - hipoteza sa interakcijama trećeg stepena¶

III model - naša hipoteza¶

	Formatted Date	Summary	Precip Type	Temperature (C)	Apparent Temperature (C)	Humidity	Wind Speed (km/h)	Wind Bearing (degrees)	Visibility (km)	Pressure (millibars)	Daily Summary
0	2006-04-01 00:00:00.000 +0200	Partly Cloudy	rain	9.472222	7.388889	0.89	14.1197	251.0	15.8263	1015.13	Partly cloudy throughout the day.
1	2006-04-01 01:00:00.000 +0200	Partly Cloudy	rain	9.355556	7.227778	0.86	14.2646	259.0	15.8263	1015.63	Partly cloudy throughout the day.
2	2006-04-01 02:00:00.000 +0200	Mostly Cloudy	rain	9.377778	9.377778	0.89	3.9284	204.0	14.9569	1015.94	Partly cloudy throughout the day.
3	2006-04-01 03:00:00.000 +0200	Partly Cloudy	rain	8.288889	5.944444	0.83	14.1036	269.0	15.8263	1016.41	Partly cloudy throughout the day.
4	2006-04-01 04:00:00.000 +0200	Mostly Cloudy	rain	8.755556	6.977778	0.83	11.0446	259.0	15.8263	1016.51	Partly cloudy throughout the day.

	Visibility (km)
count	9.645300e+04
mean	-2.357352e-18
std	1.000005e+00
min	-2.468291e+00
25%	-4.788827e-01
50%	-7.178379e-02
75%	1.065021e+00
max	1.372265e+00

Karakteristika	Standardizacija	Normalizacija
Opseg	Može biti negativan / neograničen	[0, 1] (ili [a, b])
Srednja vrednost	0	Ne mora biti 0
Standardna devijacija	1	Nije relevantna
Pogodno za	Linearne modele, PCA, SVM	k-NN, neuronske mreže, slike
Osetljivost na outliere	Manje osetljiva	Više osetljiva

	Temperature (C)	Apparent Temperature (C)	Humidity	Wind Speed (km/h)	Pressure (millibars)	Precip Type_rain	Precip Type_snow	Wind bearing cat_N	Wind bearing cat_S	Wind bearing cat_W
43345	0.366321	0.427717	1.153114	0.007331	-0.234906	True	False	False	False	False
66832	-0.620241	-0.911529	-0.331279	3.275592	-0.865304	True	False	False	True	False
92142	0.659843	0.689845	1.204300	-0.734188	-0.843433	True	False	False	True	False
24092	-0.179957	-0.060133	-0.382464	-1.096811	0.215378	True	False	False	True	False
35372	-0.418735	-0.499094	0.129395	0.460610	0.010820	True	False	False	True	False
...	...	...	...	...	...	...	...	...	...	...
13309	2.002242	1.860059	-1.815671	0.507101	-0.959220	True	False	False	False	True
9459	-0.384957	-0.323301	0.129395	-0.652830	0.317013	True	False	False	True	False
18571	1.875282	1.645260	-2.327531	-0.160034	-0.556537	True	False	False	False	False
7862	-0.310412	-0.414838	-1.713299	0.885996	0.365901	True	False	False	False	False
86561	-0.736136	-0.866281	1.050742	0.653545	-0.039354	True	False	False	False	False