Kreiranje korelacione matrice (za dataframe „df“)
In [ ]:
import matplotlib.pyplot as plt
import seaborn as sb
corr_matrix = df.corr(numeric_only=True).round(2)
plt.figure(figsize=(10, 8))
sb.heatmap(df.corr(numeric_only=True), annot=True,
cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Korelaciona matrica")
plt.tight_layout()
plt.show()
Dijagnosticki grafikoni (za model „model_sm”)
In [ ]:
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np
# Residuals vs Fitted
plt.scatter(model_sm.fittedvalues, model_sm.resid)
plt.axhline(0, color='gray', linestyle='--')
plt.xlabel("Fitted values")
plt.ylabel("Residuals")
plt.title("Residuals vs Fitted")
plt.show()
# Normal Q-Q (širi graf da linija deluje blaže)
sm.qqplot(model_sm.resid, line='45', fit=True)
plt.gcf().set_size_inches(8, 4)
plt.title("Normal Q-Q (adjusted aspect)")
plt.show()
# Scale-Location
influence = model_sm.get_influence()
std_resid = influence.resid_studentized_internal
abs_sqrt_resid = np.sqrt(np.abs(std_resid))
plt.scatter(model_sm.fittedvalues, abs_sqrt_resid)
plt.axhline(np.mean(abs_sqrt_resid), color='gray', linestyle='--')
plt.xlabel("Fitted values")
plt.ylabel("√|Standardized Residuals|")
plt.title("Scale-Location")
plt.show()
# Residuals vs Leverage + Cook's D threshold
leverage = influence.hat_matrix_diag
stud_resid = influence.resid_studentized_external
cooks_d = influence.cooks_distance[0]
n = len(model_sm.model.endog)
thresh = 4 / n
plt.scatter(leverage, stud_resid, s=1000 * cooks_d, alpha=0.5)
plt.axhline(0, color='gray', linestyle='--')
plt.axhline(2, color='red', linestyle='--')
plt.axhline(-2, color='red', linestyle='--')
plt.axvline(thresh, color='red', linestyle='--')
plt.text(thresh, plt.ylim()[1]*0.9, f"Cook's D ≈ {thresh:.3f}", color='red')
plt.xlabel("Leverage")
plt.ylabel("Studentized Residuals")
plt.title("Residuals vs Leverage")
plt.show()
VIF (za train set train_df)
In [ ]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
import statsmodels.api as sm
import pandas as pd
# Pripremamo X: svi prediktori iz train_df + konstanta. Pod prediktore se smatraju varijable koje su korišćene pri kreiranju modela. U ovom slučaju posmatramo lm3 - zbog toga uzimamo sve varijable iz train seta osim varijable medv.
X = sm.add_constant(train_df.drop(columns='medv'))
# Računamo VIF za svaki prediktor i vadimo kvadratni koren (√VIF)
# √VIF iznad 2 - potencijalna kolinearnost (prag po izboru)
print("√VIF sa svim prediktorima:")
print((pd.Series([vif(X.values, i) for i in range(X.shape[1])],
index=X.columns) ** 0.5).sort_values(ascending=False))
# Ponovno računanje VIF nakon izbacivanja 'tax'
X_no_tax = sm.add_constant(train_df.drop(columns=['medv', 'tax']))
# Ponovo računamo √VIF
print((pd.Series([vif(X_no_tax.values, i) for i in range(X_no_tax.shape[1])],
index=X_no_tax.columns) ** 0.5).sort_values(ascending=False))