End-to-End: California Housing
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import (
train_test_split, cross_val_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import (
LinearRegression, Ridge, Lasso
)
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
# Load: 20,640 houses, 8 features, target = median price
X, y = fetch_california_housing(return_X_y=True)
X_tr, X_te, y_tr, y_te = train_test_split(
X, y, test_size=0.2, random_state=42
)
scaler = StandardScaler()
X_tr_s = scaler.fit_transform(X_tr)
X_te_s = scaler.transform(X_te)
# Compare three models
models = {
"OLS": LinearRegression(),
"Ridge": Ridge(alpha=1.0),
"Lasso": Lasso(alpha=0.01),
}
for name, m in models.items():
m.fit(X_tr_s, y_tr)
y_pred = m.predict(X_te_s)
r2 = r2_score(y_te, y_pred)
rmse = np.sqrt(mean_squared_error(y_te, y_pred))
cv = cross_val_score(m, X_tr_s, y_tr, cv=5,
scoring="r2")
print(f"{name:6s} R²={r2:.3f} RMSE={rmse:.3f} "
f"CV R²={cv.mean():.3f}±{cv.std():.3f}")