Model Evaluation
Choosing the right evaluation metrics and validation methods is critical to machine learning projects.
Classification Metrics
from sklearn.metrics import (
accuracy_score,
precision_score,
recall_score,
f1_score,
classification_report,
confusion_matrix,
roc_auc_score,
roc_curve,
)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
# Prepare data
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
# Basic metrics
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_test, y_prob):.4f}")
# Detailed report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["Malignant", "Benign"]))
Metric Interpretation
Accuracy = (TP + TN) / Total
→ Can be misleading with class imbalance
(predicting all "positive" on 99% positive data → 1% accuracy)
Precision = TP / (TP + FP)
→ "Among positive predictions, what fraction is actually positive"
→ Spam filter: don't misclassify legitimate emails as spam → Precision matters
Recall = TP / (TP + FN)
→ "Among actual positives, what fraction was correctly predicted"
→ Cancer diagnosis: can't miss actual patients → Recall matters
F1 = 2 × Precision × Recall / (Precision + Recall)
→ Harmonic mean of precision and recall
AUC-ROC: 0.5 (random) ~ 1.0 (perfect)
→ Evaluates overall model performance regardless of threshold
Confusion Matrix Visualization
def plot_confusion_matrix(y_true, y_pred, class_names=None):
cm = confusion_matrix(y_true, y_pred)
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# Absolute values
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
ax=axes[0], xticklabels=class_names, yticklabels=class_names)
axes[0].set_title("Confusion Matrix (Absolute)")
axes[0].set_ylabel("True Class")
axes[0].set_xlabel("Predicted Class")
# Normalized (ratios)
cm_norm = cm.astype(float) / cm.sum(axis=1, keepdims=True)
sns.heatmap(cm_norm, annot=True, fmt=".2f", cmap="Blues",
ax=axes[1], xticklabels=class_names, yticklabels=class_names)
axes[1].set_title("Confusion Matrix (Normalized)")
axes[1].set_ylabel("True Class")
axes[1].set_xlabel("Predicted Class")
plt.tight_layout()
plt.show()
plot_confusion_matrix(y_test, y_pred, class_names=["Malignant", "Benign"])
ROC Curve
def plot_roc_curve(models_dict, X_test, y_test):
"""Compare ROC curves for multiple models"""
plt.figure(figsize=(8, 6))
for name, model in models_dict.items():
y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
auc = roc_auc_score(y_test, y_prob)
plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.3f})")
plt.plot([0, 1], [0, 1], "k--", label="Random (AUC=0.5)")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("ROC Curve Comparison")
plt.legend()
plt.grid(True)
plt.show()
Regression Metrics
from sklearn.metrics import (
mean_absolute_error, # MAE
mean_squared_error, # MSE
root_mean_squared_error, # RMSE
r2_score, # R² (coefficient of determination)
mean_absolute_percentage_error, # MAPE
)
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
reg = RandomForestRegressor(random_state=42)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
print(f"RMSE: {root_mean_squared_error(y_test, y_pred):.4f}")
print(f"R²: {r2_score(y_test, y_pred):.4f}")
print(f"MAPE: {mean_absolute_percentage_error(y_test, y_pred)*100:.2f}%")
# Residual plot
residuals = y_test - y_pred
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].scatter(y_pred, residuals, alpha=0.3)
axes[0].axhline(0, color="red", linestyle="--")
axes[0].set_xlabel("Predicted Values")
axes[0].set_ylabel("Residuals")
axes[0].set_title("Residual Plot")
axes[1].hist(residuals, bins=50, edgecolor="white")
axes[1].set_title("Residual Distribution")
plt.tight_layout()
plt.show()
Advanced Cross-Validation
from sklearn.model_selection import (
StratifiedKFold,
KFold,
LeaveOneOut,
cross_validate,
learning_curve,
)
# Stratified K-Fold (maintains class ratio in classification)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results = cross_validate(
model, X, y, cv=skf,
scoring=["accuracy", "f1", "roc_auc"],
return_train_score=True,
)
print("Train accuracy:", results["train_accuracy"].mean())
print("Val accuracy:", results["test_accuracy"].mean())
# Train >> Val: overfitting, Train ≈ Val ≈ low: underfitting
# Learning curve (diagnose overfitting/underfitting)
train_sizes, train_scores, val_scores = learning_curve(
model, X, y, cv=5,
train_sizes=np.linspace(0.1, 1.0, 10),
scoring="accuracy",
n_jobs=-1,
)
plt.figure(figsize=(8, 5))
plt.plot(train_sizes, train_scores.mean(axis=1), label="Train accuracy")
plt.plot(train_sizes, val_scores.mean(axis=1), label="Validation accuracy")
plt.fill_between(train_sizes,
train_scores.mean(axis=1) - train_scores.std(axis=1),
train_scores.mean(axis=1) + train_scores.std(axis=1), alpha=0.1)
plt.fill_between(train_sizes,
val_scores.mean(axis=1) - val_scores.std(axis=1),
val_scores.mean(axis=1) + val_scores.std(axis=1), alpha=0.1)
plt.xlabel("Training Data Size")
plt.ylabel("Accuracy")
plt.title("Learning Curve")
plt.legend()
plt.grid(True)
plt.show()
Handling Class Imbalance
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE # pip install imbalanced-learn
from imblearn.under_sampling import RandomUnderSampler
# Automatically compute class weights
classes = np.unique(y_train)
weights = compute_class_weight("balanced", classes=classes, y=y_train)
class_weight = dict(zip(classes, weights))
print(f"Class weights: {class_weight}")
# Model with class weights
model_balanced = RandomForestClassifier(
class_weight="balanced", # or class_weight=class_weight
random_state=42,
)
# SMOTE — oversample minority class
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print(f"Before oversampling: {np.bincount(y_train)}")
print(f"After oversampling: {np.bincount(y_resampled)}")
Summary
| Problem | Primary Metric | Secondary Metric |
|---|---|---|
| Binary classification (balanced) | Accuracy, F1 | AUC-ROC |
| Binary classification (imbalanced) | F1, AUC-ROC | Precision, Recall |
| Multi-class classification | Macro F1 | Confusion Matrix |
| Regression | RMSE, R² | MAE, MAPE |
Choose evaluation metrics according to business goals. When misdiagnosis is costly, prioritize Recall; when false alarms are costly, prioritize Precision.