Key Algorithms — Linear Regression, Random Forest, k-NN
Compare core machine learning algorithms from principles to practical code.
Linear Regression
from sklearn.linear_model import (
LinearRegression,
Ridge, # L2 regularization
Lasso, # L1 regularization (feature selection effect)
ElasticNet, # L1 + L2 combined
)
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
# Prepare data
housing = fetch_california_housing()
X, y = housing.data, housing.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
# Basic linear regression
lr = LinearRegression()
lr.fit(X_train_s, y_train)
print(f"Coefficients: {lr.coef_}")
print(f"Intercept: {lr.intercept_:.4f}")
print(f"R² (train): {lr.score(X_train_s, y_train):.4f}")
print(f"R² (test): {lr.score(X_test_s, y_test):.4f}")
# Ridge (L2 regularization — limits coefficient magnitude)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_s, y_train)
print(f"Ridge R²: {ridge.score(X_test_s, y_test):.4f}")
# Lasso (L1 regularization — drives unnecessary feature coefficients → 0)
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_s, y_train)
nonzero = np.sum(lasso.coef_ != 0)
print(f"Lasso non-zero coefficients: {nonzero}/{len(lasso.coef_)}")
Logistic Regression (Classification)
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
lr_clf = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
lr_clf.fit(X_train_s, y_train)
# Class probabilities
probs = lr_clf.predict_proba(X_test_s) # [[P(0), P(1)], ...]
print(f"First sample probabilities: negative={probs[0,0]:.3f}, positive={probs[0,1]:.3f}")
Decision Tree
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
import matplotlib.pyplot as plt
dt = DecisionTreeClassifier(
max_depth=4, # prevent overfitting
min_samples_leaf=5, # minimum samples in leaf node
random_state=42,
)
dt.fit(X_train, y_train)
# Visualize tree
plt.figure(figsize=(20, 8))
plot_tree(dt, feature_names=feature_names, class_names=["Malignant", "Benign"],
filled=True, fontsize=8)
plt.show()
# Print as text
print(export_text(dt, feature_names=feature_names))
Random Forest
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
# Classification
rf = RandomForestClassifier(
n_estimators=200, # number of trees
max_depth=10, # maximum depth
min_samples_split=5,
max_features="sqrt", # features to consider at each split
n_jobs=-1, # parallel processing
random_state=42,
)
rf.fit(X_train, y_train)
print(f"RF accuracy: {rf.score(X_test, y_test):.4f}")
# OOB (Out-of-Bag) score — evaluate without additional validation
rf_oob = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)
rf_oob.fit(X_train, y_train)
print(f"OOB score: {rf_oob.oob_score_:.4f}")
# Feature importance visualization
import pandas as pd
feat_imp = pd.Series(rf.feature_importances_, index=feature_names).sort_values(ascending=False)
print(feat_imp.head(10))
k-NN (k-Nearest Neighbors)
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
# Compare performance across k values
from sklearn.model_selection import cross_val_score
k_values = range(1, 31)
cv_scores = []
for k in k_values:
knn = KNeighborsClassifier(n_neighbors=k, metric="euclidean")
scores = cross_val_score(knn, X_train_s, y_train, cv=5, scoring="accuracy")
cv_scores.append(scores.mean())
best_k = k_values[np.argmax(cv_scores)]
print(f"Best k: {best_k}, score: {max(cv_scores):.4f}")
# Train with best k
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train_s, y_train)
print(f"Test accuracy: {knn.score(X_test_s, y_test):.4f}")
SVM (Support Vector Machine)
from sklearn.svm import SVC, SVR
# Classification
svm = SVC(
C=1.0, # margin-misclassification balance (higher = fewer misclassifications)
kernel="rbf", # 'linear', 'poly', 'rbf', 'sigmoid'
gamma="scale",
probability=True, # required for predict_proba
random_state=42,
)
svm.fit(X_train_s, y_train)
print(f"SVM accuracy: {svm.score(X_test_s, y_test):.4f}")
Algorithm Comparison
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
models = {
"Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
"Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42),
"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
"Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
"k-NN (k=5)": KNeighborsClassifier(n_neighbors=5),
"SVM": SVC(random_state=42),
}
results = {}
for name, model in models.items():
scores = cross_val_score(model, X_train_s, y_train, cv=5, scoring="accuracy")
results[name] = {"mean": scores.mean(), "std": scores.std()}
print(f"{name:25s}: {scores.mean():.4f} ± {scores.std():.4f}")
Algorithm Selection Guide
Choose based on data size and characteristics:
Small data (< 10K):
Classification → SVM, Logistic Regression, k-NN
Regression → Ridge, Lasso
Medium data (10K ~ 100K):
Classification/Regression → Random Forest, Gradient Boosting
Large data (> 100K):
Classification/Regression → SGDClassifier/Regressor, XGBoost, LightGBM
Interpretability needed:
→ Logistic Regression, Decision Tree
Non-linear complex patterns:
→ Random Forest, Gradient Boosting, SVM(RBF)
Summary
| Algorithm | Type | Strengths | Weaknesses |
|---|---|---|---|
| Linear Regression | Regression | Easy to interpret, fast | Poor with non-linear relationships |
| Logistic Regression | Classification | Probability output, interpretable | Poor with non-linear patterns |
| Decision Tree | Both | Easy to interpret | Prone to overfitting |
| Random Forest | Both | Robust, feature importance | Slow, memory intensive |
| k-NN | Both | Simple, intuitive | Slow prediction, scale-sensitive |
| SVM | Both | Effective in high dimensions | Slow with large datasets |
In practice, the typical approach is to try Random Forest first, then Gradient Boosting (XGBoost/LightGBM).