Key Algorithms — Linear Regression, Random Forest, k-NN

Compare core machine learning algorithms from principles to practical code.

Linear Regression

from sklearn.linear_model import (
    LinearRegression,
    Ridge,      # L2 regularization
    Lasso,      # L1 regularization (feature selection effect)
    ElasticNet, # L1 + L2 combined
)
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Prepare data
housing = fetch_california_housing()
X, y = housing.data, housing.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# Basic linear regression
lr = LinearRegression()
lr.fit(X_train_s, y_train)

print(f"Coefficients: {lr.coef_}")
print(f"Intercept: {lr.intercept_:.4f}")
print(f"R² (train): {lr.score(X_train_s, y_train):.4f}")
print(f"R² (test): {lr.score(X_test_s, y_test):.4f}")

# Ridge (L2 regularization — limits coefficient magnitude)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_s, y_train)
print(f"Ridge R²: {ridge.score(X_test_s, y_test):.4f}")

# Lasso (L1 regularization — drives unnecessary feature coefficients → 0)
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_s, y_train)
nonzero = np.sum(lasso.coef_ != 0)
print(f"Lasso non-zero coefficients: {nonzero}/{len(lasso.coef_)}")

Logistic Regression (Classification)

from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

lr_clf = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
lr_clf.fit(X_train_s, y_train)

# Class probabilities
probs = lr_clf.predict_proba(X_test_s)  # [[P(0), P(1)], ...]
print(f"First sample probabilities: negative={probs[0,0]:.3f}, positive={probs[0,1]:.3f}")

Decision Tree

from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
import matplotlib.pyplot as plt

dt = DecisionTreeClassifier(
    max_depth=4,          # prevent overfitting
    min_samples_leaf=5,   # minimum samples in leaf node
    random_state=42,
)
dt.fit(X_train, y_train)

# Visualize tree
plt.figure(figsize=(20, 8))
plot_tree(dt, feature_names=feature_names, class_names=["Malignant", "Benign"],
          filled=True, fontsize=8)
plt.show()

# Print as text
print(export_text(dt, feature_names=feature_names))

Random Forest

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# Classification
rf = RandomForestClassifier(
    n_estimators=200,        # number of trees
    max_depth=10,            # maximum depth
    min_samples_split=5,
    max_features="sqrt",     # features to consider at each split
    n_jobs=-1,               # parallel processing
    random_state=42,
)
rf.fit(X_train, y_train)
print(f"RF accuracy: {rf.score(X_test, y_test):.4f}")

# OOB (Out-of-Bag) score — evaluate without additional validation
rf_oob = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)
rf_oob.fit(X_train, y_train)
print(f"OOB score: {rf_oob.oob_score_:.4f}")

# Feature importance visualization
import pandas as pd
feat_imp = pd.Series(rf.feature_importances_, index=feature_names).sort_values(ascending=False)
print(feat_imp.head(10))

k-NN (k-Nearest Neighbors)

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

# Compare performance across k values
from sklearn.model_selection import cross_val_score

k_values = range(1, 31)
cv_scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k, metric="euclidean")
    scores = cross_val_score(knn, X_train_s, y_train, cv=5, scoring="accuracy")
    cv_scores.append(scores.mean())

best_k = k_values[np.argmax(cv_scores)]
print(f"Best k: {best_k}, score: {max(cv_scores):.4f}")

# Train with best k
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train_s, y_train)
print(f"Test accuracy: {knn.score(X_test_s, y_test):.4f}")

SVM (Support Vector Machine)

from sklearn.svm import SVC, SVR

# Classification
svm = SVC(
    C=1.0,           # margin-misclassification balance (higher = fewer misclassifications)
    kernel="rbf",    # 'linear', 'poly', 'rbf', 'sigmoid'
    gamma="scale",
    probability=True,  # required for predict_proba
    random_state=42,
)
svm.fit(X_train_s, y_train)
print(f"SVM accuracy: {svm.score(X_test_s, y_test):.4f}")

Algorithm Comparison

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "k-NN (k=5)": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(random_state=42),
}

results = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train_s, y_train, cv=5, scoring="accuracy")
    results[name] = {"mean": scores.mean(), "std": scores.std()}
    print(f"{name:25s}: {scores.mean():.4f} ± {scores.std():.4f}")

Algorithm Selection Guide

Choose based on data size and characteristics:

Small data (< 10K):
  Classification → SVM, Logistic Regression, k-NN
  Regression → Ridge, Lasso

Medium data (10K ~ 100K):
  Classification/Regression → Random Forest, Gradient Boosting

Large data (> 100K):
  Classification/Regression → SGDClassifier/Regressor, XGBoost, LightGBM

Interpretability needed:
  → Logistic Regression, Decision Tree

Non-linear complex patterns:
  → Random Forest, Gradient Boosting, SVM(RBF)

Summary

Algorithm	Type	Strengths	Weaknesses
Linear Regression	Regression	Easy to interpret, fast	Poor with non-linear relationships
Logistic Regression	Classification	Probability output, interpretable	Poor with non-linear patterns
Decision Tree	Both	Easy to interpret	Prone to overfitting
Random Forest	Both	Robust, feature importance	Slow, memory intensive
k-NN	Both	Simple, intuitive	Slow prediction, scale-sensitive
SVM	Both	Effective in high dimensions	Slow with large datasets

In practice, the typical approach is to try Random Forest first, then Gradient Boosting (XGBoost/LightGBM).

Linear Regression​

Logistic Regression (Classification)​

Decision Tree​

Random Forest​

k-NN (k-Nearest Neighbors)​

SVM (Support Vector Machine)​

Algorithm Comparison​

Algorithm Selection Guide​

Summary​

Linear Regression

Logistic Regression (Classification)

Decision Tree

Random Forest

k-NN (k-Nearest Neighbors)

SVM (Support Vector Machine)

Algorithm Comparison

Algorithm Selection Guide

Summary