scikit-learn Workflow
scikit-learn is the standard machine learning library for Python. Its consistent API handles everything from data preprocessing to model training and evaluation.
Installation
pip install scikit-learn
The Machine Learning Pipeline
Data Collection → Preprocessing → Feature Selection → Model Selection → Training → Evaluation → Tuning → Deployment
Loading Datasets
from sklearn.datasets import (
load_iris, # Iris classification (150 samples, 3 classes)
load_diabetes, # Diabetes regression (442 samples)
load_breast_cancer, # Breast cancer binary classification
fetch_california_housing, # California housing prices
make_classification, # Generate synthetic classification data
make_regression, # Generate synthetic regression data
)
import pandas as pd
# Built-in dataset
iris = load_iris()
X, y = iris.data, iris.target
feature_names = iris.feature_names
target_names = iris.target_names
# Convert to DataFrame
df = pd.DataFrame(X, columns=feature_names)
df["target"] = y
# Generate synthetic data
from sklearn.datasets import make_classification
X, y = make_classification(
n_samples=1000,
n_features=20,
n_informative=10,
n_redundant=5,
random_state=42,
)
Train-Test Split
from sklearn.model_selection import train_test_split
# Basic split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2,
random_state=42,
stratify=y, # maintain class ratio (recommended for classification)
)
print(f"Training data: {X_train.shape}")
print(f"Test data: {X_test.shape}")
Data Preprocessing
from sklearn.preprocessing import (
StandardScaler, # mean 0, std 1
MinMaxScaler, # range [0, 1]
RobustScaler, # robust to outliers (median-based)
LabelEncoder, # integer label encoding
OneHotEncoder, # one-hot encoding
)
from sklearn.impute import SimpleImputer
import numpy as np
# 1. Handle missing values
imputer = SimpleImputer(strategy="mean") # 'mean', 'median', 'most_frequent'
X_imputed = imputer.fit_transform(X_train)
# 2. Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) # fit + transform
X_test_scaled = scaler.transform(X_test) # transform only (apply training stats)
# ❌ Wrong approach: fitting on test data causes data leakage
# X_test_scaled = scaler.fit_transform(X_test)
# 3. Categorical encoding
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd
df = pd.DataFrame({
"color": ["red", "blue", "green", "red"],
"size": ["S", "M", "L", "XL"],
})
# One-hot encoding
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded = encoder.fit_transform(df[["color"]])
Pipeline — Preprocessing + Model Integration
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
# Integrate preprocessing + model in Pipeline
pipe = Pipeline([
("scaler", StandardScaler()),
("classifier", LogisticRegression(random_state=42)),
])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
score = pipe.score(X_test, y_test)
print(f"Accuracy: {score:.4f}")
# ColumnTransformer — process numeric/categorical features simultaneously
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
numeric_features = ["age", "salary"]
categorical_features = ["department", "city"]
preprocessor = ColumnTransformer(transformers=[
("num", Pipeline([
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler()),
]), numeric_features),
("cat", Pipeline([
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
]), categorical_features),
])
full_pipeline = Pipeline([
("preprocessor", preprocessor),
("classifier", LogisticRegression()),
])
Cross-Validation
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
# K-Fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipe, X, y, cv=cv, scoring="accuracy")
print(f"CV scores: {scores}")
print(f"Mean: {scores.mean():.4f} ± {scores.std():.4f}")
# Evaluate multiple metrics simultaneously
from sklearn.model_selection import cross_validate
results = cross_validate(pipe, X, y, cv=cv,
scoring=["accuracy", "f1_macro", "roc_auc_ovr"],
return_train_score=True,
)
print(results)
Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# Grid Search (exhaustive search)
param_grid = {
"classifier__C": [0.01, 0.1, 1, 10, 100],
"classifier__penalty": ["l1", "l2"],
"classifier__solver": ["liblinear"],
}
grid_search = GridSearchCV(
pipe,
param_grid,
cv=5,
scoring="accuracy",
n_jobs=-1, # use all CPUs
verbose=1,
)
grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")
best_model = grid_search.best_estimator_
# Randomized Search (efficient for large search spaces)
from scipy.stats import uniform, randint
param_dist = {
"classifier__C": uniform(0.001, 100),
"classifier__max_iter": randint(100, 1000),
}
random_search = RandomizedSearchCV(
pipe, param_dist, n_iter=50, cv=5, random_state=42, n_jobs=-1
)
random_search.fit(X_train, y_train)
Feature Importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
# Tree-based feature importance
importances = pd.Series(rf.feature_importances_, index=feature_names)
importances.sort_values(ascending=True).plot(kind="barh", figsize=(8, 6))
plt.title("Feature Importance")
plt.show()
Summary
| Step | Class/Function |
|---|---|
| Data splitting | train_test_split() |
| Scaling | StandardScaler, MinMaxScaler |
| Missing values | SimpleImputer |
| Pipeline | Pipeline, ColumnTransformer |
| Cross-validation | cross_val_score, StratifiedKFold |
| Parameter search | GridSearchCV, RandomizedSearchCV |
The core of scikit-learn is its consistent API: fit() → transform() → predict().