scikit-learn 워크플로우

scikit-learn은 Python 머신러닝의 표준 라이브러리입니다. 일관된 API 로 데이터 전처리부터 모델 학습·평가까지 처리합니다.

설치

pip install scikit-learn

머신러닝 전체 흐름

데이터 수집 → 전처리 → 특성 선택 → 모델 선택 → 학습 → 평가 → 튜닝 → 배포

데이터셋 로딩

from sklearn.datasets import (
    load_iris,           # 붓꽃 분류 (150샘플, 3클래스)
    load_diabetes,       # 당뇨병 회귀 (442샘플)
    load_breast_cancer,  # 유방암 이진 분류
    fetch_california_housing,  # 캘리포니아 주택 가격
    make_classification, # 가상 분류 데이터 생성
    make_regression,     # 가상 회귀 데이터 생성
)
import pandas as pd

# 내장 데이터셋
iris = load_iris()
X, y = iris.data, iris.target
feature_names = iris.feature_names
target_names = iris.target_names

# DataFrame으로 변환
df = pd.DataFrame(X, columns=feature_names)
df["target"] = y

# 가상 데이터 생성
from sklearn.datasets import make_classification
X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=10,
    n_redundant=5,
    random_state=42,
)

데이터 분할

from sklearn.model_selection import train_test_split

# 기본 분할 (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y,     # 클래스 비율 유지 (분류 문제에서 권장)
)

print(f"학습 데이터: {X_train.shape}")
print(f"테스트 데이터: {X_test.shape}")

데이터 전처리

from sklearn.preprocessing import (
    StandardScaler,    # 평균 0, 표준편차 1
    MinMaxScaler,      # [0, 1] 범위
    RobustScaler,      # 이상치에 강건 (중앙값 기반)
    LabelEncoder,      # 정수 레이블 인코딩
    OneHotEncoder,     # 원-핫 인코딩
)
from sklearn.impute import SimpleImputer
import numpy as np

# 1. 결측값 처리
imputer = SimpleImputer(strategy="mean")   # 'mean', 'median', 'most_frequent'
X_imputed = imputer.fit_transform(X_train)

# 2. 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # fit + transform
X_test_scaled = scaler.transform(X_test)         # transform만 (학습 데이터 통계 적용)

# ❌ 틀린 방법: 테스트에 fit하면 data leakage
# X_test_scaled = scaler.fit_transform(X_test)

# 3. 범주형 인코딩
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd

df = pd.DataFrame({
    "color": ["red", "blue", "green", "red"],
    "size": ["S", "M", "L", "XL"],
})

# 원-핫 인코딩
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded = encoder.fit_transform(df[["color"]])

Pipeline — 전처리 + 모델 통합

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Pipeline으로 전처리 + 모델 통합
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", LogisticRegression(random_state=42)),
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
score = pipe.score(X_test, y_test)
print(f"정확도: {score:.4f}")


# ColumnTransformer — 수치형/범주형 동시 처리
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

numeric_features = ["age", "salary"]
categorical_features = ["department", "city"]

preprocessor = ColumnTransformer(transformers=[
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]), numeric_features),
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]), categorical_features),
])

full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression()),
])

교차 검증

from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold

# K-Fold 교차 검증
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipe, X, y, cv=cv, scoring="accuracy")

print(f"CV 점수: {scores}")
print(f"평균: {scores.mean():.4f} ± {scores.std():.4f}")


# 복수 지표 동시 평가
from sklearn.model_selection import cross_validate

results = cross_validate(pipe, X, y, cv=cv,
    scoring=["accuracy", "f1_macro", "roc_auc_ovr"],
    return_train_score=True,
)
print(results)

하이퍼파라미터 튜닝

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Grid Search (모든 조합 탐색)
param_grid = {
    "classifier__C": [0.01, 0.1, 1, 10, 100],
    "classifier__penalty": ["l1", "l2"],
    "classifier__solver": ["liblinear"],
}

grid_search = GridSearchCV(
    pipe,
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,       # 전체 CPU 사용
    verbose=1,
)
grid_search.fit(X_train, y_train)

print(f"최적 파라미터: {grid_search.best_params_}")
print(f"최적 점수: {grid_search.best_score_:.4f}")
best_model = grid_search.best_estimator_


# Randomized Search (대규모 탐색에 효율적)
from scipy.stats import uniform, randint

param_dist = {
    "classifier__C": uniform(0.001, 100),
    "classifier__max_iter": randint(100, 1000),
}

random_search = RandomizedSearchCV(
    pipe, param_dist, n_iter=50, cv=5, random_state=42, n_jobs=-1
)
random_search.fit(X_train, y_train)

특성 중요도

from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# 트리 기반 특성 중요도
importances = pd.Series(rf.feature_importances_, index=feature_names)
importances.sort_values(ascending=True).plot(kind="barh", figsize=(8, 6))
plt.title("특성 중요도")
plt.show()

정리

단계	클래스/함수
데이터 분할	`train_test_split()`
스케일링	`StandardScaler`, `MinMaxScaler`
결측값 처리	`SimpleImputer`
파이프라인	`Pipeline`, `ColumnTransformer`
교차 검증	`cross_val_score`, `StratifiedKFold`
파라미터 탐색	`GridSearchCV`, `RandomizedSearchCV`

scikit-learn의 핵심은 fit() → transform() → predict()의 일관된 API 입니다.

설치​

머신러닝 전체 흐름​

데이터셋 로딩​

데이터 분할​

데이터 전처리​

Pipeline — 전처리 + 모델 통합​

교차 검증​

하이퍼파라미터 튜닝​

특성 중요도​

정리​

설치

머신러닝 전체 흐름

데이터셋 로딩

데이터 분할

데이터 전처리

Pipeline — 전처리 + 모델 통합

교차 검증

하이퍼파라미터 튜닝

특성 중요도

정리