실전 프로젝트 — 공공 데이터 분석

CSV 데이터 로딩 → 정제 → 분석 → 시각화까지 전체 데이터 파이프라인 을 실습합니다.

프로젝트 개요

서울시 따릉이(공공자전거) 이용 데이터 분석

목표: 대여소별 이용 패턴 분석 및 시간대·날씨 영향 파악
데이터: 서울 열린데이터광장 따릉이 이용 현황

pip install pandas numpy matplotlib seaborn requests

1단계 — 데이터 로딩

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# CSV 로딩
df = pd.read_csv("seoul_bike_2023.csv", encoding="cp949")  # 한국어 인코딩

# 기본 탐색
print(df.shape)       # (행, 열)
print(df.dtypes)      # 컬럼 타입
print(df.head())
print(df.isnull().sum())  # 결측값 확인

# 직접 데이터 생성 (연습용)
import numpy as np
from datetime import datetime, timedelta

np.random.seed(42)
n = 10000

dates = pd.date_range("2023-01-01", periods=n, freq="h")
df = pd.DataFrame({
    "datetime": dates,
    "station_id": np.random.randint(1, 51, n),
    "station_name": [f"대여소_{i}" for i in np.random.randint(1, 51, n)],
    "rentals": np.random.poisson(lam=20, size=n),
    "returns": np.random.poisson(lam=19, size=n),
    "temperature": np.random.normal(15, 10, n),
    "humidity": np.random.uniform(30, 90, n),
    "rainfall": np.where(np.random.random(n) < 0.1, np.random.exponential(5, n), 0),
})

2단계 — 데이터 정제

# 1. 날짜/시간 처리
df["datetime"] = pd.to_datetime(df["datetime"])
df["date"] = df["datetime"].dt.date
df["hour"] = df["datetime"].dt.hour
df["day_of_week"] = df["datetime"].dt.dayofweek   # 0=월요일
df["month"] = df["datetime"].dt.month
df["is_weekend"] = df["day_of_week"] >= 5

# 2. 이상치 처리
# 온도 범위 확인
print(df["temperature"].describe())
df = df[df["temperature"].between(-20, 45)]

# 3. 파생 변수 생성
df["net_flow"] = df["returns"] - df["rentals"]    # 순유입 (양수: 반납 많음)
df["total_activity"] = df["rentals"] + df["returns"]

# 비 오는 날 구분
df["is_rainy"] = df["rainfall"] > 0

# 계절 구분
def get_season(month):
    if month in [3, 4, 5]: return "봄"
    elif month in [6, 7, 8]: return "여름"
    elif month in [9, 10, 11]: return "가을"
    else: return "겨울"

df["season"] = df["month"].apply(get_season)

print(f"정제 후 데이터: {df.shape}")
print(f"결측값: {df.isnull().sum().sum()}")

3단계 — 탐색적 데이터 분석 (EDA)

# 1. 시간대별 이용량
hourly = df.groupby("hour")["rentals"].mean().reset_index()
print(hourly.nlargest(3, "rentals"))  # 피크 시간대

# 2. 요일별 이용량
weekday_names = ["월", "화", "수", "목", "금", "토", "일"]
weekly = df.groupby("day_of_week")["rentals"].mean()
weekly.index = weekday_names

# 3. 계절별 통계
seasonal = df.groupby("season").agg(
    avg_rentals=("rentals", "mean"),
    total_rentals=("rentals", "sum"),
    avg_temp=("temperature", "mean"),
).round(1)
print(seasonal)

# 4. 날씨 영향
print("비 오는 날 vs 맑은 날:")
print(df.groupby("is_rainy")["rentals"].agg(["mean", "median"]))

# 5. 온도-이용량 상관관계
corr = df[["rentals", "temperature", "humidity", "rainfall"]].corr()
print("\n상관관계:\n", corr["rentals"].sort_values(ascending=False))

# 6. 인기 대여소 TOP 10
top_stations = df.groupby("station_name")["rentals"].sum().nlargest(10)
print("\n인기 대여소 TOP 10:\n", top_stations)

4단계 — 시각화

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle("서울 따릉이 이용 패턴 분석", fontsize=16, fontweight="bold")

# 1. 시간대별 평균 이용량
hourly = df.groupby("hour")["rentals"].mean()
axes[0, 0].plot(hourly.index, hourly.values, marker="o", color="steelblue", linewidth=2)
axes[0, 0].fill_between(hourly.index, hourly.values, alpha=0.3)
axes[0, 0].set_title("시간대별 평균 대여량")
axes[0, 0].set_xlabel("시간 (시)")
axes[0, 0].set_ylabel("평균 대여 건수")
axes[0, 0].axvline(x=8, color="red", linestyle="--", alpha=0.5, label="출퇴근")
axes[0, 0].axvline(x=18, color="red", linestyle="--", alpha=0.5)
axes[0, 0].legend()

# 2. 요일별 이용량 (박스플롯)
weekday_names = ["월", "화", "수", "목", "금", "토", "일"]
df["weekday_name"] = df["day_of_week"].map(dict(enumerate(weekday_names)))
df["weekday_name"] = pd.Categorical(df["weekday_name"], categories=weekday_names, ordered=True)
sns.boxplot(data=df, x="weekday_name", y="rentals", ax=axes[0, 1], palette="Set2")
axes[0, 1].set_title("요일별 대여량 분포")
axes[0, 1].set_xlabel("요일")
axes[0, 1].set_ylabel("대여 건수")

# 3. 계절별 이용량
season_order = ["봄", "여름", "가을", "겨울"]
season_data = df.groupby("season")["rentals"].mean().reindex(season_order)
colors = ["#90EE90", "#FF6B6B", "#DEB887", "#87CEEB"]
axes[0, 2].bar(season_data.index, season_data.values, color=colors, edgecolor="black")
axes[0, 2].set_title("계절별 평균 대여량")
axes[0, 2].set_ylabel("평균 대여 건수")

# 4. 온도 vs 이용량 (산점도 + 회귀선)
sample = df.sample(1000, random_state=42)
axes[1, 0].scatter(sample["temperature"], sample["rentals"],
                   alpha=0.3, c=sample["is_rainy"].astype(int),
                   cmap="RdYlGn", s=10)
z = np.polyfit(sample["temperature"], sample["rentals"], 1)
p = np.poly1d(z)
x_range = np.linspace(sample["temperature"].min(), sample["temperature"].max(), 100)
axes[1, 0].plot(x_range, p(x_range), "b--", linewidth=2, label="추세선")
axes[1, 0].set_title("기온 vs 대여량 (초록=맑음, 빨강=비)")
axes[1, 0].set_xlabel("기온 (°C)")
axes[1, 0].set_ylabel("대여 건수")
axes[1, 0].legend()

# 5. 상관관계 히트맵
corr_data = df[["rentals", "returns", "temperature", "humidity", "rainfall"]].corr()
sns.heatmap(corr_data, annot=True, fmt=".2f", cmap="coolwarm",
            ax=axes[1, 1], vmin=-1, vmax=1, square=True)
axes[1, 1].set_title("변수 간 상관관계")

# 6. 인기 대여소 TOP 10
top10 = df.groupby("station_name")["rentals"].sum().nlargest(10).sort_values()
axes[1, 2].barh(top10.index, top10.values, color="coral", edgecolor="black")
axes[1, 2].set_title("인기 대여소 TOP 10")
axes[1, 2].set_xlabel("총 대여 건수")

plt.tight_layout()
plt.savefig("bike_analysis.png", dpi=150, bbox_inches="tight")
plt.show()

5단계 — 인사이트 도출

print("=" * 50)
print("📊 따릉이 이용 패턴 분석 요약")
print("=" * 50)

# 피크 시간대
peak_hours = hourly.nlargest(3).index.tolist()
print(f"\n⏰ 피크 시간대: {peak_hours}시")

# 요일 패턴
weekday_avg = df[~df["is_weekend"]]["rentals"].mean()
weekend_avg = df[df["is_weekend"]]["rentals"].mean()
print(f"\n📅 주중 평균: {weekday_avg:.1f}건 / 주말 평균: {weekend_avg:.1f}건")
print(f"   → {'주중' if weekday_avg > weekend_avg else '주말'} 이용이 더 많음")

# 날씨 영향
rainy_avg = df[df["is_rainy"]]["rentals"].mean()
clear_avg = df[~df["is_rainy"]]["rentals"].mean()
rain_impact = (clear_avg - rainy_avg) / clear_avg * 100
print(f"\n🌧️ 비 오는 날 이용량 감소율: {rain_impact:.1f}%")

# 온도 상관관계
temp_corr = df["temperature"].corr(df["rentals"])
print(f"\n🌡️ 온도-이용량 상관계수: {temp_corr:.3f}")
print(f"   → {'양의' if temp_corr > 0 else '음의'} 상관 (기온 높을수록 {'많이' if temp_corr > 0 else '적게'} 이용)")

정리

단계	주요 작업	도구
로딩	CSV/Excel/DB 읽기	`pd.read_csv()`
정제	결측값, 이상치, 파생 변수	`fillna()`, `apply()`
EDA	통계 요약, 그룹 집계	`describe()`, `groupby()`
시각화	분포, 관계, 트렌드	Matplotlib, Seaborn
인사이트	패턴 해석, 결론 도출	통계 + 도메인 지식

데이터 분석의 핵심은 질문을 명확히 정의 하고, 데이터로 답을 찾아가는 과정입니다.

프로젝트 개요​

1단계 — 데이터 로딩​

2단계 — 데이터 정제​

3단계 — 탐색적 데이터 분석 (EDA)​

4단계 — 시각화​

5단계 — 인사이트 도출​

정리​

프로젝트 개요

1단계 — 데이터 로딩

2단계 — 데이터 정제

3단계 — 탐색적 데이터 분석 (EDA)

4단계 — 시각화

5단계 — 인사이트 도출

정리