웹 크롤링

BeautifulSoup으로 HTML 파싱, Playwright로 JS 렌더링 페이지, 비동기 크롤링을 실습합니다.

설치

pip install beautifulsoup4 lxml requests
pip install playwright && playwright install chromium
pip install httpx

BeautifulSoup — HTML 파싱

import requests
from bs4 import BeautifulSoup


def fetch_page(url: str) -> BeautifulSoup:
    headers = {"User-Agent": "Mozilla/5.0 (compatible; MyCrawler/1.0)"}
    response = requests.get(url, headers=headers, timeout=10)
    response.raise_for_status()
    return BeautifulSoup(response.text, "lxml")


# 기본 선택자
soup = fetch_page("https://books.toscrape.com/")

# CSS 선택자 (가장 유연)
title = soup.select_one("h1").text.strip()
products = soup.select("article.product_pod")

for product in products[:5]:
    name = product.select_one("h3 a")["title"]
    price = product.select_one("p.price_color").text.strip()
    rating = product.select_one("p.star-rating")["class"][1]
    print(f"{name} | {price} | {rating}")

# find / find_all
links = soup.find_all("a", href=True)
for link in links:
    href = link["href"]
    text = link.text.strip()

# 속성 접근
img = soup.find("img")
src = img.get("src", "")
alt = img.get("alt", "")

# 텍스트 추출
paragraphs = [p.text.strip() for p in soup.find_all("p") if p.text.strip()]

페이지네이션 크롤링

import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin


def crawl_books(base_url: str = "https://books.toscrape.com/") -> list[dict]:
    books = []
    url = base_url

    while url:
        print(f"크롤링: {url}")
        soup = fetch_page(url)

        # 책 데이터 수집
        for article in soup.select("article.product_pod"):
            books.append({
                "title": article.select_one("h3 a")["title"],
                "price": article.select_one("p.price_color").text.strip(),
                "rating": article.select_one("p.star-rating")["class"][1],
                "link": urljoin(url, article.select_one("h3 a")["href"]),
            })

        # 다음 페이지
        next_btn = soup.select_one("li.next a")
        url = urljoin(url, next_btn["href"]) if next_btn else None

        time.sleep(1)  # 서버 부하 방지 (크롤링 예절)

    return books


books = crawl_books()
print(f"총 {len(books)}권 수집")

Playwright — JavaScript 렌더링 페이지

from playwright.sync_api import sync_playwright
import time


def crawl_dynamic_page(url: str) -> list[dict]:
    """JS 렌더링이 필요한 페이지 크롤링"""
    results = []

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()

        # 불필요한 리소스 차단 (속도 향상)
        page.route("**/*.{png,jpg,jpeg,gif,css,woff}", lambda route: route.abort())

        page.goto(url, wait_until="networkidle")

        # 스크롤 다운 (무한 스크롤 페이지)
        for _ in range(5):
            page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            time.sleep(1)

        # 데이터 추출
        items = page.query_selector_all(".product-item")
        for item in items:
            title = item.query_selector(".title")
            price = item.query_selector(".price")
            results.append({
                "title": title.inner_text() if title else "",
                "price": price.inner_text() if price else "",
            })

        browser.close()
    return results


# 비동기 Playwright
from playwright.async_api import async_playwright
import asyncio


async def crawl_async(url: str) -> str:
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url)
        await page.wait_for_selector(".content", timeout=10000)
        content = await page.content()
        await browser.close()
    return content

비동기 크롤링 — httpx + asyncio

import asyncio
import httpx
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class AsyncCrawler:
    def __init__(self, max_concurrent: int = 10, delay: float = 0.5):
        self.semaphore = asyncio.Semaphore(max_concurrent)
        self.delay = delay
        self.session = None

    async def fetch(self, url: str) -> str | None:
        async with self.semaphore:
            try:
                response = await self.session.get(url)
                response.raise_for_status()
                await asyncio.sleep(self.delay)
                return response.text
            except Exception as e:
                logger.error(f"실패: {url} — {e}")
                return None

    async def parse_book(self, html: str, base_url: str) -> dict | None:
        if not html:
            return None
        soup = BeautifulSoup(html, "lxml")
        try:
            return {
                "title": soup.select_one("h1").text.strip(),
                "price": soup.select_one("p.price_color").text.strip(),
                "stock": soup.select_one("p.availability").text.strip(),
            }
        except Exception:
            return None

    async def crawl_pages(self, urls: list[str]) -> list[dict]:
        async with httpx.AsyncClient(
            headers={"User-Agent": "AsyncCrawler/1.0"},
            timeout=15.0,
            follow_redirects=True,
        ) as self.session:
            html_list = await asyncio.gather(*[self.fetch(url) for url in urls])

        results = []
        for html, url in zip(html_list, urls):
            book = await self.parse_book(html, url)
            if book:
                results.append(book)
        return results


# 사용
urls = [f"https://books.toscrape.com/catalogue/page-{i}.html" for i in range(1, 6)]
crawler = AsyncCrawler(max_concurrent=5, delay=0.5)
books = asyncio.run(crawler.crawl_pages(urls))
print(f"수집 완료: {len(books)}권")

데이터 저장

import pandas as pd
import json

# CSV 저장
df = pd.DataFrame(books)
df.to_csv("books.csv", index=False, encoding="utf-8-sig")  # Excel 호환

# JSON 저장
with open("books.json", "w", encoding="utf-8") as f:
    json.dump(books, f, ensure_ascii=False, indent=2)

# SQLite 저장
import sqlite3

conn = sqlite3.connect("books.db")
df.to_sql("books", conn, if_exists="replace", index=False)
conn.close()

크롤링 윤리와 주의사항

✅ robots.txt 확인 (크롤링 허용 여부)
✅ 요청 간 지연 추가 (1초 이상 권장)
✅ User-Agent 명시
✅ 사이트 이용약관 확인

❌ 개인정보 수집 금지
❌ 과도한 요청으로 서버 부하 야기
❌ 저작권 보호 콘텐츠 무단 수집
❌ 로그인 우회 또는 접근 제한 무시

정리

도구	용도
`requests` + `BeautifulSoup`	정적 HTML 페이지
`httpx` + `asyncio`	대규모 비동기 크롤링
`Playwright`	JS 렌더링 (SPA, React, Vue 등)
`lxml`	빠른 HTML/XML 파싱

대부분의 크롤링은 BeautifulSoup으로 충분, JS 동적 로딩이 있으면 Playwright 사용합니다.

설치​

BeautifulSoup — HTML 파싱​

페이지네이션 크롤링​

Playwright — JavaScript 렌더링 페이지​

비동기 크롤링 — httpx + asyncio​

데이터 저장​

크롤링 윤리와 주의사항​

정리​

설치

BeautifulSoup — HTML 파싱

페이지네이션 크롤링

Playwright — JavaScript 렌더링 페이지

비동기 크롤링 — httpx + asyncio

데이터 저장

크롤링 윤리와 주의사항

정리