본문으로 건너뛰기
Advertisement

웹 크롤링

BeautifulSoup으로 HTML 파싱, Playwright로 JS 렌더링 페이지, 비동기 크롤링을 실습합니다.


설치

pip install beautifulsoup4 lxml requests
pip install playwright && playwright install chromium
pip install httpx

BeautifulSoup — HTML 파싱

import requests
from bs4 import BeautifulSoup


def fetch_page(url: str) -> BeautifulSoup:
headers = {"User-Agent": "Mozilla/5.0 (compatible; MyCrawler/1.0)"}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
return BeautifulSoup(response.text, "lxml")


# 기본 선택자
soup = fetch_page("https://books.toscrape.com/")

# CSS 선택자 (가장 유연)
title = soup.select_one("h1").text.strip()
products = soup.select("article.product_pod")

for product in products[:5]:
name = product.select_one("h3 a")["title"]
price = product.select_one("p.price_color").text.strip()
rating = product.select_one("p.star-rating")["class"][1]
print(f"{name} | {price} | {rating}")

# find / find_all
links = soup.find_all("a", href=True)
for link in links:
href = link["href"]
text = link.text.strip()

# 속성 접근
img = soup.find("img")
src = img.get("src", "")
alt = img.get("alt", "")

# 텍스트 추출
paragraphs = [p.text.strip() for p in soup.find_all("p") if p.text.strip()]

페이지네이션 크롤링

import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin


def crawl_books(base_url: str = "https://books.toscrape.com/") -> list[dict]:
books = []
url = base_url

while url:
print(f"크롤링: {url}")
soup = fetch_page(url)

# 책 데이터 수집
for article in soup.select("article.product_pod"):
books.append({
"title": article.select_one("h3 a")["title"],
"price": article.select_one("p.price_color").text.strip(),
"rating": article.select_one("p.star-rating")["class"][1],
"link": urljoin(url, article.select_one("h3 a")["href"]),
})

# 다음 페이지
next_btn = soup.select_one("li.next a")
url = urljoin(url, next_btn["href"]) if next_btn else None

time.sleep(1) # 서버 부하 방지 (크롤링 예절)

return books


books = crawl_books()
print(f"총 {len(books)}권 수집")

Playwright — JavaScript 렌더링 페이지

from playwright.sync_api import sync_playwright
import time


def crawl_dynamic_page(url: str) -> list[dict]:
"""JS 렌더링이 필요한 페이지 크롤링"""
results = []

with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()

# 불필요한 리소스 차단 (속도 향상)
page.route("**/*.{png,jpg,jpeg,gif,css,woff}", lambda route: route.abort())

page.goto(url, wait_until="networkidle")

# 스크롤 다운 (무한 스크롤 페이지)
for _ in range(5):
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(1)

# 데이터 추출
items = page.query_selector_all(".product-item")
for item in items:
title = item.query_selector(".title")
price = item.query_selector(".price")
results.append({
"title": title.inner_text() if title else "",
"price": price.inner_text() if price else "",
})

browser.close()
return results


# 비동기 Playwright
from playwright.async_api import async_playwright
import asyncio


async def crawl_async(url: str) -> str:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
await page.goto(url)
await page.wait_for_selector(".content", timeout=10000)
content = await page.content()
await browser.close()
return content

비동기 크롤링 — httpx + asyncio

import asyncio
import httpx
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class AsyncCrawler:
def __init__(self, max_concurrent: int = 10, delay: float = 0.5):
self.semaphore = asyncio.Semaphore(max_concurrent)
self.delay = delay
self.session = None

async def fetch(self, url: str) -> str | None:
async with self.semaphore:
try:
response = await self.session.get(url)
response.raise_for_status()
await asyncio.sleep(self.delay)
return response.text
except Exception as e:
logger.error(f"실패: {url}{e}")
return None

async def parse_book(self, html: str, base_url: str) -> dict | None:
if not html:
return None
soup = BeautifulSoup(html, "lxml")
try:
return {
"title": soup.select_one("h1").text.strip(),
"price": soup.select_one("p.price_color").text.strip(),
"stock": soup.select_one("p.availability").text.strip(),
}
except Exception:
return None

async def crawl_pages(self, urls: list[str]) -> list[dict]:
async with httpx.AsyncClient(
headers={"User-Agent": "AsyncCrawler/1.0"},
timeout=15.0,
follow_redirects=True,
) as self.session:
html_list = await asyncio.gather(*[self.fetch(url) for url in urls])

results = []
for html, url in zip(html_list, urls):
book = await self.parse_book(html, url)
if book:
results.append(book)
return results


# 사용
urls = [f"https://books.toscrape.com/catalogue/page-{i}.html" for i in range(1, 6)]
crawler = AsyncCrawler(max_concurrent=5, delay=0.5)
books = asyncio.run(crawler.crawl_pages(urls))
print(f"수집 완료: {len(books)}권")

데이터 저장

import pandas as pd
import json

# CSV 저장
df = pd.DataFrame(books)
df.to_csv("books.csv", index=False, encoding="utf-8-sig") # Excel 호환

# JSON 저장
with open("books.json", "w", encoding="utf-8") as f:
json.dump(books, f, ensure_ascii=False, indent=2)

# SQLite 저장
import sqlite3

conn = sqlite3.connect("books.db")
df.to_sql("books", conn, if_exists="replace", index=False)
conn.close()

크롤링 윤리와 주의사항

✅ robots.txt 확인 (크롤링 허용 여부)
✅ 요청 간 지연 추가 (1초 이상 권장)
✅ User-Agent 명시
✅ 사이트 이용약관 확인

❌ 개인정보 수집 금지
❌ 과도한 요청으로 서버 부하 야기
❌ 저작권 보호 콘텐츠 무단 수집
❌ 로그인 우회 또는 접근 제한 무시

정리

도구용도
requests + BeautifulSoup정적 HTML 페이지
httpx + asyncio대규모 비동기 크롤링
PlaywrightJS 렌더링 (SPA, React, Vue 등)
lxml빠른 HTML/XML 파싱

대부분의 크롤링은 BeautifulSoup으로 충분, JS 동적 로딩이 있으면 Playwright 사용합니다.

Advertisement