웹 크롤링
BeautifulSoup으로 HTML 파싱, Playwright로 JS 렌더링 페이지, 비동기 크롤링을 실습합니다.
설치
pip install beautifulsoup4 lxml requests
pip install playwright && playwright install chromium
pip install httpx
BeautifulSoup — HTML 파싱
import requests
from bs4 import BeautifulSoup
def fetch_page(url: str) -> BeautifulSoup:
headers = {"User-Agent": "Mozilla/5.0 (compatible; MyCrawler/1.0)"}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
return BeautifulSoup(response.text, "lxml")
# 기본 선택자
soup = fetch_page("https://books.toscrape.com/")
# CSS 선택자 (가장 유연)
title = soup.select_one("h1").text.strip()
products = soup.select("article.product_pod")
for product in products[:5]:
name = product.select_one("h3 a")["title"]
price = product.select_one("p.price_color").text.strip()
rating = product.select_one("p.star-rating")["class"][1]
print(f"{name} | {price} | {rating}")
# find / find_all
links = soup.find_all("a", href=True)
for link in links:
href = link["href"]
text = link.text.strip()
# 속성 접근
img = soup.find("img")
src = img.get("src", "")
alt = img.get("alt", "")
# 텍스트 추출
paragraphs = [p.text.strip() for p in soup.find_all("p") if p.text.strip()]
페이지네이션 크롤링
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin
def crawl_books(base_url: str = "https://books.toscrape.com/") -> list[dict]:
books = []
url = base_url
while url:
print(f"크롤링: {url}")
soup = fetch_page(url)
# 책 데이터 수집
for article in soup.select("article.product_pod"):
books.append({
"title": article.select_one("h3 a")["title"],
"price": article.select_one("p.price_color").text.strip(),
"rating": article.select_one("p.star-rating")["class"][1],
"link": urljoin(url, article.select_one("h3 a")["href"]),
})
# 다음 페이지
next_btn = soup.select_one("li.next a")
url = urljoin(url, next_btn["href"]) if next_btn else None
time.sleep(1) # 서버 부하 방지 (크롤링 예절)
return books
books = crawl_books()
print(f"총 {len(books)}권 수집")
Playwright — JavaScript 렌더링 페이지
from playwright.sync_api import sync_playwright
import time
def crawl_dynamic_page(url: str) -> list[dict]:
"""JS 렌더링이 필요한 페이지 크롤링"""
results = []
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
# 불필요한 리소스 차단 (속도 향상)
page.route("**/*.{png,jpg,jpeg,gif,css,woff}", lambda route: route.abort())
page.goto(url, wait_until="networkidle")
# 스크롤 다운 (무한 스크롤 페이지)
for _ in range(5):
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(1)
# 데이터 추출
items = page.query_selector_all(".product-item")
for item in items:
title = item.query_selector(".title")
price = item.query_selector(".price")
results.append({
"title": title.inner_text() if title else "",
"price": price.inner_text() if price else "",
})
browser.close()
return results
# 비동기 Playwright
from playwright.async_api import async_playwright
import asyncio
async def crawl_async(url: str) -> str:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
await page.goto(url)
await page.wait_for_selector(".content", timeout=10000)
content = await page.content()
await browser.close()
return content
비동기 크롤링 — httpx + asyncio
import asyncio
import httpx
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class AsyncCrawler:
def __init__(self, max_concurrent: int = 10, delay: float = 0.5):
self.semaphore = asyncio.Semaphore(max_concurrent)
self.delay = delay
self.session = None
async def fetch(self, url: str) -> str | None:
async with self.semaphore:
try:
response = await self.session.get(url)
response.raise_for_status()
await asyncio.sleep(self.delay)
return response.text
except Exception as e:
logger.error(f"실패: {url} — {e}")
return None
async def parse_book(self, html: str, base_url: str) -> dict | None:
if not html:
return None
soup = BeautifulSoup(html, "lxml")
try:
return {
"title": soup.select_one("h1").text.strip(),
"price": soup.select_one("p.price_color").text.strip(),
"stock": soup.select_one("p.availability").text.strip(),
}
except Exception:
return None
async def crawl_pages(self, urls: list[str]) -> list[dict]:
async with httpx.AsyncClient(
headers={"User-Agent": "AsyncCrawler/1.0"},
timeout=15.0,
follow_redirects=True,
) as self.session:
html_list = await asyncio.gather(*[self.fetch(url) for url in urls])
results = []
for html, url in zip(html_list, urls):
book = await self.parse_book(html, url)
if book:
results.append(book)
return results
# 사용
urls = [f"https://books.toscrape.com/catalogue/page-{i}.html" for i in range(1, 6)]
crawler = AsyncCrawler(max_concurrent=5, delay=0.5)
books = asyncio.run(crawler.crawl_pages(urls))
print(f"수집 완료: {len(books)}권")
데이터 저장
import pandas as pd
import json
# CSV 저장
df = pd.DataFrame(books)
df.to_csv("books.csv", index=False, encoding="utf-8-sig") # Excel 호환
# JSON 저장
with open("books.json", "w", encoding="utf-8") as f:
json.dump(books, f, ensure_ascii=False, indent=2)
# SQLite 저장
import sqlite3
conn = sqlite3.connect("books.db")
df.to_sql("books", conn, if_exists="replace", index=False)
conn.close()
크롤링 윤리와 주의사항
✅ robots.txt 확인 (크롤링 허용 여부)
✅ 요청 간 지연 추가 (1초 이상 권장)
✅ User-Agent 명시
✅ 사이트 이용약관 확인
❌ 개인정보 수집 금지
❌ 과도한 요청으로 서버 부하 야기
❌ 저작권 보호 콘텐츠 무단 수집
❌ 로그인 우회 또는 접근 제한 무시
정리
| 도구 | 용도 |
|---|---|
requests + BeautifulSoup | 정적 HTML 페이지 |
httpx + asyncio | 대규모 비동기 크롤링 |
Playwright | JS 렌더링 (SPA, React, Vue 등) |
lxml | 빠른 HTML/XML 파싱 |
대부분의 크롤링은 BeautifulSoup으로 충분, JS 동적 로딩이 있으면 Playwright 사용합니다.