Web Crawling
Practice HTML parsing with BeautifulSoup, JS-rendered pages with Playwright, and async crawling.
Installation
pip install beautifulsoup4 lxml requests
pip install playwright && playwright install chromium
pip install httpx
BeautifulSoup — HTML Parsing
import requests
from bs4 import BeautifulSoup
def fetch_page(url: str) -> BeautifulSoup:
headers = {"User-Agent": "Mozilla/5.0 (compatible; MyCrawler/1.0)"}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
return BeautifulSoup(response.text, "lxml")
# Basic selectors
soup = fetch_page("https://books.toscrape.com/")
# CSS selectors (most flexible)
title = soup.select_one("h1").text.strip()
products = soup.select("article.product_pod")
for product in products[:5]:
name = product.select_one("h3 a")["title"]
price = product.select_one("p.price_color").text.strip()
rating = product.select_one("p.star-rating")["class"][1]
print(f"{name} | {price} | {rating}")
# find / find_all
links = soup.find_all("a", href=True)
for link in links:
href = link["href"]
text = link.text.strip()
# Attribute access
img = soup.find("img")
src = img.get("src", "")
alt = img.get("alt", "")
# Text extraction
paragraphs = [p.text.strip() for p in soup.find_all("p") if p.text.strip()]
Pagination Crawling
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin
def crawl_books(base_url: str = "https://books.toscrape.com/") -> list[dict]:
books = []
url = base_url
while url:
print(f"Crawling: {url}")
soup = fetch_page(url)
# Collect book data
for article in soup.select("article.product_pod"):
books.append({
"title": article.select_one("h3 a")["title"],
"price": article.select_one("p.price_color").text.strip(),
"rating": article.select_one("p.star-rating")["class"][1],
"link": urljoin(url, article.select_one("h3 a")["href"]),
})
# Next page
next_btn = soup.select_one("li.next a")
url = urljoin(url, next_btn["href"]) if next_btn else None
time.sleep(1) # Be polite — avoid overloading the server
return books
books = crawl_books()
print(f"Total collected: {len(books)} books")
Playwright — JavaScript-Rendered Pages
from playwright.sync_api import sync_playwright
import time
def crawl_dynamic_page(url: str) -> list[dict]:
"""Crawl pages that require JS rendering"""
results = []
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
# Block unnecessary resources (faster)
page.route("**/*.{png,jpg,jpeg,gif,css,woff}", lambda route: route.abort())
page.goto(url, wait_until="networkidle")
# Scroll down (infinite scroll pages)
for _ in range(5):
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(1)
# Extract data
items = page.query_selector_all(".product-item")
for item in items:
title = item.query_selector(".title")
price = item.query_selector(".price")
results.append({
"title": title.inner_text() if title else "",
"price": price.inner_text() if price else "",
})
browser.close()
return results
# Async Playwright
from playwright.async_api import async_playwright
import asyncio
async def crawl_async(url: str) -> str:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
await page.goto(url)
await page.wait_for_selector(".content", timeout=10000)
content = await page.content()
await browser.close()
return content
Async Crawling — httpx + asyncio
import asyncio
import httpx
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class AsyncCrawler:
def __init__(self, max_concurrent: int = 10, delay: float = 0.5):
self.semaphore = asyncio.Semaphore(max_concurrent)
self.delay = delay
self.session = None
async def fetch(self, url: str) -> str | None:
async with self.semaphore:
try:
response = await self.session.get(url)
response.raise_for_status()
await asyncio.sleep(self.delay)
return response.text
except Exception as e:
logger.error(f"Failed: {url} — {e}")
return None
async def parse_book(self, html: str, base_url: str) -> dict | None:
if not html:
return None
soup = BeautifulSoup(html, "lxml")
try:
return {
"title": soup.select_one("h1").text.strip(),
"price": soup.select_one("p.price_color").text.strip(),
"stock": soup.select_one("p.availability").text.strip(),
}
except Exception:
return None
async def crawl_pages(self, urls: list[str]) -> list[dict]:
async with httpx.AsyncClient(
headers={"User-Agent": "AsyncCrawler/1.0"},
timeout=15.0,
follow_redirects=True,
) as self.session:
html_list = await asyncio.gather(*[self.fetch(url) for url in urls])
results = []
for html, url in zip(html_list, urls):
book = await self.parse_book(html, url)
if book:
results.append(book)
return results
# Usage
urls = [f"https://books.toscrape.com/catalogue/page-{i}.html" for i in range(1, 6)]
crawler = AsyncCrawler(max_concurrent=5, delay=0.5)
books = asyncio.run(crawler.crawl_pages(urls))
print(f"Collected: {len(books)} books")
Saving Data
import pandas as pd
import json
# Save as CSV
df = pd.DataFrame(books)
df.to_csv("books.csv", index=False, encoding="utf-8-sig") # Excel-compatible
# Save as JSON
with open("books.json", "w", encoding="utf-8") as f:
json.dump(books, f, ensure_ascii=False, indent=2)
# Save to SQLite
import sqlite3
conn = sqlite3.connect("books.db")
df.to_sql("books", conn, if_exists="replace", index=False)
conn.close()
Crawling Ethics and Best Practices
✅ Check robots.txt (verify crawling is allowed)
✅ Add delays between requests (1 second or more recommended)
✅ Identify your crawler with a User-Agent
✅ Review the site's terms of service
❌ Do not collect personal information
❌ Do not overload servers with excessive requests
❌ Do not collect copyright-protected content without permission
❌ Do not bypass login or access restrictions
Summary
| Tool | Use Case |
|---|---|
requests + BeautifulSoup | Static HTML pages |
httpx + asyncio | Large-scale async crawling |
Playwright | JS-rendered pages (SPA, React, Vue, etc.) |
lxml | Fast HTML/XML parsing |
BeautifulSoup is sufficient for most crawling tasks. Use Playwright when dynamic JS loading is required.