Web Crawling

Practice HTML parsing with BeautifulSoup, JS-rendered pages with Playwright, and async crawling.

Installation

pip install beautifulsoup4 lxml requests
pip install playwright && playwright install chromium
pip install httpx

BeautifulSoup — HTML Parsing

import requests
from bs4 import BeautifulSoup


def fetch_page(url: str) -> BeautifulSoup:
    headers = {"User-Agent": "Mozilla/5.0 (compatible; MyCrawler/1.0)"}
    response = requests.get(url, headers=headers, timeout=10)
    response.raise_for_status()
    return BeautifulSoup(response.text, "lxml")


# Basic selectors
soup = fetch_page("https://books.toscrape.com/")

# CSS selectors (most flexible)
title = soup.select_one("h1").text.strip()
products = soup.select("article.product_pod")

for product in products[:5]:
    name = product.select_one("h3 a")["title"]
    price = product.select_one("p.price_color").text.strip()
    rating = product.select_one("p.star-rating")["class"][1]
    print(f"{name} | {price} | {rating}")

# find / find_all
links = soup.find_all("a", href=True)
for link in links:
    href = link["href"]
    text = link.text.strip()

# Attribute access
img = soup.find("img")
src = img.get("src", "")
alt = img.get("alt", "")

# Text extraction
paragraphs = [p.text.strip() for p in soup.find_all("p") if p.text.strip()]

Pagination Crawling

import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin


def crawl_books(base_url: str = "https://books.toscrape.com/") -> list[dict]:
    books = []
    url = base_url

    while url:
        print(f"Crawling: {url}")
        soup = fetch_page(url)

        # Collect book data
        for article in soup.select("article.product_pod"):
            books.append({
                "title": article.select_one("h3 a")["title"],
                "price": article.select_one("p.price_color").text.strip(),
                "rating": article.select_one("p.star-rating")["class"][1],
                "link": urljoin(url, article.select_one("h3 a")["href"]),
            })

        # Next page
        next_btn = soup.select_one("li.next a")
        url = urljoin(url, next_btn["href"]) if next_btn else None

        time.sleep(1)  # Be polite — avoid overloading the server

    return books


books = crawl_books()
print(f"Total collected: {len(books)} books")

Playwright — JavaScript-Rendered Pages

from playwright.sync_api import sync_playwright
import time


def crawl_dynamic_page(url: str) -> list[dict]:
    """Crawl pages that require JS rendering"""
    results = []

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()

        # Block unnecessary resources (faster)
        page.route("**/*.{png,jpg,jpeg,gif,css,woff}", lambda route: route.abort())

        page.goto(url, wait_until="networkidle")

        # Scroll down (infinite scroll pages)
        for _ in range(5):
            page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            time.sleep(1)

        # Extract data
        items = page.query_selector_all(".product-item")
        for item in items:
            title = item.query_selector(".title")
            price = item.query_selector(".price")
            results.append({
                "title": title.inner_text() if title else "",
                "price": price.inner_text() if price else "",
            })

        browser.close()
    return results


# Async Playwright
from playwright.async_api import async_playwright
import asyncio


async def crawl_async(url: str) -> str:
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url)
        await page.wait_for_selector(".content", timeout=10000)
        content = await page.content()
        await browser.close()
    return content

Async Crawling — httpx + asyncio

import asyncio
import httpx
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class AsyncCrawler:
    def __init__(self, max_concurrent: int = 10, delay: float = 0.5):
        self.semaphore = asyncio.Semaphore(max_concurrent)
        self.delay = delay
        self.session = None

    async def fetch(self, url: str) -> str | None:
        async with self.semaphore:
            try:
                response = await self.session.get(url)
                response.raise_for_status()
                await asyncio.sleep(self.delay)
                return response.text
            except Exception as e:
                logger.error(f"Failed: {url} — {e}")
                return None

    async def parse_book(self, html: str, base_url: str) -> dict | None:
        if not html:
            return None
        soup = BeautifulSoup(html, "lxml")
        try:
            return {
                "title": soup.select_one("h1").text.strip(),
                "price": soup.select_one("p.price_color").text.strip(),
                "stock": soup.select_one("p.availability").text.strip(),
            }
        except Exception:
            return None

    async def crawl_pages(self, urls: list[str]) -> list[dict]:
        async with httpx.AsyncClient(
            headers={"User-Agent": "AsyncCrawler/1.0"},
            timeout=15.0,
            follow_redirects=True,
        ) as self.session:
            html_list = await asyncio.gather(*[self.fetch(url) for url in urls])

        results = []
        for html, url in zip(html_list, urls):
            book = await self.parse_book(html, url)
            if book:
                results.append(book)
        return results


# Usage
urls = [f"https://books.toscrape.com/catalogue/page-{i}.html" for i in range(1, 6)]
crawler = AsyncCrawler(max_concurrent=5, delay=0.5)
books = asyncio.run(crawler.crawl_pages(urls))
print(f"Collected: {len(books)} books")

Saving Data

import pandas as pd
import json

# Save as CSV
df = pd.DataFrame(books)
df.to_csv("books.csv", index=False, encoding="utf-8-sig")  # Excel-compatible

# Save as JSON
with open("books.json", "w", encoding="utf-8") as f:
    json.dump(books, f, ensure_ascii=False, indent=2)

# Save to SQLite
import sqlite3

conn = sqlite3.connect("books.db")
df.to_sql("books", conn, if_exists="replace", index=False)
conn.close()

Crawling Ethics and Best Practices

✅ Check robots.txt (verify crawling is allowed)
✅ Add delays between requests (1 second or more recommended)
✅ Identify your crawler with a User-Agent
✅ Review the site's terms of service

❌ Do not collect personal information
❌ Do not overload servers with excessive requests
❌ Do not collect copyright-protected content without permission
❌ Do not bypass login or access restrictions

Summary

Tool	Use Case
`requests` + `BeautifulSoup`	Static HTML pages
`httpx` + `asyncio`	Large-scale async crawling
`Playwright`	JS-rendered pages (SPA, React, Vue, etc.)
`lxml`	Fast HTML/XML parsing

BeautifulSoup is sufficient for most crawling tasks. Use Playwright when dynamic JS loading is required.

Installation​

BeautifulSoup — HTML Parsing​

Pagination Crawling​

Playwright — JavaScript-Rendered Pages​

Async Crawling — httpx + asyncio​

Saving Data​

Crawling Ethics and Best Practices​

Summary​