Skip to main content
Advertisement

Practical Benchmarking

Measure code performance accurately with timeit and pytest-benchmark, and develop Big-O intuition.


Installation

pip install pytest pytest-benchmark

timeit — Micro Benchmarks

import timeit


# ── Basic usage ───────────────────────────────────────────
# Option 1: Command line
# python -m timeit "sum(range(1000))"
# python -m timeit -n 10000 -r 5 "'-'.join(str(i) for i in range(100))"

# Option 2: Inline
t = timeit.timeit("sum(range(1000))", number=10_000)
print(f"10000 runs: {t:.4f}s")
print(f"Per run: {t / 10_000 * 1e6:.2f}µs")

# Option 3: repeat (multiple measurements)
results = timeit.repeat(
stmt="[x**2 for x in range(100)]",
repeat=5, # 5 rounds
number=10_000, # 10000 executions per round
)
print(f"Min: {min(results):.4f}s")
print(f"Mean: {sum(results)/len(results):.4f}s")

# Option 4: pass function directly
def target():
return sorted(range(1000), reverse=True)

t = timeit.timeit(target, number=10_000)
print(f"{t:.4f}s")

# Option 5: with setup code
t = timeit.timeit(
stmt="bisect.bisect_left(data, 500)",
setup="import bisect; data = list(range(1000))",
number=1_000_000,
)
print(f"bisect: {t:.4f}s")

pytest-benchmark — Test-Integrated Benchmarking

# tests/test_benchmark.py
import pytest


def bubble_sort(arr):
arr = arr.copy()
n = len(arr)
for i in range(n):
for j in range(0, n - i - 1):
if arr[j] > arr[j + 1]:
arr[j], arr[j + 1] = arr[j + 1], arr[j]
return arr


def insertion_sort(arr):
arr = arr.copy()
for i in range(1, len(arr)):
key = arr[i]
j = i - 1
while j >= 0 and arr[j] > key:
arr[j + 1] = arr[j]
j -= 1
arr[j + 1] = key
return arr


@pytest.fixture
def sample_data():
import random
return [random.randint(0, 1000) for _ in range(500)]


def test_bubble_sort(benchmark, sample_data):
result = benchmark(bubble_sort, sample_data)
assert result == sorted(sample_data)


def test_insertion_sort(benchmark):
import random
data = [random.randint(0, 1000) for _ in range(500)]
result = benchmark(insertion_sort, data)
assert result == sorted(data)


def test_builtin_sort(benchmark, sample_data):
result = benchmark(sorted, sample_data)
assert result == sorted(sample_data)


# pytest tests/ --benchmark-only
# pytest tests/ --benchmark-compare # compare with previous results
# pytest tests/ --benchmark-save=baseline # save results
# pytest tests/ --benchmark-histogram # save histogram

Measuring Big-O Empirically

import timeit
import random


# ── O(n) vs O(n log n) vs O(n²) ─────────────────────────
def measure_complexity():
sizes = [100, 500, 1000, 5000, 10000]

print(f"{'Size':>8} {'O(n) list.count':>16} {'O(n log n) sorted':>18} {'O(n²) bubble':>14}")
print("-" * 60)

for n in sizes:
data = [random.randint(0, n) for _ in range(n)]

# O(n): linear scan
t_linear = timeit.timeit(lambda: data.count(n // 2), number=100)

# O(n log n): sorting
t_nlogn = timeit.timeit(lambda: sorted(data), number=100)

# O(n²): bubble sort
def bubble(arr):
arr = arr[:]
for i in range(len(arr)):
for j in range(len(arr) - i - 1):
if arr[j] > arr[j+1]:
arr[j], arr[j+1] = arr[j+1], arr[j]
t_n2 = timeit.timeit(lambda: bubble(data), number=10)

print(f"{n:>8} {t_linear*1000:>14.2f}ms {t_nlogn*1000:>16.2f}ms {t_n2*1000:>12.2f}ms")


measure_complexity()

# ── Search complexity comparison ──────────────────────────
import bisect

def compare_search(n: int = 100_000):
data_list = list(range(n))
data_set = set(data_list)
target = n - 1

t_list = timeit.timeit(lambda: target in data_list, number=10_000)
t_set = timeit.timeit(lambda: target in data_set, number=10_000)
t_bisect = timeit.timeit(
lambda: bisect.bisect_left(data_list, target) < len(data_list),
number=10_000,
)

print(f"list O(n): {t_list:.4f}s")
print(f"set O(1): {t_set:.6f}s ({t_list/t_set:.0f}x faster)")
print(f"bisect O(logn): {t_bisect:.6f}s ({t_list/t_bisect:.0f}x faster)")


compare_search()

Memory-Profile-Based Benchmarking

from memory_profiler import memory_usage
import tracemalloc


def benchmark_memory(func, *args, **kwargs):
"""Measure memory before and after function execution"""
mem_usage = memory_usage((func, args, kwargs), interval=0.01, retval=True)
memory_timeline, result = mem_usage[:-1], mem_usage[-1]

peak_mem = max(memory_timeline) if memory_timeline else 0
base_mem = memory_timeline[0] if memory_timeline else 0

print(f"Base memory: {base_mem:.1f} MB")
print(f"Peak memory: {peak_mem:.1f} MB")
print(f"Increase: {peak_mem - base_mem:.1f} MB")
return result


# tracemalloc — track allocation locations
def trace_allocations(func, *args):
tracemalloc.start()
snapshot1 = tracemalloc.take_snapshot()

result = func(*args)

snapshot2 = tracemalloc.take_snapshot()
top_stats = snapshot2.compare_to(snapshot1, "lineno")

print("\n=== Top 5 Memory Allocations ===")
for stat in top_stats[:5]:
print(f" {stat}")

tracemalloc.stop()
return result

Performance Regression Tests

# Use performance thresholds to prevent regressions
import pytest
import time


def fast_function(n: int) -> int:
return sum(range(n))


def test_performance_budget():
"""Performance budget: must complete within 100ms"""
start = time.perf_counter()
result = fast_function(1_000_000)
elapsed = time.perf_counter() - start

assert result == 499_999_500_000
assert elapsed < 0.1, f"Performance budget exceeded: {elapsed:.3f}s > 0.1s"


# pytest-benchmark with thresholds
def test_with_benchmark(benchmark):
result = benchmark.pedantic(
fast_function,
args=(1_000_000,),
rounds=10,
warmup_rounds=2,
)
assert result == 499_999_500_000
# benchmark.stats.mean < 0.01 # mean must be under 10ms

Summary

ToolUse CaseNotes
timeitSimple expressions/functionsStandard library, accurate CPU time
pytest-benchmarkTest-integratedStatistics, comparison, CI integration
memory_profilerMemory consumptionTracks in MB
tracemallocMemory allocation sourceStandard library

Benchmarking principles:

  1. Include warmup runs (JIT and cache effects)
  2. Enough iterations for statistical significance
  3. Compare under identical conditions (CPU clock, background processes)
  4. Use input data that resembles real workloads
Advertisement