Skip to main content

Practical Benchmarking

Measure code performance accurately with timeit and pytest-benchmark, and develop Big-O intuition.


Installationโ€‹

pip install pytest pytest-benchmark

timeit โ€” Micro Benchmarksโ€‹

import timeit


# โ”€โ”€ Basic usage โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# Option 1: Command line
# python -m timeit "sum(range(1000))"
# python -m timeit -n 10000 -r 5 "'-'.join(str(i) for i in range(100))"

# Option 2: Inline
t = timeit.timeit("sum(range(1000))", number=10_000)
print(f"10000 runs: {t:.4f}s")
print(f"Per run: {t / 10_000 * 1e6:.2f}ยตs")

# Option 3: repeat (multiple measurements)
results = timeit.repeat(
stmt="[x**2 for x in range(100)]",
repeat=5, # 5 rounds
number=10_000, # 10000 executions per round
)
print(f"Min: {min(results):.4f}s")
print(f"Mean: {sum(results)/len(results):.4f}s")

# Option 4: pass function directly
def target():
return sorted(range(1000), reverse=True)

t = timeit.timeit(target, number=10_000)
print(f"{t:.4f}s")

# Option 5: with setup code
t = timeit.timeit(
stmt="bisect.bisect_left(data, 500)",
setup="import bisect; data = list(range(1000))",
number=1_000_000,
)
print(f"bisect: {t:.4f}s")

pytest-benchmark โ€” Test-Integrated Benchmarkingโ€‹

# tests/test_benchmark.py
import pytest


def bubble_sort(arr):
arr = arr.copy()
n = len(arr)
for i in range(n):
for j in range(0, n - i - 1):
if arr[j] > arr[j + 1]:
arr[j], arr[j + 1] = arr[j + 1], arr[j]
return arr


def insertion_sort(arr):
arr = arr.copy()
for i in range(1, len(arr)):
key = arr[i]
j = i - 1
while j >= 0 and arr[j] > key:
arr[j + 1] = arr[j]
j -= 1
arr[j + 1] = key
return arr


@pytest.fixture
def sample_data():
import random
return [random.randint(0, 1000) for _ in range(500)]


def test_bubble_sort(benchmark, sample_data):
result = benchmark(bubble_sort, sample_data)
assert result == sorted(sample_data)


def test_insertion_sort(benchmark):
import random
data = [random.randint(0, 1000) for _ in range(500)]
result = benchmark(insertion_sort, data)
assert result == sorted(data)


def test_builtin_sort(benchmark, sample_data):
result = benchmark(sorted, sample_data)
assert result == sorted(sample_data)


# pytest tests/ --benchmark-only
# pytest tests/ --benchmark-compare # compare with previous results
# pytest tests/ --benchmark-save=baseline # save results
# pytest tests/ --benchmark-histogram # save histogram

Measuring Big-O Empiricallyโ€‹

import timeit
import random


# โ”€โ”€ O(n) vs O(n log n) vs O(nยฒ) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def measure_complexity():
sizes = [100, 500, 1000, 5000, 10000]

print(f"{'Size':>8} {'O(n) list.count':>16} {'O(n log n) sorted':>18} {'O(nยฒ) bubble':>14}")
print("-" * 60)

for n in sizes:
data = [random.randint(0, n) for _ in range(n)]

# O(n): linear scan
t_linear = timeit.timeit(lambda: data.count(n // 2), number=100)

# O(n log n): sorting
t_nlogn = timeit.timeit(lambda: sorted(data), number=100)

# O(nยฒ): bubble sort
def bubble(arr):
arr = arr[:]
for i in range(len(arr)):
for j in range(len(arr) - i - 1):
if arr[j] > arr[j+1]:
arr[j], arr[j+1] = arr[j+1], arr[j]
t_n2 = timeit.timeit(lambda: bubble(data), number=10)

print(f"{n:>8} {t_linear*1000:>14.2f}ms {t_nlogn*1000:>16.2f}ms {t_n2*1000:>12.2f}ms")


measure_complexity()

# โ”€โ”€ Search complexity comparison โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
import bisect

def compare_search(n: int = 100_000):
data_list = list(range(n))
data_set = set(data_list)
target = n - 1

t_list = timeit.timeit(lambda: target in data_list, number=10_000)
t_set = timeit.timeit(lambda: target in data_set, number=10_000)
t_bisect = timeit.timeit(
lambda: bisect.bisect_left(data_list, target) < len(data_list),
number=10_000,
)

print(f"list O(n): {t_list:.4f}s")
print(f"set O(1): {t_set:.6f}s ({t_list/t_set:.0f}x faster)")
print(f"bisect O(logn): {t_bisect:.6f}s ({t_list/t_bisect:.0f}x faster)")


compare_search()

Memory-Profile-Based Benchmarkingโ€‹

from memory_profiler import memory_usage
import tracemalloc


def benchmark_memory(func, *args, **kwargs):
"""Measure memory before and after function execution"""
mem_usage = memory_usage((func, args, kwargs), interval=0.01, retval=True)
memory_timeline, result = mem_usage[:-1], mem_usage[-1]

peak_mem = max(memory_timeline) if memory_timeline else 0
base_mem = memory_timeline[0] if memory_timeline else 0

print(f"Base memory: {base_mem:.1f} MB")
print(f"Peak memory: {peak_mem:.1f} MB")
print(f"Increase: {peak_mem - base_mem:.1f} MB")
return result


# tracemalloc โ€” track allocation locations
def trace_allocations(func, *args):
tracemalloc.start()
snapshot1 = tracemalloc.take_snapshot()

result = func(*args)

snapshot2 = tracemalloc.take_snapshot()
top_stats = snapshot2.compare_to(snapshot1, "lineno")

print("\n=== Top 5 Memory Allocations ===")
for stat in top_stats[:5]:
print(f" {stat}")

tracemalloc.stop()
return result

Performance Regression Testsโ€‹

# Use performance thresholds to prevent regressions
import pytest
import time


def fast_function(n: int) -> int:
return sum(range(n))


def test_performance_budget():
"""Performance budget: must complete within 100ms"""
start = time.perf_counter()
result = fast_function(1_000_000)
elapsed = time.perf_counter() - start

assert result == 499_999_500_000
assert elapsed < 0.1, f"Performance budget exceeded: {elapsed:.3f}s > 0.1s"


# pytest-benchmark with thresholds
def test_with_benchmark(benchmark):
result = benchmark.pedantic(
fast_function,
args=(1_000_000,),
rounds=10,
warmup_rounds=2,
)
assert result == 499_999_500_000
# benchmark.stats.mean < 0.01 # mean must be under 10ms

Summaryโ€‹

ToolUse CaseNotes
timeitSimple expressions/functionsStandard library, accurate CPU time
pytest-benchmarkTest-integratedStatistics, comparison, CI integration
memory_profilerMemory consumptionTracks in MB
tracemallocMemory allocation sourceStandard library

Benchmarking principles:

  1. Include warmup runs (JIT and cache effects)
  2. Enough iterations for statistical significance
  3. Compare under identical conditions (CPU clock, background processes)
  4. Use input data that resembles real workloads