Skip to main content

Benchmarking & Profiling — testing.B and pprof

Go has benchmarking and profiling tools built into its standard library. They provide a complete workflow for measuring code performance and finding bottlenecks.


testing.B — Benchmark Basics

Benchmark functions start with Benchmark and take *testing.B as a parameter.

// string_bench_test.go
package main

import (
"strings"
"testing"
)

// Compare string concatenation methods
func BenchmarkStringConcat(b *testing.B) {
for i := 0; i < b.N; i++ {
s := ""
for j := 0; j < 100; j++ {
s += "hello"
}
}
}

func BenchmarkStringBuilder(b *testing.B) {
for i := 0; i < b.N; i++ {
var sb strings.Builder
for j := 0; j < 100; j++ {
sb.WriteString("hello")
}
_ = sb.String()
}
}

func BenchmarkStringJoin(b *testing.B) {
words := make([]string, 100)
for i := range words {
words[i] = "hello"
}

b.ResetTimer() // exclude setup time
for i := 0; i < b.N; i++ {
_ = strings.Join(words, "")
}
}

Benchmark Execution Commands

# Run all benchmarks
go test -bench=. ./...

# Run specific pattern only
go test -bench=BenchmarkString ./...

# Run benchmarks with unit tests
go test -bench=. -run=TestXxx ./...

# Skip unit tests, run benchmarks only
go test -bench=. -run=^$ ./...

# Fixed iterations
go test -bench=. -benchtime=5s ./... # 5 seconds
go test -bench=. -benchtime=1000x ./... # 1000 iterations

# Include memory allocation info
go test -bench=. -benchmem ./...

# Multi-core benchmarks
go test -bench=. -cpu=1,2,4,8 ./...

Reading Benchmark Results

BenchmarkStringConcat-8      9381    124589 ns/op    5616 B/op    99 allocs/op
BenchmarkStringBuilder-8 532780 2251 ns/op 2688 B/op 7 allocs/op
BenchmarkStringJoin-8 659847 1820 ns/op 2048 B/op 2 allocs/op
ColumnMeaning
-8GOMAXPROCS (CPU count)
9381Iterations (b.N)
124589 ns/opNanoseconds per operation
5616 B/opBytes allocated per operation (−benchmem)
99 allocs/opHeap allocations per operation (−benchmem)

Table-Driven Benchmarks

func BenchmarkSearch(b *testing.B) {
sizes := []int{10, 100, 1000, 10000}

for _, size := range sizes {
b.Run(fmt.Sprintf("size=%d", size), func(b *testing.B) {
data := make([]int, size)
for i := range data {
data[i] = i
}
target := size / 2

b.ResetTimer()
for i := 0; i < b.N; i++ {
linearSearch(data, target)
}
})
}
}

func linearSearch(data []int, target int) int {
for i, v := range data {
if v == target {
return i
}
}
return -1
}
BenchmarkSearch/size=10-8       100000000     11.2 ns/op
BenchmarkSearch/size=100-8 20000000 63.4 ns/op
BenchmarkSearch/size=1000-8 2000000 612.3 ns/op
BenchmarkSearch/size=10000-8 200000 6092.1 ns/op

b.ResetTimer / b.StopTimer / b.StartTimer

func BenchmarkWithSetup(b *testing.B) {
// This should not be included in measurement time
data := generateLargeData(10000)

b.ResetTimer() // reset timer excluding setup time
for i := 0; i < b.N; i++ {
process(data)
}
}

func BenchmarkWithPerIterSetup(b *testing.B) {
for i := 0; i < b.N; i++ {
b.StopTimer() // stop timer
data := copyData(input) // prepare new data each iteration
b.StartTimer() // resume timer

mutate(data)
}
}

Memory Allocation Optimization Benchmarks

// map_bench_test.go
package main

import "testing"

// Pre-allocation vs dynamic growth
func BenchmarkMapNoPre(b *testing.B) {
for i := 0; i < b.N; i++ {
m := make(map[int]int)
for j := 0; j < 1000; j++ {
m[j] = j
}
}
}

func BenchmarkMapWithPre(b *testing.B) {
for i := 0; i < b.N; i++ {
m := make(map[int]int, 1000) // pre-allocate capacity
for j := 0; j < 1000; j++ {
m[j] = j
}
}
}

// Slice growth comparison
func BenchmarkSliceAppendNoPre(b *testing.B) {
for i := 0; i < b.N; i++ {
s := []int{}
for j := 0; j < 1000; j++ {
s = append(s, j)
}
}
}

func BenchmarkSliceAppendWithPre(b *testing.B) {
for i := 0; i < b.N; i++ {
s := make([]int, 0, 1000) // pre-allocate capacity
for j := 0; j < 1000; j++ {
s = append(s, j)
}
}
}
BenchmarkMapNoPre-8         5000    312451 ns/op    86516 B/op    65 allocs/op
BenchmarkMapWithPre-8 8000 148230 ns/op 41040 B/op 8 allocs/op
BenchmarkSliceAppendNoPre-8 300000 4521 ns/op 25208 B/op 12 allocs/op
BenchmarkSliceAppendWithPre-8 500000 2312 ns/op 8192 B/op 1 allocs/op

pprof — CPU & Memory Profiling

Generating Profiles from Benchmarks

# CPU profile
go test -bench=. -cpuprofile=cpu.prof ./...

# Memory profile
go test -bench=. -memprofile=mem.prof ./...

# Both
go test -bench=. -cpuprofile=cpu.prof -memprofile=mem.prof ./...

Interactive pprof Analysis

# Analyze CPU profile
go tool pprof cpu.prof

# Key commands
(pprof) top # top CPU-consuming functions
(pprof) top -cum # top cumulative CPU usage
(pprof) list FuncName # line-by-line analysis of function
(pprof) web # visualize graph in browser
(pprof) png # save as PNG image

# Memory profile
go tool pprof -alloc_space mem.prof # total allocated memory
go tool pprof -inuse_space mem.prof # currently in-use memory

Integrate pprof in HTTP Server

package main

import (
"log"
"net/http"
_ "net/http/pprof" // automatically registers /debug/pprof/ handlers
)

func main() {
// Profiling server (port 6060)
go func() {
log.Println(http.ListenAndServe(":6060", nil))
}()

// Real application code
startServer()
}
# Collect 30 seconds of CPU profile from running server
go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30

# Heap memory profile
go tool pprof http://localhost:6060/debug/pprof/heap

# Goroutine dump
curl http://localhost:6060/debug/pprof/goroutine?debug=1

# Real-time web UI (Go 1.10+)
go tool pprof -http=:8080 http://localhost:6060/debug/pprof/profile?seconds=30

trace — Execution Tracing

# Collect trace
go test -bench=. -trace=trace.out ./...

# Analyze trace (in browser)
go tool trace trace.out

The trace tool shows:

  • Goroutine creation/termination/wait timeline
  • Garbage collection (GC) events and duration
  • Processor (P) workload distribution
  • Network/syscall wait time

benchstat — Statistical Comparison

Compare two benchmark results statistically.

go install golang.org/x/perf/cmd/benchstat@latest

# Measure before changes
go test -bench=. -count=10 ./... > before.txt

# Measure after changes
go test -bench=. -count=10 ./... > after.txt

# Statistical comparison (includes p-value)
benchstat before.txt after.txt
name              old time/op    new time/op    delta
StringConcat-8 124µs ± 1% 2.3µs ± 2% -98.15% (p=0.000 n=10+10)

name old alloc/op new alloc/op delta
StringConcat-8 5.62kB ± 0% 2.69kB ± 0% -52.14% (p=0.000 n=10+10)
  • p=0.000: statistically significant difference (p < 0.05)
  • -98.15%: 98% performance improvement

Real-World Benchmark Example — JSON Serialization

package main

import (
"encoding/json"
"testing"

"github.com/bytedance/sonic"
"github.com/goccy/go-json"
)

type User struct {
ID int `json:"id"`
Name string `json:"name"`
Email string `json:"email"`
Age int `json:"age"`
}

var testUser = User{ID: 1, Name: "Kim Golang", Email: "go@example.com", Age: 30}

func BenchmarkJSONMarshalStd(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_, err := json.Marshal(testUser)
if err != nil {
b.Fatal(err)
}
}
}

func BenchmarkJSONMarshalGoJson(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_, err := gojson.Marshal(testUser)
if err != nil {
b.Fatal(err)
}
}
}

func BenchmarkJSONMarshalSonic(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_, err := sonic.Marshal(testUser)
if err != nil {
b.Fatal(err)
}
}
}

Benchmark Writing Rules & Anti-Patterns

// ❌ Compiler optimization can make results meaningless
func BenchmarkBad(b *testing.B) {
for i := 0; i < b.N; i++ {
result := compute(42) // compiler might remove unused result
_ = result // this is not enough
}
}

// ✅ Use sink variable to prevent optimization
var globalSink int

func BenchmarkGood(b *testing.B) {
var sink int
for i := 0; i < b.N; i++ {
sink = compute(42)
}
globalSink = sink // export to global to prevent optimization
}

Important Notes

// ❌ Putting measured code outside b.N loop
func BenchmarkWrong(b *testing.B) {
result := expensiveCompute() // only runs once — benchmark meaningless
_ = result
for i := 0; i < b.N; i++ {
_ = result
}
}

// ✅ Correct structure
func BenchmarkRight(b *testing.B) {
for i := 0; i < b.N; i++ {
_ = expensiveCompute() // runs each iteration
}
}

Key Summary

ToolPurpose
go test -bench=.Run benchmarks
-benchmemMeasure memory allocation
-cpuprofile=fSave CPU profile
-memprofile=fSave memory profile
go tool pprofAnalyze profiles
go tool traceAnalyze execution traces
benchstatStatistical comparison
  • b.N is determined by Go runtime — never set manually
  • Use b.ResetTimer() to exclude setup time
  • Use -benchmem to identify if memory allocation is the bottleneck
  • Use benchstat to verify statistical significance with p-values