Benchmarking & Profiling — testing.B and pprof
Go has benchmarking and profiling tools built into its standard library. They provide a complete workflow for measuring code performance and finding bottlenecks.
testing.B — Benchmark Basics
Benchmark functions start with Benchmark and take *testing.B as a parameter.
// string_bench_test.go
package main
import (
"strings"
"testing"
)
// Compare string concatenation methods
func BenchmarkStringConcat(b *testing.B) {
for i := 0; i < b.N; i++ {
s := ""
for j := 0; j < 100; j++ {
s += "hello"
}
}
}
func BenchmarkStringBuilder(b *testing.B) {
for i := 0; i < b.N; i++ {
var sb strings.Builder
for j := 0; j < 100; j++ {
sb.WriteString("hello")
}
_ = sb.String()
}
}
func BenchmarkStringJoin(b *testing.B) {
words := make([]string, 100)
for i := range words {
words[i] = "hello"
}
b.ResetTimer() // exclude setup time
for i := 0; i < b.N; i++ {
_ = strings.Join(words, "")
}
}
Benchmark Execution Commands
# Run all benchmarks
go test -bench=. ./...
# Run specific pattern only
go test -bench=BenchmarkString ./...
# Run benchmarks with unit tests
go test -bench=. -run=TestXxx ./...
# Skip unit tests, run benchmarks only
go test -bench=. -run=^$ ./...
# Fixed iterations
go test -bench=. -benchtime=5s ./... # 5 seconds
go test -bench=. -benchtime=1000x ./... # 1000 iterations
# Include memory allocation info
go test -bench=. -benchmem ./...
# Multi-core benchmarks
go test -bench=. -cpu=1,2,4,8 ./...
Reading Benchmark Results
BenchmarkStringConcat-8 9381 124589 ns/op 5616 B/op 99 allocs/op
BenchmarkStringBuilder-8 532780 2251 ns/op 2688 B/op 7 allocs/op
BenchmarkStringJoin-8 659847 1820 ns/op 2048 B/op 2 allocs/op
| Column | Meaning |
|---|---|
-8 | GOMAXPROCS (CPU count) |
9381 | Iterations (b.N) |
124589 ns/op | Nanoseconds per operation |
5616 B/op | Bytes allocated per operation (−benchmem) |
99 allocs/op | Heap allocations per operation (−benchmem) |
Table-Driven Benchmarks
func BenchmarkSearch(b *testing.B) {
sizes := []int{10, 100, 1000, 10000}
for _, size := range sizes {
b.Run(fmt.Sprintf("size=%d", size), func(b *testing.B) {
data := make([]int, size)
for i := range data {
data[i] = i
}
target := size / 2
b.ResetTimer()
for i := 0; i < b.N; i++ {
linearSearch(data, target)
}
})
}
}
func linearSearch(data []int, target int) int {
for i, v := range data {
if v == target {
return i
}
}
return -1
}
BenchmarkSearch/size=10-8 100000000 11.2 ns/op
BenchmarkSearch/size=100-8 20000000 63.4 ns/op
BenchmarkSearch/size=1000-8 2000000 612.3 ns/op
BenchmarkSearch/size=10000-8 200000 6092.1 ns/op
b.ResetTimer / b.StopTimer / b.StartTimer
func BenchmarkWithSetup(b *testing.B) {
// This should not be included in measurement time
data := generateLargeData(10000)
b.ResetTimer() // reset timer excluding setup time
for i := 0; i < b.N; i++ {
process(data)
}
}
func BenchmarkWithPerIterSetup(b *testing.B) {
for i := 0; i < b.N; i++ {
b.StopTimer() // stop timer
data := copyData(input) // prepare new data each iteration
b.StartTimer() // resume timer
mutate(data)
}
}
Memory Allocation Optimization Benchmarks
// map_bench_test.go
package main
import "testing"
// Pre-allocation vs dynamic growth
func BenchmarkMapNoPre(b *testing.B) {
for i := 0; i < b.N; i++ {
m := make(map[int]int)
for j := 0; j < 1000; j++ {
m[j] = j
}
}
}
func BenchmarkMapWithPre(b *testing.B) {
for i := 0; i < b.N; i++ {
m := make(map[int]int, 1000) // pre-allocate capacity
for j := 0; j < 1000; j++ {
m[j] = j
}
}
}
// Slice growth comparison
func BenchmarkSliceAppendNoPre(b *testing.B) {
for i := 0; i < b.N; i++ {
s := []int{}
for j := 0; j < 1000; j++ {
s = append(s, j)
}
}
}
func BenchmarkSliceAppendWithPre(b *testing.B) {
for i := 0; i < b.N; i++ {
s := make([]int, 0, 1000) // pre-allocate capacity
for j := 0; j < 1000; j++ {
s = append(s, j)
}
}
}
BenchmarkMapNoPre-8 5000 312451 ns/op 86516 B/op 65 allocs/op
BenchmarkMapWithPre-8 8000 148230 ns/op 41040 B/op 8 allocs/op
BenchmarkSliceAppendNoPre-8 300000 4521 ns/op 25208 B/op 12 allocs/op
BenchmarkSliceAppendWithPre-8 500000 2312 ns/op 8192 B/op 1 allocs/op
pprof — CPU & Memory Profiling
Generating Profiles from Benchmarks
# CPU profile
go test -bench=. -cpuprofile=cpu.prof ./...
# Memory profile
go test -bench=. -memprofile=mem.prof ./...
# Both
go test -bench=. -cpuprofile=cpu.prof -memprofile=mem.prof ./...
Interactive pprof Analysis
# Analyze CPU profile
go tool pprof cpu.prof
# Key commands
(pprof) top # top CPU-consuming functions
(pprof) top -cum # top cumulative CPU usage
(pprof) list FuncName # line-by-line analysis of function
(pprof) web # visualize graph in browser
(pprof) png # save as PNG image
# Memory profile
go tool pprof -alloc_space mem.prof # total allocated memory
go tool pprof -inuse_space mem.prof # currently in-use memory
Integrate pprof in HTTP Server
package main
import (
"log"
"net/http"
_ "net/http/pprof" // automatically registers /debug/pprof/ handlers
)
func main() {
// Profiling server (port 6060)
go func() {
log.Println(http.ListenAndServe(":6060", nil))
}()
// Real application code
startServer()
}
# Collect 30 seconds of CPU profile from running server
go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30
# Heap memory profile
go tool pprof http://localhost:6060/debug/pprof/heap
# Goroutine dump
curl http://localhost:6060/debug/pprof/goroutine?debug=1
# Real-time web UI (Go 1.10+)
go tool pprof -http=:8080 http://localhost:6060/debug/pprof/profile?seconds=30
trace — Execution Tracing
# Collect trace
go test -bench=. -trace=trace.out ./...
# Analyze trace (in browser)
go tool trace trace.out
The trace tool shows:
- Goroutine creation/termination/wait timeline
- Garbage collection (GC) events and duration
- Processor (P) workload distribution
- Network/syscall wait time
benchstat — Statistical Comparison
Compare two benchmark results statistically.
go install golang.org/x/perf/cmd/benchstat@latest
# Measure before changes
go test -bench=. -count=10 ./... > before.txt
# Measure after changes
go test -bench=. -count=10 ./... > after.txt
# Statistical comparison (includes p-value)
benchstat before.txt after.txt
name old time/op new time/op delta
StringConcat-8 124µs ± 1% 2.3µs ± 2% -98.15% (p=0.000 n=10+10)
name old alloc/op new alloc/op delta
StringConcat-8 5.62kB ± 0% 2.69kB ± 0% -52.14% (p=0.000 n=10+10)
p=0.000: statistically significant difference (p < 0.05)-98.15%: 98% performance improvement
Real-World Benchmark Example — JSON Serialization
package main
import (
"encoding/json"
"testing"
"github.com/bytedance/sonic"
"github.com/goccy/go-json"
)
type User struct {
ID int `json:"id"`
Name string `json:"name"`
Email string `json:"email"`
Age int `json:"age"`
}
var testUser = User{ID: 1, Name: "Kim Golang", Email: "go@example.com", Age: 30}
func BenchmarkJSONMarshalStd(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_, err := json.Marshal(testUser)
if err != nil {
b.Fatal(err)
}
}
}
func BenchmarkJSONMarshalGoJson(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_, err := gojson.Marshal(testUser)
if err != nil {
b.Fatal(err)
}
}
}
func BenchmarkJSONMarshalSonic(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_, err := sonic.Marshal(testUser)
if err != nil {
b.Fatal(err)
}
}
}
Benchmark Writing Rules & Anti-Patterns
// ❌ Compiler optimization can make results meaningless
func BenchmarkBad(b *testing.B) {
for i := 0; i < b.N; i++ {
result := compute(42) // compiler might remove unused result
_ = result // this is not enough
}
}
// ✅ Use sink variable to prevent optimization
var globalSink int
func BenchmarkGood(b *testing.B) {
var sink int
for i := 0; i < b.N; i++ {
sink = compute(42)
}
globalSink = sink // export to global to prevent optimization
}
Important Notes
// ❌ Putting measured code outside b.N loop
func BenchmarkWrong(b *testing.B) {
result := expensiveCompute() // only runs once — benchmark meaningless
_ = result
for i := 0; i < b.N; i++ {
_ = result
}
}
// ✅ Correct structure
func BenchmarkRight(b *testing.B) {
for i := 0; i < b.N; i++ {
_ = expensiveCompute() // runs each iteration
}
}
Key Summary
| Tool | Purpose |
|---|---|
go test -bench=. | Run benchmarks |
-benchmem | Measure memory allocation |
-cpuprofile=f | Save CPU profile |
-memprofile=f | Save memory profile |
go tool pprof | Analyze profiles |
go tool trace | Analyze execution traces |
benchstat | Statistical comparison |
b.Nis determined by Go runtime — never set manually- Use
b.ResetTimer()to exclude setup time - Use
-benchmemto identify if memory allocation is the bottleneck - Use
benchstatto verify statistical significance with p-values