Benchmarking & Profiling — testing.B and pprof

Go has benchmarking and profiling tools built into its standard library. They provide a complete workflow for measuring code performance and finding bottlenecks.

testing.B — Benchmark Basics

Benchmark functions start with Benchmark and take *testing.B as a parameter.

// string_bench_test.go
package main

import (
    "strings"
    "testing"
)

// Compare string concatenation methods
func BenchmarkStringConcat(b *testing.B) {
    for i := 0; i < b.N; i++ {
        s := ""
        for j := 0; j < 100; j++ {
            s += "hello"
        }
    }
}

func BenchmarkStringBuilder(b *testing.B) {
    for i := 0; i < b.N; i++ {
        var sb strings.Builder
        for j := 0; j < 100; j++ {
            sb.WriteString("hello")
        }
        _ = sb.String()
    }
}

func BenchmarkStringJoin(b *testing.B) {
    words := make([]string, 100)
    for i := range words {
        words[i] = "hello"
    }

    b.ResetTimer() // exclude setup time
    for i := 0; i < b.N; i++ {
        _ = strings.Join(words, "")
    }
}

Benchmark Execution Commands

# Run all benchmarks
go test -bench=. ./...

# Run specific pattern only
go test -bench=BenchmarkString ./...

# Run benchmarks with unit tests
go test -bench=. -run=TestXxx ./...

# Skip unit tests, run benchmarks only
go test -bench=. -run=^$ ./...

# Fixed iterations
go test -bench=. -benchtime=5s ./...    # 5 seconds
go test -bench=. -benchtime=1000x ./...  # 1000 iterations

# Include memory allocation info
go test -bench=. -benchmem ./...

# Multi-core benchmarks
go test -bench=. -cpu=1,2,4,8 ./...

Reading Benchmark Results

BenchmarkStringConcat-8      9381    124589 ns/op    5616 B/op    99 allocs/op
BenchmarkStringBuilder-8   532780     2251 ns/op    2688 B/op     7 allocs/op
BenchmarkStringJoin-8      659847     1820 ns/op    2048 B/op     2 allocs/op

Column	Meaning
`-8`	GOMAXPROCS (CPU count)
`9381`	Iterations (b.N)
`124589 ns/op`	Nanoseconds per operation
`5616 B/op`	Bytes allocated per operation (−benchmem)
`99 allocs/op`	Heap allocations per operation (−benchmem)

Table-Driven Benchmarks

func BenchmarkSearch(b *testing.B) {
    sizes := []int{10, 100, 1000, 10000}

    for _, size := range sizes {
        b.Run(fmt.Sprintf("size=%d", size), func(b *testing.B) {
            data := make([]int, size)
            for i := range data {
                data[i] = i
            }
            target := size / 2

            b.ResetTimer()
            for i := 0; i < b.N; i++ {
                linearSearch(data, target)
            }
        })
    }
}

func linearSearch(data []int, target int) int {
    for i, v := range data {
        if v == target {
            return i
        }
    }
    return -1
}

BenchmarkSearch/size=10-8       100000000     11.2 ns/op
BenchmarkSearch/size=100-8       20000000     63.4 ns/op
BenchmarkSearch/size=1000-8       2000000    612.3 ns/op
BenchmarkSearch/size=10000-8       200000   6092.1 ns/op

b.ResetTimer / b.StopTimer / b.StartTimer

func BenchmarkWithSetup(b *testing.B) {
    // This should not be included in measurement time
    data := generateLargeData(10000)

    b.ResetTimer() // reset timer excluding setup time
    for i := 0; i < b.N; i++ {
        process(data)
    }
}

func BenchmarkWithPerIterSetup(b *testing.B) {
    for i := 0; i < b.N; i++ {
        b.StopTimer()           // stop timer
        data := copyData(input) // prepare new data each iteration
        b.StartTimer()          // resume timer

        mutate(data)
    }
}

Memory Allocation Optimization Benchmarks

// map_bench_test.go
package main

import "testing"

// Pre-allocation vs dynamic growth
func BenchmarkMapNoPre(b *testing.B) {
    for i := 0; i < b.N; i++ {
        m := make(map[int]int)
        for j := 0; j < 1000; j++ {
            m[j] = j
        }
    }
}

func BenchmarkMapWithPre(b *testing.B) {
    for i := 0; i < b.N; i++ {
        m := make(map[int]int, 1000) // pre-allocate capacity
        for j := 0; j < 1000; j++ {
            m[j] = j
        }
    }
}

// Slice growth comparison
func BenchmarkSliceAppendNoPre(b *testing.B) {
    for i := 0; i < b.N; i++ {
        s := []int{}
        for j := 0; j < 1000; j++ {
            s = append(s, j)
        }
    }
}

func BenchmarkSliceAppendWithPre(b *testing.B) {
    for i := 0; i < b.N; i++ {
        s := make([]int, 0, 1000) // pre-allocate capacity
        for j := 0; j < 1000; j++ {
            s = append(s, j)
        }
    }
}

BenchmarkMapNoPre-8         5000    312451 ns/op    86516 B/op    65 allocs/op
BenchmarkMapWithPre-8       8000    148230 ns/op    41040 B/op     8 allocs/op
BenchmarkSliceAppendNoPre-8    300000    4521 ns/op    25208 B/op    12 allocs/op
BenchmarkSliceAppendWithPre-8  500000    2312 ns/op     8192 B/op     1 allocs/op

pprof — CPU & Memory Profiling

Generating Profiles from Benchmarks

# CPU profile
go test -bench=. -cpuprofile=cpu.prof ./...

# Memory profile
go test -bench=. -memprofile=mem.prof ./...

# Both
go test -bench=. -cpuprofile=cpu.prof -memprofile=mem.prof ./...

Interactive pprof Analysis

# Analyze CPU profile
go tool pprof cpu.prof

# Key commands
(pprof) top           # top CPU-consuming functions
(pprof) top -cum      # top cumulative CPU usage
(pprof) list FuncName # line-by-line analysis of function
(pprof) web           # visualize graph in browser
(pprof) png           # save as PNG image

# Memory profile
go tool pprof -alloc_space mem.prof  # total allocated memory
go tool pprof -inuse_space mem.prof  # currently in-use memory

Integrate pprof in HTTP Server

package main

import (
    "log"
    "net/http"
    _ "net/http/pprof" // automatically registers /debug/pprof/ handlers
)

func main() {
    // Profiling server (port 6060)
    go func() {
        log.Println(http.ListenAndServe(":6060", nil))
    }()

    // Real application code
    startServer()
}

# Collect 30 seconds of CPU profile from running server
go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30

# Heap memory profile
go tool pprof http://localhost:6060/debug/pprof/heap

# Goroutine dump
curl http://localhost:6060/debug/pprof/goroutine?debug=1

# Real-time web UI (Go 1.10+)
go tool pprof -http=:8080 http://localhost:6060/debug/pprof/profile?seconds=30

trace — Execution Tracing

# Collect trace
go test -bench=. -trace=trace.out ./...

# Analyze trace (in browser)
go tool trace trace.out

The trace tool shows:

Goroutine creation/termination/wait timeline
Garbage collection (GC) events and duration
Processor (P) workload distribution
Network/syscall wait time

benchstat — Statistical Comparison

Compare two benchmark results statistically.

go install golang.org/x/perf/cmd/benchstat@latest

# Measure before changes
go test -bench=. -count=10 ./... > before.txt

# Measure after changes
go test -bench=. -count=10 ./... > after.txt

# Statistical comparison (includes p-value)
benchstat before.txt after.txt

name              old time/op    new time/op    delta
StringConcat-8     124µs ± 1%     2.3µs ± 2%  -98.15%  (p=0.000 n=10+10)

name              old alloc/op   new alloc/op   delta
StringConcat-8    5.62kB ± 0%    2.69kB ± 0%  -52.14%  (p=0.000 n=10+10)

p=0.000: statistically significant difference (p < 0.05)
-98.15%: 98% performance improvement

Real-World Benchmark Example — JSON Serialization

package main

import (
    "encoding/json"
    "testing"

    "github.com/bytedance/sonic"
    "github.com/goccy/go-json"
)

type User struct {
    ID    int    `json:"id"`
    Name  string `json:"name"`
    Email string `json:"email"`
    Age   int    `json:"age"`
}

var testUser = User{ID: 1, Name: "Kim Golang", Email: "go@example.com", Age: 30}

func BenchmarkJSONMarshalStd(b *testing.B) {
    b.ReportAllocs()
    for i := 0; i < b.N; i++ {
        _, err := json.Marshal(testUser)
        if err != nil {
            b.Fatal(err)
        }
    }
}

func BenchmarkJSONMarshalGoJson(b *testing.B) {
    b.ReportAllocs()
    for i := 0; i < b.N; i++ {
        _, err := gojson.Marshal(testUser)
        if err != nil {
            b.Fatal(err)
        }
    }
}

func BenchmarkJSONMarshalSonic(b *testing.B) {
    b.ReportAllocs()
    for i := 0; i < b.N; i++ {
        _, err := sonic.Marshal(testUser)
        if err != nil {
            b.Fatal(err)
        }
    }
}

Benchmark Writing Rules & Anti-Patterns

// ❌ Compiler optimization can make results meaningless
func BenchmarkBad(b *testing.B) {
    for i := 0; i < b.N; i++ {
        result := compute(42) // compiler might remove unused result
        _ = result            // this is not enough
    }
}

// ✅ Use sink variable to prevent optimization
var globalSink int

func BenchmarkGood(b *testing.B) {
    var sink int
    for i := 0; i < b.N; i++ {
        sink = compute(42)
    }
    globalSink = sink // export to global to prevent optimization
}

Important Notes

// ❌ Putting measured code outside b.N loop
func BenchmarkWrong(b *testing.B) {
    result := expensiveCompute() // only runs once — benchmark meaningless
    _ = result
    for i := 0; i < b.N; i++ {
        _ = result
    }
}

// ✅ Correct structure
func BenchmarkRight(b *testing.B) {
    for i := 0; i < b.N; i++ {
        _ = expensiveCompute() // runs each iteration
    }
}

Key Summary

Tool	Purpose
`go test -bench=.`	Run benchmarks
`-benchmem`	Measure memory allocation
`-cpuprofile=f`	Save CPU profile
`-memprofile=f`	Save memory profile
`go tool pprof`	Analyze profiles
`go tool trace`	Analyze execution traces
`benchstat`	Statistical comparison

b.N is determined by Go runtime — never set manually
Use b.ResetTimer() to exclude setup time
Use -benchmem to identify if memory allocation is the bottleneck
Use benchstat to verify statistical significance with p-values

testing.B — Benchmark Basics​

Benchmark Execution Commands​

Reading Benchmark Results​

Table-Driven Benchmarks​

b.ResetTimer / b.StopTimer / b.StartTimer​

Memory Allocation Optimization Benchmarks​

pprof — CPU & Memory Profiling​

Generating Profiles from Benchmarks​

Interactive pprof Analysis​

Integrate pprof in HTTP Server​

trace — Execution Tracing​

benchstat — Statistical Comparison​

Real-World Benchmark Example — JSON Serialization​

Benchmark Writing Rules & Anti-Patterns​

Important Notes​

Key Summary​