Go Performance Optimization — Profiling and Tuning

Go has a built-in garbage collector, goroutine scheduler, and memory allocator. Optimizing by guesswork without proper profiling often backfires. The key is: "Measure → Analyze → Optimize."

pprof Profiling

HTTP Server Profiling Endpoint

// main.go — Integrate pprof into production apps
package main

import (
    "log"
    "net/http"
    _ "net/http/pprof" // Register handlers as side effect
    "time"
)

func main() {
    // Expose profiling server on separate port (internal access only)
    go func() {
        log.Println("pprof server: :6060")
        log.Fatal(http.ListenAndServe(":6060", nil))
    }()

    // Actual application server
    mux := http.NewServeMux()
    mux.HandleFunc("/", expensiveHandler)
    log.Fatal(http.ListenAndServe(":8080", mux))
}

func expensiveHandler(w http.ResponseWriter, r *http.Request) {
    // Simulate CPU-intensive work
    sum := 0
    for i := 0; i < 10_000_000; i++ {
        sum += i
    }
    time.Sleep(10 * time.Millisecond) // Simulate I/O wait
    w.Write([]byte("OK"))
}

# Collect CPU profile for 30 seconds
go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30

# Memory profile
go tool pprof http://localhost:6060/debug/pprof/heap

# Goroutine stack dump
curl http://localhost:6060/debug/pprof/goroutine?debug=2

# Mutex contention analysis
go tool pprof http://localhost:6060/debug/pprof/mutex

# Blocking operation analysis
go tool pprof http://localhost:6060/debug/pprof/block

Programmatic Profiling

// profiling.go
package main

import (
    "os"
    "runtime"
    "runtime/pprof"
    "runtime/trace"
)

func profileCPU(filename string, fn func()) error {
    f, err := os.Create(filename)
    if err != nil {
        return err
    }
    defer f.Close()

    if err := pprof.StartCPUProfile(f); err != nil {
        return err
    }
    defer pprof.StopCPUProfile()

    fn() // Run code to profile
    return nil
}

func profileMemory(filename string) error {
    runtime.GC() // Accurate snapshot of current state

    f, err := os.Create(filename)
    if err != nil {
        return err
    }
    defer f.Close()

    return pprof.WriteHeapProfile(f)
}

func traceExecution(filename string, fn func()) error {
    f, err := os.Create(filename)
    if err != nil {
        return err
    }
    defer f.Close()

    if err := trace.Start(f); err != nil {
        return err
    }
    defer trace.Stop()

    fn()
    return nil
}

func main() {
    // CPU profiling
    profileCPU("cpu.pprof", func() {
        heavyComputation()
    })

    // Memory profiling
    manyAllocations()
    profileMemory("mem.pprof")

    // Execution tracing
    traceExecution("trace.out", func() {
        concurrentWork()
    })
}

# Interactive pprof analysis
go tool pprof cpu.pprof

# pprof commands
(pprof) top10          # Top 10 functions
(pprof) web            # View graph in browser (requires graphviz)
(pprof) list main.     # Show source for functions in package
(pprof) tree           # Call tree
(pprof) png > graph.png # Save as image

# Execution trace analysis
go tool trace trace.out

Memory Optimization

Reduce Heap Allocations

package main

import (
    "fmt"
    "strings"
    "sync"
)

// Bad: allocate new slice/map on each call
func processRequestsBad(items []string) []string {
    result := []string{} // Heap allocation on each call
    for _, item := range items {
        result = append(result, strings.ToUpper(item))
    }
    return result
}

// Good: pre-specify capacity
func processRequestsGood(items []string) []string {
    result := make([]string, 0, len(items)) // Pre-allocated capacity
    for _, item := range items {
        result = append(result, strings.ToUpper(item))
    }
    return result
}

// sync.Pool — reuse frequently allocated/deallocated objects
var bufferPool = sync.Pool{
    New: func() interface{} {
        return &strings.Builder{}
    },
}

func buildStringWithPool(parts []string) string {
    sb := bufferPool.Get().(*strings.Builder)
    sb.Reset()
    defer bufferPool.Put(sb)

    for _, p := range parts {
        sb.WriteString(p)
    }
    return sb.String()
}

// Value type vs pointer type
type SmallStruct struct {
    X, Y int
}

// Small structs pass by value (copy may be faster than pointer dereference)
func processSmall(s SmallStruct) int {
    return s.X + s.Y
}

type LargeStruct struct {
    Data [1024]byte
    Meta string
}

// Large structs pass by pointer
func processLarge(s *LargeStruct) int {
    return len(s.Data)
}

func main() {
    items := []string{"hello", "world", "go"}
    fmt.Println(processRequestsGood(items))

    parts := []string{"foo", "bar", "baz"}
    fmt.Println(buildStringWithPool(parts))
}

Escape Analysis

// escape_test.go
package main

import "testing"

// Cases where values escape to heap vs stay on stack
func stackAlloc() int {
    x := 42 // Stack allocation (not returned)
    return x
}

func heapAlloc() *int {
    x := 42 // Escapes to heap (pointer returned)
    return &x
}

func BenchmarkStackAlloc(b *testing.B) {
    for i := 0; i < b.N; i++ {
        _ = stackAlloc()
    }
}

func BenchmarkHeapAlloc(b *testing.B) {
    for i := 0; i < b.N; i++ {
        _ = heapAlloc()
    }
}

# Show escape analysis output
go build -gcflags="-m -m" ./...

# Output example:
# ./main.go:15:6: moved to heap: x  ← escapes to heap
# ./main.go:10:6: x does not escape ← stays on stack

CPU Optimization

Compiler Inlining

package main

import "fmt"

// Small functions are automatically inlined (~10 instructions or less)
func add(a, b int) int {
    return a + b // inlined
}

// Force inlining hint
//go:nosplit
func fastPath(n int) int {
    return n * n
}

// Prevent inlining (recursion, closures, etc.)
func recursive(n int) int {
    if n <= 1 {
        return 1
    }
    return n * recursive(n-1)
}

// Check inlining status
// go build -gcflags="-m" main.go
// ./main.go:8:6: can inline add

func main() {
    fmt.Println(add(1, 2))
    fmt.Println(fastPath(5))
}

SIMD and Assembly (Advanced)

// sum_amd64.go — Connect assembly implementation
package compute

// Function implemented in assembly (implementation in sum_amd64.s)
func SumSliceASM(data []float32) float32

// Pure Go fallback
func SumSlice(data []float32) float32 {
    var sum float32
    for _, v := range data {
        sum += v
    }
    return sum
}

Cache-Friendly Data Structures

package main

import "fmt"

// Bad: AoS (Array of Structs) — causes cache misses
type ParticleAoS struct {
    X, Y, Z    float32 // Position
    VX, VY, VZ float32 // Velocity
    Mass       float32
}

func updatePositionsAoS(particles []ParticleAoS, dt float32) {
    for i := range particles {
        // Only need to update X, Y, Z but whole struct is loaded
        particles[i].X += particles[i].VX * dt
        particles[i].Y += particles[i].VY * dt
        particles[i].Z += particles[i].VZ * dt
    }
}

// Good: SoA (Struct of Arrays) — cache efficient
type ParticlesSoA struct {
    X, Y, Z    []float32 // Position arrays
    VX, VY, VZ []float32 // Velocity arrays
    Mass       []float32
}

func updatePositionsSoA(p *ParticlesSoA, dt float32) {
    // Only sequentially access X array → perfect cache locality
    for i := range p.X {
        p.X[i] += p.VX[i] * dt
        p.Y[i] += p.VY[i] * dt
        p.Z[i] += p.VZ[i] * dt
    }
}

func main() {
    // AoS approach
    particlesAoS := make([]ParticleAoS, 10000)
    updatePositionsAoS(particlesAoS, 0.016)

    // SoA approach (faster)
    n := 10000
    particlesSoA := &ParticlesSoA{
        X: make([]float32, n), Y: make([]float32, n), Z: make([]float32, n),
        VX: make([]float32, n), VY: make([]float32, n), VZ: make([]float32, n),
    }
    updatePositionsSoA(particlesSoA, 0.016)
    fmt.Println("Update complete")
}

Goroutine and Concurrency Optimization

Worker Pool Pattern

package main

import (
    "fmt"
    "sync"
)

// Prevent unbounded goroutine creation: use worker pool
type WorkerPool struct {
    tasks   chan func()
    wg      sync.WaitGroup
}

func NewWorkerPool(workers int) *WorkerPool {
    pool := &WorkerPool{
        tasks: make(chan func(), workers*10),
    }
    for i := 0; i < workers; i++ {
        pool.wg.Add(1)
        go func() {
            defer pool.wg.Done()
            for task := range pool.tasks {
                task()
            }
        }()
    }
    return pool
}

func (p *WorkerPool) Submit(task func()) {
    p.tasks <- task
}

func (p *WorkerPool) Close() {
    close(p.tasks)
    p.wg.Wait()
}

func main() {
    pool := NewWorkerPool(4) // Tune to CPU cores

    results := make([]int, 100)
    var mu sync.Mutex

    for i := 0; i < 100; i++ {
        i := i // Capture loop variable
        pool.Submit(func() {
            result := i * i
            mu.Lock()
            results[i] = result
            mu.Unlock()
        })
    }

    pool.Close()
    fmt.Printf("Sample results: %v\n", results[:5])
}

Channel vs Mutex Selection

package main

import (
    "fmt"
    "sync"
    "sync/atomic"
)

// atomic — simple counter (fastest)
type AtomicCounter struct {
    value int64
}

func (c *AtomicCounter) Increment() {
    atomic.AddInt64(&c.value, 1)
}

func (c *AtomicCounter) Get() int64 {
    return atomic.LoadInt64(&c.value)
}

// sync.Mutex — protect complex state
type SafeMap struct {
    mu   sync.RWMutex
    data map[string]int
}

func (m *SafeMap) Set(key string, val int) {
    m.mu.Lock()
    defer m.mu.Unlock()
    m.data[key] = val
}

func (m *SafeMap) Get(key string) (int, bool) {
    m.mu.RLock() // Read lock for parallel reads
    defer m.mu.RUnlock()
    v, ok := m.data[key]
    return v, ok
}

// sync.Map — for read-heavy workloads
var globalCache sync.Map

func cacheGet(key string) (interface{}, bool) {
    return globalCache.Load(key)
}

func cacheSet(key string, val interface{}) {
    globalCache.Store(key, val)
}

func main() {
    counter := &AtomicCounter{}
    var wg sync.WaitGroup
    for i := 0; i < 1000; i++ {
        wg.Add(1)
        go func() {
            defer wg.Done()
            counter.Increment()
        }()
    }
    wg.Wait()
    fmt.Printf("Counter: %d\n", counter.Get()) // 1000
}

Garbage Collector Tuning

package main

import (
    "fmt"
    "os"
    "runtime"
    "runtime/debug"
)

func init() {
    // GOGC: GC trigger threshold (default 100 = trigger when heap grows 100%)
    // Higher = less frequent GC (saves CPU, increases memory)
    // Lower = more frequent GC (better memory efficiency, more CPU)
    //
    // Set via environment variable:
    // GOGC=200 ./app   → Run GC less often (save CPU, use more memory)
    // GOGC=off ./app   → Disable GC (batch jobs only)

    // Set in code
    debug.SetGCPercent(200) // Reduce GC frequency if memory available

    // Go 1.19+: Set memory limit with GOMEMLIMIT
    // GOMEMLIMIT=512MiB ./app
    debug.SetMemoryLimit(512 * 1024 * 1024) // 512MB
}

func printMemStats() {
    var ms runtime.MemStats
    runtime.ReadMemStats(&ms)
    fmt.Printf("Heap in use: %d MB\n", ms.HeapInuse/1024/1024)
    fmt.Printf("Total heap allocated: %d MB\n", ms.TotalAlloc/1024/1024)
    fmt.Printf("GC runs: %d\n", ms.NumGC)
    fmt.Printf("GC pause time (total): %d ms\n", ms.PauseTotalNs/1e6)
}

func main() {
    // Force GC
    runtime.GC()

    printMemStats()

    // Manual GC hint (after releasing large temporary data)
    bigData := make([]byte, 100*1024*1024) // 100MB temporary allocation
    _ = bigData
    bigData = nil
    runtime.GC() // Request immediate cleanup

    printMemStats()

    // Check CPU/memory limit environment variables
    fmt.Printf("GOMAXPROCS: %d\n", runtime.GOMAXPROCS(0))
    fmt.Printf("GOGC: %s\n", os.Getenv("GOGC"))
}

Performance Optimization Cheat Sheet

// 1. String concatenation: Builder > + operator
// Bad
s := ""
for _, word := range words {
    s += word + " " // New string allocation each time
}

// Good
var sb strings.Builder
sb.Grow(totalSize) // Pre-reserve capacity
for _, word := range words {
    sb.WriteString(word)
    sb.WriteByte(' ')
}
s := sb.String()

// 2. Slice append: pre-specify capacity
result := make([]T, 0, expectedSize)

// 3. Map initialization: capacity hint
m := make(map[string]int, expectedSize)

// 4. Struct field ordering: minimize padding
// Bad (24 bytes)
type BadStruct struct {
    A bool    // 1 byte + 7 padding
    B int64   // 8 bytes
    C bool    // 1 byte + 7 padding
}

// Good (16 bytes)
type GoodStruct struct {
    B int64   // 8 bytes
    A bool    // 1 byte
    C bool    // 1 byte + 6 padding
}

// 5. Minimize interface boxing
// Bad: boxing in loop
for _, v := range items {
    process(interface{}(v)) // Boxing on each iteration
}

// Good: process concrete type directly
for _, v := range items {
    processTyped(v)
}

Performance Measurement Tools

# 1. Run benchmarks
go test -bench=. -benchmem -count=5 ./...

# 2. CPU profile
go test -bench=. -cpuprofile=cpu.pprof ./...
go tool pprof cpu.pprof

# 3. Memory profile
go test -bench=. -memprofile=mem.pprof ./...
go tool pprof -alloc_space mem.pprof

# 4. Execution trace
go test -bench=. -trace=trace.out ./...
go tool trace trace.out

# 5. Escape analysis
go build -gcflags="-m" ./...

# 6. Assembly output
go build -gcflags="-S" ./...

# 7. Disable optimizations (debugging)
go build -gcflags="-N -l" ./...

# 8. benchstat — statistical comparison
go install golang.org/x/perf/cmd/benchstat@latest
go test -bench=. -count=10 > old.txt
# After code change
go test -bench=. -count=10 > new.txt
benchstat old.txt new.txt

Key Takeaways

Optimization Area	Technique	Effect
Memory allocation	Specify capacity in `make`	Prevent reallocations
Object reuse	`sync.Pool`	Reduce GC pressure
String building	`strings.Builder`	Reduce copy operations
Concurrency	Worker pools	Control goroutine overhead
Counters	`sync/atomic`	Remove lock overhead
Cache efficiency	SoA layout	Improve CPU cache hit rate
GC tuning	`GOGC`, `GOMEMLIMIT`	Balance GC frequency/memory

Measure first: Profile with pprof to find bottlenecks, then optimize
Beware micro-optimization: Making a 1% function 10x faster only improves overall by 0.9%
Balance readability vs performance: Don't increase code complexity without clear gains

pprof Profiling​

HTTP Server Profiling Endpoint​

Programmatic Profiling​

Memory Optimization​

Reduce Heap Allocations​

Escape Analysis​

CPU Optimization​

Compiler Inlining​

SIMD and Assembly (Advanced)​

Cache-Friendly Data Structures​

Goroutine and Concurrency Optimization​

Worker Pool Pattern​

Channel vs Mutex Selection​

Garbage Collector Tuning​

Performance Optimization Cheat Sheet​

Performance Measurement Tools​

Key Takeaways​

pprof Profiling

HTTP Server Profiling Endpoint

Programmatic Profiling

Memory Optimization

Reduce Heap Allocations

Escape Analysis

CPU Optimization

Compiler Inlining

SIMD and Assembly (Advanced)

Cache-Friendly Data Structures

Goroutine and Concurrency Optimization

Worker Pool Pattern

Channel vs Mutex Selection

Garbage Collector Tuning

Performance Optimization Cheat Sheet

Performance Measurement Tools

Key Takeaways