Go Performance Optimization — Profiling and Tuning
Go has a built-in garbage collector, goroutine scheduler, and memory allocator. Optimizing by guesswork without proper profiling often backfires. The key is: "Measure → Analyze → Optimize."
pprof Profiling
HTTP Server Profiling Endpoint
// main.go — Integrate pprof into production apps
package main
import (
"log"
"net/http"
_ "net/http/pprof" // Register handlers as side effect
"time"
)
func main() {
// Expose profiling server on separate port (internal access only)
go func() {
log.Println("pprof server: :6060")
log.Fatal(http.ListenAndServe(":6060", nil))
}()
// Actual application server
mux := http.NewServeMux()
mux.HandleFunc("/", expensiveHandler)
log.Fatal(http.ListenAndServe(":8080", mux))
}
func expensiveHandler(w http.ResponseWriter, r *http.Request) {
// Simulate CPU-intensive work
sum := 0
for i := 0; i < 10_000_000; i++ {
sum += i
}
time.Sleep(10 * time.Millisecond) // Simulate I/O wait
w.Write([]byte("OK"))
}
# Collect CPU profile for 30 seconds
go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30
# Memory profile
go tool pprof http://localhost:6060/debug/pprof/heap
# Goroutine stack dump
curl http://localhost:6060/debug/pprof/goroutine?debug=2
# Mutex contention analysis
go tool pprof http://localhost:6060/debug/pprof/mutex
# Blocking operation analysis
go tool pprof http://localhost:6060/debug/pprof/block
Programmatic Profiling
// profiling.go
package main
import (
"os"
"runtime"
"runtime/pprof"
"runtime/trace"
)
func profileCPU(filename string, fn func()) error {
f, err := os.Create(filename)
if err != nil {
return err
}
defer f.Close()
if err := pprof.StartCPUProfile(f); err != nil {
return err
}
defer pprof.StopCPUProfile()
fn() // Run code to profile
return nil
}
func profileMemory(filename string) error {
runtime.GC() // Accurate snapshot of current state
f, err := os.Create(filename)
if err != nil {
return err
}
defer f.Close()
return pprof.WriteHeapProfile(f)
}
func traceExecution(filename string, fn func()) error {
f, err := os.Create(filename)
if err != nil {
return err
}
defer f.Close()
if err := trace.Start(f); err != nil {
return err
}
defer trace.Stop()
fn()
return nil
}
func main() {
// CPU profiling
profileCPU("cpu.pprof", func() {
heavyComputation()
})
// Memory profiling
manyAllocations()
profileMemory("mem.pprof")
// Execution tracing
traceExecution("trace.out", func() {
concurrentWork()
})
}
# Interactive pprof analysis
go tool pprof cpu.pprof
# pprof commands
(pprof) top10 # Top 10 functions
(pprof) web # View graph in browser (requires graphviz)
(pprof) list main. # Show source for functions in package
(pprof) tree # Call tree
(pprof) png > graph.png # Save as image
# Execution trace analysis
go tool trace trace.out
Memory Optimization
Reduce Heap Allocations
package main
import (
"fmt"
"strings"
"sync"
)
// Bad: allocate new slice/map on each call
func processRequestsBad(items []string) []string {
result := []string{} // Heap allocation on each call
for _, item := range items {
result = append(result, strings.ToUpper(item))
}
return result
}
// Good: pre-specify capacity
func processRequestsGood(items []string) []string {
result := make([]string, 0, len(items)) // Pre-allocated capacity
for _, item := range items {
result = append(result, strings.ToUpper(item))
}
return result
}
// sync.Pool — reuse frequently allocated/deallocated objects
var bufferPool = sync.Pool{
New: func() interface{} {
return &strings.Builder{}
},
}
func buildStringWithPool(parts []string) string {
sb := bufferPool.Get().(*strings.Builder)
sb.Reset()
defer bufferPool.Put(sb)
for _, p := range parts {
sb.WriteString(p)
}
return sb.String()
}
// Value type vs pointer type
type SmallStruct struct {
X, Y int
}
// Small structs pass by value (copy may be faster than pointer dereference)
func processSmall(s SmallStruct) int {
return s.X + s.Y
}
type LargeStruct struct {
Data [1024]byte
Meta string
}
// Large structs pass by pointer
func processLarge(s *LargeStruct) int {
return len(s.Data)
}
func main() {
items := []string{"hello", "world", "go"}
fmt.Println(processRequestsGood(items))
parts := []string{"foo", "bar", "baz"}
fmt.Println(buildStringWithPool(parts))
}
Escape Analysis
// escape_test.go
package main
import "testing"
// Cases where values escape to heap vs stay on stack
func stackAlloc() int {
x := 42 // Stack allocation (not returned)
return x
}
func heapAlloc() *int {
x := 42 // Escapes to heap (pointer returned)
return &x
}
func BenchmarkStackAlloc(b *testing.B) {
for i := 0; i < b.N; i++ {
_ = stackAlloc()
}
}
func BenchmarkHeapAlloc(b *testing.B) {
for i := 0; i < b.N; i++ {
_ = heapAlloc()
}
}
# Show escape analysis output
go build -gcflags="-m -m" ./...
# Output example:
# ./main.go:15:6: moved to heap: x ← escapes to heap
# ./main.go:10:6: x does not escape ← stays on stack
CPU Optimization
Compiler Inlining
package main
import "fmt"
// Small functions are automatically inlined (~10 instructions or less)
func add(a, b int) int {
return a + b // inlined
}
// Force inlining hint
//go:nosplit
func fastPath(n int) int {
return n * n
}
// Prevent inlining (recursion, closures, etc.)
func recursive(n int) int {
if n <= 1 {
return 1
}
return n * recursive(n-1)
}
// Check inlining status
// go build -gcflags="-m" main.go
// ./main.go:8:6: can inline add
func main() {
fmt.Println(add(1, 2))
fmt.Println(fastPath(5))
}
SIMD and Assembly (Advanced)
// sum_amd64.go — Connect assembly implementation
package compute
// Function implemented in assembly (implementation in sum_amd64.s)
func SumSliceASM(data []float32) float32
// Pure Go fallback
func SumSlice(data []float32) float32 {
var sum float32
for _, v := range data {
sum += v
}
return sum
}
Cache-Friendly Data Structures
package main
import "fmt"
// Bad: AoS (Array of Structs) — causes cache misses
type ParticleAoS struct {
X, Y, Z float32 // Position
VX, VY, VZ float32 // Velocity
Mass float32
}
func updatePositionsAoS(particles []ParticleAoS, dt float32) {
for i := range particles {
// Only need to update X, Y, Z but whole struct is loaded
particles[i].X += particles[i].VX * dt
particles[i].Y += particles[i].VY * dt
particles[i].Z += particles[i].VZ * dt
}
}
// Good: SoA (Struct of Arrays) — cache efficient
type ParticlesSoA struct {
X, Y, Z []float32 // Position arrays
VX, VY, VZ []float32 // Velocity arrays
Mass []float32
}
func updatePositionsSoA(p *ParticlesSoA, dt float32) {
// Only sequentially access X array → perfect cache locality
for i := range p.X {
p.X[i] += p.VX[i] * dt
p.Y[i] += p.VY[i] * dt
p.Z[i] += p.VZ[i] * dt
}
}
func main() {
// AoS approach
particlesAoS := make([]ParticleAoS, 10000)
updatePositionsAoS(particlesAoS, 0.016)
// SoA approach (faster)
n := 10000
particlesSoA := &ParticlesSoA{
X: make([]float32, n), Y: make([]float32, n), Z: make([]float32, n),
VX: make([]float32, n), VY: make([]float32, n), VZ: make([]float32, n),
}
updatePositionsSoA(particlesSoA, 0.016)
fmt.Println("Update complete")
}
Goroutine and Concurrency Optimization
Worker Pool Pattern
package main
import (
"fmt"
"sync"
)
// Prevent unbounded goroutine creation: use worker pool
type WorkerPool struct {
tasks chan func()
wg sync.WaitGroup
}
func NewWorkerPool(workers int) *WorkerPool {
pool := &WorkerPool{
tasks: make(chan func(), workers*10),
}
for i := 0; i < workers; i++ {
pool.wg.Add(1)
go func() {
defer pool.wg.Done()
for task := range pool.tasks {
task()
}
}()
}
return pool
}
func (p *WorkerPool) Submit(task func()) {
p.tasks <- task
}
func (p *WorkerPool) Close() {
close(p.tasks)
p.wg.Wait()
}
func main() {
pool := NewWorkerPool(4) // Tune to CPU cores
results := make([]int, 100)
var mu sync.Mutex
for i := 0; i < 100; i++ {
i := i // Capture loop variable
pool.Submit(func() {
result := i * i
mu.Lock()
results[i] = result
mu.Unlock()
})
}
pool.Close()
fmt.Printf("Sample results: %v\n", results[:5])
}
Channel vs Mutex Selection
package main
import (
"fmt"
"sync"
"sync/atomic"
)
// atomic — simple counter (fastest)
type AtomicCounter struct {
value int64
}
func (c *AtomicCounter) Increment() {
atomic.AddInt64(&c.value, 1)
}
func (c *AtomicCounter) Get() int64 {
return atomic.LoadInt64(&c.value)
}
// sync.Mutex — protect complex state
type SafeMap struct {
mu sync.RWMutex
data map[string]int
}
func (m *SafeMap) Set(key string, val int) {
m.mu.Lock()
defer m.mu.Unlock()
m.data[key] = val
}
func (m *SafeMap) Get(key string) (int, bool) {
m.mu.RLock() // Read lock for parallel reads
defer m.mu.RUnlock()
v, ok := m.data[key]
return v, ok
}
// sync.Map — for read-heavy workloads
var globalCache sync.Map
func cacheGet(key string) (interface{}, bool) {
return globalCache.Load(key)
}
func cacheSet(key string, val interface{}) {
globalCache.Store(key, val)
}
func main() {
counter := &AtomicCounter{}
var wg sync.WaitGroup
for i := 0; i < 1000; i++ {
wg.Add(1)
go func() {
defer wg.Done()
counter.Increment()
}()
}
wg.Wait()
fmt.Printf("Counter: %d\n", counter.Get()) // 1000
}
Garbage Collector Tuning
package main
import (
"fmt"
"os"
"runtime"
"runtime/debug"
)
func init() {
// GOGC: GC trigger threshold (default 100 = trigger when heap grows 100%)
// Higher = less frequent GC (saves CPU, increases memory)
// Lower = more frequent GC (better memory efficiency, more CPU)
//
// Set via environment variable:
// GOGC=200 ./app → Run GC less often (save CPU, use more memory)
// GOGC=off ./app → Disable GC (batch jobs only)
// Set in code
debug.SetGCPercent(200) // Reduce GC frequency if memory available
// Go 1.19+: Set memory limit with GOMEMLIMIT
// GOMEMLIMIT=512MiB ./app
debug.SetMemoryLimit(512 * 1024 * 1024) // 512MB
}
func printMemStats() {
var ms runtime.MemStats
runtime.ReadMemStats(&ms)
fmt.Printf("Heap in use: %d MB\n", ms.HeapInuse/1024/1024)
fmt.Printf("Total heap allocated: %d MB\n", ms.TotalAlloc/1024/1024)
fmt.Printf("GC runs: %d\n", ms.NumGC)
fmt.Printf("GC pause time (total): %d ms\n", ms.PauseTotalNs/1e6)
}
func main() {
// Force GC
runtime.GC()
printMemStats()
// Manual GC hint (after releasing large temporary data)
bigData := make([]byte, 100*1024*1024) // 100MB temporary allocation
_ = bigData
bigData = nil
runtime.GC() // Request immediate cleanup
printMemStats()
// Check CPU/memory limit environment variables
fmt.Printf("GOMAXPROCS: %d\n", runtime.GOMAXPROCS(0))
fmt.Printf("GOGC: %s\n", os.Getenv("GOGC"))
}
Performance Optimization Cheat Sheet
// 1. String concatenation: Builder > + operator
// Bad
s := ""
for _, word := range words {
s += word + " " // New string allocation each time
}
// Good
var sb strings.Builder
sb.Grow(totalSize) // Pre-reserve capacity
for _, word := range words {
sb.WriteString(word)
sb.WriteByte(' ')
}
s := sb.String()
// 2. Slice append: pre-specify capacity
result := make([]T, 0, expectedSize)
// 3. Map initialization: capacity hint
m := make(map[string]int, expectedSize)
// 4. Struct field ordering: minimize padding
// Bad (24 bytes)
type BadStruct struct {
A bool // 1 byte + 7 padding
B int64 // 8 bytes
C bool // 1 byte + 7 padding
}
// Good (16 bytes)
type GoodStruct struct {
B int64 // 8 bytes
A bool // 1 byte
C bool // 1 byte + 6 padding
}
// 5. Minimize interface boxing
// Bad: boxing in loop
for _, v := range items {
process(interface{}(v)) // Boxing on each iteration
}
// Good: process concrete type directly
for _, v := range items {
processTyped(v)
}
Performance Measurement Tools
# 1. Run benchmarks
go test -bench=. -benchmem -count=5 ./...
# 2. CPU profile
go test -bench=. -cpuprofile=cpu.pprof ./...
go tool pprof cpu.pprof
# 3. Memory profile
go test -bench=. -memprofile=mem.pprof ./...
go tool pprof -alloc_space mem.pprof
# 4. Execution trace
go test -bench=. -trace=trace.out ./...
go tool trace trace.out
# 5. Escape analysis
go build -gcflags="-m" ./...
# 6. Assembly output
go build -gcflags="-S" ./...
# 7. Disable optimizations (debugging)
go build -gcflags="-N -l" ./...
# 8. benchstat — statistical comparison
go install golang.org/x/perf/cmd/benchstat@latest
go test -bench=. -count=10 > old.txt
# After code change
go test -bench=. -count=10 > new.txt
benchstat old.txt new.txt
Key Takeaways
| Optimization Area | Technique | Effect |
|---|---|---|
| Memory allocation | Specify capacity in make | Prevent reallocations |
| Object reuse | sync.Pool | Reduce GC pressure |
| String building | strings.Builder | Reduce copy operations |
| Concurrency | Worker pools | Control goroutine overhead |
| Counters | sync/atomic | Remove lock overhead |
| Cache efficiency | SoA layout | Improve CPU cache hit rate |
| GC tuning | GOGC, GOMEMLIMIT | Balance GC frequency/memory |
- Measure first: Profile with pprof to find bottlenecks, then optimize
- Beware micro-optimization: Making a 1% function 10x faster only improves overall by 0.9%
- Balance readability vs performance: Don't increase code complexity without clear gains