Skip to main content

Go Performance Optimization — Profiling and Tuning

Go has a built-in garbage collector, goroutine scheduler, and memory allocator. Optimizing by guesswork without proper profiling often backfires. The key is: "Measure → Analyze → Optimize."


pprof Profiling

HTTP Server Profiling Endpoint

// main.go — Integrate pprof into production apps
package main

import (
"log"
"net/http"
_ "net/http/pprof" // Register handlers as side effect
"time"
)

func main() {
// Expose profiling server on separate port (internal access only)
go func() {
log.Println("pprof server: :6060")
log.Fatal(http.ListenAndServe(":6060", nil))
}()

// Actual application server
mux := http.NewServeMux()
mux.HandleFunc("/", expensiveHandler)
log.Fatal(http.ListenAndServe(":8080", mux))
}

func expensiveHandler(w http.ResponseWriter, r *http.Request) {
// Simulate CPU-intensive work
sum := 0
for i := 0; i < 10_000_000; i++ {
sum += i
}
time.Sleep(10 * time.Millisecond) // Simulate I/O wait
w.Write([]byte("OK"))
}
# Collect CPU profile for 30 seconds
go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30

# Memory profile
go tool pprof http://localhost:6060/debug/pprof/heap

# Goroutine stack dump
curl http://localhost:6060/debug/pprof/goroutine?debug=2

# Mutex contention analysis
go tool pprof http://localhost:6060/debug/pprof/mutex

# Blocking operation analysis
go tool pprof http://localhost:6060/debug/pprof/block

Programmatic Profiling

// profiling.go
package main

import (
"os"
"runtime"
"runtime/pprof"
"runtime/trace"
)

func profileCPU(filename string, fn func()) error {
f, err := os.Create(filename)
if err != nil {
return err
}
defer f.Close()

if err := pprof.StartCPUProfile(f); err != nil {
return err
}
defer pprof.StopCPUProfile()

fn() // Run code to profile
return nil
}

func profileMemory(filename string) error {
runtime.GC() // Accurate snapshot of current state

f, err := os.Create(filename)
if err != nil {
return err
}
defer f.Close()

return pprof.WriteHeapProfile(f)
}

func traceExecution(filename string, fn func()) error {
f, err := os.Create(filename)
if err != nil {
return err
}
defer f.Close()

if err := trace.Start(f); err != nil {
return err
}
defer trace.Stop()

fn()
return nil
}

func main() {
// CPU profiling
profileCPU("cpu.pprof", func() {
heavyComputation()
})

// Memory profiling
manyAllocations()
profileMemory("mem.pprof")

// Execution tracing
traceExecution("trace.out", func() {
concurrentWork()
})
}
# Interactive pprof analysis
go tool pprof cpu.pprof

# pprof commands
(pprof) top10 # Top 10 functions
(pprof) web # View graph in browser (requires graphviz)
(pprof) list main. # Show source for functions in package
(pprof) tree # Call tree
(pprof) png > graph.png # Save as image

# Execution trace analysis
go tool trace trace.out

Memory Optimization

Reduce Heap Allocations

package main

import (
"fmt"
"strings"
"sync"
)

// Bad: allocate new slice/map on each call
func processRequestsBad(items []string) []string {
result := []string{} // Heap allocation on each call
for _, item := range items {
result = append(result, strings.ToUpper(item))
}
return result
}

// Good: pre-specify capacity
func processRequestsGood(items []string) []string {
result := make([]string, 0, len(items)) // Pre-allocated capacity
for _, item := range items {
result = append(result, strings.ToUpper(item))
}
return result
}

// sync.Pool — reuse frequently allocated/deallocated objects
var bufferPool = sync.Pool{
New: func() interface{} {
return &strings.Builder{}
},
}

func buildStringWithPool(parts []string) string {
sb := bufferPool.Get().(*strings.Builder)
sb.Reset()
defer bufferPool.Put(sb)

for _, p := range parts {
sb.WriteString(p)
}
return sb.String()
}

// Value type vs pointer type
type SmallStruct struct {
X, Y int
}

// Small structs pass by value (copy may be faster than pointer dereference)
func processSmall(s SmallStruct) int {
return s.X + s.Y
}

type LargeStruct struct {
Data [1024]byte
Meta string
}

// Large structs pass by pointer
func processLarge(s *LargeStruct) int {
return len(s.Data)
}

func main() {
items := []string{"hello", "world", "go"}
fmt.Println(processRequestsGood(items))

parts := []string{"foo", "bar", "baz"}
fmt.Println(buildStringWithPool(parts))
}

Escape Analysis

// escape_test.go
package main

import "testing"

// Cases where values escape to heap vs stay on stack
func stackAlloc() int {
x := 42 // Stack allocation (not returned)
return x
}

func heapAlloc() *int {
x := 42 // Escapes to heap (pointer returned)
return &x
}

func BenchmarkStackAlloc(b *testing.B) {
for i := 0; i < b.N; i++ {
_ = stackAlloc()
}
}

func BenchmarkHeapAlloc(b *testing.B) {
for i := 0; i < b.N; i++ {
_ = heapAlloc()
}
}
# Show escape analysis output
go build -gcflags="-m -m" ./...

# Output example:
# ./main.go:15:6: moved to heap: x ← escapes to heap
# ./main.go:10:6: x does not escape ← stays on stack

CPU Optimization

Compiler Inlining

package main

import "fmt"

// Small functions are automatically inlined (~10 instructions or less)
func add(a, b int) int {
return a + b // inlined
}

// Force inlining hint
//go:nosplit
func fastPath(n int) int {
return n * n
}

// Prevent inlining (recursion, closures, etc.)
func recursive(n int) int {
if n <= 1 {
return 1
}
return n * recursive(n-1)
}

// Check inlining status
// go build -gcflags="-m" main.go
// ./main.go:8:6: can inline add

func main() {
fmt.Println(add(1, 2))
fmt.Println(fastPath(5))
}

SIMD and Assembly (Advanced)

// sum_amd64.go — Connect assembly implementation
package compute

// Function implemented in assembly (implementation in sum_amd64.s)
func SumSliceASM(data []float32) float32

// Pure Go fallback
func SumSlice(data []float32) float32 {
var sum float32
for _, v := range data {
sum += v
}
return sum
}

Cache-Friendly Data Structures

package main

import "fmt"

// Bad: AoS (Array of Structs) — causes cache misses
type ParticleAoS struct {
X, Y, Z float32 // Position
VX, VY, VZ float32 // Velocity
Mass float32
}

func updatePositionsAoS(particles []ParticleAoS, dt float32) {
for i := range particles {
// Only need to update X, Y, Z but whole struct is loaded
particles[i].X += particles[i].VX * dt
particles[i].Y += particles[i].VY * dt
particles[i].Z += particles[i].VZ * dt
}
}

// Good: SoA (Struct of Arrays) — cache efficient
type ParticlesSoA struct {
X, Y, Z []float32 // Position arrays
VX, VY, VZ []float32 // Velocity arrays
Mass []float32
}

func updatePositionsSoA(p *ParticlesSoA, dt float32) {
// Only sequentially access X array → perfect cache locality
for i := range p.X {
p.X[i] += p.VX[i] * dt
p.Y[i] += p.VY[i] * dt
p.Z[i] += p.VZ[i] * dt
}
}

func main() {
// AoS approach
particlesAoS := make([]ParticleAoS, 10000)
updatePositionsAoS(particlesAoS, 0.016)

// SoA approach (faster)
n := 10000
particlesSoA := &ParticlesSoA{
X: make([]float32, n), Y: make([]float32, n), Z: make([]float32, n),
VX: make([]float32, n), VY: make([]float32, n), VZ: make([]float32, n),
}
updatePositionsSoA(particlesSoA, 0.016)
fmt.Println("Update complete")
}

Goroutine and Concurrency Optimization

Worker Pool Pattern

package main

import (
"fmt"
"sync"
)

// Prevent unbounded goroutine creation: use worker pool
type WorkerPool struct {
tasks chan func()
wg sync.WaitGroup
}

func NewWorkerPool(workers int) *WorkerPool {
pool := &WorkerPool{
tasks: make(chan func(), workers*10),
}
for i := 0; i < workers; i++ {
pool.wg.Add(1)
go func() {
defer pool.wg.Done()
for task := range pool.tasks {
task()
}
}()
}
return pool
}

func (p *WorkerPool) Submit(task func()) {
p.tasks <- task
}

func (p *WorkerPool) Close() {
close(p.tasks)
p.wg.Wait()
}

func main() {
pool := NewWorkerPool(4) // Tune to CPU cores

results := make([]int, 100)
var mu sync.Mutex

for i := 0; i < 100; i++ {
i := i // Capture loop variable
pool.Submit(func() {
result := i * i
mu.Lock()
results[i] = result
mu.Unlock()
})
}

pool.Close()
fmt.Printf("Sample results: %v\n", results[:5])
}

Channel vs Mutex Selection

package main

import (
"fmt"
"sync"
"sync/atomic"
)

// atomic — simple counter (fastest)
type AtomicCounter struct {
value int64
}

func (c *AtomicCounter) Increment() {
atomic.AddInt64(&c.value, 1)
}

func (c *AtomicCounter) Get() int64 {
return atomic.LoadInt64(&c.value)
}

// sync.Mutex — protect complex state
type SafeMap struct {
mu sync.RWMutex
data map[string]int
}

func (m *SafeMap) Set(key string, val int) {
m.mu.Lock()
defer m.mu.Unlock()
m.data[key] = val
}

func (m *SafeMap) Get(key string) (int, bool) {
m.mu.RLock() // Read lock for parallel reads
defer m.mu.RUnlock()
v, ok := m.data[key]
return v, ok
}

// sync.Map — for read-heavy workloads
var globalCache sync.Map

func cacheGet(key string) (interface{}, bool) {
return globalCache.Load(key)
}

func cacheSet(key string, val interface{}) {
globalCache.Store(key, val)
}

func main() {
counter := &AtomicCounter{}
var wg sync.WaitGroup
for i := 0; i < 1000; i++ {
wg.Add(1)
go func() {
defer wg.Done()
counter.Increment()
}()
}
wg.Wait()
fmt.Printf("Counter: %d\n", counter.Get()) // 1000
}

Garbage Collector Tuning

package main

import (
"fmt"
"os"
"runtime"
"runtime/debug"
)

func init() {
// GOGC: GC trigger threshold (default 100 = trigger when heap grows 100%)
// Higher = less frequent GC (saves CPU, increases memory)
// Lower = more frequent GC (better memory efficiency, more CPU)
//
// Set via environment variable:
// GOGC=200 ./app → Run GC less often (save CPU, use more memory)
// GOGC=off ./app → Disable GC (batch jobs only)

// Set in code
debug.SetGCPercent(200) // Reduce GC frequency if memory available

// Go 1.19+: Set memory limit with GOMEMLIMIT
// GOMEMLIMIT=512MiB ./app
debug.SetMemoryLimit(512 * 1024 * 1024) // 512MB
}

func printMemStats() {
var ms runtime.MemStats
runtime.ReadMemStats(&ms)
fmt.Printf("Heap in use: %d MB\n", ms.HeapInuse/1024/1024)
fmt.Printf("Total heap allocated: %d MB\n", ms.TotalAlloc/1024/1024)
fmt.Printf("GC runs: %d\n", ms.NumGC)
fmt.Printf("GC pause time (total): %d ms\n", ms.PauseTotalNs/1e6)
}

func main() {
// Force GC
runtime.GC()

printMemStats()

// Manual GC hint (after releasing large temporary data)
bigData := make([]byte, 100*1024*1024) // 100MB temporary allocation
_ = bigData
bigData = nil
runtime.GC() // Request immediate cleanup

printMemStats()

// Check CPU/memory limit environment variables
fmt.Printf("GOMAXPROCS: %d\n", runtime.GOMAXPROCS(0))
fmt.Printf("GOGC: %s\n", os.Getenv("GOGC"))
}

Performance Optimization Cheat Sheet

// 1. String concatenation: Builder > + operator
// Bad
s := ""
for _, word := range words {
s += word + " " // New string allocation each time
}

// Good
var sb strings.Builder
sb.Grow(totalSize) // Pre-reserve capacity
for _, word := range words {
sb.WriteString(word)
sb.WriteByte(' ')
}
s := sb.String()

// 2. Slice append: pre-specify capacity
result := make([]T, 0, expectedSize)

// 3. Map initialization: capacity hint
m := make(map[string]int, expectedSize)

// 4. Struct field ordering: minimize padding
// Bad (24 bytes)
type BadStruct struct {
A bool // 1 byte + 7 padding
B int64 // 8 bytes
C bool // 1 byte + 7 padding
}

// Good (16 bytes)
type GoodStruct struct {
B int64 // 8 bytes
A bool // 1 byte
C bool // 1 byte + 6 padding
}

// 5. Minimize interface boxing
// Bad: boxing in loop
for _, v := range items {
process(interface{}(v)) // Boxing on each iteration
}

// Good: process concrete type directly
for _, v := range items {
processTyped(v)
}

Performance Measurement Tools

# 1. Run benchmarks
go test -bench=. -benchmem -count=5 ./...

# 2. CPU profile
go test -bench=. -cpuprofile=cpu.pprof ./...
go tool pprof cpu.pprof

# 3. Memory profile
go test -bench=. -memprofile=mem.pprof ./...
go tool pprof -alloc_space mem.pprof

# 4. Execution trace
go test -bench=. -trace=trace.out ./...
go tool trace trace.out

# 5. Escape analysis
go build -gcflags="-m" ./...

# 6. Assembly output
go build -gcflags="-S" ./...

# 7. Disable optimizations (debugging)
go build -gcflags="-N -l" ./...

# 8. benchstat — statistical comparison
go install golang.org/x/perf/cmd/benchstat@latest
go test -bench=. -count=10 > old.txt
# After code change
go test -bench=. -count=10 > new.txt
benchstat old.txt new.txt

Key Takeaways

Optimization AreaTechniqueEffect
Memory allocationSpecify capacity in makePrevent reallocations
Object reusesync.PoolReduce GC pressure
String buildingstrings.BuilderReduce copy operations
ConcurrencyWorker poolsControl goroutine overhead
Counterssync/atomicRemove lock overhead
Cache efficiencySoA layoutImprove CPU cache hit rate
GC tuningGOGC, GOMEMLIMITBalance GC frequency/memory
  • Measure first: Profile with pprof to find bottlenecks, then optimize
  • Beware micro-optimization: Making a 1% function 10x faster only improves overall by 0.9%
  • Balance readability vs performance: Don't increase code complexity without clear gains