Go 성능 최적화 — 프로파일링과 튜닝

Go는 가비지 컬렉터, 고루틴 스케줄러, 메모리 할당기를 내장하고 있어 올바른 프로파일링 없이 추측으로 최적화하면 오히려 역효과가 납니다. "측정 → 분석 → 최적화" 순서를 지켜야 합니다.

pprof 프로파일링

HTTP 서버 프로파일링 엔드포인트

// main.go — 프로덕션 앱에 pprof 통합
package main

import (
    "log"
    "net/http"
    _ "net/http/pprof" // 사이드 이펙트로 핸들러 등록
    "time"
)

func main() {
    // 별도 포트로 프로파일링 서버 노출 (내부 접근만)
    go func() {
        log.Println("pprof 서버: :6060")
        log.Fatal(http.ListenAndServe(":6060", nil))
    }()

    // 실제 애플리케이션 서버
    mux := http.NewServeMux()
    mux.HandleFunc("/", expensiveHandler)
    log.Fatal(http.ListenAndServe(":8080", mux))
}

func expensiveHandler(w http.ResponseWriter, r *http.Request) {
    // CPU를 많이 사용하는 작업 시뮬레이션
    sum := 0
    for i := 0; i < 10_000_000; i++ {
        sum += i
    }
    time.Sleep(10 * time.Millisecond) // I/O 대기 시뮬레이션
    w.Write([]byte("OK"))
}

# CPU 프로파일 30초 수집
go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30

# 메모리 프로파일
go tool pprof http://localhost:6060/debug/pprof/heap

# 고루틴 스택 덤프
curl http://localhost:6060/debug/pprof/goroutine?debug=2

# 뮤텍스 경합 분석
go tool pprof http://localhost:6060/debug/pprof/mutex

# 블로킹 연산 분석
go tool pprof http://localhost:6060/debug/pprof/block

프로그래밍 방식 프로파일링

// profiling.go
package main

import (
    "os"
    "runtime"
    "runtime/pprof"
    "runtime/trace"
)

func profileCPU(filename string, fn func()) error {
    f, err := os.Create(filename)
    if err != nil {
        return err
    }
    defer f.Close()

    if err := pprof.StartCPUProfile(f); err != nil {
        return err
    }
    defer pprof.StopCPUProfile()

    fn() // 프로파일링할 코드 실행
    return nil
}

func profileMemory(filename string) error {
    runtime.GC() // 현재 상태의 정확한 스냅샷

    f, err := os.Create(filename)
    if err != nil {
        return err
    }
    defer f.Close()

    return pprof.WriteHeapProfile(f)
}

func traceExecution(filename string, fn func()) error {
    f, err := os.Create(filename)
    if err != nil {
        return err
    }
    defer f.Close()

    if err := trace.Start(f); err != nil {
        return err
    }
    defer trace.Stop()

    fn()
    return nil
}

func main() {
    // CPU 프로파일링
    profileCPU("cpu.pprof", func() {
        heavyComputation()
    })

    // 메모리 프로파일링
    manyAllocations()
    profileMemory("mem.pprof")

    // 실행 트레이싱
    traceExecution("trace.out", func() {
        concurrentWork()
    })
}

# pprof 대화형 분석
go tool pprof cpu.pprof

# pprof 명령어
(pprof) top10          # 상위 10개 함수
(pprof) web            # 브라우저에서 그래프 보기 (graphviz 필요)
(pprof) list main.     # 특정 패키지 함수 소스 보기
(pprof) tree           # 호출 트리
(pprof) png > graph.png # 이미지로 저장

# 실행 트레이스 분석
go tool trace trace.out

메모리 최적화

힙 할당 줄이기

package main

import (
    "fmt"
    "strings"
    "sync"
)

// 나쁜 예: 매번 새 슬라이스/맵 할당
func processRequestsBad(items []string) []string {
    result := []string{} // 매 호출마다 힙 할당
    for _, item := range items {
        result = append(result, strings.ToUpper(item))
    }
    return result
}

// 좋은 예: 용량 미리 지정
func processRequestsGood(items []string) []string {
    result := make([]string, 0, len(items)) // 용량 미리 지정
    for _, item := range items {
        result = append(result, strings.ToUpper(item))
    }
    return result
}

// sync.Pool — 자주 할당/해제되는 객체 재사용
var bufferPool = sync.Pool{
    New: func() interface{} {
        return &strings.Builder{}
    },
}

func buildStringWithPool(parts []string) string {
    sb := bufferPool.Get().(*strings.Builder)
    sb.Reset()
    defer bufferPool.Put(sb)

    for _, p := range parts {
        sb.WriteString(p)
    }
    return sb.String()
}

// 값 타입 vs 포인터 타입
type SmallStruct struct {
    X, Y int
}

// 작은 구조체는 값으로 전달 (복사가 포인터 역참조보다 빠를 수 있음)
func processSmall(s SmallStruct) int {
    return s.X + s.Y
}

type LargeStruct struct {
    Data [1024]byte
    Meta string
}

// 큰 구조체는 포인터로 전달
func processLarge(s *LargeStruct) int {
    return len(s.Data)
}

func main() {
    items := []string{"hello", "world", "go"}
    fmt.Println(processRequestsGood(items))

    parts := []string{"foo", "bar", "baz"}
    fmt.Println(buildStringWithPool(parts))
}

이스케이프 분석

// escape_test.go
package main

import "testing"

// 힙으로 이스케이프되는 경우 vs 스택에 머무는 경우
func stackAlloc() int {
    x := 42 // 스택에 할당 (반환되지 않음)
    return x
}

func heapAlloc() *int {
    x := 42 // 힙으로 이스케이프 (포인터 반환)
    return &x
}

func BenchmarkStackAlloc(b *testing.B) {
    for i := 0; i < b.N; i++ {
        _ = stackAlloc()
    }
}

func BenchmarkHeapAlloc(b *testing.B) {
    for i := 0; i < b.N; i++ {
        _ = heapAlloc()
    }
}

# 이스케이프 분석 출력
go build -gcflags="-m -m" ./...

# 출력 예:
# ./main.go:15:6: moved to heap: x  ← 힙 이스케이프
# ./main.go:10:6: x does not escape ← 스택 유지

CPU 최적화

컴파일러 인라이닝 활용

package main

import "fmt"

// 작은 함수는 자동으로 인라인됨 (≈10개 명령어 이하)
func add(a, b int) int {
    return a + b // 인라인됨
}

// 인라인 강제 힌트
//go:nosplit
func fastPath(n int) int {
    return n * n
}

// 인라인 방지 (재귀, 클로저 등)
func recursive(n int) int {
    if n <= 1 {
        return 1
    }
    return n * recursive(n-1)
}

// 인라인 여부 확인
// go build -gcflags="-m" main.go
// ./main.go:8:6: can inline add

func main() {
    fmt.Println(add(1, 2))
    fmt.Println(fastPath(5))
}

SIMD 및 어셈블리 활용 (고급)

// sum_amd64.go — 어셈블리 구현과 연결
package compute

// 어셈블리로 구현된 함수 선언 (구현은 sum_amd64.s)
func SumSliceASM(data []float32) float32

// 순수 Go 폴백
func SumSlice(data []float32) float32 {
    var sum float32
    for _, v := range data {
        sum += v
    }
    return sum
}

캐시 친화적 데이터 구조

package main

import "fmt"

// 나쁜 예: AoS (Array of Structs) — 캐시 미스 발생
type ParticleAoS struct {
    X, Y, Z    float32 // 위치
    VX, VY, VZ float32 // 속도
    Mass       float32
}

func updatePositionsAoS(particles []ParticleAoS, dt float32) {
    for i := range particles {
        // X, Y, Z 업데이트만 필요한데 전체 구조체 로드
        particles[i].X += particles[i].VX * dt
        particles[i].Y += particles[i].VY * dt
        particles[i].Z += particles[i].VZ * dt
    }
}

// 좋은 예: SoA (Struct of Arrays) — 캐시 효율적
type ParticlesSoA struct {
    X, Y, Z    []float32 // 위치 배열
    VX, VY, VZ []float32 // 속도 배열
    Mass       []float32
}

func updatePositionsSoA(p *ParticlesSoA, dt float32) {
    // X 배열만 순차 접근 → 완벽한 캐시 지역성
    for i := range p.X {
        p.X[i] += p.VX[i] * dt
        p.Y[i] += p.VY[i] * dt
        p.Z[i] += p.VZ[i] * dt
    }
}

func main() {
    // AoS 방식
    particlesAoS := make([]ParticleAoS, 10000)
    updatePositionsAoS(particlesAoS, 0.016)

    // SoA 방식 (더 빠름)
    n := 10000
    particlesSoA := &ParticlesSoA{
        X: make([]float32, n), Y: make([]float32, n), Z: make([]float32, n),
        VX: make([]float32, n), VY: make([]float32, n), VZ: make([]float32, n),
    }
    updatePositionsSoA(particlesSoA, 0.016)
    fmt.Println("업데이트 완료")
}

고루틴과 동시성 최적화

고루틴 풀 패턴

package main

import (
    "fmt"
    "sync"
)

// 무한 고루틴 생성 방지: 워커 풀 사용
type WorkerPool struct {
    tasks   chan func()
    wg      sync.WaitGroup
}

func NewWorkerPool(workers int) *WorkerPool {
    pool := &WorkerPool{
        tasks: make(chan func(), workers*10),
    }
    for i := 0; i < workers; i++ {
        pool.wg.Add(1)
        go func() {
            defer pool.wg.Done()
            for task := range pool.tasks {
                task()
            }
        }()
    }
    return pool
}

func (p *WorkerPool) Submit(task func()) {
    p.tasks <- task
}

func (p *WorkerPool) Close() {
    close(p.tasks)
    p.wg.Wait()
}

func main() {
    pool := NewWorkerPool(4) // CPU 코어 수에 맞게

    results := make([]int, 100)
    var mu sync.Mutex

    for i := 0; i < 100; i++ {
        i := i // 루프 변수 캡처
        pool.Submit(func() {
            result := i * i
            mu.Lock()
            results[i] = result
            mu.Unlock()
        })
    }

    pool.Close()
    fmt.Printf("결과 샘플: %v\n", results[:5])
}

채널 vs 뮤텍스 선택 기준

package main

import (
    "fmt"
    "sync"
    "sync/atomic"
)

// atomic — 단순 카운터 (가장 빠름)
type AtomicCounter struct {
    value int64
}

func (c *AtomicCounter) Increment() {
    atomic.AddInt64(&c.value, 1)
}

func (c *AtomicCounter) Get() int64 {
    return atomic.LoadInt64(&c.value)
}

// sync.Mutex — 복잡한 상태 보호
type SafeMap struct {
    mu   sync.RWMutex
    data map[string]int
}

func (m *SafeMap) Set(key string, val int) {
    m.mu.Lock()
    defer m.mu.Unlock()
    m.data[key] = val
}

func (m *SafeMap) Get(key string) (int, bool) {
    m.mu.RLock() // 읽기는 RLock으로 병렬화
    defer m.mu.RUnlock()
    v, ok := m.data[key]
    return v, ok
}

// sync.Map — 읽기가 압도적으로 많을 때
var globalCache sync.Map

func cacheGet(key string) (interface{}, bool) {
    return globalCache.Load(key)
}

func cacheSet(key string, val interface{}) {
    globalCache.Store(key, val)
}

func main() {
    counter := &AtomicCounter{}
    var wg sync.WaitGroup
    for i := 0; i < 1000; i++ {
        wg.Add(1)
        go func() {
            defer wg.Done()
            counter.Increment()
        }()
    }
    wg.Wait()
    fmt.Printf("카운터: %d\n", counter.Get()) // 1000
}

가비지 컬렉터 튜닝

package main

import (
    "fmt"
    "os"
    "runtime"
    "runtime/debug"
)

func init() {
    // GOGC: GC 트리거 임계값 (기본 100 = 힙 100% 증가 시 GC)
    // 높을수록 GC 빈도 줄고 메모리 사용 늘어남
    // 낮을수록 GC 빈도 높고 메모리 효율 좋아짐
    //
    // 환경변수로 설정:
    // GOGC=200 ./app   → GC를 덜 자주 실행 (CPU 절약, 메모리 증가)
    // GOGC=off ./app   → GC 비활성화 (배치 작업에서만)

    // 코드에서 설정
    debug.SetGCPercent(200) // 메모리 여유 있으면 GC 덜 실행

    // Go 1.19+: GOMEMLIMIT으로 메모리 상한 설정
    // GOMEMLIMIT=512MiB ./app
    debug.SetMemoryLimit(512 * 1024 * 1024) // 512MB
}

func printMemStats() {
    var ms runtime.MemStats
    runtime.ReadMemStats(&ms)
    fmt.Printf("힙 사용: %d MB\n", ms.HeapInuse/1024/1024)
    fmt.Printf("힙 할당 총계: %d MB\n", ms.TotalAlloc/1024/1024)
    fmt.Printf("GC 횟수: %d\n", ms.NumGC)
    fmt.Printf("GC 일시정지(전체): %d ms\n", ms.PauseTotalNs/1e6)
}

func main() {
    // GC 강제 실행
    runtime.GC()

    printMemStats()

    // 수동 GC 힌트 (대형 일시적 데이터 해제 후)
    bigData := make([]byte, 100*1024*1024) // 100MB 임시 할당
    _ = bigData
    bigData = nil
    runtime.GC() // 즉시 해제 요청

    printMemStats()

    // CPU/메모리 제한 환경변수 확인
    fmt.Printf("GOMAXPROCS: %d\n", runtime.GOMAXPROCS(0))
    fmt.Printf("GOGC: %s\n", os.Getenv("GOGC"))
}

성능 최적화 치트시트

// 1. 문자열 연결: Builder > + 연산자
// 나쁜 예
s := ""
for _, word := range words {
    s += word + " " // 매번 새 문자열 할당
}

// 좋은 예
var sb strings.Builder
sb.Grow(totalSize) // 용량 미리 예약
for _, word := range words {
    sb.WriteString(word)
    sb.WriteByte(' ')
}
s := sb.String()

// 2. 슬라이스 append: 용량 미리 지정
result := make([]T, 0, expectedSize)

// 3. 맵 초기화: 용량 힌트
m := make(map[string]int, expectedSize)

// 4. 구조체 필드 정렬: 패딩 최소화
// 나쁜 예 (24 bytes)
type BadStruct struct {
    A bool    // 1 byte + 7 padding
    B int64   // 8 bytes
    C bool    // 1 byte + 7 padding
}

// 좋은 예 (16 bytes)
type GoodStruct struct {
    B int64   // 8 bytes
    A bool    // 1 byte
    C bool    // 1 byte + 6 padding
}

// 5. 인터페이스 박싱 최소화
// 나쁜 예: 루프 안에서 인터페이스 생성
for _, v := range items {
    process(interface{}(v)) // 매번 박싱
}

// 좋은 예: 구체 타입으로 직접 처리
for _, v := range items {
    processTyped(v)
}

성능 측정 도구 요약

# 1. 벤치마크 실행
go test -bench=. -benchmem -count=5 ./...

# 2. CPU 프로파일
go test -bench=. -cpuprofile=cpu.pprof ./...
go tool pprof cpu.pprof

# 3. 메모리 프로파일
go test -bench=. -memprofile=mem.pprof ./...
go tool pprof -alloc_space mem.pprof

# 4. 실행 트레이스
go test -bench=. -trace=trace.out ./...
go tool trace trace.out

# 5. 이스케이프 분석
go build -gcflags="-m" ./...

# 6. 어셈블리 출력
go build -gcflags="-S" ./...

# 7. 최적화 비활성화 (디버깅용)
go build -gcflags="-N -l" ./...

# 8. benchstat — 통계적 비교
go install golang.org/x/perf/cmd/benchstat@latest
go test -bench=. -count=10 > old.txt
# 코드 변경 후
go test -bench=. -count=10 > new.txt
benchstat old.txt new.txt

핵심 정리

최적화 영역	기법	효과
메모리 할당	`make`에 용량 지정	재할당 방지
객체 재사용	`sync.Pool`	GC 압력 감소
문자열 빌드	`strings.Builder`	복사 횟수 감소
동시성	워커 풀	고루틴 오버헤드 제어
카운터	`sync/atomic`	락 오버헤드 제거
캐시 효율	SoA 레이아웃	CPU 캐시 히트율 향상
GC 튜닝	`GOGC`, `GOMEMLIMIT`	GC 빈도/메모리 균형

측정 먼저: pprof로 병목을 찾고 나서 최적화
마이크로 최적화 주의: 전체 지연 중 1%인 함수를 10배 빠르게 해도 0.9% 개선
가독성 vs 성능 균형: 명백한 개선이 아니면 코드 복잡도 증가 지양

pprof 프로파일링​

HTTP 서버 프로파일링 엔드포인트​

프로그래밍 방식 프로파일링​

메모리 최적화​

힙 할당 줄이기​

이스케이프 분석​

CPU 최적화​

컴파일러 인라이닝 활용​

SIMD 및 어셈블리 활용 (고급)​

캐시 친화적 데이터 구조​

고루틴과 동시성 최적화​

고루틴 풀 패턴​

채널 vs 뮤텍스 선택 기준​

가비지 컬렉터 튜닝​

성능 최적화 치트시트​

성능 측정 도구 요약​

핵심 정리​

pprof 프로파일링

HTTP 서버 프로파일링 엔드포인트

프로그래밍 방식 프로파일링

메모리 최적화

힙 할당 줄이기

이스케이프 분석

CPU 최적화

컴파일러 인라이닝 활용

SIMD 및 어셈블리 활용 (고급)

캐시 친화적 데이터 구조

고루틴과 동시성 최적화

고루틴 풀 패턴

채널 vs 뮤텍스 선택 기준

가비지 컬렉터 튜닝

성능 최적화 치트시트

성능 측정 도구 요약

핵심 정리