벤치마크 & 프로파일링 — testing.B와 pprof

Go는 표준 라이브러리에 벤치마크와 프로파일링 도구가 내장되어 있습니다. 코드 성능을 측정하고 병목을 찾는 완전한 워크플로를 제공합니다.

testing.B — 벤치마크 기초

벤치마크 함수는 Benchmark로 시작하고 *testing.B를 매개변수로 받습니다.

// string_bench_test.go
package main

import (
    "strings"
    "testing"
)

// 문자열 연결 방법 비교
func BenchmarkStringConcat(b *testing.B) {
    for i := 0; i < b.N; i++ {
        s := ""
        for j := 0; j < 100; j++ {
            s += "hello"
        }
    }
}

func BenchmarkStringBuilder(b *testing.B) {
    for i := 0; i < b.N; i++ {
        var sb strings.Builder
        for j := 0; j < 100; j++ {
            sb.WriteString("hello")
        }
        _ = sb.String()
    }
}

func BenchmarkStringJoin(b *testing.B) {
    words := make([]string, 100)
    for i := range words {
        words[i] = "hello"
    }

    b.ResetTimer() // 준비 시간 제외
    for i := 0; i < b.N; i++ {
        _ = strings.Join(words, "")
    }
}

벤치마크 실행 명령어

# 모든 벤치마크 실행
go test -bench=. ./...

# 특정 패턴 벤치마크만
go test -bench=BenchmarkString ./...

# 벤치마크 + 단위 테스트 함께
go test -bench=. -run=TestXxx ./...

# 단위 테스트 건너뛰고 벤치마크만
go test -bench=. -run=^$ ./...

# 반복 횟수 고정
go test -bench=. -benchtime=5s ./...    # 5초간
go test -bench=. -benchtime=1000x ./...  # 1000회

# 메모리 할당 정보 포함
go test -bench=. -benchmem ./...

# 멀티 코어 벤치마크
go test -bench=. -cpu=1,2,4,8 ./...

실행 결과 읽기

BenchmarkStringConcat-8      9381    124589 ns/op    5616 B/op    99 allocs/op
BenchmarkStringBuilder-8   532780     2251 ns/op    2688 B/op     7 allocs/op
BenchmarkStringJoin-8      659847     1820 ns/op    2048 B/op     2 allocs/op

컬럼	의미
`-8`	GOMAXPROCS (사용 CPU 수)
`9381`	반복 횟수 (b.N)
`124589 ns/op`	1회당 나노초
`5616 B/op`	1회당 메모리 할당 바이트 (−benchmem)
`99 allocs/op`	1회당 힙 할당 횟수 (−benchmem)

테이블 주도 벤치마크

func BenchmarkSearch(b *testing.B) {
    sizes := []int{10, 100, 1000, 10000}

    for _, size := range sizes {
        b.Run(fmt.Sprintf("size=%d", size), func(b *testing.B) {
            data := make([]int, size)
            for i := range data {
                data[i] = i
            }
            target := size / 2

            b.ResetTimer()
            for i := 0; i < b.N; i++ {
                linearSearch(data, target)
            }
        })
    }
}

func linearSearch(data []int, target int) int {
    for i, v := range data {
        if v == target {
            return i
        }
    }
    return -1
}

BenchmarkSearch/size=10-8       100000000     11.2 ns/op
BenchmarkSearch/size=100-8       20000000     63.4 ns/op
BenchmarkSearch/size=1000-8       2000000    612.3 ns/op
BenchmarkSearch/size=10000-8       200000   6092.1 ns/op

b.ResetTimer / b.StopTimer / b.StartTimer

func BenchmarkWithSetup(b *testing.B) {
    // 이 부분은 측정 시간에 포함되지 않아야 함
    data := generateLargeData(10000)

    b.ResetTimer() // 셋업 시간 제외하고 타이머 리셋
    for i := 0; i < b.N; i++ {
        process(data)
    }
}

func BenchmarkWithPerIterSetup(b *testing.B) {
    for i := 0; i < b.N; i++ {
        b.StopTimer()           // 타이머 중지
        data := copyData(input) // 각 반복마다 새 데이터 준비
        b.StartTimer()          // 타이머 재개

        mutate(data)
    }
}

메모리 할당 최적화 벤치마크

// map_bench_test.go
package main

import "testing"

// 사전 할당 vs 동적 확장 비교
func BenchmarkMapNoPre(b *testing.B) {
    for i := 0; i < b.N; i++ {
        m := make(map[int]int)
        for j := 0; j < 1000; j++ {
            m[j] = j
        }
    }
}

func BenchmarkMapWithPre(b *testing.B) {
    for i := 0; i < b.N; i++ {
        m := make(map[int]int, 1000) // 용량 사전 할당
        for j := 0; j < 1000; j++ {
            m[j] = j
        }
    }
}

// 슬라이스 성장 비교
func BenchmarkSliceAppendNoPre(b *testing.B) {
    for i := 0; i < b.N; i++ {
        s := []int{}
        for j := 0; j < 1000; j++ {
            s = append(s, j)
        }
    }
}

func BenchmarkSliceAppendWithPre(b *testing.B) {
    for i := 0; i < b.N; i++ {
        s := make([]int, 0, 1000) // 용량 사전 할당
        for j := 0; j < 1000; j++ {
            s = append(s, j)
        }
    }
}

BenchmarkMapNoPre-8         5000    312451 ns/op    86516 B/op    65 allocs/op
BenchmarkMapWithPre-8       8000    148230 ns/op    41040 B/op     8 allocs/op
BenchmarkSliceAppendNoPre-8    300000    4521 ns/op    25208 B/op    12 allocs/op
BenchmarkSliceAppendWithPre-8  500000    2312 ns/op     8192 B/op     1 allocs/op

pprof — CPU & 메모리 프로파일링

벤치마크에서 프로파일 생성

# CPU 프로파일
go test -bench=. -cpuprofile=cpu.prof ./...

# 메모리 프로파일
go test -bench=. -memprofile=mem.prof ./...

# 둘 다
go test -bench=. -cpuprofile=cpu.prof -memprofile=mem.prof ./...

pprof 인터랙티브 분석

# CPU 프로파일 분석
go tool pprof cpu.prof

# 주요 명령어
(pprof) top           # CPU 사용량 상위 함수
(pprof) top -cum      # 누적 CPU 사용량 상위
(pprof) list FuncName # 특정 함수 라인별 분석
(pprof) web           # 브라우저에서 그래프로 시각화
(pprof) png           # PNG 이미지로 저장

# 메모리 프로파일
go tool pprof -alloc_space mem.prof  # 할당된 전체 메모리
go tool pprof -inuse_space mem.prof  # 현재 사용 중인 메모리

HTTP 서버에 pprof 통합

package main

import (
    "log"
    "net/http"
    _ "net/http/pprof" // 자동으로 /debug/pprof/ 핸들러 등록
)

func main() {
    // 프로파일링 서버 (포트 6060)
    go func() {
        log.Println(http.ListenAndServe(":6060", nil))
    }()

    // 실제 애플리케이션 코드
    startServer()
}

# 실행 중인 서버에서 30초간 CPU 프로파일 수집
go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30

# 힙 메모리 프로파일
go tool pprof http://localhost:6060/debug/pprof/heap

# 고루틴 덤프
curl http://localhost:6060/debug/pprof/goroutine?debug=1

# 실시간 웹 UI (Go 1.10+)
go tool pprof -http=:8080 http://localhost:6060/debug/pprof/profile?seconds=30

trace — 실행 추적

# 트레이스 수집
go test -bench=. -trace=trace.out ./...

# 트레이스 분석 (브라우저)
go tool trace trace.out

트레이스 도구로 확인할 수 있는 정보:

고루틴 생성/종료/대기 타임라인
GC(가비지 컬렉션) 발생 시점 및 소요 시간
프로세서(P)별 작업 분배
네트워크/시스템 콜 대기 시간

benchstat — 통계 비교

두 벤치마크 결과를 통계적으로 비교합니다.

go install golang.org/x/perf/cmd/benchstat@latest

# 기존 코드 측정
go test -bench=. -count=10 ./... > before.txt

# 코드 변경 후 측정
go test -bench=. -count=10 ./... > after.txt

# 통계 비교 (p값 포함)
benchstat before.txt after.txt

name              old time/op    new time/op    delta
StringConcat-8     124µs ± 1%     2.3µs ± 2%  -98.15%  (p=0.000 n=10+10)

name              old alloc/op   new alloc/op   delta
StringConcat-8    5.62kB ± 0%    2.69kB ± 0%  -52.14%  (p=0.000 n=10+10)

p=0.000: 통계적으로 유의미한 차이 (p < 0.05)
-98.15%: 98% 성능 향상

실전 벤치마크 예시 — JSON 직렬화

package main

import (
    "encoding/json"
    "testing"

    "github.com/bytedance/sonic"
    "github.com/goccy/go-json"
)

type User struct {
    ID    int    `json:"id"`
    Name  string `json:"name"`
    Email string `json:"email"`
    Age   int    `json:"age"`
}

var testUser = User{ID: 1, Name: "김고랭", Email: "go@example.com", Age: 30}

func BenchmarkJSONMarshalStd(b *testing.B) {
    b.ReportAllocs()
    for i := 0; i < b.N; i++ {
        _, err := json.Marshal(testUser)
        if err != nil {
            b.Fatal(err)
        }
    }
}

func BenchmarkJSONMarshalGoJson(b *testing.B) {
    b.ReportAllocs()
    for i := 0; i < b.N; i++ {
        _, err := gojson.Marshal(testUser)
        if err != nil {
            b.Fatal(err)
        }
    }
}

func BenchmarkJSONMarshalSonic(b *testing.B) {
    b.ReportAllocs()
    for i := 0; i < b.N; i++ {
        _, err := sonic.Marshal(testUser)
        if err != nil {
            b.Fatal(err)
        }
    }
}

벤치마크 작성 규칙 & 안티패턴

// ❌ 컴파일러 최적화로 결과가 무의미해질 수 있음
func BenchmarkBad(b *testing.B) {
    for i := 0; i < b.N; i++ {
        result := compute(42) // 사용되지 않으면 컴파일러가 제거
        _ = result            // 이 정도로는 부족
    }
}

// ✅ sink 변수로 최적화 방지
var globalSink int

func BenchmarkGood(b *testing.B) {
    var sink int
    for i := 0; i < b.N; i++ {
        sink = compute(42)
    }
    globalSink = sink // 글로벌로 내보내기
}

주의사항

// ❌ b.N 루프 밖에 측정 대상 로직 넣기
func BenchmarkWrong(b *testing.B) {
    result := expensiveCompute() // 한 번만 실행됨 — 벤치마크 의미 없음
    _ = result
    for i := 0; i < b.N; i++ {
        _ = result
    }
}

// ✅ 올바른 구조
func BenchmarkRight(b *testing.B) {
    for i := 0; i < b.N; i++ {
        _ = expensiveCompute() // 매 반복마다 실행
    }
}

핵심 정리

도구	용도
`go test -bench=.`	벤치마크 실행
`-benchmem`	메모리 할당 측정
`-cpuprofile=f`	CPU 프로파일 저장
`-memprofile=f`	메모리 프로파일 저장
`go tool pprof`	프로파일 분석
`go tool trace`	실행 추적 분석
`benchstat`	통계적 비교

b.N은 Go 런타임이 결정 — 직접 설정 금지
b.ResetTimer()로 셋업 시간 제외
-benchmem으로 메모리 할당이 성능 병목인지 확인
benchstat으로 p값 기반 통계적 유의성 검증

testing.B — 벤치마크 기초​

벤치마크 실행 명령어​

실행 결과 읽기​

테이블 주도 벤치마크​

b.ResetTimer / b.StopTimer / b.StartTimer​

메모리 할당 최적화 벤치마크​

pprof — CPU & 메모리 프로파일링​

벤치마크에서 프로파일 생성​

pprof 인터랙티브 분석​

HTTP 서버에 pprof 통합​

trace — 실행 추적​

benchstat — 통계 비교​

실전 벤치마크 예시 — JSON 직렬화​

벤치마크 작성 규칙 & 안티패턴​

주의사항​

핵심 정리​

testing.B — 벤치마크 기초

벤치마크 실행 명령어

실행 결과 읽기

테이블 주도 벤치마크

b.ResetTimer / b.StopTimer / b.StartTimer

메모리 할당 최적화 벤치마크

pprof — CPU & 메모리 프로파일링

벤치마크에서 프로파일 생성

pprof 인터랙티브 분석

HTTP 서버에 pprof 통합

trace — 실행 추적

benchstat — 통계 비교

실전 벤치마크 예시 — JSON 직렬화

벤치마크 작성 규칙 & 안티패턴

주의사항

핵심 정리