Hello Metal 다시 보기

같은 훈련, 다른 군복

기억해: Metal은 GPU printf 없음. 그래서 Apple Silicon의 configurable-battalion 버전은 thread별 정체성을 buffer에 쓰고 CPU가 print. 미션은 CUDA 버전이랑 동일.

CUDA랑 다른 점 둘이 느껴질 거:

'Block size'가 두 곳에 살아 — 커널의 hard-coded 상수랑 host의 threadsPerThreadgroup. 일치해야 하고 mismatch면 Metal이 항상 알려주는 건 아냐.
Host driver가 commandBuffer.waitUntilCompleted()로 명시적 대기; Metal의 cudaDeviceSynchronize야.

Code

ant_battalion.metal — 커널은 ID 쓰고 host가 print·metal

#include <metal_stdlib>
using namespace metal;

// out[gid] = (block_id, thread_in_block)
kernel void ant_battalion(
    device uint2 *out               [[buffer(0)]],
    uint  tid                       [[thread_index_in_threadgroup]],
    uint  bid                       [[threadgroup_position_in_grid]],
    uint  threads_per_block         [[threads_per_threadgroup]])
{
    uint gid = bid * threads_per_block + tid;
    out[gid] = uint2(bid, tid);
}

ant_battalion.swift — 가변 launch + 검증·swift

import Metal
import Foundation

func main() throws {
    let args = CommandLine.arguments
    guard args.count == 3,
          let blocks  = Int(args[1]),
          let threads = Int(args[2]) else {
        print("Usage: ant_battalion <blocks> <threads_per_block>")
        exit(1)
    }
    let device = MTLCreateSystemDefaultDevice()!
    let lib = try device.makeLibrary(URL: URL(fileURLWithPath: "ant_battalion.metallib"))
    let fn  = lib.makeFunction(name: "ant_battalion")!
    let pipe = try device.makeComputePipelineState(function: fn)

    let total = blocks * threads
    let buf = device.makeBuffer(
        length: total * MemoryLayout<SIMD2<UInt32>>.stride,
        options: .storageModeShared)!

    let q  = device.makeCommandQueue()!
    let cb = q.makeCommandBuffer()!
    let e  = cb.makeComputeCommandEncoder()!
    e.setComputePipelineState(pipe)
    e.setBuffer(buf, offset: 0, index: 0)
    e.dispatchThreadgroups(
        MTLSize(width: blocks, height: 1, depth: 1),
        threadsPerThreadgroup: MTLSize(width: threads, height: 1, depth: 1))
    e.endEncoding()
    cb.commit(); cb.waitUntilCompleted()

    let p = buf.contents().bindMemory(to: SIMD2<UInt32>.self, capacity: total)
    for i in 0..<total {
        print("Ant \(i): block \(p[i].x), slot \(p[i].y)")
    }
}

do { try main() } catch { print("Error: \(error)"); exit(1) }

Build + run·bash

mkdir -p build
xcrun -sdk macosx metal -c ant_battalion.metal -o build/ant.air
xcrun -sdk macosx metallib build/ant.air -o build/ant_battalion.metallib
xcrun -sdk macosx swiftc ant_battalion.swift \
      -framework Metal -framework Foundation \
      -o build/ant_battalion

cd build && ./ant_battalion 4 32   # 128마리
         ./ant_battalion 32 64    # 2048마리 — 출력은 'CPU read' 순

Exercise

./ant_battalion 4 32 돌리고 128 line이 index 순으로 나오는지 확인. 그 다음 host에서 blocks=4, threads=64로 바꾸되 커널은 그대로 — 다시 실행. output buffer는 여전히 entry 128개 (원래 count 기준 할당), 근데 일부 block ID만 보일 거 — launch geometry가 256 thread로 바뀌었으니까. 이게 callout에서 경고하는 silent-mismatch 실패 모드. built-in의 threads_per_threadgroup 읽으면 fix.

같은 훈련, 다른 군복

Code

External links

Exercise

Progress

댓글 0