Vector Addition: CUDA & Metal

두 언어, 같은 커널 — 나란히 읽어

이게 lesson 1에서 이미 쓴 커널인데 이제 양쪽 다 있는 완전한 프로그램 안에 넣은 거. micro-pattern (원소당 thread 하나, if (idx < N)로 bounds-check)이 conv 커널, attention mask, position encoding — 원소별 compute가 병렬 하드웨어 만나는 모든 곳에서 보이는 거야.

Code

CUDA — 완전한 vector_add.cu·cuda

#include <cstdio>
#include <vector>
#include <cuda_runtime.h>

__global__ void vec_add(const float* __restrict__ A,
                        const float* __restrict__ B,
                        float*       __restrict__ C,
                        size_t N) {
    size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) C[i] = A[i] + B[i];
}

int main() {
    const size_t N = 1 << 24;
    std::vector<float> hA(N, 1.5f), hB(N, 2.5f), hC(N, 0.f);

    float *dA, *dB, *dC;
    cudaMalloc(&dA, N*sizeof(float));
    cudaMalloc(&dB, N*sizeof(float));
    cudaMalloc(&dC, N*sizeof(float));

    cudaMemcpy(dA, hA.data(), N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dB, hB.data(), N*sizeof(float), cudaMemcpyHostToDevice);

    int threads = 256;
    int blocks  = (N + threads - 1) / threads;
    vec_add<<<blocks, threads>>>(dA, dB, dC, N);
    cudaDeviceSynchronize();

    cudaMemcpy(hC.data(), dC, N*sizeof(float), cudaMemcpyDeviceToHost);
    printf("hC[0] = %.2f, hC[N-1] = %.2f\n", hC[0], hC[N-1]);

    cudaFree(dA); cudaFree(dB); cudaFree(dC);
    return 0;
}

Metal — vec_add.metal·metal

#include <metal_stdlib>
using namespace metal;

kernel void vec_add(const device float *A [[buffer(0)]],
                    const device float *B [[buffer(1)]],
                    device       float *C [[buffer(2)]],
                    constant     uint  &N [[buffer(3)]],
                    uint gid              [[thread_position_in_grid]]) {
    if (gid < N) C[gid] = A[gid] + B[gid];
}

Metal host — vec_add_host.swift·swift

import Metal
import Foundation

let device = MTLCreateSystemDefaultDevice()!
let n: UInt32 = 1 << 24
let bytes = Int(n) * MemoryLayout<Float>.stride

let A = device.makeBuffer(length: bytes, options: .storageModeShared)!
let B = device.makeBuffer(length: bytes, options: .storageModeShared)!
let C = device.makeBuffer(length: bytes, options: .storageModeShared)!
let pA = A.contents().bindMemory(to: Float.self, capacity: Int(n))
let pB = B.contents().bindMemory(to: Float.self, capacity: Int(n))
let pC = C.contents().bindMemory(to: Float.self, capacity: Int(n))
for i in 0..<Int(n) { pA[i] = 1.5; pB[i] = 2.5 }

let lib = try device.makeLibrary(URL: URL(fileURLWithPath: "vec_add.metallib"))
let pipe = try device.makeComputePipelineState(function: lib.makeFunction(name: "vec_add")!)

var nVar = n
let q = device.makeCommandQueue()!
let cb = q.makeCommandBuffer()!
let e  = cb.makeComputeCommandEncoder()!
e.setComputePipelineState(pipe)
e.setBuffer(A, offset: 0, index: 0)
e.setBuffer(B, offset: 0, index: 1)
e.setBuffer(C, offset: 0, index: 2)
e.setBytes(&nVar, length: MemoryLayout<UInt32>.size, index: 3)
let tg = MTLSize(width: 256, height: 1, depth: 1)
let grid = MTLSize(width: (Int(n) + 255) / 256, height: 1, depth: 1)
e.dispatchThreadgroups(grid, threadsPerThreadgroup: tg)
e.endEncoding()
cb.commit(); cb.waitUntilCompleted()

print("pC[0] = \(pC[0]), pC[n-1] = \(pC[Int(n)-1])")

Exercise

쓸 수 있는 하드웨어에서 두 버전 다 빌드, 둘 다 실행, 비교. line 수 주목: CUDA는 malloc/memcpy 주변 boilerplate 더 많음, Metal은 encoder/pipeline setup 주변 boilerplate 더 많음. 순 비슷 (~50 line 각각). track5_vector_add/ 폴더에 두 파일 다 저장; 다음 두 레슨에서 진화시킬 거.

두 언어, 같은 커널 — 나란히 읽어

Code

External links

Exercise

Progress

댓글 0