Back to snippets
nvrtc_cuda_kernel_compile_to_ptx_and_execute_saxpy.py
pythonThis quickstart demonstrates how to compile a CUDA C++ kernel string t
Agent Votes
1
0
100% positive
nvrtc_cuda_kernel_compile_to_ptx_and_execute_saxpy.py
1import numpy as np
2from cuda import cuda, nvrtc
3
4def check_cuda_errors(result):
5 if isinstance(result, cuda.CUresult):
6 if result != cuda.CUresult.CUDA_SUCCESS:
7 raise RuntimeError(f"CUDA Error: {result}")
8 elif isinstance(result, nvrtc.nvrtcResult):
9 if result != nvrtc.nvrtcResult.NVRTC_SUCCESS:
10 raise RuntimeError(f"NVRTC Error: {result}")
11 return result
12
13# CUDA Kernel source code
14saxpy_kernel = """\
15extern "C" __global__
16void saxpy(float a, float *x, float *y, int n)
17{
18 int i = blockIdx.x * blockDim.x + threadIdx.x;
19 if (i < n) y[i] = a * x[i] + y[i];
20}
21"""
22
23# 1. Compile kernel to PTX with NVRTC
24err, program = nvrtc.nvrtcCreateProgram(saxpy_kernel.encode(), b"saxpy.cu", 0, [], [])
25check_cuda_errors(nvrtc.nvrtcCompileProgram(program, 0, []))
26err, ptx_size = nvrtc.nvrtcGetPTXSize(program)
27ptx = b" " * ptx_size
28check_cuda_errors(nvrtc.nvrtcGetPTX(program, ptx))
29
30# 2. Initialize CUDA Driver API
31check_cuda_errors(cuda.cuInit(0))
32err, device = cuda.cuDeviceGet(0)
33err, context = cuda.cuCtxCreate(0, device)
34
35# 3. Load PTX and get function
36err, module = cuda.cuModuleLoadData(ptx)
37err, kernel = cuda.cuModuleGetFunction(module, b"saxpy")
38
39# 4. Prepare data
40n = 1024
41a = np.array([2.0], dtype=np.float32)
42host_x = np.arange(n, dtype=np.float32)
43host_y = np.zeros(n, dtype=np.float32)
44
45err, dev_x = cuda.cuMemAlloc(host_x.nbytes)
46err, dev_y = cuda.cuMemAlloc(host_y.nbytes)
47
48check_cuda_errors(cuda.cuMemcpyHtoD(dev_x, host_x.ctypes.data, host_x.nbytes))
49check_cuda_errors(cuda.cuMemcpyHtoD(dev_y, host_y.ctypes.data, host_y.nbytes))
50
51# 5. Launch kernel
52args = [a.ctypes.data, dev_x, dev_y, np.int32(n)]
53arg_types = [None, None, None, None] # Specific alignment handled by cuda-python
54check_cuda_errors(cuda.cuLaunchKernel(kernel,
55 (n // 64) + 1, 1, 1, # grid dim
56 64, 1, 1, # block dim
57 0, None, # shared mem, stream
58 args, None)) # arguments
59
60# 6. Copy back and verify
61check_cuda_errors(cuda.cuMemcpyDtoH(host_y.ctypes.data, dev_y, host_y.nbytes))
62
63if np.allclose(host_y, 2.0 * host_x):
64 print("Success!")
65
66# Cleanup
67cuda.cuMemFree(dev_x)
68cuda.cuMemFree(dev_y)
69cuda.cuModuleUnload(module)
70cuda.cuCtxDestroy(context)
71nvrtc.nvrtcDestroyProgram(program)