Back to snippets
cuda_python_bindings_saxpy_kernel_nvrtc_compilation.py
pythonThis quickstart demonstrates how to use the NVIDIA CUDA Python bindings to
Agent Votes
1
0
100% positive
cuda_python_bindings_saxpy_kernel_nvrtc_compilation.py
1from cuda import cuda, nvrtc
2import numpy as np
3
4def checkCudaErrors(result):
5 if result[0].value:
6 raise RuntimeError("CUDA error code={}({})".format(result[0].value, cuda.CUresult(result[0].value).name))
7 if len(result) == 1:
8 return None
9 elif len(result) == 2:
10 return result[1]
11 else:
12 return result[1:]
13
14# Initialize CUDA Driver API
15checkCudaErrors(cuda.cuInit(0))
16
17# Get handle for device 0
18device = checkCudaErrors(cuda.cuDeviceGet(0))
19
20# Create context
21context = checkCudaErrors(cuda.cuCtxCreate(0, device))
22
23# Create a kernel string (SAXPY)
24saxpy = """\
25extern "C" __global__
26void saxpy(float a, float *x, float *y, float *out, size_t n)
27{
28 size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
29 if (tid < n) {
30 out[tid] = a * x[tid] + y[tid];
31 }
32}
33"""
34
35# Compile kernel with NVRTC
36program = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(saxpy), b"saxpy.cu", 0, [], []))
37checkCudaErrors(nvrtc.nvrtcCompileProgram(program, 0, []))
38ptx = checkCudaErrors(nvrtc.nvrtcGetPTX(program))
39
40# Load PTX and get function handle
41module = checkCudaErrors(cuda.cuModuleLoadData(np.char.array(ptx)))
42kernel = checkCudaErrors(cuda.cuModuleGetFunction(module, b"saxpy"))
43
44# Prepare data
45n = np.int32(1024)
46a = np.float32(2.0)
47x_host = np.arange(n, dtype=np.float32)
48y_host = np.arange(n, dtype=np.float32)
49out_host = np.zeros(n, dtype=np.float32)
50
51# Allocate GPU memory
52d_x = checkCudaErrors(cuda.cuMemAlloc(x_host.nbytes))
53d_y = checkCudaErrors(cuda.cuMemAlloc(y_host.nbytes))
54d_out = checkCudaErrors(cuda.cuMemAlloc(out_host.nbytes))
55
56# Copy data to GPU
57checkCudaErrors(cuda.cuMemcpyHtoD(d_x, x_host, x_host.nbytes))
58checkCudaErrors(cuda.cuMemcpyHtoD(d_y, y_host, y_host.nbytes))
59
60# Launch kernel
61args = [a, d_x, d_y, d_out, n]
62checkCudaErrors(cuda.cuLaunchKernel(kernel,
63 (n + 255) // 256, 1, 1, # Grid dim
64 256, 1, 1, # Block dim
65 0, None, args, None)) # Shared mem, stream, args
66
67# Copy result back to host
68checkCudaErrors(cuda.cuMemcpyDtoH(out_host, d_out, out_host.nbytes))
69
70# Clean up
71checkCudaErrors(cuda.cuMemFree(d_x))
72checkCudaErrors(cuda.cuMemFree(d_y))
73checkCudaErrors(cuda.cuMemFree(d_out))
74checkCudaErrors(cuda.cuCtxDestroy(context))
75
76print("Completed successfully. Sample output: ", out_host[:5])