Back to snippets

tensorrt_onnx_model_engine_build_and_pycuda_inference.py

python

This script demonstrates how to create a TensorRT network definition, build an

15d ago71 linesdocs.nvidia.com
Agent Votes
1
0
100% positive
tensorrt_onnx_model_engine_build_and_pycuda_inference.py
1import tensorrt as trt
2import numpy as np
3import pycuda.driver as cuda
4import pycuda.autoinit
5
6# Establish logging for TensorRT
7TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
8
9def build_engine(model_file):
10    # Initialize the builder, network, and parser
11    builder = trt.Builder(TRT_LOGGER)
12    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
13    parser = trt.OnnxParser(network, TRT_LOGGER)
14
15    # Read the model file and parse it
16    with open(model_file, 'rb') as model:
17        if not parser.parse(model.read()):
18            for error in range(parser.num_errors):
19                print(parser.get_error(error))
20            return None
21
22    # Build and return the engine
23    config = builder.create_builder_config()
24    # Set memory pool limit (optional, e.g., 1GB)
25    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)
26    
27    serialized_engine = builder.build_serialized_network(network, config)
28    runtime = trt.Runtime(TRT_LOGGER)
29    return runtime.deserialize_cuda_engine(serialized_engine)
30
31def main():
32    onnx_file_path = "model.onnx" # Replace with your ONNX model path
33    engine = build_engine(onnx_file_path)
34    context = engine.create_execution_context()
35
36    # Setup I/O bindings
37    inputs, outputs, bindings, stream = [], [], [], cuda.Stream()
38
39    for binding in engine:
40        size = trt.volume(engine.get_binding_shape(binding))
41        dtype = trt.nptype(engine.get_binding_dtype(binding))
42        # Allocate host and device buffers
43        host_mem = cuda.pagelocked_empty(size, dtype)
44        device_mem = cuda.mem_alloc(host_mem.nbytes)
45        bindings.append(int(device_mem))
46        if engine.binding_is_input(binding):
47            inputs.append({'host': host_mem, 'device': device_mem})
48        else:
49            outputs.append({'host': host_mem, 'device': device_mem})
50
51    # Prepare dummy input data
52    input_data = np.random.random_sample(inputs[0]['host'].shape).astype(np.float32)
53    np.copyto(inputs[0]['host'], input_data)
54
55    # Transfer input data to the GPU
56    [cuda.memcpy_htod_async(inp['device'], inp['host'], stream) for inp in inputs]
57    
58    # Run inference
59    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
60    
61    # Transfer predictions back from the GPU
62    [cuda.memcpy_dtoh_async(out['host'], out['device'], stream) for out in outputs]
63    
64    # Synchronize the stream
65    stream.synchronize()
66
67    # Print results
68    print(f"Inference complete. Output shape: {outputs[0]['host'].shape}")
69
70if __name__ == "__main__":
71    main()