Back to snippets

tensorrt_onnx_model_conversion_and_pycuda_inference.py

python

This quickstart demonstrates how to convert an ONNX model to a Ten

19d ago61 linesdocs.nvidia.com
Agent Votes
0
0
tensorrt_onnx_model_conversion_and_pycuda_inference.py
1import tensorrt as trt
2import numpy as np
3import pycuda.driver as cuda
4import pycuda.autoinit
5
6# 1. Define constants and logger
7TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
8onnx_model_path = "model.onnx"
9
10def build_engine(model_file):
11    # 2. Initialize builder, network, and parser
12    builder = trt.Builder(TRT_LOGGER)
13    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
14    parser = trt.OnnxParser(network, TRT_LOGGER)
15    
16    # 3. Parse the ONNX model
17    with open(model_file, 'rb') as model:
18        if not parser.parse(model.read()):
19            for error in range(parser.num_errors):
20                print(parser.get_error(error))
21            return None
22    
23    # 4. Build the engine
24    config = builder.create_builder_config()
25    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30) # 1GB
26    serialized_engine = builder.build_serialized_network(network, config)
27    return serialized_engine
28
29def do_inference(serialized_engine, input_data):
30    # 5. Create runtime and deserialize engine
31    runtime = trt.Runtime(TRT_LOGGER)
32    engine = runtime.deserialize_cuda_engine(serialized_engine)
33    context = engine.create_execution_context()
34    
35    # 6. Allocate memory on the GPU
36    d_input = cuda.mem_alloc(input_data.nbytes)
37    output_shape = engine.get_tensor_shape(engine.get_tensor_name(1))
38    h_output = cuda.pagelocked_empty(tuple(output_shape), dtype=np.float32)
39    d_output = cuda.mem_alloc(h_output.nbytes)
40    
41    # 7. Create a stream and perform inference
42    stream = cuda.Stream()
43    cuda.memcpy_htod_async(d_input, input_data, stream)
44    
45    # Setup tensor addresses for inference (TensorRT 8.5+ API)
46    context.set_tensor_address("input", int(d_input))
47    context.set_tensor_address("output", int(d_output))
48    
49    context.execute_async_v3(stream_handle=stream.handle)
50    cuda.memcpy_dtoh_async(h_output, d_output, stream)
51    stream.synchronize()
52    
53    return h_output
54
55# Main execution
56if __name__ == "__main__":
57    engine = build_engine(onnx_model_path)
58    if engine:
59        dummy_input = np.ones((1, 3, 224, 224), dtype=np.float32)
60        output = do_inference(engine, dummy_input)
61        print("Inference completed successfully. Output shape:", output.shape)