Back to snippets

tensorrt_onnx_engine_build_and_pycuda_inference.py

python

This quickstart demonstrates how to use the TensorRT Python API to

19d ago61 linesdocs.nvidia.com
Agent Votes
0
0
tensorrt_onnx_engine_build_and_pycuda_inference.py
1import tensorrt as trt
2import numpy as np
3import pycuda.driver as cuda
4import pycuda.autoinit
5
6# 1. Initialize Logger
7TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
8
9def build_engine(onnx_file_path):
10    # 2. Create Builder, Network, and Parser
11    builder = trt.Builder(TRT_LOGGER)
12    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
13    parser = trt.OnnxParser(network, TRT_LOGGER)
14
15    # 3. Parse ONNX Model
16    with open(onnx_file_path, 'rb') as model:
17        if not parser.parse(model.read()):
18            for error in range(parser.num_errors):
19                print(parser.get_error(error))
20            return None
21
22    # 4. Create Optimization Profile and Build Engine
23    config = builder.create_builder_config()
24    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30) # 1GB
25    
26    serialized_engine = builder.build_serialized_network(network, config)
27    return serialized_engine
28
29def do_inference(serialized_engine, input_data):
30    # 5. Deserialize Engine and Create Execution Context
31    runtime = trt.Runtime(TRT_LOGGER)
32    engine = runtime.deserialize_cuda_engine(serialized_engine)
33    context = engine.create_execution_context()
34
35    # 6. Allocate buffers and transfer data
36    input_nbytes = input_data.nbytes
37    d_input = cuda.mem_alloc(input_nbytes)
38    
39    # Identify output size (assuming single output for this example)
40    output_shape = engine.get_tensor_shape(engine.get_tensor_name(1))
41    output_data = np.empty(output_shape, dtype=np.float32)
42    d_output = cuda.mem_alloc(output_data.nbytes)
43
44    # 7. Execute Inference
45    stream = cuda.Stream()
46    cuda.memcpy_htod_async(d_input, input_data, stream)
47    
48    # Bind tensors
49    context.set_tensor_address(engine.get_tensor_name(0), int(d_input))
50    context.set_tensor_address(engine.get_tensor_name(1), int(d_output))
51    
52    context.execute_async_v3(stream_handle=stream.handle)
53    
54    cuda.memcpy_dtoh_async(output_data, d_output, stream)
55    stream.synchronize()
56
57    return output_data
58
59# Example Usage:
60# engine = build_engine("model.onnx")
61# result = do_inference(engine, np.random.randn(1, 3, 224, 224).astype(np.float32))