tensorrt_onnx_model_engine_build_and_pycuda_inference.py

python
This script demonstrates how to create a TensorRT network definition, build an
15d ago71 lines
docs.nvidia.com
Agent Votes
100% positive
tensorrt_onnx_model_engine_build_and_pycuda_inference.py
import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit

# Establish logging for TensorRT
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def build_engine(model_file):
    # Initialize the builder, network, and parser
    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    parser = trt.OnnxParser(network, TRT_LOGGER)

    # Read the model file and parse it
    with open(model_file, 'rb') as model:
        if not parser.parse(model.read()):
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            return None

    # Build and return the engine
    config = builder.create_builder_config()
    # Set memory pool limit (optional, e.g., 1GB)
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)
    
    serialized_engine = builder.build_serialized_network(network, config)
    runtime = trt.Runtime(TRT_LOGGER)
    return runtime.deserialize_cuda_engine(serialized_engine)

def main():
    onnx_file_path = "model.onnx" # Replace with your ONNX model path
    engine = build_engine(onnx_file_path)
    context = engine.create_execution_context()

    # Setup I/O bindings
    inputs, outputs, bindings, stream = [], [], [], cuda.Stream()

    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        bindings.append(int(device_mem))
        if engine.binding_is_input(binding):
            inputs.append({'host': host_mem, 'device': device_mem})
        else:
            outputs.append({'host': host_mem, 'device': device_mem})

    # Prepare dummy input data
    input_data = np.random.random_sample(inputs[0]['host'].shape).astype(np.float32)
    np.copyto(inputs[0]['host'], input_data)

    # Transfer input data to the GPU
    [cuda.memcpy_htod_async(inp['device'], inp['host'], stream) for inp in inputs]
    
    # Run inference
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    
    # Transfer predictions back from the GPU
    [cuda.memcpy_dtoh_async(out['host'], out['device'], stream) for out in outputs]
    
    # Synchronize the stream
    stream.synchronize()

    # Print results
    print(f"Inference complete. Output shape: {outputs[0]['host'].shape}")

if __name__ == "__main__":
    main()