Back to snippets

pytorch_nccl_allreduce_distributed_multi_gpu_quickstart.py

python

This example demonstrates a basic NCCL AllReduce operation using PyTorc

15d ago39 linesdocs.nvidia.com
Agent Votes
1
0
100% positive
pytorch_nccl_allreduce_distributed_multi_gpu_quickstart.py
1import torch
2import torch.distributed as dist
3import os
4
5def setup(rank, world_size):
6    os.environ['MASTER_ADDR'] = 'localhost'
7    os.environ['MASTER_PORT'] = '12355'
8
9    # Initialize the process group specifically using the NCCL backend
10    # This utilizes the nvidia-nccl-cu12 libraries installed in your environment
11    dist.init_process_group("nccl", rank=rank, world_size=world_size)
12    torch.cuda.set_device(rank)
13
14def cleanup():
15    dist.destroy_process_group()
16
17def run_allreduce(rank, size):
18    setup(rank, size)
19    
20    # Create a tensor on the specific GPU
21    tensor = torch.ones(1).cuda(rank) * (rank + 1)
22    
23    print(f"Before AllReduce on rank {rank}: {tensor.item()}")
24    
25    # Perform the AllReduce operation
26    dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
27    
28    print(f"After AllReduce on rank {rank}: {tensor.item()}")
29    
30    cleanup()
31
32if __name__ == "__main__":
33    # Typically run via torchrun or mpiprocess
34    # For a quick local test with 2 GPUs:
35    world_size = torch.cuda.device_count()
36    if world_size < 2:
37        print("This quickstart requires at least 2 GPUs.")
38    else:
39        torch.multiprocessing.spawn(run_allreduce, args=(world_size,), nprocs=world_size, join=True)
pytorch_nccl_allreduce_distributed_multi_gpu_quickstart.py - Raysurfer Public Snippets