CLI DDP fixes (#135)

This commit is contained in:
Glenn Jocher
2023-01-02 19:55:04 +01:00
committed by GitHub
parent 8f3cd52844
commit c5c86a3acd
2 changed files with 5 additions and 4 deletions

View File

@ -29,10 +29,11 @@ WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
@contextmanager
def torch_distributed_zero_first(local_rank: int):
# Decorator to make all processes in distributed training wait for each local_master to do something
if local_rank not in {-1, 0}:
initialized = torch.distributed.is_initialized() # prevent 'Default process group has not been initialized' errors
if initialized and local_rank not in {-1, 0}:
dist.barrier(device_ids=[local_rank])
yield
if local_rank == 0:
if initialized and local_rank == 0:
dist.barrier(device_ids=[0])