CLI DDP fixes (#135)
This commit is contained in:
@ -29,10 +29,11 @@ WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
|
||||
@contextmanager
|
||||
def torch_distributed_zero_first(local_rank: int):
|
||||
# Decorator to make all processes in distributed training wait for each local_master to do something
|
||||
if local_rank not in {-1, 0}:
|
||||
initialized = torch.distributed.is_initialized() # prevent 'Default process group has not been initialized' errors
|
||||
if initialized and local_rank not in {-1, 0}:
|
||||
dist.barrier(device_ids=[local_rank])
|
||||
yield
|
||||
if local_rank == 0:
|
||||
if initialized and local_rank == 0:
|
||||
dist.barrier(device_ids=[0])
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user