How to run with "-m some-module" directly in python code?
Recently I learned that running a command like below can run some torch.distributed python modules:
"python -m torch.distributed.launch --nproc_per_node=2 SOME_TRAINING_SCRIPT.py"
========
If I want to achieve running SOME_TRAINING_SCRIPT.py by torch.distributed.launch by just executing script python SOME_TRAINING_SCRIPT.py (instead of python -m torch.distributed.launch --nproc_per_node=2 SOME_TRAINING_SCRIPT.py), what things should be added to SOME_TRAINING_SCRIPT.py to achieve this?
The python file is like below:
import torch, os
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.distributed import DistributedSampler
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '29500'
torch.distributed.init_process_group(backend="gloo")
input_size = 5
output_size = 2
batch_size = 30
data_size = 90
local_rank = torch.distributed.get_rank()
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)
print("local_rank = ", local_rank)
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size).to('cuda')
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
dataset = RandomDataset(input_size, data_size)
rand_loader = DataLoader(dataset=dataset,
batch_size=batch_size,
sampler=DistributedSampler(dataset))
class Model(nn.Module):
def __init__(self, input_size, output_size):
super(Model, self).__init__()
self.fc = nn.Linear(input_size, output_size)
def forward(self, input):
output = self.fc(input)
print(" In Model: input size", input.size(),
"output size", output.size())
return output
model = Model(input_size, output_size)
model.to(device)
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
model = torch.nn.parallel.DistributedDataParallel(model,
device_ids=[local_rank],
output_device=local_rank)
for data in rand_loader:
if torch.cuda.is_available():
input_var = data
else:
input_var = data
output = model(input_var)
print("Outside: input size", input_var.size(), "output_size", output.size())