mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
The `usort` config in `pyproject.toml` has no effect due to a typo. Fixing the typo make `usort` do more and generate the changes in the PR. Except `pyproject.toml`, all changes are generated by `lintrunner -a --take UFMT --all-files`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/127126 Approved by: https://github.com/kit1980
297 lines
8.6 KiB
Python
297 lines
8.6 KiB
Python
import argparse
|
|
import math
|
|
import os
|
|
import time
|
|
|
|
from benchmark_dataset import BenchmarkLMDataset, collate_sentences_lm
|
|
|
|
import torch
|
|
import torch.nn as nn
|
|
from torch.distributed import rpc
|
|
|
|
from torch.distributed.pipeline.sync import Pipe
|
|
from torch.distributed.pipeline.sync.utils import partition_model
|
|
from torch.optim import Adam
|
|
from torch.utils.data import DataLoader
|
|
|
|
|
|
def sizeof_fmt(num, suffix="B"):
|
|
for unit in ["", "Ki", "Mi", "Gi", "Ti"]:
|
|
if abs(num) < 1024.0:
|
|
return f"{num:3.2f}{unit}B"
|
|
num /= 1024.0
|
|
|
|
|
|
def init_random_seed(seed: int):
|
|
import numpy
|
|
|
|
torch.manual_seed(seed)
|
|
torch.cuda.manual_seed(seed)
|
|
numpy.random.seed(seed)
|
|
|
|
|
|
iteration_count = 0
|
|
|
|
|
|
class EmbeddingLayer(nn.Embedding):
|
|
def __init__(self, ntoken, ninp, initrange):
|
|
super().__init__(ntoken, ninp)
|
|
self.ninp = ninp
|
|
nn.init.uniform_(self.weight, -initrange, initrange)
|
|
|
|
def forward(self, src):
|
|
return super().forward(src) * math.sqrt(self.ninp)
|
|
|
|
|
|
class PositionalEncodingLayer(nn.Module):
|
|
def __init__(self, d_model, dropout=0.1, max_len=5000):
|
|
super().__init__()
|
|
self.dropout = nn.Dropout(p=dropout)
|
|
|
|
pe = torch.zeros(max_len, d_model)
|
|
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
|
|
div_term = torch.exp(
|
|
torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
|
|
)
|
|
pe[:, 0::2] = torch.sin(position * div_term)
|
|
pe[:, 1::2] = torch.cos(position * div_term)
|
|
pe = pe.unsqueeze(0).transpose(0, 1)
|
|
self.register_buffer("pe", pe)
|
|
|
|
def forward(self, x):
|
|
x = x + self.pe[: x.size(0), :]
|
|
return self.dropout(x)
|
|
|
|
|
|
class TransformerDecoderLayer(nn.TransformerEncoderLayer):
|
|
"""Though this class inherits from torch.nn.TransformerEncoderLayer,
|
|
it functions as a decoder in this model"""
|
|
|
|
def __init__(self, ninp, nhead, nhid, droupout):
|
|
super().__init__(ninp, nhead, nhid, droupout)
|
|
self.src_mask = None
|
|
|
|
def forward(self, src):
|
|
global iteration_count
|
|
iteration_count += 1
|
|
|
|
if self.src_mask is None or self.src_mask.size(0) != len(src):
|
|
device = src.device
|
|
mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(device)
|
|
self.src_mask = mask
|
|
|
|
return super().forward(src, self.src_mask)
|
|
|
|
|
|
class LinearLayer(nn.Linear):
|
|
def __init__(self, ninp, ntoken, initrange):
|
|
super().__init__(ninp, ntoken)
|
|
nn.init.zeros_(self.bias)
|
|
nn.init.uniform_(self.weight, -initrange, initrange)
|
|
|
|
|
|
class TransformerLMSequential(nn.Sequential):
|
|
"""A small language model based on the design of GPT-2 using nn.Sequential
|
|
for compatibility with Pipe"""
|
|
|
|
def __init__(self, ntokens, ninp, nhead, nhid, dropout, initrange, ndecoder):
|
|
layers = [
|
|
EmbeddingLayer(ntokens, ninp, initrange),
|
|
PositionalEncodingLayer(ninp, dropout),
|
|
]
|
|
for _ in range(ndecoder):
|
|
layers.append(TransformerDecoderLayer(ninp, nhead, nhid, dropout))
|
|
|
|
layers.append(LinearLayer(ninp, ntokens, initrange))
|
|
super().__init__(*layers)
|
|
|
|
|
|
def make_model(args, device, ntokens):
|
|
ninp = 2048 # embedding dimension
|
|
nhid = (
|
|
2048 # the dimension of the feedforward network model in nn.TransformerEncoder
|
|
)
|
|
nhead = 32 # the number of heads in the multiheadattention models
|
|
dropout = 0
|
|
initrange = 0.1
|
|
ndecoder = args.num_decoder_layers
|
|
|
|
model = TransformerLMSequential(
|
|
ntokens, ninp, nhead, nhid, dropout, initrange, ndecoder
|
|
).to(device)
|
|
|
|
criterion = nn.CrossEntropyLoss()
|
|
lr = 0.01 # learning rate
|
|
|
|
def make_adam(model):
|
|
return Adam(model.parameters(), lr=lr)
|
|
|
|
optimizer = make_adam
|
|
|
|
return model, criterion, optimizer
|
|
|
|
|
|
def train(lm_dataloader, model, criterion, optimizer, vocab_size, args):
|
|
model.train()
|
|
|
|
vocab_size = 10000
|
|
total_loss = 0.0
|
|
start_time = time.time()
|
|
word_counter = 0
|
|
|
|
optimizer = optimizer(model)
|
|
|
|
def get_first_device(model):
|
|
if model.devices:
|
|
return model.devices[0]
|
|
else:
|
|
return torch.cuda.current_device()
|
|
|
|
def get_last_device(model):
|
|
if model.devices:
|
|
return model.devices[-1]
|
|
else:
|
|
return torch.cuda.current_device()
|
|
|
|
print(
|
|
f"Number of parameters for model: {sum(p.numel() for p in model.parameters())}"
|
|
)
|
|
for i, batch in enumerate(lm_dataloader):
|
|
bi = batch["input"]
|
|
if args.max_batch and i > args.max_batch:
|
|
break
|
|
optimizer.zero_grad()
|
|
try:
|
|
tmp = batch["input"].to(get_first_device(model))
|
|
output = model(tmp).local_value()
|
|
except Exception as e:
|
|
raise RuntimeError(
|
|
f"training failed on {torch.distributed.get_rank()}"
|
|
) from e
|
|
|
|
target = batch["target"].to(get_last_device(model))
|
|
output = output.to(target.device)
|
|
|
|
loss = criterion(output.view(-1, vocab_size), target.view(-1))
|
|
loss.backward()
|
|
del target
|
|
del output
|
|
|
|
torch.nn.utils.clip_grad_value_(model.parameters(), 0.05)
|
|
optimizer.step()
|
|
|
|
total_loss += loss.item()
|
|
log_interval = 1
|
|
word_counter += batch["ntokens"]
|
|
if i % log_interval == 0 and i > 0:
|
|
cur_loss = total_loss / log_interval
|
|
elapsed = time.time() - start_time
|
|
print(
|
|
f"| batch {i:5d} | wps {word_counter / elapsed:5.2f} | loss {cur_loss:5.2f} | ppl {math.exp(cur_loss):8.2f}"
|
|
)
|
|
word_counter = 0
|
|
total_loss = 0
|
|
start_time = time.time()
|
|
|
|
print("Peak memory usage for GPUs: ", end="")
|
|
for i in range(len(model.devices)):
|
|
print(
|
|
f"cuda:{i}: {sizeof_fmt(torch.cuda.memory_stats(i)['allocated_bytes.all.peak'])}, ",
|
|
end="",
|
|
)
|
|
print()
|
|
|
|
|
|
def generate_balance(num_devices, num_layers):
|
|
balance = []
|
|
layers_assigned = 0
|
|
for i in range(num_devices):
|
|
x = (num_layers - layers_assigned) / (num_devices - i)
|
|
if x.is_integer():
|
|
balance.append(int(x))
|
|
layers_assigned += x
|
|
else:
|
|
balance.append(math.ceil(x))
|
|
layers_assigned += math.ceil(x)
|
|
return balance
|
|
|
|
|
|
def make_model_and_data(args, device):
|
|
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
|
vocab_size = 10000
|
|
model, criterion, optimizer = make_model(args, device, vocab_size)
|
|
lm_dataset = BenchmarkLMDataset()
|
|
lm_dataloader = DataLoader(
|
|
lm_dataset,
|
|
batch_size=args.batch_size,
|
|
shuffle=True,
|
|
num_workers=0,
|
|
collate_fn=collate_sentences_lm,
|
|
)
|
|
return {
|
|
"model": model,
|
|
"criterion": criterion,
|
|
"optimizer": optimizer,
|
|
"data": lm_dataloader,
|
|
"vocab_size": vocab_size,
|
|
}
|
|
|
|
|
|
def bench_single_process(args):
|
|
os.environ.update({"MASTER_ADDR": args.host})
|
|
os.environ.update({"MASTER_PORT": "10638"})
|
|
|
|
rpc.init_rpc(
|
|
"worker",
|
|
rank=0,
|
|
world_size=1,
|
|
)
|
|
|
|
num_devices = torch.cuda.device_count() if torch.cuda.is_available() else 1
|
|
num_devices = min(args.num_devices, num_devices)
|
|
assert num_devices > 0
|
|
init_random_seed(0)
|
|
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
|
|
|
blob = make_model_and_data(args, None)
|
|
model = blob["model"]
|
|
|
|
balance = generate_balance(num_devices, len(model))
|
|
model = partition_model(model, balance)
|
|
p = Pipe(model, chunks=args.chunks, checkpoint=args.checkpoint)
|
|
del model
|
|
del blob["model"]
|
|
|
|
train(
|
|
blob["data"], p, blob["criterion"], blob["optimizer"], blob["vocab_size"], args
|
|
)
|
|
|
|
|
|
parser = argparse.ArgumentParser(description="benchmark")
|
|
parser.add_argument("--host", "-o", type=str, default="localhost", help="hostname")
|
|
parser.add_argument(
|
|
"--chunks", type=int, default=4, help="number of microbatches per batch"
|
|
)
|
|
parser.add_argument("--batch-size", type=int, default=8, help="size of a batch")
|
|
parser.add_argument("--max-batch", type=int, default=10, help="Max number of batches")
|
|
parser.add_argument(
|
|
"--num-decoder-layers",
|
|
type=int,
|
|
default=10,
|
|
help="Number of decoder layers in the model",
|
|
)
|
|
parser.add_argument(
|
|
"--checkpoint",
|
|
default="except_last",
|
|
choices=["always", "except_last", "never"],
|
|
help="Checkpointing strategy for pipe",
|
|
)
|
|
parser.add_argument(
|
|
"--num-devices", type=int, default=4, help="Number of GPU devices to use"
|
|
)
|
|
|
|
if __name__ == "__main__":
|
|
args = parser.parse_args()
|
|
print(f"Running benchmark with args: {args}")
|
|
bench_single_process(args)
|