mirror of
https://github.com/huggingface/accelerate.git
synced 2025-10-20 10:03:46 +08:00
69 lines
2.7 KiB
Python
69 lines
2.7 KiB
Python
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import argparse
|
|
|
|
import runhouse as rh
|
|
import torch
|
|
from nlp_example import training_function
|
|
|
|
from accelerate.utils import PrepareForLaunch, patch_environment
|
|
|
|
|
|
def launch_train(*args):
|
|
num_processes = torch.cuda.device_count()
|
|
print(f"Device count: {num_processes}")
|
|
with patch_environment(
|
|
world_size=num_processes, master_addr="127.0.0.1", master_port="29500", mixed_precision=args[1].mixed_precision
|
|
):
|
|
launcher = PrepareForLaunch(training_function, distributed_type="MULTI_GPU")
|
|
torch.multiprocessing.start_processes(launcher, args=args, nprocs=num_processes, start_method="spawn")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Refer to https://runhouse-docs.readthedocs-hosted.com/en/main/rh_primitives/cluster.html#hardware-setup
|
|
# for cloud access setup instructions (if using on-demand hardware), and for API specifications.
|
|
|
|
# on-demand GPU
|
|
# gpu = rh.cluster(name='rh-cluster', instance_type='V100:1', provider='cheapest', use_spot=False) # single GPU
|
|
gpu = rh.cluster(name="rh-cluster", instance_type="V100:4", provider="cheapest", use_spot=False) # multi GPU
|
|
gpu.up_if_not()
|
|
|
|
# on-prem GPU
|
|
# gpu = rh.cluster(
|
|
# ips=["ip_addr"], ssh_creds={ssh_user:"<username>", ssh_private_key:"<key_path>"}, name="rh-cluster"
|
|
# )
|
|
|
|
# Set up remote function
|
|
reqs = [
|
|
"pip:./",
|
|
"transformers",
|
|
"datasets",
|
|
"evaluate",
|
|
"tqdm",
|
|
"scipy",
|
|
"scikit-learn",
|
|
"tensorboard",
|
|
"torch --upgrade --extra-index-url https://download.pytorch.org/whl/cu117",
|
|
]
|
|
launch_train_gpu = rh.function(fn=launch_train, system=gpu, reqs=reqs, name="train_bert_glue")
|
|
|
|
# Define train args/config, run train function
|
|
train_args = argparse.Namespace(cpu=False, mixed_precision="fp16")
|
|
config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
|
|
launch_train_gpu(config, train_args, stream_logs=True)
|
|
|
|
# Alternatively, we can just run as instructed in the README (but only because there's already a wrapper CLI):
|
|
# gpu.install_packages(reqs)
|
|
# gpu.run(['accelerate launch --multi_gpu accelerate/examples/nlp_example.py'])
|