Files
accelerate/examples/multigpu_remote_launcher.py
Zach Mueller 7a2feecad4 Add copyright + some ruff lint things (#2523)
* Copyright and ruff stuff

* lol
2024-03-04 09:14:31 -05:00

69 lines
2.7 KiB
Python

# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import runhouse as rh
import torch
from nlp_example import training_function
from accelerate.utils import PrepareForLaunch, patch_environment
def launch_train(*args):
num_processes = torch.cuda.device_count()
print(f"Device count: {num_processes}")
with patch_environment(
world_size=num_processes, master_addr="127.0.0.1", master_port="29500", mixed_precision=args[1].mixed_precision
):
launcher = PrepareForLaunch(training_function, distributed_type="MULTI_GPU")
torch.multiprocessing.start_processes(launcher, args=args, nprocs=num_processes, start_method="spawn")
if __name__ == "__main__":
# Refer to https://runhouse-docs.readthedocs-hosted.com/en/main/rh_primitives/cluster.html#hardware-setup
# for cloud access setup instructions (if using on-demand hardware), and for API specifications.
# on-demand GPU
# gpu = rh.cluster(name='rh-cluster', instance_type='V100:1', provider='cheapest', use_spot=False) # single GPU
gpu = rh.cluster(name="rh-cluster", instance_type="V100:4", provider="cheapest", use_spot=False) # multi GPU
gpu.up_if_not()
# on-prem GPU
# gpu = rh.cluster(
# ips=["ip_addr"], ssh_creds={ssh_user:"<username>", ssh_private_key:"<key_path>"}, name="rh-cluster"
# )
# Set up remote function
reqs = [
"pip:./",
"transformers",
"datasets",
"evaluate",
"tqdm",
"scipy",
"scikit-learn",
"tensorboard",
"torch --upgrade --extra-index-url https://download.pytorch.org/whl/cu117",
]
launch_train_gpu = rh.function(fn=launch_train, system=gpu, reqs=reqs, name="train_bert_glue")
# Define train args/config, run train function
train_args = argparse.Namespace(cpu=False, mixed_precision="fp16")
config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
launch_train_gpu(config, train_args, stream_logs=True)
# Alternatively, we can just run as instructed in the README (but only because there's already a wrapper CLI):
# gpu.install_packages(reqs)
# gpu.run(['accelerate launch --multi_gpu accelerate/examples/nlp_example.py'])