mirror of
https://github.com/huggingface/accelerate.git
synced 2025-10-20 18:13:46 +08:00
Introduce a pod-config command (#802)
* Add in ability to configure pod and start CLI commands * Further tests, add a help * Added tests and cleaned up! * Fix weird missing parts * MOre tests + install accelerate with flag * Unused pod_config_file * Test with multiple commands * Update src/accelerate/commands/config/cluster.py Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> * Clarity during printing Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Make public names for readability * Fix test expected outputs and refactor response * Fix ref errors Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
@ -19,6 +19,7 @@ from argparse import ArgumentParser
|
||||
from accelerate.commands.config import config_command_parser
|
||||
from accelerate.commands.env import env_command_parser
|
||||
from accelerate.commands.launch import launch_command_parser
|
||||
from accelerate.commands.pod import pod_command_parser
|
||||
from accelerate.commands.test import test_command_parser
|
||||
|
||||
|
||||
@ -28,9 +29,10 @@ def main():
|
||||
|
||||
# Register commands
|
||||
config_command_parser(subparsers=subparsers)
|
||||
launch_command_parser(subparsers=subparsers)
|
||||
test_command_parser(subparsers=subparsers)
|
||||
env_command_parser(subparsers=subparsers)
|
||||
launch_command_parser(subparsers=subparsers)
|
||||
pod_command_parser(subparsers=subparsers)
|
||||
test_command_parser(subparsers=subparsers)
|
||||
|
||||
# Let's go
|
||||
args = parser.parse_args()
|
||||
|
@ -14,6 +14,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
|
||||
from ...utils import ComputeEnvironment, DistributedType, is_deepspeed_available, is_transformers_available
|
||||
from ...utils.constants import (
|
||||
DEEPSPEED_MULTINODE_LAUNCHERS,
|
||||
@ -41,6 +43,10 @@ def get_cluster_input():
|
||||
main_process_port = None
|
||||
rdzv_backend = "static"
|
||||
same_network = True
|
||||
tpu_name = None
|
||||
tpu_zone = None
|
||||
commands = None
|
||||
command_file = None
|
||||
if distributed_type in [DistributedType.MULTI_GPU, DistributedType.MULTI_CPU]:
|
||||
num_machines = _ask_field(
|
||||
"How many different machines will you use (use more than 1 for multi-node training)? [1]: ",
|
||||
@ -341,6 +347,50 @@ def get_cluster_input():
|
||||
"What is the name of the function in your script that should be launched in all parallel scripts? [main]: ",
|
||||
default="main",
|
||||
)
|
||||
use_cluster = _ask_field(
|
||||
"Are you using a TPU cluster? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
if use_cluster:
|
||||
tpu_name = _ask_field(
|
||||
"What is the name of your TPU cluster? ",
|
||||
default=None,
|
||||
error_message="Please enter the name of your TPU cluster.",
|
||||
)
|
||||
tpu_zone = _ask_field(
|
||||
"What is the zone of your TPU cluster? ",
|
||||
default=None,
|
||||
error_message="Please enter the zone of your TPU cluster.",
|
||||
)
|
||||
run_commands = _ask_field(
|
||||
"Do you have code you wish to run on startup in each pod? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
if run_commands:
|
||||
use_command_file = _ask_field(
|
||||
"Is this code located in a bash script? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
if use_command_file:
|
||||
command_file = _ask_field(
|
||||
"What is the path to your bash script? ",
|
||||
default=None,
|
||||
error_message="Please enter the path to your bash script.",
|
||||
)
|
||||
command_file = os.path.abspath(command_file)
|
||||
else:
|
||||
commands = _ask_field(
|
||||
"What commands do you wish to run on startup in each pod? ",
|
||||
default=None,
|
||||
error_message="Please enter the commands you wish to run on startup in each pod as a single string.",
|
||||
)
|
||||
|
||||
else:
|
||||
main_training_function = "main"
|
||||
|
||||
@ -408,4 +458,8 @@ def get_cluster_input():
|
||||
use_cpu=use_cpu,
|
||||
rdzv_backend=rdzv_backend,
|
||||
same_network=same_network,
|
||||
tpu_name=tpu_name,
|
||||
tpu_zone=tpu_zone,
|
||||
commands=commands,
|
||||
command_file=command_file,
|
||||
)
|
||||
|
@ -18,7 +18,7 @@ import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Optional, Union
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import yaml
|
||||
|
||||
@ -151,6 +151,12 @@ class ClusterConfig(BaseConfig):
|
||||
# args for TPU
|
||||
downcast_bf16: bool = False
|
||||
|
||||
# args for TPU pods
|
||||
tpu_name: str = None
|
||||
tpu_zone: str = None
|
||||
command_file: str = None
|
||||
command: List[str] = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.deepspeed_config is None:
|
||||
self.deepspeed_config = {}
|
||||
|
@ -1,3 +1,19 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import platform
|
||||
|
152
src/accelerate/commands/pod.py
Normal file
152
src/accelerate/commands/pod.py
Normal file
@ -0,0 +1,152 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from accelerate.commands.config.config_args import default_config_file, load_config_from_file
|
||||
from packaging.version import Version, parse
|
||||
|
||||
|
||||
_description = "Run commands across a pod of TPU VMs for initial setup before running `accelerate launch`. Will also install Accelerate on the pod."
|
||||
|
||||
|
||||
def pod_command_parser(subparsers=None):
|
||||
if subparsers is not None:
|
||||
parser = subparsers.add_parser("pod-config", description=_description)
|
||||
else:
|
||||
parser = argparse.ArgumentParser("Accelerate pod-config command", description=_description)
|
||||
|
||||
parser.add_argument(
|
||||
"--config_file",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the config file to use for accelerate.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--pod_config_file",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the config file to use for the pod.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--command_file",
|
||||
default=None,
|
||||
help="The path to the file containing the commands to run on the pod on startup.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--command",
|
||||
action="append",
|
||||
nargs="+",
|
||||
help="A command to run on the pod. If not specified, will use the command specified in the command file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tpu_name",
|
||||
default=None,
|
||||
help="The name of the TPU to use. If not specified, will use the TPU specified in the config file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tpu_zone",
|
||||
default=None,
|
||||
help="The zone of the TPU to use. If not specified, will use the zone specified in the config file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--install_accelerate",
|
||||
action="store_true",
|
||||
help="Whether to install accelerate on the pod. Defaults to False.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--accelerate_version",
|
||||
default="latest",
|
||||
help="The version of accelerate to install on the pod. If not specified, will use the latest pypi version. Specify 'dev' to install from GitHub.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--debug", action="store_true", help="If set, will print the command that would be run instead of running it."
|
||||
)
|
||||
|
||||
if subparsers is not None:
|
||||
parser.set_defaults(func=pod_launcher)
|
||||
return parser
|
||||
|
||||
|
||||
def pod_launcher(args):
|
||||
defaults = None
|
||||
|
||||
# Get the default from the config file if it exists.
|
||||
if args.config_file is not None or os.path.isfile(default_config_file):
|
||||
defaults = load_config_from_file(args.config_file)
|
||||
if not args.command_file and defaults.command_file is not None and not args.command:
|
||||
args.command_file = defaults.command_file
|
||||
if not args.command and defaults.command is not None:
|
||||
args.command = defaults.command
|
||||
if not args.tpu_name:
|
||||
args.tpu_name = defaults.tpu_name
|
||||
if not args.tpu_zone:
|
||||
args.tpu_zone = defaults.tpu_zone
|
||||
if args.accelerate_version == "dev":
|
||||
args.accelerate_version = "git+https://github.com/huggingface/accelerate.git"
|
||||
elif args.accelerate_version == "latest":
|
||||
args.accelerate_version = "accelerate -U"
|
||||
elif isinstance(parse(args.accelerate_version), Version):
|
||||
args.accelerate_version = f"accelerate=={args.accelerate_version}"
|
||||
|
||||
if not args.command_file and not args.command:
|
||||
raise ValueError("You must specify either a command file or a command to run on the pod.")
|
||||
|
||||
if args.command_file:
|
||||
with open(args.command_file, "r") as f:
|
||||
args.command = [f.read().splitlines()]
|
||||
|
||||
# To turn list of lists into list of strings
|
||||
args.command = [line for cmd in args.command for line in cmd]
|
||||
# Default to the shared folder and install accelerate
|
||||
new_cmd = ["cd /usr/share"]
|
||||
if args.install_accelerate:
|
||||
new_cmd += [f"pip install {args.accelerate_version}"]
|
||||
new_cmd += args.command
|
||||
args.command = "; ".join(new_cmd)
|
||||
|
||||
# Then send it to gcloud
|
||||
# Eventually try to use google-api-core to do this instead of subprocess
|
||||
cmd = [
|
||||
"gcloud",
|
||||
"compute",
|
||||
"tpus",
|
||||
"tpu-vm",
|
||||
"ssh",
|
||||
args.tpu_name,
|
||||
"--zone",
|
||||
args.tpu_zone,
|
||||
"--command",
|
||||
args.command,
|
||||
"--worker",
|
||||
"all",
|
||||
]
|
||||
if args.debug:
|
||||
print(f"Running {' '.join(cmd)}")
|
||||
return
|
||||
subprocess.run(cmd)
|
||||
print("Successfully setup pod.")
|
||||
|
||||
|
||||
def main():
|
||||
parser = pod_command_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
pod_launcher(args)
|
@ -21,6 +21,7 @@ import torch
|
||||
|
||||
import accelerate
|
||||
from accelerate.test_utils import execute_subprocess_async
|
||||
from accelerate.test_utils.testing import run_command
|
||||
|
||||
|
||||
class AccelerateLauncherTester(unittest.TestCase):
|
||||
@ -63,3 +64,151 @@ class AccelerateLauncherTester(unittest.TestCase):
|
||||
execute_subprocess_async(
|
||||
self.base_cmd + ["--config_file", str(config), self.test_file_path], env=os.environ.copy()
|
||||
)
|
||||
|
||||
|
||||
class PodConfigTester(unittest.TestCase):
|
||||
"""
|
||||
Test case for verifying the `accelerate pod-config` CLI passes the right `gcloud` command.
|
||||
"""
|
||||
|
||||
tpu_name = "test-tpu"
|
||||
tpu_zone = "us-central1-a"
|
||||
command = "ls"
|
||||
cmd = ["accelerate", "pod-config"]
|
||||
base_output = "cd /usr/share"
|
||||
command_file = "tests/test_samples/test_command_file.sh"
|
||||
gcloud = "Running gcloud compute tpus tpu-vm ssh"
|
||||
|
||||
@staticmethod
|
||||
def clean_output(output):
|
||||
return "".join(output).rstrip()
|
||||
|
||||
def test_base(self):
|
||||
output = run_command(
|
||||
self.cmd
|
||||
+ ["--command", self.command, "--tpu_zone", self.tpu_zone, "--tpu_name", self.tpu_name, "--debug"],
|
||||
return_stdout=True,
|
||||
)
|
||||
self.assertEqual(
|
||||
self.clean_output(output),
|
||||
f"{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; ls --worker all",
|
||||
)
|
||||
|
||||
def test_base_backward_compatibility(self):
|
||||
output = run_command(
|
||||
self.cmd
|
||||
+ [
|
||||
"--config_file",
|
||||
"tests/test_configs/0_12_0.yaml",
|
||||
"--command",
|
||||
self.command,
|
||||
"--tpu_zone",
|
||||
self.tpu_zone,
|
||||
"--tpu_name",
|
||||
self.tpu_name,
|
||||
"--debug",
|
||||
],
|
||||
return_stdout=True,
|
||||
)
|
||||
self.assertEqual(
|
||||
self.clean_output(output),
|
||||
f"{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; ls --worker all",
|
||||
)
|
||||
|
||||
def test_with_config_file(self):
|
||||
output = run_command(
|
||||
self.cmd + ["--config_file", "tests/test_configs/latest.yaml", "--debug"], return_stdout=True
|
||||
)
|
||||
self.assertEqual(
|
||||
self.clean_output(output),
|
||||
f'{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; echo "hello world"; echo "this is a second command" --worker all',
|
||||
)
|
||||
|
||||
def test_with_config_file_and_command(self):
|
||||
output = run_command(
|
||||
self.cmd + ["--config_file", "tests/test_configs/latest.yaml", "--command", self.command, "--debug"],
|
||||
return_stdout=True,
|
||||
)
|
||||
self.assertEqual(
|
||||
self.clean_output(output),
|
||||
f"{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; ls --worker all",
|
||||
)
|
||||
|
||||
def test_with_config_file_and_multiple_command(self):
|
||||
output = run_command(
|
||||
self.cmd
|
||||
+ [
|
||||
"--config_file",
|
||||
"tests/test_configs/latest.yaml",
|
||||
"--command",
|
||||
self.command,
|
||||
"--command",
|
||||
'echo "Hello World"',
|
||||
"--debug",
|
||||
],
|
||||
return_stdout=True,
|
||||
)
|
||||
self.assertEqual(
|
||||
self.clean_output(output),
|
||||
f'{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; ls; echo "Hello World" --worker all',
|
||||
)
|
||||
|
||||
def test_with_config_file_and_command_file(self):
|
||||
output = run_command(
|
||||
self.cmd
|
||||
+ ["--config_file", "tests/test_configs/latest.yaml", "--command_file", self.command_file, "--debug"],
|
||||
return_stdout=True,
|
||||
)
|
||||
self.assertEqual(
|
||||
self.clean_output(output),
|
||||
f'{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; echo "hello world"; echo "this is a second command" --worker all',
|
||||
)
|
||||
|
||||
def test_with_config_file_and_command_file_backward_compatibility(self):
|
||||
output = run_command(
|
||||
self.cmd
|
||||
+ [
|
||||
"--config_file",
|
||||
"tests/test_configs/0_12_0.yaml",
|
||||
"--command_file",
|
||||
self.command_file,
|
||||
"--tpu_zone",
|
||||
self.tpu_zone,
|
||||
"--tpu_name",
|
||||
self.tpu_name,
|
||||
"--debug",
|
||||
],
|
||||
return_stdout=True,
|
||||
)
|
||||
self.assertEqual(
|
||||
self.clean_output(output),
|
||||
f'{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; echo "hello world"; echo "this is a second command" --worker all',
|
||||
)
|
||||
|
||||
def test_accelerate_install(self):
|
||||
output = run_command(
|
||||
self.cmd + ["--config_file", "tests/test_configs/latest.yaml", "--install_accelerate", "--debug"],
|
||||
return_stdout=True,
|
||||
)
|
||||
self.assertEqual(
|
||||
self.clean_output(output),
|
||||
f'{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; pip install accelerate -U; echo "hello world"; echo "this is a second command" --worker all',
|
||||
)
|
||||
|
||||
def test_accelerate_install_version(self):
|
||||
output = run_command(
|
||||
self.cmd
|
||||
+ [
|
||||
"--config_file",
|
||||
"tests/test_configs/latest.yaml",
|
||||
"--install_accelerate",
|
||||
"--accelerate_version",
|
||||
"12.0.0",
|
||||
"--debug",
|
||||
],
|
||||
return_stdout=True,
|
||||
)
|
||||
self.assertEqual(
|
||||
self.clean_output(output),
|
||||
f'{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; pip install accelerate==12.0.0; echo "hello world"; echo "this is a second command" --worker all',
|
||||
)
|
||||
|
@ -15,3 +15,7 @@ num_processes: 1
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
use_cpu: false
|
||||
tpu_name: 'test-tpu'
|
||||
tpu_zone: 'us-central1-a'
|
||||
command: null
|
||||
command_file: tests/test_samples/test_command_file.sh
|
2
tests/test_samples/test_command_file.sh
Normal file
2
tests/test_samples/test_command_file.sh
Normal file
@ -0,0 +1,2 @@
|
||||
echo "hello world"
|
||||
echo "this is a second command"
|
Reference in New Issue
Block a user