Introduce a pod-config command (#802)

* Add in ability to configure pod and start CLI commands

* Further tests, add a help

* Added tests and cleaned up!

* Fix weird missing parts

* MOre tests + install accelerate with flag

* Unused pod_config_file

* Test with multiple commands

* Update src/accelerate/commands/config/cluster.py

Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>

* Clarity during printing

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Make public names for readability

* Fix test expected outputs and refactor response

* Fix ref errors

Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
Zachary Mueller
2022-11-01 10:00:48 -04:00
committed by GitHub
parent c4c444a158
commit b816e258a9
8 changed files with 388 additions and 3 deletions

View File

@ -19,6 +19,7 @@ from argparse import ArgumentParser
from accelerate.commands.config import config_command_parser
from accelerate.commands.env import env_command_parser
from accelerate.commands.launch import launch_command_parser
from accelerate.commands.pod import pod_command_parser
from accelerate.commands.test import test_command_parser
@ -28,9 +29,10 @@ def main():
# Register commands
config_command_parser(subparsers=subparsers)
launch_command_parser(subparsers=subparsers)
test_command_parser(subparsers=subparsers)
env_command_parser(subparsers=subparsers)
launch_command_parser(subparsers=subparsers)
pod_command_parser(subparsers=subparsers)
test_command_parser(subparsers=subparsers)
# Let's go
args = parser.parse_args()

View File

@ -14,6 +14,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from ...utils import ComputeEnvironment, DistributedType, is_deepspeed_available, is_transformers_available
from ...utils.constants import (
DEEPSPEED_MULTINODE_LAUNCHERS,
@ -41,6 +43,10 @@ def get_cluster_input():
main_process_port = None
rdzv_backend = "static"
same_network = True
tpu_name = None
tpu_zone = None
commands = None
command_file = None
if distributed_type in [DistributedType.MULTI_GPU, DistributedType.MULTI_CPU]:
num_machines = _ask_field(
"How many different machines will you use (use more than 1 for multi-node training)? [1]: ",
@ -341,6 +347,50 @@ def get_cluster_input():
"What is the name of the function in your script that should be launched in all parallel scripts? [main]: ",
default="main",
)
use_cluster = _ask_field(
"Are you using a TPU cluster? [yes/NO]: ",
_convert_yes_no_to_bool,
default=False,
error_message="Please enter yes or no.",
)
if use_cluster:
tpu_name = _ask_field(
"What is the name of your TPU cluster? ",
default=None,
error_message="Please enter the name of your TPU cluster.",
)
tpu_zone = _ask_field(
"What is the zone of your TPU cluster? ",
default=None,
error_message="Please enter the zone of your TPU cluster.",
)
run_commands = _ask_field(
"Do you have code you wish to run on startup in each pod? [yes/NO]: ",
_convert_yes_no_to_bool,
default=False,
error_message="Please enter yes or no.",
)
if run_commands:
use_command_file = _ask_field(
"Is this code located in a bash script? [yes/NO]: ",
_convert_yes_no_to_bool,
default=False,
error_message="Please enter yes or no.",
)
if use_command_file:
command_file = _ask_field(
"What is the path to your bash script? ",
default=None,
error_message="Please enter the path to your bash script.",
)
command_file = os.path.abspath(command_file)
else:
commands = _ask_field(
"What commands do you wish to run on startup in each pod? ",
default=None,
error_message="Please enter the commands you wish to run on startup in each pod as a single string.",
)
else:
main_training_function = "main"
@ -408,4 +458,8 @@ def get_cluster_input():
use_cpu=use_cpu,
rdzv_backend=rdzv_backend,
same_network=same_network,
tpu_name=tpu_name,
tpu_zone=tpu_zone,
commands=commands,
command_file=command_file,
)

View File

@ -18,7 +18,7 @@ import json
import os
from dataclasses import dataclass
from enum import Enum
from typing import Optional, Union
from typing import List, Optional, Union
import yaml
@ -151,6 +151,12 @@ class ClusterConfig(BaseConfig):
# args for TPU
downcast_bf16: bool = False
# args for TPU pods
tpu_name: str = None
tpu_zone: str = None
command_file: str = None
command: List[str] = None
def __post_init__(self):
if self.deepspeed_config is None:
self.deepspeed_config = {}

View File

@ -1,3 +1,19 @@
#!/usr/bin/env python
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import platform

View File

@ -0,0 +1,152 @@
#!/usr/bin/env python
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import subprocess
from accelerate.commands.config.config_args import default_config_file, load_config_from_file
from packaging.version import Version, parse
_description = "Run commands across a pod of TPU VMs for initial setup before running `accelerate launch`. Will also install Accelerate on the pod."
def pod_command_parser(subparsers=None):
if subparsers is not None:
parser = subparsers.add_parser("pod-config", description=_description)
else:
parser = argparse.ArgumentParser("Accelerate pod-config command", description=_description)
parser.add_argument(
"--config_file",
type=str,
default=None,
help="Path to the config file to use for accelerate.",
)
parser.add_argument(
"--pod_config_file",
type=str,
default=None,
help="Path to the config file to use for the pod.",
)
parser.add_argument(
"--command_file",
default=None,
help="The path to the file containing the commands to run on the pod on startup.",
)
parser.add_argument(
"--command",
action="append",
nargs="+",
help="A command to run on the pod. If not specified, will use the command specified in the command file.",
)
parser.add_argument(
"--tpu_name",
default=None,
help="The name of the TPU to use. If not specified, will use the TPU specified in the config file.",
)
parser.add_argument(
"--tpu_zone",
default=None,
help="The zone of the TPU to use. If not specified, will use the zone specified in the config file.",
)
parser.add_argument(
"--install_accelerate",
action="store_true",
help="Whether to install accelerate on the pod. Defaults to False.",
)
parser.add_argument(
"--accelerate_version",
default="latest",
help="The version of accelerate to install on the pod. If not specified, will use the latest pypi version. Specify 'dev' to install from GitHub.",
)
parser.add_argument(
"--debug", action="store_true", help="If set, will print the command that would be run instead of running it."
)
if subparsers is not None:
parser.set_defaults(func=pod_launcher)
return parser
def pod_launcher(args):
defaults = None
# Get the default from the config file if it exists.
if args.config_file is not None or os.path.isfile(default_config_file):
defaults = load_config_from_file(args.config_file)
if not args.command_file and defaults.command_file is not None and not args.command:
args.command_file = defaults.command_file
if not args.command and defaults.command is not None:
args.command = defaults.command
if not args.tpu_name:
args.tpu_name = defaults.tpu_name
if not args.tpu_zone:
args.tpu_zone = defaults.tpu_zone
if args.accelerate_version == "dev":
args.accelerate_version = "git+https://github.com/huggingface/accelerate.git"
elif args.accelerate_version == "latest":
args.accelerate_version = "accelerate -U"
elif isinstance(parse(args.accelerate_version), Version):
args.accelerate_version = f"accelerate=={args.accelerate_version}"
if not args.command_file and not args.command:
raise ValueError("You must specify either a command file or a command to run on the pod.")
if args.command_file:
with open(args.command_file, "r") as f:
args.command = [f.read().splitlines()]
# To turn list of lists into list of strings
args.command = [line for cmd in args.command for line in cmd]
# Default to the shared folder and install accelerate
new_cmd = ["cd /usr/share"]
if args.install_accelerate:
new_cmd += [f"pip install {args.accelerate_version}"]
new_cmd += args.command
args.command = "; ".join(new_cmd)
# Then send it to gcloud
# Eventually try to use google-api-core to do this instead of subprocess
cmd = [
"gcloud",
"compute",
"tpus",
"tpu-vm",
"ssh",
args.tpu_name,
"--zone",
args.tpu_zone,
"--command",
args.command,
"--worker",
"all",
]
if args.debug:
print(f"Running {' '.join(cmd)}")
return
subprocess.run(cmd)
print("Successfully setup pod.")
def main():
parser = pod_command_parser()
args = parser.parse_args()
pod_launcher(args)

View File

@ -21,6 +21,7 @@ import torch
import accelerate
from accelerate.test_utils import execute_subprocess_async
from accelerate.test_utils.testing import run_command
class AccelerateLauncherTester(unittest.TestCase):
@ -63,3 +64,151 @@ class AccelerateLauncherTester(unittest.TestCase):
execute_subprocess_async(
self.base_cmd + ["--config_file", str(config), self.test_file_path], env=os.environ.copy()
)
class PodConfigTester(unittest.TestCase):
"""
Test case for verifying the `accelerate pod-config` CLI passes the right `gcloud` command.
"""
tpu_name = "test-tpu"
tpu_zone = "us-central1-a"
command = "ls"
cmd = ["accelerate", "pod-config"]
base_output = "cd /usr/share"
command_file = "tests/test_samples/test_command_file.sh"
gcloud = "Running gcloud compute tpus tpu-vm ssh"
@staticmethod
def clean_output(output):
return "".join(output).rstrip()
def test_base(self):
output = run_command(
self.cmd
+ ["--command", self.command, "--tpu_zone", self.tpu_zone, "--tpu_name", self.tpu_name, "--debug"],
return_stdout=True,
)
self.assertEqual(
self.clean_output(output),
f"{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; ls --worker all",
)
def test_base_backward_compatibility(self):
output = run_command(
self.cmd
+ [
"--config_file",
"tests/test_configs/0_12_0.yaml",
"--command",
self.command,
"--tpu_zone",
self.tpu_zone,
"--tpu_name",
self.tpu_name,
"--debug",
],
return_stdout=True,
)
self.assertEqual(
self.clean_output(output),
f"{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; ls --worker all",
)
def test_with_config_file(self):
output = run_command(
self.cmd + ["--config_file", "tests/test_configs/latest.yaml", "--debug"], return_stdout=True
)
self.assertEqual(
self.clean_output(output),
f'{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; echo "hello world"; echo "this is a second command" --worker all',
)
def test_with_config_file_and_command(self):
output = run_command(
self.cmd + ["--config_file", "tests/test_configs/latest.yaml", "--command", self.command, "--debug"],
return_stdout=True,
)
self.assertEqual(
self.clean_output(output),
f"{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; ls --worker all",
)
def test_with_config_file_and_multiple_command(self):
output = run_command(
self.cmd
+ [
"--config_file",
"tests/test_configs/latest.yaml",
"--command",
self.command,
"--command",
'echo "Hello World"',
"--debug",
],
return_stdout=True,
)
self.assertEqual(
self.clean_output(output),
f'{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; ls; echo "Hello World" --worker all',
)
def test_with_config_file_and_command_file(self):
output = run_command(
self.cmd
+ ["--config_file", "tests/test_configs/latest.yaml", "--command_file", self.command_file, "--debug"],
return_stdout=True,
)
self.assertEqual(
self.clean_output(output),
f'{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; echo "hello world"; echo "this is a second command" --worker all',
)
def test_with_config_file_and_command_file_backward_compatibility(self):
output = run_command(
self.cmd
+ [
"--config_file",
"tests/test_configs/0_12_0.yaml",
"--command_file",
self.command_file,
"--tpu_zone",
self.tpu_zone,
"--tpu_name",
self.tpu_name,
"--debug",
],
return_stdout=True,
)
self.assertEqual(
self.clean_output(output),
f'{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; echo "hello world"; echo "this is a second command" --worker all',
)
def test_accelerate_install(self):
output = run_command(
self.cmd + ["--config_file", "tests/test_configs/latest.yaml", "--install_accelerate", "--debug"],
return_stdout=True,
)
self.assertEqual(
self.clean_output(output),
f'{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; pip install accelerate -U; echo "hello world"; echo "this is a second command" --worker all',
)
def test_accelerate_install_version(self):
output = run_command(
self.cmd
+ [
"--config_file",
"tests/test_configs/latest.yaml",
"--install_accelerate",
"--accelerate_version",
"12.0.0",
"--debug",
],
return_stdout=True,
)
self.assertEqual(
self.clean_output(output),
f'{self.gcloud} test-tpu --zone us-central1-a --command {self.base_output}; pip install accelerate==12.0.0; echo "hello world"; echo "this is a second command" --worker all',
)

View File

@ -15,3 +15,7 @@ num_processes: 1
rdzv_backend: static
same_network: true
use_cpu: false
tpu_name: 'test-tpu'
tpu_zone: 'us-central1-a'
command: null
command_file: tests/test_samples/test_command_file.sh

View File

@ -0,0 +1,2 @@
echo "hello world"
echo "this is a second command"