Compare commits

...

12 Commits

Author SHA1 Message Date
7f48cb52e7 bug 2024-05-15 16:46:44 +02:00
e33aba7371 testing 2024-05-15 16:45:38 +02:00
068d586938 tied_param 2024-05-15 16:30:30 +02:00
76043b402f tied map 2024-05-15 16:17:17 +02:00
3126992054 more 2024-05-15 15:55:46 +02:00
656e15e4f8 more 2024-05-15 15:41:34 +02:00
1b21f9a630 test 2024-05-15 15:32:04 +02:00
f592aad8df test 2024-05-15 15:21:51 +02:00
b69239577f more debug 2024-05-15 15:11:27 +02:00
f973f0d5f9 debug 2024-05-15 14:34:22 +02:00
56580b40c5 only run big modeling test 2024-05-15 14:04:07 +02:00
30ac26cf33 debug tests 2024-05-15 13:59:52 +02:00
5 changed files with 232 additions and 207 deletions

View File

@ -44,186 +44,186 @@ jobs:
source activate accelerate
make test
- name: Run examples on GPUs
working-directory: accelerate
if: always()
run: |
source activate accelerate
pip uninstall comet_ml -y
make test_examples
# - name: Run examples on GPUs
# working-directory: accelerate
# if: always()
# run: |
# source activate accelerate
# pip uninstall comet_ml -y
# make test_examples
- name: Generate Report
working-directory: accelerate
if: always()
run: |
pip install slack_sdk tabulate
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
# - name: Generate Report
# working-directory: accelerate
# if: always()
# run: |
# pip install slack_sdk tabulate
# python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
run_deepspeed_tests_single_gpu:
runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
env:
CUDA_VISIBLE_DEVICES: "0"
TEST_TYPE: "single_gpu_deepspeed"
container:
image: huggingface/accelerate:gpu-deepspeed-nightly
options: --gpus all --shm-size "16gb"
defaults:
run:
shell: bash
steps:
- name: Update clone & pip install
run: |
source activate accelerate
git clone https://github.com/huggingface/accelerate;
cd accelerate;
git checkout ${{ github.sha }};
pip install -e . --no-deps
pip install pytest-reportlog tabulate
# run_deepspeed_tests_single_gpu:
# runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
# env:
# CUDA_VISIBLE_DEVICES: "0"
# TEST_TYPE: "single_gpu_deepspeed"
# container:
# image: huggingface/accelerate:gpu-deepspeed-nightly
# options: --gpus all --shm-size "16gb"
# defaults:
# run:
# shell: bash
# steps:
# - name: Update clone & pip install
# run: |
# source activate accelerate
# git clone https://github.com/huggingface/accelerate;
# cd accelerate;
# git checkout ${{ github.sha }};
# pip install -e . --no-deps
# pip install pytest-reportlog tabulate
- name: Show installed libraries
run: |
source activate accelerate;
pip freeze
# - name: Show installed libraries
# run: |
# source activate accelerate;
# pip freeze
- name: Run test on GPUs
working-directory: accelerate
run: |
source activate accelerate
make test_deepspeed
# - name: Run test on GPUs
# working-directory: accelerate
# run: |
# source activate accelerate
# make test_deepspeed
- name: Run Integration tests on GPUs
working-directory: accelerate
if: always()
run: |
source activate accelerate
make test_integrations
# - name: Run Integration tests on GPUs
# working-directory: accelerate
# if: always()
# run: |
# source activate accelerate
# make test_integrations
- name: Run examples on GPUs
working-directory: accelerate
if: always()
run: |
source activate accelerate
pip uninstall comet_ml -y
make test_examples
# - name: Run examples on GPUs
# working-directory: accelerate
# if: always()
# run: |
# source activate accelerate
# pip uninstall comet_ml -y
# make test_examples
- name: Generate Report
working-directory: accelerate
if: always()
run: |
pip install slack_sdk tabulate
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
# - name: Generate Report
# working-directory: accelerate
# if: always()
# run: |
# pip install slack_sdk tabulate
# python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
run_core_tests_multi_gpu:
runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
env:
CUDA_VISIBLE_DEVICES: "0,1"
TEST_TYPE: "multi_gpu"
container:
image: huggingface/accelerate:gpu-nightly
options: --gpus all --shm-size "16gb"
defaults:
run:
shell: bash
steps:
- name: Update clone
run: |
source activate accelerate
git clone https://github.com/huggingface/accelerate;
cd accelerate;
git checkout ${{ github.sha }};
pip install -e . --no-deps
pip install pytest-reportlog tabulate
# run_core_tests_multi_gpu:
# runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
# env:
# CUDA_VISIBLE_DEVICES: "0,1"
# TEST_TYPE: "multi_gpu"
# container:
# image: huggingface/accelerate:gpu-nightly
# options: --gpus all --shm-size "16gb"
# defaults:
# run:
# shell: bash
# steps:
# - name: Update clone
# run: |
# source activate accelerate
# git clone https://github.com/huggingface/accelerate;
# cd accelerate;
# git checkout ${{ github.sha }};
# pip install -e . --no-deps
# pip install pytest-reportlog tabulate
- name: Show installed libraries
run: |
source activate accelerate;
pip freeze
# - name: Show installed libraries
# run: |
# source activate accelerate;
# pip freeze
- name: Run core and big modeling tests on GPUs
working-directory: accelerate
run: |
source activate accelerate
make test_core
make test_big_modeling
make test_cli
# - name: Run core and big modeling tests on GPUs
# working-directory: accelerate
# run: |
# source activate accelerate
# make test_core
# make test_big_modeling
# make test_cli
- name: Run Integration tests on GPUs
working-directory: accelerate
if: always()
run: |
source activate accelerate
make test_integrations
# - name: Run Integration tests on GPUs
# working-directory: accelerate
# if: always()
# run: |
# source activate accelerate
# make test_integrations
- name: Run examples on GPUs
working-directory: accelerate
if: always()
run: |
source activate accelerate
pip uninstall comet_ml -y
make test_examples
# - name: Run examples on GPUs
# working-directory: accelerate
# if: always()
# run: |
# source activate accelerate
# pip uninstall comet_ml -y
# make test_examples
- name: Generate Report
working-directory: accelerate
if: always()
run: |
pip install slack_sdk tabulate
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
# - name: Generate Report
# working-directory: accelerate
# if: always()
# run: |
# pip install slack_sdk tabulate
# python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
run_deepspeed_tests_multi_gpu:
runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
env:
CUDA_VISIBLE_DEVICES: "0,1"
TEST_TYPE: "multi_gpu_deepspeed"
container:
image: huggingface/accelerate:gpu-deepspeed-nightly
options: --gpus all --shm-size "16gb"
defaults:
run:
shell: bash
steps:
- name: Update clone
run: |
source activate accelerate
git clone https://github.com/huggingface/accelerate;
cd accelerate;
git checkout ${{ github.sha }};
pip install -e . --no-deps
pip install pytest-reportlog tabulate
# run_deepspeed_tests_multi_gpu:
# runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
# env:
# CUDA_VISIBLE_DEVICES: "0,1"
# TEST_TYPE: "multi_gpu_deepspeed"
# container:
# image: huggingface/accelerate:gpu-deepspeed-nightly
# options: --gpus all --shm-size "16gb"
# defaults:
# run:
# shell: bash
# steps:
# - name: Update clone
# run: |
# source activate accelerate
# git clone https://github.com/huggingface/accelerate;
# cd accelerate;
# git checkout ${{ github.sha }};
# pip install -e . --no-deps
# pip install pytest-reportlog tabulate
- name: Show installed libraries
run: |
source activate accelerate;
pip freeze
# - name: Show installed libraries
# run: |
# source activate accelerate;
# pip freeze
- name: Run DeepSpeed tests
working-directory: accelerate
run: |
source activate accelerate
make test_deepspeed
# - name: Run DeepSpeed tests
# working-directory: accelerate
# run: |
# source activate accelerate
# make test_deepspeed
- name: Run Integration tests on GPUs
working-directory: accelerate
if: always()
run: |
source activate accelerate
make test_integrations
# - name: Run Integration tests on GPUs
# working-directory: accelerate
# if: always()
# run: |
# source activate accelerate
# make test_integrations
- name: Run examples on GPUs
working-directory: accelerate
if: always()
run: |
source activate accelerate
pip uninstall comet_ml -y
make test_examples
# - name: Run examples on GPUs
# working-directory: accelerate
# if: always()
# run: |
# source activate accelerate
# pip uninstall comet_ml -y
# make test_examples
- name: Generate Report
working-directory: accelerate
if: always()
run: |
pip install slack_sdk tabulate
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
# - name: Generate Report
# working-directory: accelerate
# if: always()
# run: |
# pip install slack_sdk tabulate
# python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
run-integration-tests:
if: always()
uses: ./.github/workflows/self_hosted_integration_tests.yml
# run-integration-tests:
# if: always()
# uses: ./.github/workflows/self_hosted_integration_tests.yml

View File

@ -42,11 +42,7 @@ test_fsdp:
# Since the new version of pytest will *change* how things are collected, we need `deepspeed` to
# run after test_core and test_cli
test:
$(MAKE) test_core
$(MAKE) test_cli
$(MAKE) test_big_modeling
$(MAKE) test_deepspeed
$(MAKE) test_fsdp
test_examples:
python -m pytest -s -v ./tests/test_examples.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_examples.log",)

View File

@ -397,6 +397,7 @@ def dispatch_model(
weights_map = OffloadedWeightsLoader(
state_dict=state_dict, save_folder=save_folder, index=offload_index, device=device
)
print(weights_map)
else:
weights_map = None
@ -415,7 +416,6 @@ def dispatch_model(
# Note: To handle the disk offloading case, we can not simply use weights_map[param_name].data_ptr() as the reference pointer,
# as we have no guarantee that safetensors' `file.get_tensor()` will always give the same pointer.
attach_align_device_hook_on_blocks(
model,
execution_device=execution_device,

View File

@ -340,6 +340,11 @@ def set_module_tensor_to_device(
and value.data_ptr() in tied_params_map
and device in tied_params_map[value.data_ptr()]
):
print("using value from tied_params_map value")
print(tensor_name)
print(value)
print(value.data_ptr())
print(tied_params_map[value.data_ptr()][device])
module._parameters[tensor_name] = tied_params_map[value.data_ptr()][device]
return
elif (
@ -347,6 +352,11 @@ def set_module_tensor_to_device(
and old_value.data_ptr() in tied_params_map
and device in tied_params_map[old_value.data_ptr()]
):
print("using value from tied_params_map old_value")
print(tensor_name)
print(value)
print(old_value.data_ptr())
print(tied_params_map[old_value.data_ptr()][device])
module._parameters[tensor_name] = tied_params_map[old_value.data_ptr()][device]
return
@ -466,6 +476,8 @@ def set_module_tensor_to_device(
and device not in tied_params_map[old_value.data_ptr()]
):
tied_params_map[old_value.data_ptr()][device] = new_value
print("tied_map updated 1 ")
print(tied_params_map)
elif (
value is not None
and tied_params_map is not None
@ -473,6 +485,8 @@ def set_module_tensor_to_device(
and device not in tied_params_map[value.data_ptr()]
):
tied_params_map[value.data_ptr()][device] = new_value
print("tied_map updated 2")
print(tied_params_map)
def named_module_tensors(

View File

@ -145,6 +145,50 @@ class ModelWithUnusedSubModulesForTest(nn.Module):
return self.linear4(self.linear3(self.batchnorm(self.linear2(self.linear1(x)))))
# To test dispatch with tied weights
class SubModule(torch.nn.Module):
def __init__(self, ref_to_parameter):
super().__init__()
self.parameter = ref_to_parameter
def forward(self, x):
return x + torch.max(self.parameter)
class LinearModuleAndSubModule(torch.nn.Linear):
def __init__(self, in_features, out_features, name):
super().__init__(in_features, out_features, bias=False)
print("init weights")
self.name = name
self.weight_submodule = SubModule(self.weight)
self.weight_submodule2 = SubModule(self.weight)
self.weight_submodule3 = SubModule(self.weight)
self.weight_submodule4 = SubModule(self.weight)
def forward(self, x):
print("weight")
print(self.weight)
print("name")
print(self.name)
a = torch.nn.functional.linear(self.weight_submodule(x), self.weight)
b = torch.nn.functional.linear(self.weight_submodule2(x), self.weight)
c = torch.nn.functional.linear(self.weight_submodule3(x), self.weight)
d = torch.nn.functional.linear(self.weight_submodule4(x), self.weight)
return a + b + c + d
class ModelWithSubmodules(torch.nn.Module):
def __init__(self):
super().__init__()
self.module1 = LinearModuleAndSubModule(5000, 5000, "1")
self.module2 = LinearModuleAndSubModule(5000, 5000, "2")
def forward(self, x):
a = self.module1(x)
b = self.module2(x)
return a + b
class BigModelingTester(unittest.TestCase):
def test_init_empty_weights(self):
# base use
@ -490,42 +534,8 @@ class BigModelingTester(unittest.TestCase):
torch.cuda.empty_cache() # Needed in case we run several tests in a row.
class SubModule(torch.nn.Module):
def __init__(self, ref_to_parameter):
super().__init__()
self.parameter = ref_to_parameter
def forward(self, x):
return x + torch.max(self.parameter)
class LinearModuleAndSubModule(torch.nn.Linear):
def __init__(self, in_features, out_features):
super().__init__(in_features, out_features, bias=False)
self.weight_submodule = SubModule(self.weight)
self.weight_submodule2 = SubModule(self.weight)
self.weight_submodule3 = SubModule(self.weight)
self.weight_submodule4 = SubModule(self.weight)
def forward(self, x):
a = torch.nn.functional.linear(self.weight_submodule(x), self.weight)
b = torch.nn.functional.linear(self.weight_submodule2(x), self.weight)
c = torch.nn.functional.linear(self.weight_submodule3(x), self.weight)
d = torch.nn.functional.linear(self.weight_submodule4(x), self.weight)
return a + b + c + d
class ModelWithSubmodules(torch.nn.Module):
def __init__(self):
super().__init__()
self.compute = LinearModuleAndSubModule(5000, 5000)
self.compute1 = LinearModuleAndSubModule(5000, 5000)
def forward(self, x):
a = self.compute(x)
b = self.compute1(x)
return a + b
# We should need only 2 * 5000 * 5000 * 32 // 8 * 1e-6 = 200 MB on the device 0 for the whole model forward, and not 600 MB.
device_map = {"compute": 0, "compute1": "disk"}
device_map = {"module1": 0, "module2": "disk"}
model = ModelWithSubmodules()
@ -545,7 +555,13 @@ class BigModelingTester(unittest.TestCase):
free_memory_bytes_before_dispatch = torch.cuda.mem_get_info("cuda:0")[0]
with TemporaryDirectory() as tmp_dir:
print("before dispatch")
print(model.module1.weight)
print(model.module2.weight)
dispatch_model(model, device_map, offload_dir=tmp_dir)
print("after dispatch")
print(model.module1.weight)
print(model.module2.weight)
free_memory_bytes_after_dispatch = torch.cuda.mem_get_info("cuda:0")[0]
assert (free_memory_bytes_after_dispatch - free_memory_bytes_before_dispatch) * 1e-6 < 130
@ -559,7 +575,6 @@ class BigModelingTester(unittest.TestCase):
)
except Exception as e:
raise e
assert torch.allclose(expected, output.cpu(), atol=1e-5)
torch.cuda.empty_cache()
@ -568,16 +583,16 @@ class BigModelingTester(unittest.TestCase):
# Check that we have no more references on GPU for the offloaded tied weight.
n_non_empty = 0
for pointer, pointer_dict in model.compute1.weight_submodule._hf_hook.tied_params_map.items():
for pointer, pointer_dict in model.module1.weight_submodule._hf_hook.tied_params_map.items():
if len(pointer_dict) > 0:
n_non_empty += 1
assert n_non_empty == 1 # `compute` layer one.
assert n_non_empty == 1 # `module1` layer one.
n_non_empty = 0
for pointer, pointer_dict in model.compute1._hf_hook.tied_params_map.items():
for pointer, pointer_dict in model.module1._hf_hook.tied_params_map.items():
if len(pointer_dict) > 0:
n_non_empty += 1
assert n_non_empty == 1 # `compute` layer one.
assert n_non_empty == 1 # `module1` layer one.
assert (free_memory_bytes_after_infer - free_memory_bytes_after_dispatch) * 1e-6 < 130