mirror of
				https://github.com/huggingface/transformers.git
				synced 2025-11-04 20:14:36 +08:00 
			
		
		
		
	Compare commits
	
		
			39 Commits
		
	
	
		
			v4.42.1
			...
			run-move-i
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 2b143945bf | |||
| 4b4c68157c | |||
| 33412d3759 | |||
| fa451d4604 | |||
| 3107a96b6c | |||
| 89f4ebd4e3 | |||
| 7b6098c34a | |||
| 10d3b77aa7 | |||
| 8ace6bd21a | |||
| 5773b33b5f | |||
| 72fd103a12 | |||
| d50051a215 | |||
| 080fc2fa90 | |||
| b756acedc7 | |||
| bb0a0259c6 | |||
| 310ceb1e10 | |||
| be38218dbe | |||
| b8fcf6143b | |||
| 615ac14ea5 | |||
| bd95ee271c | |||
| b4c4cf7c95 | |||
| 7cc7cbb5fa | |||
| 80d27756f6 | |||
| 656c41113e | |||
| c80cfd1aed | |||
| bc7a6ae4a6 | |||
| f4d8c833f1 | |||
| 566847461d | |||
| e4e245b16f | |||
| b2e167265d | |||
| de497d4f2e | |||
| f8afb0abc0 | |||
| b30adec90f | |||
| 026f53c4d2 | |||
| 190b83f7f4 | |||
| 582fbde0a3 | |||
| 7c0b4bb9af | |||
| 9453fa2538 | |||
| b04903bcd8 | 
							
								
								
									
										22
									
								
								.github/workflows/self-scheduled.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										22
									
								
								.github/workflows/self-scheduled.yml
									
									
									
									
										vendored
									
									
								
							@ -8,11 +8,9 @@ name: Self-hosted runner (scheduled)
 | 
			
		||||
 | 
			
		||||
on:
 | 
			
		||||
  repository_dispatch:
 | 
			
		||||
  schedule:
 | 
			
		||||
    - cron: "17 2 * * *"
 | 
			
		||||
  push:
 | 
			
		||||
    branches:
 | 
			
		||||
      - run_scheduled_ci*
 | 
			
		||||
      - run-move-integrations
 | 
			
		||||
 | 
			
		||||
env:
 | 
			
		||||
  HF_HOME: /mnt/cache
 | 
			
		||||
@ -43,7 +41,7 @@ jobs:
 | 
			
		||||
    strategy:
 | 
			
		||||
      matrix:
 | 
			
		||||
        machine_type: [single-gpu, multi-gpu]
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
 | 
			
		||||
    container:
 | 
			
		||||
      image: huggingface/transformers-all-latest-gpu
 | 
			
		||||
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
@ -58,7 +56,7 @@ jobs:
 | 
			
		||||
    strategy:
 | 
			
		||||
      matrix:
 | 
			
		||||
        machine_type: [single-gpu, multi-gpu]
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
 | 
			
		||||
    container:
 | 
			
		||||
      image: huggingface/transformers-all-latest-gpu
 | 
			
		||||
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
@ -85,7 +83,7 @@ jobs:
 | 
			
		||||
        name: Identify models to test
 | 
			
		||||
        working-directory: /transformers/tests
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT
 | 
			
		||||
          echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2[:10] + d1; print(d)')" >> $GITHUB_OUTPUT
 | 
			
		||||
 | 
			
		||||
      - name: NVIDIA-SMI
 | 
			
		||||
        run: |
 | 
			
		||||
@ -98,7 +96,7 @@ jobs:
 | 
			
		||||
      matrix:
 | 
			
		||||
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
 | 
			
		||||
        machine_type: [single-gpu]
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
 | 
			
		||||
    container:
 | 
			
		||||
      image: huggingface/transformers-all-latest-gpu
 | 
			
		||||
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
@ -159,7 +157,7 @@ jobs:
 | 
			
		||||
      matrix:
 | 
			
		||||
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
 | 
			
		||||
        machine_type: [multi-gpu]
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
 | 
			
		||||
    container:
 | 
			
		||||
      image: huggingface/transformers-all-latest-gpu
 | 
			
		||||
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
@ -219,7 +217,7 @@ jobs:
 | 
			
		||||
      fail-fast: false
 | 
			
		||||
      matrix:
 | 
			
		||||
        machine_type: [single-gpu]
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
 | 
			
		||||
    container:
 | 
			
		||||
      image: huggingface/transformers-all-latest-gpu
 | 
			
		||||
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
@ -270,7 +268,7 @@ jobs:
 | 
			
		||||
      fail-fast: false
 | 
			
		||||
      matrix:
 | 
			
		||||
        machine_type: [single-gpu, multi-gpu]
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
 | 
			
		||||
    container:
 | 
			
		||||
      image: huggingface/transformers-pytorch-gpu
 | 
			
		||||
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
@ -320,7 +318,7 @@ jobs:
 | 
			
		||||
      fail-fast: false
 | 
			
		||||
      matrix:
 | 
			
		||||
        machine_type: [single-gpu, multi-gpu]
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
 | 
			
		||||
    container:
 | 
			
		||||
      image: huggingface/transformers-tensorflow-gpu
 | 
			
		||||
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 | 
			
		||||
@ -371,7 +369,7 @@ jobs:
 | 
			
		||||
      fail-fast: false
 | 
			
		||||
      matrix:
 | 
			
		||||
        machine_type: [single-gpu, multi-gpu]
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
 | 
			
		||||
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
 | 
			
		||||
    needs: setup
 | 
			
		||||
    container:
 | 
			
		||||
      image: huggingface/transformers-pytorch-deepspeed-latest-gpu
 | 
			
		||||
 | 
			
		||||
@ -2065,20 +2065,20 @@ In this case you usually need to raise the value of `initial_scale_power`. Setti
 | 
			
		||||
 | 
			
		||||
## Non-Trainer Deepspeed Integration
 | 
			
		||||
 | 
			
		||||
The [`~deepspeed.HfDeepSpeedConfig`] is used to integrate Deepspeed into the 🤗 Transformers core
 | 
			
		||||
The [`~integrations.HfDeepSpeedConfig`] is used to integrate Deepspeed into the 🤗 Transformers core
 | 
			
		||||
functionality, when [`Trainer`] is not used. The only thing that it does is handling Deepspeed ZeRO-3 param gathering and automatically splitting the model onto multiple gpus during `from_pretrained` call. Everything else you have to do by yourself.
 | 
			
		||||
 | 
			
		||||
When using [`Trainer`] everything is automatically taken care of.
 | 
			
		||||
 | 
			
		||||
When not using [`Trainer`], to efficiently deploy DeepSpeed ZeRO-3, you must instantiate the
 | 
			
		||||
[`~deepspeed.HfDeepSpeedConfig`] object before instantiating the model and keep that object alive.
 | 
			
		||||
[`~integrations.HfDeepSpeedConfig`] object before instantiating the model and keep that object alive.
 | 
			
		||||
 | 
			
		||||
If you're using Deepspeed ZeRO-1 or ZeRO-2 you don't need to use `HfDeepSpeedConfig` at all.
 | 
			
		||||
 | 
			
		||||
For example for a pretrained model:
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
from transformers.deepspeed import HfDeepSpeedConfig
 | 
			
		||||
from transformers.integrations import HfDeepSpeedConfig
 | 
			
		||||
from transformers import AutoModel
 | 
			
		||||
import deepspeed
 | 
			
		||||
 | 
			
		||||
@ -2092,7 +2092,7 @@ engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
 | 
			
		||||
or for non-pretrained model:
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
from transformers.deepspeed import HfDeepSpeedConfig
 | 
			
		||||
from transformers.integrations import HfDeepSpeedConfig
 | 
			
		||||
from transformers import AutoModel, AutoConfig
 | 
			
		||||
import deepspeed
 | 
			
		||||
 | 
			
		||||
@ -2108,7 +2108,7 @@ Please note that if you're not using the [`Trainer`] integration, you're complet
 | 
			
		||||
 | 
			
		||||
## HfDeepSpeedConfig
 | 
			
		||||
 | 
			
		||||
[[autodoc]] deepspeed.HfDeepSpeedConfig
 | 
			
		||||
[[autodoc]] integrations.HfDeepSpeedConfig
 | 
			
		||||
    - all
 | 
			
		||||
 | 
			
		||||
### Custom DeepSpeed ZeRO Inference
 | 
			
		||||
@ -2161,7 +2161,7 @@ Make sure to:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
 | 
			
		||||
from transformers.deepspeed import HfDeepSpeedConfig
 | 
			
		||||
from transformers.integrations import HfDeepSpeedConfig
 | 
			
		||||
import deepspeed
 | 
			
		||||
import os
 | 
			
		||||
import torch
 | 
			
		||||
 | 
			
		||||
@ -32,7 +32,7 @@ from copy import deepcopy  # noqa
 | 
			
		||||
 | 
			
		||||
from parameterized import parameterized  # noqa
 | 
			
		||||
from transformers import TrainingArguments, is_torch_available  # noqa
 | 
			
		||||
from transformers.deepspeed import is_deepspeed_available  # noqa
 | 
			
		||||
from transformers.integrations.deepspeed import is_deepspeed_available  # noqa
 | 
			
		||||
from transformers.file_utils import WEIGHTS_NAME  # noqa
 | 
			
		||||
from transformers.testing_utils import (  # noqa
 | 
			
		||||
    CaptureLogger,
 | 
			
		||||
 | 
			
		||||
@ -94,6 +94,7 @@ _import_structure = {
 | 
			
		||||
    "data.metrics": [],
 | 
			
		||||
    "data.processors": [],
 | 
			
		||||
    "debug_utils": [],
 | 
			
		||||
    "deepspeed": [],
 | 
			
		||||
    "dependency_versions_check": [],
 | 
			
		||||
    "dependency_versions_table": [],
 | 
			
		||||
    "dynamic_module_utils": [],
 | 
			
		||||
@ -115,8 +116,6 @@ _import_structure = {
 | 
			
		||||
        "is_tensorboard_available",
 | 
			
		||||
        "is_wandb_available",
 | 
			
		||||
    ],
 | 
			
		||||
    "lib_integrations": [],
 | 
			
		||||
    "lib_integrations.peft": [],
 | 
			
		||||
    "modelcard": ["ModelCard"],
 | 
			
		||||
    "modeling_tf_pytorch_utils": [
 | 
			
		||||
        "convert_tf_weight_name_to_pt_weight_name",
 | 
			
		||||
@ -745,7 +744,6 @@ _import_structure = {
 | 
			
		||||
        "is_vision_available",
 | 
			
		||||
        "logging",
 | 
			
		||||
    ],
 | 
			
		||||
    "utils.bitsandbytes": [],
 | 
			
		||||
    "utils.quantization_config": ["BitsAndBytesConfig", "GPTQConfig"],
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -1002,7 +1000,6 @@ else:
 | 
			
		||||
        "TextDataset",
 | 
			
		||||
        "TextDatasetForNextSentencePrediction",
 | 
			
		||||
    ]
 | 
			
		||||
    _import_structure["deepspeed"] = []
 | 
			
		||||
    _import_structure["generation"].extend(
 | 
			
		||||
        [
 | 
			
		||||
            "BeamScorer",
 | 
			
		||||
 | 
			
		||||
@ -12,378 +12,29 @@
 | 
			
		||||
# See the License for the specific language governing permissions and
 | 
			
		||||
# limitations under the License.
 | 
			
		||||
"""
 | 
			
		||||
Integration with Deepspeed
 | 
			
		||||
Integration with Deepspeed - kept for backward compatiblity, if you plan to make any edit, make sure to modify the file
 | 
			
		||||
in `integrations/deepspeed` instead.
 | 
			
		||||
 | 
			
		||||
Check: https://github.com/huggingface/transformers/pull/25599
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
import importlib.util
 | 
			
		||||
import weakref
 | 
			
		||||
from functools import partialmethod
 | 
			
		||||
 | 
			
		||||
from .dependency_versions_check import dep_version_check
 | 
			
		||||
from .utils import is_accelerate_available, is_torch_available, logging
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if is_torch_available():
 | 
			
		||||
    import torch
 | 
			
		||||
 | 
			
		||||
logger = logging.get_logger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_deepspeed_available():
 | 
			
		||||
    return importlib.util.find_spec("deepspeed") is not None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if is_accelerate_available() and is_deepspeed_available():
 | 
			
		||||
    from accelerate.utils.deepspeed import HfDeepSpeedConfig as DeepSpeedConfig
 | 
			
		||||
else:
 | 
			
		||||
    # Inherits from a dummy `object` if accelerate is not available, so that python succeeds to import this file.
 | 
			
		||||
    # Deepspeed glue code will never inherit this dummy object as it checks if accelerate is available.
 | 
			
		||||
    from builtins import object as DeepSpeedConfig
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class HfDeepSpeedConfig(DeepSpeedConfig):
 | 
			
		||||
    """
 | 
			
		||||
    This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.
 | 
			
		||||
 | 
			
		||||
    A `weakref` of this object is stored in the module's globals to be able to access the config from areas where
 | 
			
		||||
    things like the Trainer object is not available (e.g. `from_pretrained` and `_get_resized_embeddings`). Therefore
 | 
			
		||||
    it's important that this object remains alive while the program is still running.
 | 
			
		||||
 | 
			
		||||
    [`Trainer`] uses the `HfTrainerDeepSpeedConfig` subclass instead. That subclass has logic to sync the configuration
 | 
			
		||||
    with values of [`TrainingArguments`] by replacing special placeholder values: `"auto"`. Without this special logic
 | 
			
		||||
    the DeepSpeed configuration is not modified in any way.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        config_file_or_dict (`Union[str, Dict]`): path to DeepSpeed config file or dict.
 | 
			
		||||
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(self, config_file_or_dict):
 | 
			
		||||
        # set global weakref object
 | 
			
		||||
        set_hf_deepspeed_config(self)
 | 
			
		||||
        dep_version_check("accelerate")
 | 
			
		||||
        dep_version_check("deepspeed")
 | 
			
		||||
        super().__init__(config_file_or_dict)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
 | 
			
		||||
    """
 | 
			
		||||
    The `HfTrainerDeepSpeedConfig` object is meant to be created during `TrainingArguments` object creation and has the
 | 
			
		||||
    same lifespan as the latter.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(self, config_file_or_dict):
 | 
			
		||||
        super().__init__(config_file_or_dict)
 | 
			
		||||
        self._dtype = None
 | 
			
		||||
        self.mismatches = []
 | 
			
		||||
 | 
			
		||||
    def dtype(self):
 | 
			
		||||
        if self._dtype is None:
 | 
			
		||||
            raise ValueError("trainer_config_process() wasn't called yet to tell dtype")
 | 
			
		||||
        return self._dtype
 | 
			
		||||
 | 
			
		||||
    def is_auto(self, ds_key_long):
 | 
			
		||||
        val = self.get_value(ds_key_long)
 | 
			
		||||
        if val is None:
 | 
			
		||||
            return False
 | 
			
		||||
        else:
 | 
			
		||||
            return val == "auto"
 | 
			
		||||
 | 
			
		||||
    def fill_match(self, ds_key_long, hf_val, hf_key=None, must_match=True):
 | 
			
		||||
        """
 | 
			
		||||
        A utility method that massages the config file and can optionally verify that the values match.
 | 
			
		||||
 | 
			
		||||
        1. Replace "auto" values with `TrainingArguments` value.
 | 
			
		||||
 | 
			
		||||
        2. If it wasn't "auto" and `must_match` is true, then check that DS config matches Trainer
 | 
			
		||||
        config values and if mismatched add the entry to `self.mismatched` - will assert during
 | 
			
		||||
        `trainer_config_finalize` for one or more mismatches.
 | 
			
		||||
 | 
			
		||||
        """
 | 
			
		||||
        config, ds_key = self.find_config_node(ds_key_long)
 | 
			
		||||
        if config is None:
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        if config.get(ds_key) == "auto":
 | 
			
		||||
            config[ds_key] = hf_val
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        if not must_match:
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        ds_val = config.get(ds_key)
 | 
			
		||||
        if ds_val is not None and ds_val != hf_val:
 | 
			
		||||
            self.mismatches.append(f"- ds {ds_key_long}={ds_val} vs hf {hf_key}={hf_val}")
 | 
			
		||||
 | 
			
		||||
    fill_only = partialmethod(fill_match, must_match=False)
 | 
			
		||||
 | 
			
		||||
    def trainer_config_process(self, args):
 | 
			
		||||
        """
 | 
			
		||||
        Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object
 | 
			
		||||
        creation.
 | 
			
		||||
        """
 | 
			
		||||
        # DeepSpeed does:
 | 
			
		||||
        # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps
 | 
			
		||||
        train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps
 | 
			
		||||
        self.fill_match(
 | 
			
		||||
            "train_micro_batch_size_per_gpu", args.per_device_train_batch_size, "per_device_train_batch_size"
 | 
			
		||||
        )
 | 
			
		||||
        self.fill_match("gradient_accumulation_steps", args.gradient_accumulation_steps, "gradient_accumulation_steps")
 | 
			
		||||
        self.fill_match("train_batch_size", train_batch_size, "train_batch_size (calculated)")
 | 
			
		||||
        self.fill_match("gradient_clipping", args.max_grad_norm, "max_grad_norm")
 | 
			
		||||
 | 
			
		||||
        self.fill_match("optimizer.params.lr", args.learning_rate, "learning_rate")
 | 
			
		||||
        self.fill_match("optimizer.params.betas", [args.adam_beta1, args.adam_beta2], "adam_beta1+adam_beta2")
 | 
			
		||||
        self.fill_match("optimizer.params.eps", args.adam_epsilon, "adam_epsilon")
 | 
			
		||||
        self.fill_match("optimizer.params.weight_decay", args.weight_decay, "weight_decay")
 | 
			
		||||
 | 
			
		||||
        self.fill_only("scheduler.params.warmup_min_lr", 0)  # not a trainer arg
 | 
			
		||||
        self.fill_match("scheduler.params.warmup_max_lr", args.learning_rate, "learning_rate")
 | 
			
		||||
        # total_num_steps - will get set in trainer_config_finalize
 | 
			
		||||
 | 
			
		||||
        # fp16
 | 
			
		||||
        if args.fp16 or args.fp16_full_eval:
 | 
			
		||||
            fp16_backend = "apex" if args.fp16_backend == "apex" else "amp"
 | 
			
		||||
        else:
 | 
			
		||||
            fp16_backend = None
 | 
			
		||||
 | 
			
		||||
        if args.save_on_each_node:
 | 
			
		||||
            # deepspeed uses shared storage by default. Let's override this setting if save_on_each_node == True
 | 
			
		||||
            self.config["checkpoint"] = self.config.get("checkpoint", {})
 | 
			
		||||
            self.config["checkpoint"]["use_node_local_storage"] = args.save_on_each_node
 | 
			
		||||
 | 
			
		||||
        # amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set
 | 
			
		||||
        # any here unless the user did the work
 | 
			
		||||
        self.fill_match(
 | 
			
		||||
            "fp16.enabled",
 | 
			
		||||
            ((args.fp16 or args.fp16_full_eval) and fp16_backend == "amp"),
 | 
			
		||||
            "fp16|fp16_full_eval+fp16_backend(amp)",
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any
 | 
			
		||||
        # ZeRO features
 | 
			
		||||
        self.fill_match("amp.enabled", fp16_backend == "apex", "fp16+fp16_backend(apex)")
 | 
			
		||||
        self.fill_match("amp.opt_level", args.fp16_opt_level, "fp16_opt_level")
 | 
			
		||||
 | 
			
		||||
        self.fill_match("bf16.enabled", (args.bf16 or args.bf16_full_eval), "bf16|bf16_full_eval")
 | 
			
		||||
 | 
			
		||||
        # deepspeed's default mode is fp16 unless there is a config that says differently
 | 
			
		||||
        if self.is_true("bf16.enabled"):
 | 
			
		||||
            self._dtype = torch.bfloat16
 | 
			
		||||
        elif self.is_false("fp16.enabled"):
 | 
			
		||||
            self._dtype = torch.float32
 | 
			
		||||
        else:
 | 
			
		||||
            self._dtype = torch.float16
 | 
			
		||||
 | 
			
		||||
    def trainer_config_finalize(self, args, model, num_training_steps):
 | 
			
		||||
        """
 | 
			
		||||
        This stage is run after we have the model and know num_training_steps.
 | 
			
		||||
 | 
			
		||||
        Now we can complete the configuration process.
 | 
			
		||||
        """
 | 
			
		||||
        # zero
 | 
			
		||||
 | 
			
		||||
        # deal with config keys that use `auto` value and rely on model's hidden_size
 | 
			
		||||
        hidden_size_based_keys = [
 | 
			
		||||
            "zero_optimization.reduce_bucket_size",
 | 
			
		||||
            "zero_optimization.stage3_prefetch_bucket_size",
 | 
			
		||||
            "zero_optimization.stage3_param_persistence_threshold",
 | 
			
		||||
        ]
 | 
			
		||||
        hidden_size_auto_keys = [x for x in hidden_size_based_keys if self.is_auto(x)]
 | 
			
		||||
 | 
			
		||||
        if len(hidden_size_auto_keys) > 0:
 | 
			
		||||
            if hasattr(model.config, "hidden_size"):
 | 
			
		||||
                hidden_size = model.config.hidden_size
 | 
			
		||||
            elif hasattr(model.config, "hidden_sizes"):
 | 
			
		||||
                # if there are many hidden sizes pick the largest one
 | 
			
		||||
                hidden_size = max(model.config.hidden_sizes)
 | 
			
		||||
            else:
 | 
			
		||||
                raise ValueError(
 | 
			
		||||
                    "The model's config file has neither `hidden_size` nor `hidden_sizes` entry, "
 | 
			
		||||
                    "therefore it's not possible to automatically fill out the following `auto` entries "
 | 
			
		||||
                    f"in the DeepSpeed config file: {hidden_size_auto_keys}. You can fix that by replacing "
 | 
			
		||||
                    "`auto` values for these keys with an integer value of your choice."
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
            self.fill_only("zero_optimization.reduce_bucket_size", hidden_size * hidden_size)
 | 
			
		||||
            if self.is_zero3():
 | 
			
		||||
                # automatically assign the optimal config values based on model config
 | 
			
		||||
                self.fill_only("zero_optimization.stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size)
 | 
			
		||||
                self.fill_only("zero_optimization.stage3_param_persistence_threshold", 10 * hidden_size)
 | 
			
		||||
 | 
			
		||||
        # scheduler
 | 
			
		||||
        self.fill_match("scheduler.params.total_num_steps", num_training_steps, "num_training_steps (calculated)")
 | 
			
		||||
        self.fill_match("scheduler.params.warmup_num_steps", args.get_warmup_steps(num_training_steps), "warmup_steps")
 | 
			
		||||
 | 
			
		||||
        if len(self.mismatches) > 0:
 | 
			
		||||
            mismatches = "\n".join(self.mismatches)
 | 
			
		||||
            raise ValueError(
 | 
			
		||||
                "Please correct the following DeepSpeed config values that mismatch TrainingArguments"
 | 
			
		||||
                f" values:\n{mismatches}\nThe easiest method is to set these DeepSpeed config values to 'auto'."
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# keep the config object global to be able to access it anywhere during TrainingArguments life-cycle
 | 
			
		||||
_hf_deepspeed_config_weak_ref = None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def set_hf_deepspeed_config(hf_deepspeed_config_obj):
 | 
			
		||||
    # this is a special weakref global object to allow us to get to Deepspeed config from APIs
 | 
			
		||||
    # that don't have an easy way to get to the Deepspeed config outside of the Trainer domain.
 | 
			
		||||
    global _hf_deepspeed_config_weak_ref
 | 
			
		||||
    # will go away automatically when HfDeepSpeedConfig is destroyed (when TrainingArguments is destroyed)
 | 
			
		||||
    _hf_deepspeed_config_weak_ref = weakref.ref(hf_deepspeed_config_obj)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def unset_hf_deepspeed_config():
 | 
			
		||||
    # useful for unit tests to ensure the global state doesn't leak - call from `tearDown` method
 | 
			
		||||
    global _hf_deepspeed_config_weak_ref
 | 
			
		||||
    _hf_deepspeed_config_weak_ref = None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_deepspeed_zero3_enabled():
 | 
			
		||||
    if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None:
 | 
			
		||||
        return _hf_deepspeed_config_weak_ref().is_zero3()
 | 
			
		||||
    else:
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def deepspeed_config():
 | 
			
		||||
    if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None:
 | 
			
		||||
        return _hf_deepspeed_config_weak_ref().config
 | 
			
		||||
    else:
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps, model_parameters):
 | 
			
		||||
    """
 | 
			
		||||
    A convenience wrapper that deals with optimizer and lr scheduler configuration.
 | 
			
		||||
    """
 | 
			
		||||
    from accelerate.utils import DummyOptim, DummyScheduler
 | 
			
		||||
 | 
			
		||||
    config = hf_deepspeed_config.config
 | 
			
		||||
 | 
			
		||||
    # Optimizer + Scheduler
 | 
			
		||||
    # Currently supported combos:
 | 
			
		||||
    # 1. DS scheduler + DS optimizer: Yes
 | 
			
		||||
    # 2. HF scheduler + HF optimizer: Yes
 | 
			
		||||
    # 3. DS scheduler + HF optimizer: Yes
 | 
			
		||||
    # 4. HF scheduler + DS optimizer: No
 | 
			
		||||
    #
 | 
			
		||||
    # Unless Offload is enabled in which case it's:
 | 
			
		||||
    # 1. DS scheduler + DS optimizer: Yes
 | 
			
		||||
    # 2. HF scheduler + HF optimizer: Mostly*
 | 
			
		||||
    # 3. DS scheduler + HF optimizer: Mostly*
 | 
			
		||||
    # 4. HF scheduler + DS optimizer: No
 | 
			
		||||
    #
 | 
			
		||||
    # Mostly*: All non-native DeepSpeed optimizers that have both CPU and GPU implementation should work (except LAMB)
 | 
			
		||||
 | 
			
		||||
    optimizer = None
 | 
			
		||||
    if "optimizer" in config:
 | 
			
		||||
        if args.adafactor:
 | 
			
		||||
            raise ValueError(
 | 
			
		||||
                "--adafactor was passed, but also found `optimizer` configured in the DeepSpeed config. "
 | 
			
		||||
                "Only one optimizer can be configured."
 | 
			
		||||
            )
 | 
			
		||||
        optimizer = DummyOptim(params=model_parameters)
 | 
			
		||||
    else:
 | 
			
		||||
        if hf_deepspeed_config.is_offload():
 | 
			
		||||
            logger.info(
 | 
			
		||||
                "Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the"
 | 
			
		||||
                " custom optimizer has both CPU and GPU implementation (except LAMB)"
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
 | 
			
		||||
        # But trainer uses AdamW by default.
 | 
			
		||||
        optimizer = trainer.create_optimizer()
 | 
			
		||||
        # To use other optimizers requires voiding warranty with: `zero_allow_untested_optimizer`
 | 
			
		||||
        config["zero_allow_untested_optimizer"] = True
 | 
			
		||||
 | 
			
		||||
    lr_scheduler = None
 | 
			
		||||
    if "scheduler" in config:
 | 
			
		||||
        lr_scheduler = DummyScheduler(optimizer)
 | 
			
		||||
    else:
 | 
			
		||||
        if isinstance(optimizer, DummyOptim):
 | 
			
		||||
            raise ValueError(
 | 
			
		||||
                "Found `optimizer` configured in the DeepSpeed config, but no `scheduler`. "
 | 
			
		||||
                "Please configure a scheduler in the DeepSpeed config."
 | 
			
		||||
            )
 | 
			
		||||
        lr_scheduler = trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
 | 
			
		||||
 | 
			
		||||
    return optimizer, lr_scheduler
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def deepspeed_init(trainer, num_training_steps, inference=False):
 | 
			
		||||
    """
 | 
			
		||||
    Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.
 | 
			
		||||
 | 
			
		||||
    If `resume_from_checkpoint` was passed then an attempt to resume from a previously saved checkpoint will be made.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        trainer: Trainer object
 | 
			
		||||
        num_training_steps: per single gpu
 | 
			
		||||
        resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load
 | 
			
		||||
        inference: launch in inference mode (no optimizer and no lr scheduler)
 | 
			
		||||
 | 
			
		||||
    Returns: optimizer, lr_scheduler
 | 
			
		||||
 | 
			
		||||
    We may use `deepspeed_init` more than once during the life of Trainer, when we do - it's a temp hack based on:
 | 
			
		||||
    https://github.com/microsoft/DeepSpeed/issues/1394#issuecomment-937405374 until Deepspeed fixes a bug where it
 | 
			
		||||
    can't resume from a checkpoint after it did some stepping https://github.com/microsoft/DeepSpeed/issues/1612
 | 
			
		||||
 | 
			
		||||
    """
 | 
			
		||||
    from deepspeed.utils import logger as ds_logger
 | 
			
		||||
 | 
			
		||||
    model = trainer.model
 | 
			
		||||
    args = trainer.args
 | 
			
		||||
 | 
			
		||||
    hf_deepspeed_config = trainer.accelerator.state.deepspeed_plugin.hf_ds_config
 | 
			
		||||
 | 
			
		||||
    # resume config update - some bits like `model` and `num_training_steps` only become available during train
 | 
			
		||||
    hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps)
 | 
			
		||||
 | 
			
		||||
    # set the Deepspeed log level consistent with the Trainer
 | 
			
		||||
    ds_logger.setLevel(args.get_process_log_level())
 | 
			
		||||
 | 
			
		||||
    if inference:
 | 
			
		||||
        # only Z3 makes sense for the inference
 | 
			
		||||
        if not hf_deepspeed_config.is_zero3():
 | 
			
		||||
            raise ValueError("ZeRO inference only makes sense with ZeRO Stage 3 - please adjust your config")
 | 
			
		||||
 | 
			
		||||
        # in case the training config is re-used for inference
 | 
			
		||||
        hf_deepspeed_config.del_config_sub_tree("optimizer")
 | 
			
		||||
        hf_deepspeed_config.del_config_sub_tree("lr_scheduler")
 | 
			
		||||
        optimizer, lr_scheduler = None, None
 | 
			
		||||
        model_parameters = None
 | 
			
		||||
    else:
 | 
			
		||||
        trainer.optimizer = None  # important for when deepspeed_init is used as re-init
 | 
			
		||||
        model_parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
 | 
			
		||||
        optimizer, lr_scheduler = deepspeed_optim_sched(
 | 
			
		||||
            trainer, hf_deepspeed_config, args, num_training_steps, model_parameters
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    # keep for quick debug:
 | 
			
		||||
    # from pprint import pprint; pprint(config)
 | 
			
		||||
 | 
			
		||||
    return optimizer, lr_scheduler
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def deepspeed_load_checkpoint(deepspeed_engine, checkpoint_path):
 | 
			
		||||
    # it's possible that the user is trying to resume from model_path, which doesn't necessarily
 | 
			
		||||
    # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's
 | 
			
		||||
    # a resume from a checkpoint and not just a local pretrained weight. So we check here if the
 | 
			
		||||
    # path contains what looks like a deepspeed checkpoint
 | 
			
		||||
    import glob
 | 
			
		||||
 | 
			
		||||
    deepspeed_checkpoint_dirs = sorted(glob.glob(f"{checkpoint_path}/global_step*"))
 | 
			
		||||
 | 
			
		||||
    if len(deepspeed_checkpoint_dirs) > 0:
 | 
			
		||||
        logger.info(f"Attempting to resume from {checkpoint_path}")
 | 
			
		||||
        # this magically updates self.optimizer and self.lr_scheduler
 | 
			
		||||
        load_path, _ = deepspeed_engine.load_checkpoint(
 | 
			
		||||
            checkpoint_path, load_optimizer_states=True, load_lr_scheduler_states=True
 | 
			
		||||
        )
 | 
			
		||||
        if load_path is None:
 | 
			
		||||
            raise ValueError(f"[deepspeed] failed to resume from checkpoint {checkpoint_path}")
 | 
			
		||||
    else:
 | 
			
		||||
        raise ValueError(f"Can't find a valid checkpoint at {checkpoint_path}")
 | 
			
		||||
import warnings
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
warnings.warn(
 | 
			
		||||
    "transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations",
 | 
			
		||||
    FutureWarning,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
# Backward compatibility imports, to make sure all those objects can be found in integrations/deepspeed
 | 
			
		||||
from .integrations.deepspeed import (  # noqa
 | 
			
		||||
    HfDeepSpeedConfig,
 | 
			
		||||
    HfTrainerDeepSpeedConfig,
 | 
			
		||||
    deepspeed_config,
 | 
			
		||||
    deepspeed_init,
 | 
			
		||||
    deepspeed_load_checkpoint,
 | 
			
		||||
    deepspeed_optim_sched,
 | 
			
		||||
    is_deepspeed_available,
 | 
			
		||||
    is_deepspeed_zero3_enabled,
 | 
			
		||||
    set_hf_deepspeed_config,
 | 
			
		||||
    unset_hf_deepspeed_config,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
@ -24,7 +24,7 @@ import torch
 | 
			
		||||
import torch.distributed as dist
 | 
			
		||||
from torch import nn
 | 
			
		||||
 | 
			
		||||
from ..deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ..integrations.deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ..modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput
 | 
			
		||||
from ..models.auto import (
 | 
			
		||||
    MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										71
									
								
								src/transformers/integrations/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										71
									
								
								src/transformers/integrations/__init__.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,71 @@
 | 
			
		||||
# Copyright 2023 The HuggingFace Team. All rights reserved.
 | 
			
		||||
#
 | 
			
		||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
# you may not use this file except in compliance with the License.
 | 
			
		||||
# You may obtain a copy of the License at
 | 
			
		||||
#
 | 
			
		||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
#
 | 
			
		||||
# Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
# See the License for the specific language governing permissions and
 | 
			
		||||
# limitations under the License.
 | 
			
		||||
from .bitsandbytes import (
 | 
			
		||||
    get_keys_to_not_convert,
 | 
			
		||||
    replace_8bit_linear,
 | 
			
		||||
    replace_with_bnb_linear,
 | 
			
		||||
    set_module_8bit_tensor_to_device,
 | 
			
		||||
    set_module_quantized_tensor_to_device,
 | 
			
		||||
)
 | 
			
		||||
from .deepspeed import (
 | 
			
		||||
    HfDeepSpeedConfig,
 | 
			
		||||
    HfTrainerDeepSpeedConfig,
 | 
			
		||||
    deepspeed_config,
 | 
			
		||||
    deepspeed_init,
 | 
			
		||||
    deepspeed_load_checkpoint,
 | 
			
		||||
    deepspeed_optim_sched,
 | 
			
		||||
    is_deepspeed_available,
 | 
			
		||||
    is_deepspeed_zero3_enabled,
 | 
			
		||||
    set_hf_deepspeed_config,
 | 
			
		||||
    unset_hf_deepspeed_config,
 | 
			
		||||
)
 | 
			
		||||
from .integration_utils import (
 | 
			
		||||
    INTEGRATION_TO_CALLBACK,
 | 
			
		||||
    AzureMLCallback,
 | 
			
		||||
    ClearMLCallback,
 | 
			
		||||
    CodeCarbonCallback,
 | 
			
		||||
    CometCallback,
 | 
			
		||||
    DagsHubCallback,
 | 
			
		||||
    FlyteCallback,
 | 
			
		||||
    MLflowCallback,
 | 
			
		||||
    NeptuneCallback,
 | 
			
		||||
    NeptuneMissingConfiguration,
 | 
			
		||||
    TensorBoardCallback,
 | 
			
		||||
    WandbCallback,
 | 
			
		||||
    get_available_reporting_integrations,
 | 
			
		||||
    get_reporting_integration_callbacks,
 | 
			
		||||
    hp_params,
 | 
			
		||||
    is_azureml_available,
 | 
			
		||||
    is_clearml_available,
 | 
			
		||||
    is_codecarbon_available,
 | 
			
		||||
    is_comet_available,
 | 
			
		||||
    is_dagshub_available,
 | 
			
		||||
    is_fairscale_available,
 | 
			
		||||
    is_flyte_deck_standard_available,
 | 
			
		||||
    is_flytekit_available,
 | 
			
		||||
    is_mlflow_available,
 | 
			
		||||
    is_neptune_available,
 | 
			
		||||
    is_optuna_available,
 | 
			
		||||
    is_ray_available,
 | 
			
		||||
    is_ray_tune_available,
 | 
			
		||||
    is_sigopt_available,
 | 
			
		||||
    is_tensorboard_available,
 | 
			
		||||
    is_wandb_available,
 | 
			
		||||
    rewrite_logs,
 | 
			
		||||
    run_hp_search_optuna,
 | 
			
		||||
    run_hp_search_ray,
 | 
			
		||||
    run_hp_search_sigopt,
 | 
			
		||||
    run_hp_search_wandb,
 | 
			
		||||
)
 | 
			
		||||
from .peft import PeftAdapterMixin
 | 
			
		||||
							
								
								
									
										290
									
								
								src/transformers/integrations/bitsandbytes.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										290
									
								
								src/transformers/integrations/bitsandbytes.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,290 @@
 | 
			
		||||
import importlib.metadata
 | 
			
		||||
import warnings
 | 
			
		||||
from copy import deepcopy
 | 
			
		||||
 | 
			
		||||
from packaging import version
 | 
			
		||||
 | 
			
		||||
from ..utils import is_accelerate_available, is_bitsandbytes_available, logging
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if is_bitsandbytes_available():
 | 
			
		||||
    import bitsandbytes as bnb
 | 
			
		||||
    import torch
 | 
			
		||||
    import torch.nn as nn
 | 
			
		||||
 | 
			
		||||
    from ..pytorch_utils import Conv1D
 | 
			
		||||
 | 
			
		||||
if is_accelerate_available():
 | 
			
		||||
    from accelerate import init_empty_weights
 | 
			
		||||
    from accelerate.utils import find_tied_parameters
 | 
			
		||||
 | 
			
		||||
logger = logging.get_logger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def set_module_quantized_tensor_to_device(module, tensor_name, device, value=None, fp16_statistics=None):
 | 
			
		||||
    """
 | 
			
		||||
    A helper function to set a given tensor (parameter of buffer) of a module on a specific device (note that doing
 | 
			
		||||
    `param.to(device)` creates a new tensor not linked to the parameter, which is why we need this function). The
 | 
			
		||||
    function is adapted from `set_module_tensor_to_device` function from accelerate that is adapted to support the
 | 
			
		||||
    class `Int8Params` from `bitsandbytes`.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        module (`torch.nn.Module`):
 | 
			
		||||
            The module in which the tensor we want to move lives.
 | 
			
		||||
        tensor_name (`str`):
 | 
			
		||||
            The full name of the parameter/buffer.
 | 
			
		||||
        device (`int`, `str` or `torch.device`):
 | 
			
		||||
            The device on which to set the tensor.
 | 
			
		||||
        value (`torch.Tensor`, *optional*):
 | 
			
		||||
            The value of the tensor (useful when going from the meta device to any other device).
 | 
			
		||||
        fp16_statistics (`torch.HalfTensor`, *optional*):
 | 
			
		||||
            The list of fp16 statistics to set on the module, used for serialization.
 | 
			
		||||
    """
 | 
			
		||||
    # Recurse if needed
 | 
			
		||||
    if "." in tensor_name:
 | 
			
		||||
        splits = tensor_name.split(".")
 | 
			
		||||
        for split in splits[:-1]:
 | 
			
		||||
            new_module = getattr(module, split)
 | 
			
		||||
            if new_module is None:
 | 
			
		||||
                raise ValueError(f"{module} has no attribute {split}.")
 | 
			
		||||
            module = new_module
 | 
			
		||||
        tensor_name = splits[-1]
 | 
			
		||||
 | 
			
		||||
    if tensor_name not in module._parameters and tensor_name not in module._buffers:
 | 
			
		||||
        raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
 | 
			
		||||
    is_buffer = tensor_name in module._buffers
 | 
			
		||||
    old_value = getattr(module, tensor_name)
 | 
			
		||||
 | 
			
		||||
    if old_value.device == torch.device("meta") and device not in ["meta", torch.device("meta")] and value is None:
 | 
			
		||||
        raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {device}.")
 | 
			
		||||
 | 
			
		||||
    is_4bit = False
 | 
			
		||||
    is_8bit = False
 | 
			
		||||
    if is_buffer or not is_bitsandbytes_available():
 | 
			
		||||
        is_8bit = False
 | 
			
		||||
        is_4bit = False
 | 
			
		||||
    else:
 | 
			
		||||
        is_4bit = hasattr(bnb.nn, "Params4bit") and isinstance(module._parameters[tensor_name], bnb.nn.Params4bit)
 | 
			
		||||
        is_8bit = isinstance(module._parameters[tensor_name], bnb.nn.Int8Params)
 | 
			
		||||
 | 
			
		||||
    if is_8bit or is_4bit:
 | 
			
		||||
        param = module._parameters[tensor_name]
 | 
			
		||||
        if param.device.type != "cuda":
 | 
			
		||||
            if value is None:
 | 
			
		||||
                new_value = old_value.to(device)
 | 
			
		||||
            elif isinstance(value, torch.Tensor):
 | 
			
		||||
                new_value = value.to("cpu")
 | 
			
		||||
                if value.dtype == torch.int8:
 | 
			
		||||
                    is_8bit_serializable = version.parse(importlib.metadata.version("bitsandbytes")) > version.parse(
 | 
			
		||||
                        "0.37.2"
 | 
			
		||||
                    )
 | 
			
		||||
                    if not is_8bit_serializable:
 | 
			
		||||
                        raise ValueError(
 | 
			
		||||
                            "Detected int8 weights but the version of bitsandbytes is not compatible with int8 serialization. "
 | 
			
		||||
                            "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`."
 | 
			
		||||
                        )
 | 
			
		||||
            else:
 | 
			
		||||
                new_value = torch.tensor(value, device="cpu")
 | 
			
		||||
 | 
			
		||||
            # Support models using `Conv1D` in place of `nn.Linear` (e.g. gpt2) by transposing the weight matrix prior to quantization.
 | 
			
		||||
            # Since weights are saved in the correct "orientation", we skip transposing when loading.
 | 
			
		||||
            if issubclass(module.source_cls, Conv1D) and fp16_statistics is None:
 | 
			
		||||
                new_value = new_value.T
 | 
			
		||||
 | 
			
		||||
            kwargs = old_value.__dict__
 | 
			
		||||
            if is_8bit:
 | 
			
		||||
                new_value = bnb.nn.Int8Params(new_value, requires_grad=False, **kwargs).to(device)
 | 
			
		||||
            elif is_4bit:
 | 
			
		||||
                new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(device)
 | 
			
		||||
 | 
			
		||||
            module._parameters[tensor_name] = new_value
 | 
			
		||||
            if fp16_statistics is not None:
 | 
			
		||||
                setattr(module.weight, "SCB", fp16_statistics.to(device))
 | 
			
		||||
 | 
			
		||||
    else:
 | 
			
		||||
        if value is None:
 | 
			
		||||
            new_value = old_value.to(device)
 | 
			
		||||
        elif isinstance(value, torch.Tensor):
 | 
			
		||||
            new_value = value.to(device)
 | 
			
		||||
        else:
 | 
			
		||||
            new_value = torch.tensor(value, device=device)
 | 
			
		||||
 | 
			
		||||
        if is_buffer:
 | 
			
		||||
            module._buffers[tensor_name] = new_value
 | 
			
		||||
        else:
 | 
			
		||||
            new_value = nn.Parameter(new_value, requires_grad=old_value.requires_grad)
 | 
			
		||||
            module._parameters[tensor_name] = new_value
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _replace_with_bnb_linear(
 | 
			
		||||
    model, modules_to_not_convert=None, current_key_name=None, quantization_config=None, has_been_replaced=False
 | 
			
		||||
):
 | 
			
		||||
    """
 | 
			
		||||
    Private method that wraps the recursion for module replacement.
 | 
			
		||||
 | 
			
		||||
    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
 | 
			
		||||
    """
 | 
			
		||||
    for name, module in model.named_children():
 | 
			
		||||
        if current_key_name is None:
 | 
			
		||||
            current_key_name = []
 | 
			
		||||
        current_key_name.append(name)
 | 
			
		||||
 | 
			
		||||
        if (isinstance(module, nn.Linear) or isinstance(module, Conv1D)) and name not in modules_to_not_convert:
 | 
			
		||||
            # Check if the current key is not in the `modules_to_not_convert`
 | 
			
		||||
            if not any(key in ".".join(current_key_name) for key in modules_to_not_convert):
 | 
			
		||||
                with init_empty_weights():
 | 
			
		||||
                    if isinstance(module, Conv1D):
 | 
			
		||||
                        in_features, out_features = module.weight.shape
 | 
			
		||||
                    else:
 | 
			
		||||
                        in_features = module.in_features
 | 
			
		||||
                        out_features = module.out_features
 | 
			
		||||
 | 
			
		||||
                    if quantization_config.quantization_method() == "llm_int8":
 | 
			
		||||
                        model._modules[name] = bnb.nn.Linear8bitLt(
 | 
			
		||||
                            in_features,
 | 
			
		||||
                            out_features,
 | 
			
		||||
                            module.bias is not None,
 | 
			
		||||
                            has_fp16_weights=quantization_config.llm_int8_has_fp16_weight,
 | 
			
		||||
                            threshold=quantization_config.llm_int8_threshold,
 | 
			
		||||
                        )
 | 
			
		||||
                        has_been_replaced = True
 | 
			
		||||
                    else:
 | 
			
		||||
                        if (
 | 
			
		||||
                            quantization_config.llm_int8_skip_modules is not None
 | 
			
		||||
                            and name in quantization_config.llm_int8_skip_modules
 | 
			
		||||
                        ):
 | 
			
		||||
                            pass
 | 
			
		||||
                        else:
 | 
			
		||||
                            model._modules[name] = bnb.nn.Linear4bit(
 | 
			
		||||
                                in_features,
 | 
			
		||||
                                out_features,
 | 
			
		||||
                                module.bias is not None,
 | 
			
		||||
                                quantization_config.bnb_4bit_compute_dtype,
 | 
			
		||||
                                compress_statistics=quantization_config.bnb_4bit_use_double_quant,
 | 
			
		||||
                                quant_type=quantization_config.bnb_4bit_quant_type,
 | 
			
		||||
                            )
 | 
			
		||||
                            has_been_replaced = True
 | 
			
		||||
                    # Store the module class in case we need to transpose the weight later
 | 
			
		||||
                    model._modules[name].source_cls = type(module)
 | 
			
		||||
                    # Force requires grad to False to avoid unexpected errors
 | 
			
		||||
                    model._modules[name].requires_grad_(False)
 | 
			
		||||
        if len(list(module.children())) > 0:
 | 
			
		||||
            _, has_been_replaced = _replace_with_bnb_linear(
 | 
			
		||||
                module,
 | 
			
		||||
                modules_to_not_convert,
 | 
			
		||||
                current_key_name,
 | 
			
		||||
                quantization_config,
 | 
			
		||||
                has_been_replaced=has_been_replaced,
 | 
			
		||||
            )
 | 
			
		||||
        # Remove the last key for recursion
 | 
			
		||||
        current_key_name.pop(-1)
 | 
			
		||||
    return model, has_been_replaced
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name=None, quantization_config=None):
 | 
			
		||||
    """
 | 
			
		||||
    A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules from the `bitsandbytes`
 | 
			
		||||
    library. This will enable running your models using mixed int8 precision as described by the paper `LLM.int8():
 | 
			
		||||
    8-bit Matrix Multiplication for Transformers at Scale`. Make sure `bitsandbytes` compiled with the correct CUDA
 | 
			
		||||
    version of your hardware is installed before running this function. `pip install -i https://test.pypi.org/simple/
 | 
			
		||||
    bitsandbytes`
 | 
			
		||||
 | 
			
		||||
    The function will be run recursively and replace all `torch.nn.Linear` modules except for the `lm_head` that should
 | 
			
		||||
    be kept as a `torch.nn.Linear` module. The replacement is done under `init_empty_weights` context manager so no
 | 
			
		||||
    CPU/GPU memory is required to run this function. Int8 mixed-precision matrix decomposition works by separating a
 | 
			
		||||
    matrix multiplication into two streams: (1) and systematic feature outlier stream matrix multiplied in fp16
 | 
			
		||||
    (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no
 | 
			
		||||
    predictive degradation is possible for very large models (>=176B parameters).
 | 
			
		||||
 | 
			
		||||
    Parameters:
 | 
			
		||||
        model (`torch.nn.Module`):
 | 
			
		||||
            Input model or `torch.nn.Module` as the function is run recursively.
 | 
			
		||||
        modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`):
 | 
			
		||||
            Names of the modules to not convert in `Linear8bitLt`. In practice we keep the `lm_head` in full precision
 | 
			
		||||
            for numerical stability reasons.
 | 
			
		||||
        current_key_name (`List[`str`]`, *optional*):
 | 
			
		||||
            An array to track the current key of the recursion. This is used to check whether the current key (part of
 | 
			
		||||
            it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
 | 
			
		||||
            `disk`).
 | 
			
		||||
    """
 | 
			
		||||
    modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert
 | 
			
		||||
    model, has_been_replaced = _replace_with_bnb_linear(
 | 
			
		||||
        model, modules_to_not_convert, current_key_name, quantization_config
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    if not has_been_replaced:
 | 
			
		||||
        logger.warning(
 | 
			
		||||
            "You are loading your model in 8bit or 4bit but no linear modules were found in your model."
 | 
			
		||||
            " Please double check your model architecture, or submit an issue on github if you think this is"
 | 
			
		||||
            " a bug."
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    return model
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# For backward compatibility
 | 
			
		||||
def replace_8bit_linear(*args, **kwargs):
 | 
			
		||||
    warnings.warn(
 | 
			
		||||
        "`replace_8bit_linear` will be deprecated in a future version, please use `replace_with_bnb_linear` instead",
 | 
			
		||||
        FutureWarning,
 | 
			
		||||
    )
 | 
			
		||||
    return replace_with_bnb_linear(*args, **kwargs)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# For backward compatiblity
 | 
			
		||||
def set_module_8bit_tensor_to_device(*args, **kwargs):
 | 
			
		||||
    warnings.warn(
 | 
			
		||||
        "`set_module_8bit_tensor_to_device` will be deprecated in a future version, please use `set_module_quantized_tensor_to_device` instead",
 | 
			
		||||
        FutureWarning,
 | 
			
		||||
    )
 | 
			
		||||
    return set_module_quantized_tensor_to_device(*args, **kwargs)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_keys_to_not_convert(model):
 | 
			
		||||
    r"""
 | 
			
		||||
    An utility function to get the key of the module to keep in full precision if any For example for CausalLM modules
 | 
			
		||||
    we may want to keep the lm_head in full precision for numerical stability reasons. For other architectures, we want
 | 
			
		||||
    to keep the tied weights of the model. The function will return a list of the keys of the modules to not convert in
 | 
			
		||||
    int8.
 | 
			
		||||
 | 
			
		||||
    Parameters:
 | 
			
		||||
    model (`torch.nn.Module`):
 | 
			
		||||
        Input model
 | 
			
		||||
    """
 | 
			
		||||
    # Create a copy of the model and tie the weights, then
 | 
			
		||||
    # check if it contains tied weights
 | 
			
		||||
    tied_model = deepcopy(model)  # this has 0 cost since it is done inside `init_empty_weights` context manager`
 | 
			
		||||
    tied_model.tie_weights()
 | 
			
		||||
 | 
			
		||||
    tied_params = find_tied_parameters(tied_model)
 | 
			
		||||
    # For compatibility with Accelerate < 0.18
 | 
			
		||||
    if isinstance(tied_params, dict):
 | 
			
		||||
        tied_keys = sum(list(tied_params.values()), []) + list(tied_params.keys())
 | 
			
		||||
    else:
 | 
			
		||||
        tied_keys = sum(tied_params, [])
 | 
			
		||||
    has_tied_params = len(tied_keys) > 0
 | 
			
		||||
 | 
			
		||||
    # If there is not tied weights, we want to keep the lm_head(output_embedding) in full precision
 | 
			
		||||
    if not has_tied_params:
 | 
			
		||||
        output_emb = model.get_output_embeddings()
 | 
			
		||||
        if output_emb is not None:
 | 
			
		||||
            list_last_module = [name for name, module in model.named_modules() if id(module) == id(output_emb)]
 | 
			
		||||
            return list_last_module
 | 
			
		||||
 | 
			
		||||
    # otherwise, no tied weights, no output embedding defined, simply keep the last module in full precision
 | 
			
		||||
    list_modules = list(model.named_parameters())
 | 
			
		||||
    list_last_module = [list_modules[-1][0]]
 | 
			
		||||
    # add last module together with tied weights
 | 
			
		||||
    intersection = set(list_last_module) - set(tied_keys)
 | 
			
		||||
    list_untouched = list(set(tied_keys)) + list(intersection)
 | 
			
		||||
 | 
			
		||||
    # remove ".weight" from the keys
 | 
			
		||||
    names_to_remove = [".weight", ".bias"]
 | 
			
		||||
    filtered_module_names = []
 | 
			
		||||
    for name in list_untouched:
 | 
			
		||||
        for name_to_remove in names_to_remove:
 | 
			
		||||
            if name_to_remove in name:
 | 
			
		||||
                name = name.replace(name_to_remove, "")
 | 
			
		||||
        filtered_module_names.append(name)
 | 
			
		||||
 | 
			
		||||
    return filtered_module_names
 | 
			
		||||
							
								
								
									
										389
									
								
								src/transformers/integrations/deepspeed.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										389
									
								
								src/transformers/integrations/deepspeed.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,389 @@
 | 
			
		||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
 | 
			
		||||
#
 | 
			
		||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
# you may not use this file except in compliance with the License.
 | 
			
		||||
# You may obtain a copy of the License at
 | 
			
		||||
#
 | 
			
		||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
#
 | 
			
		||||
# Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
# See the License for the specific language governing permissions and
 | 
			
		||||
# limitations under the License.
 | 
			
		||||
"""
 | 
			
		||||
Integration with Deepspeed
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
import importlib.util
 | 
			
		||||
import weakref
 | 
			
		||||
from functools import partialmethod
 | 
			
		||||
 | 
			
		||||
from ..dependency_versions_check import dep_version_check
 | 
			
		||||
from ..utils import is_accelerate_available, is_torch_available, logging
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if is_torch_available():
 | 
			
		||||
    import torch
 | 
			
		||||
 | 
			
		||||
logger = logging.get_logger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_deepspeed_available():
 | 
			
		||||
    return importlib.util.find_spec("deepspeed") is not None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if is_accelerate_available() and is_deepspeed_available():
 | 
			
		||||
    from accelerate.utils.deepspeed import HfDeepSpeedConfig as DeepSpeedConfig
 | 
			
		||||
else:
 | 
			
		||||
    # Inherits from a dummy `object` if accelerate is not available, so that python succeeds to import this file.
 | 
			
		||||
    # Deepspeed glue code will never inherit this dummy object as it checks if accelerate is available.
 | 
			
		||||
    from builtins import object as DeepSpeedConfig
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class HfDeepSpeedConfig(DeepSpeedConfig):
 | 
			
		||||
    """
 | 
			
		||||
    This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.
 | 
			
		||||
 | 
			
		||||
    A `weakref` of this object is stored in the module's globals to be able to access the config from areas where
 | 
			
		||||
    things like the Trainer object is not available (e.g. `from_pretrained` and `_get_resized_embeddings`). Therefore
 | 
			
		||||
    it's important that this object remains alive while the program is still running.
 | 
			
		||||
 | 
			
		||||
    [`Trainer`] uses the `HfTrainerDeepSpeedConfig` subclass instead. That subclass has logic to sync the configuration
 | 
			
		||||
    with values of [`TrainingArguments`] by replacing special placeholder values: `"auto"`. Without this special logic
 | 
			
		||||
    the DeepSpeed configuration is not modified in any way.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        config_file_or_dict (`Union[str, Dict]`): path to DeepSpeed config file or dict.
 | 
			
		||||
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(self, config_file_or_dict):
 | 
			
		||||
        # set global weakref object
 | 
			
		||||
        set_hf_deepspeed_config(self)
 | 
			
		||||
        dep_version_check("accelerate")
 | 
			
		||||
        dep_version_check("deepspeed")
 | 
			
		||||
        super().__init__(config_file_or_dict)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
 | 
			
		||||
    """
 | 
			
		||||
    The `HfTrainerDeepSpeedConfig` object is meant to be created during `TrainingArguments` object creation and has the
 | 
			
		||||
    same lifespan as the latter.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(self, config_file_or_dict):
 | 
			
		||||
        super().__init__(config_file_or_dict)
 | 
			
		||||
        self._dtype = None
 | 
			
		||||
        self.mismatches = []
 | 
			
		||||
 | 
			
		||||
    def dtype(self):
 | 
			
		||||
        if self._dtype is None:
 | 
			
		||||
            raise ValueError("trainer_config_process() wasn't called yet to tell dtype")
 | 
			
		||||
        return self._dtype
 | 
			
		||||
 | 
			
		||||
    def is_auto(self, ds_key_long):
 | 
			
		||||
        val = self.get_value(ds_key_long)
 | 
			
		||||
        if val is None:
 | 
			
		||||
            return False
 | 
			
		||||
        else:
 | 
			
		||||
            return val == "auto"
 | 
			
		||||
 | 
			
		||||
    def fill_match(self, ds_key_long, hf_val, hf_key=None, must_match=True):
 | 
			
		||||
        """
 | 
			
		||||
        A utility method that massages the config file and can optionally verify that the values match.
 | 
			
		||||
 | 
			
		||||
        1. Replace "auto" values with `TrainingArguments` value.
 | 
			
		||||
 | 
			
		||||
        2. If it wasn't "auto" and `must_match` is true, then check that DS config matches Trainer
 | 
			
		||||
        config values and if mismatched add the entry to `self.mismatched` - will assert during
 | 
			
		||||
        `trainer_config_finalize` for one or more mismatches.
 | 
			
		||||
 | 
			
		||||
        """
 | 
			
		||||
        config, ds_key = self.find_config_node(ds_key_long)
 | 
			
		||||
        if config is None:
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        if config.get(ds_key) == "auto":
 | 
			
		||||
            config[ds_key] = hf_val
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        if not must_match:
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        ds_val = config.get(ds_key)
 | 
			
		||||
        if ds_val is not None and ds_val != hf_val:
 | 
			
		||||
            self.mismatches.append(f"- ds {ds_key_long}={ds_val} vs hf {hf_key}={hf_val}")
 | 
			
		||||
 | 
			
		||||
    fill_only = partialmethod(fill_match, must_match=False)
 | 
			
		||||
 | 
			
		||||
    def trainer_config_process(self, args):
 | 
			
		||||
        """
 | 
			
		||||
        Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object
 | 
			
		||||
        creation.
 | 
			
		||||
        """
 | 
			
		||||
        # DeepSpeed does:
 | 
			
		||||
        # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps
 | 
			
		||||
        train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps
 | 
			
		||||
        self.fill_match(
 | 
			
		||||
            "train_micro_batch_size_per_gpu", args.per_device_train_batch_size, "per_device_train_batch_size"
 | 
			
		||||
        )
 | 
			
		||||
        self.fill_match("gradient_accumulation_steps", args.gradient_accumulation_steps, "gradient_accumulation_steps")
 | 
			
		||||
        self.fill_match("train_batch_size", train_batch_size, "train_batch_size (calculated)")
 | 
			
		||||
        self.fill_match("gradient_clipping", args.max_grad_norm, "max_grad_norm")
 | 
			
		||||
 | 
			
		||||
        self.fill_match("optimizer.params.lr", args.learning_rate, "learning_rate")
 | 
			
		||||
        self.fill_match("optimizer.params.betas", [args.adam_beta1, args.adam_beta2], "adam_beta1+adam_beta2")
 | 
			
		||||
        self.fill_match("optimizer.params.eps", args.adam_epsilon, "adam_epsilon")
 | 
			
		||||
        self.fill_match("optimizer.params.weight_decay", args.weight_decay, "weight_decay")
 | 
			
		||||
 | 
			
		||||
        self.fill_only("scheduler.params.warmup_min_lr", 0)  # not a trainer arg
 | 
			
		||||
        self.fill_match("scheduler.params.warmup_max_lr", args.learning_rate, "learning_rate")
 | 
			
		||||
        # total_num_steps - will get set in trainer_config_finalize
 | 
			
		||||
 | 
			
		||||
        # fp16
 | 
			
		||||
        if args.fp16 or args.fp16_full_eval:
 | 
			
		||||
            fp16_backend = "apex" if args.fp16_backend == "apex" else "amp"
 | 
			
		||||
        else:
 | 
			
		||||
            fp16_backend = None
 | 
			
		||||
 | 
			
		||||
        if args.save_on_each_node:
 | 
			
		||||
            # deepspeed uses shared storage by default. Let's override this setting if save_on_each_node == True
 | 
			
		||||
            self.config["checkpoint"] = self.config.get("checkpoint", {})
 | 
			
		||||
            self.config["checkpoint"]["use_node_local_storage"] = args.save_on_each_node
 | 
			
		||||
 | 
			
		||||
        # amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set
 | 
			
		||||
        # any here unless the user did the work
 | 
			
		||||
        self.fill_match(
 | 
			
		||||
            "fp16.enabled",
 | 
			
		||||
            ((args.fp16 or args.fp16_full_eval) and fp16_backend == "amp"),
 | 
			
		||||
            "fp16|fp16_full_eval+fp16_backend(amp)",
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any
 | 
			
		||||
        # ZeRO features
 | 
			
		||||
        self.fill_match("amp.enabled", fp16_backend == "apex", "fp16+fp16_backend(apex)")
 | 
			
		||||
        self.fill_match("amp.opt_level", args.fp16_opt_level, "fp16_opt_level")
 | 
			
		||||
 | 
			
		||||
        self.fill_match("bf16.enabled", (args.bf16 or args.bf16_full_eval), "bf16|bf16_full_eval")
 | 
			
		||||
 | 
			
		||||
        # deepspeed's default mode is fp16 unless there is a config that says differently
 | 
			
		||||
        if self.is_true("bf16.enabled"):
 | 
			
		||||
            self._dtype = torch.bfloat16
 | 
			
		||||
        elif self.is_false("fp16.enabled"):
 | 
			
		||||
            self._dtype = torch.float32
 | 
			
		||||
        else:
 | 
			
		||||
            self._dtype = torch.float16
 | 
			
		||||
 | 
			
		||||
    def trainer_config_finalize(self, args, model, num_training_steps):
 | 
			
		||||
        """
 | 
			
		||||
        This stage is run after we have the model and know num_training_steps.
 | 
			
		||||
 | 
			
		||||
        Now we can complete the configuration process.
 | 
			
		||||
        """
 | 
			
		||||
        # zero
 | 
			
		||||
 | 
			
		||||
        # deal with config keys that use `auto` value and rely on model's hidden_size
 | 
			
		||||
        hidden_size_based_keys = [
 | 
			
		||||
            "zero_optimization.reduce_bucket_size",
 | 
			
		||||
            "zero_optimization.stage3_prefetch_bucket_size",
 | 
			
		||||
            "zero_optimization.stage3_param_persistence_threshold",
 | 
			
		||||
        ]
 | 
			
		||||
        hidden_size_auto_keys = [x for x in hidden_size_based_keys if self.is_auto(x)]
 | 
			
		||||
 | 
			
		||||
        if len(hidden_size_auto_keys) > 0:
 | 
			
		||||
            if hasattr(model.config, "hidden_size"):
 | 
			
		||||
                hidden_size = model.config.hidden_size
 | 
			
		||||
            elif hasattr(model.config, "hidden_sizes"):
 | 
			
		||||
                # if there are many hidden sizes pick the largest one
 | 
			
		||||
                hidden_size = max(model.config.hidden_sizes)
 | 
			
		||||
            else:
 | 
			
		||||
                raise ValueError(
 | 
			
		||||
                    "The model's config file has neither `hidden_size` nor `hidden_sizes` entry, "
 | 
			
		||||
                    "therefore it's not possible to automatically fill out the following `auto` entries "
 | 
			
		||||
                    f"in the DeepSpeed config file: {hidden_size_auto_keys}. You can fix that by replacing "
 | 
			
		||||
                    "`auto` values for these keys with an integer value of your choice."
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
            self.fill_only("zero_optimization.reduce_bucket_size", hidden_size * hidden_size)
 | 
			
		||||
            if self.is_zero3():
 | 
			
		||||
                # automatically assign the optimal config values based on model config
 | 
			
		||||
                self.fill_only("zero_optimization.stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size)
 | 
			
		||||
                self.fill_only("zero_optimization.stage3_param_persistence_threshold", 10 * hidden_size)
 | 
			
		||||
 | 
			
		||||
        # scheduler
 | 
			
		||||
        self.fill_match("scheduler.params.total_num_steps", num_training_steps, "num_training_steps (calculated)")
 | 
			
		||||
        self.fill_match("scheduler.params.warmup_num_steps", args.get_warmup_steps(num_training_steps), "warmup_steps")
 | 
			
		||||
 | 
			
		||||
        if len(self.mismatches) > 0:
 | 
			
		||||
            mismatches = "\n".join(self.mismatches)
 | 
			
		||||
            raise ValueError(
 | 
			
		||||
                "Please correct the following DeepSpeed config values that mismatch TrainingArguments"
 | 
			
		||||
                f" values:\n{mismatches}\nThe easiest method is to set these DeepSpeed config values to 'auto'."
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# keep the config object global to be able to access it anywhere during TrainingArguments life-cycle
 | 
			
		||||
_hf_deepspeed_config_weak_ref = None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def set_hf_deepspeed_config(hf_deepspeed_config_obj):
 | 
			
		||||
    # this is a special weakref global object to allow us to get to Deepspeed config from APIs
 | 
			
		||||
    # that don't have an easy way to get to the Deepspeed config outside of the Trainer domain.
 | 
			
		||||
    global _hf_deepspeed_config_weak_ref
 | 
			
		||||
    # will go away automatically when HfDeepSpeedConfig is destroyed (when TrainingArguments is destroyed)
 | 
			
		||||
    _hf_deepspeed_config_weak_ref = weakref.ref(hf_deepspeed_config_obj)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def unset_hf_deepspeed_config():
 | 
			
		||||
    # useful for unit tests to ensure the global state doesn't leak - call from `tearDown` method
 | 
			
		||||
    global _hf_deepspeed_config_weak_ref
 | 
			
		||||
    _hf_deepspeed_config_weak_ref = None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_deepspeed_zero3_enabled():
 | 
			
		||||
    if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None:
 | 
			
		||||
        return _hf_deepspeed_config_weak_ref().is_zero3()
 | 
			
		||||
    else:
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def deepspeed_config():
 | 
			
		||||
    if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None:
 | 
			
		||||
        return _hf_deepspeed_config_weak_ref().config
 | 
			
		||||
    else:
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps, model_parameters):
 | 
			
		||||
    """
 | 
			
		||||
    A convenience wrapper that deals with optimizer and lr scheduler configuration.
 | 
			
		||||
    """
 | 
			
		||||
    from accelerate.utils import DummyOptim, DummyScheduler
 | 
			
		||||
 | 
			
		||||
    config = hf_deepspeed_config.config
 | 
			
		||||
 | 
			
		||||
    # Optimizer + Scheduler
 | 
			
		||||
    # Currently supported combos:
 | 
			
		||||
    # 1. DS scheduler + DS optimizer: Yes
 | 
			
		||||
    # 2. HF scheduler + HF optimizer: Yes
 | 
			
		||||
    # 3. DS scheduler + HF optimizer: Yes
 | 
			
		||||
    # 4. HF scheduler + DS optimizer: No
 | 
			
		||||
    #
 | 
			
		||||
    # Unless Offload is enabled in which case it's:
 | 
			
		||||
    # 1. DS scheduler + DS optimizer: Yes
 | 
			
		||||
    # 2. HF scheduler + HF optimizer: Mostly*
 | 
			
		||||
    # 3. DS scheduler + HF optimizer: Mostly*
 | 
			
		||||
    # 4. HF scheduler + DS optimizer: No
 | 
			
		||||
    #
 | 
			
		||||
    # Mostly*: All non-native DeepSpeed optimizers that have both CPU and GPU implementation should work (except LAMB)
 | 
			
		||||
 | 
			
		||||
    optimizer = None
 | 
			
		||||
    if "optimizer" in config:
 | 
			
		||||
        if args.adafactor:
 | 
			
		||||
            raise ValueError(
 | 
			
		||||
                "--adafactor was passed, but also found `optimizer` configured in the DeepSpeed config. "
 | 
			
		||||
                "Only one optimizer can be configured."
 | 
			
		||||
            )
 | 
			
		||||
        optimizer = DummyOptim(params=model_parameters)
 | 
			
		||||
    else:
 | 
			
		||||
        if hf_deepspeed_config.is_offload():
 | 
			
		||||
            logger.info(
 | 
			
		||||
                "Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the"
 | 
			
		||||
                " custom optimizer has both CPU and GPU implementation (except LAMB)"
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
 | 
			
		||||
        # But trainer uses AdamW by default.
 | 
			
		||||
        optimizer = trainer.create_optimizer()
 | 
			
		||||
        # To use other optimizers requires voiding warranty with: `zero_allow_untested_optimizer`
 | 
			
		||||
        config["zero_allow_untested_optimizer"] = True
 | 
			
		||||
 | 
			
		||||
    lr_scheduler = None
 | 
			
		||||
    if "scheduler" in config:
 | 
			
		||||
        lr_scheduler = DummyScheduler(optimizer)
 | 
			
		||||
    else:
 | 
			
		||||
        if isinstance(optimizer, DummyOptim):
 | 
			
		||||
            raise ValueError(
 | 
			
		||||
                "Found `optimizer` configured in the DeepSpeed config, but no `scheduler`. "
 | 
			
		||||
                "Please configure a scheduler in the DeepSpeed config."
 | 
			
		||||
            )
 | 
			
		||||
        lr_scheduler = trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
 | 
			
		||||
 | 
			
		||||
    return optimizer, lr_scheduler
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def deepspeed_init(trainer, num_training_steps, inference=False):
 | 
			
		||||
    """
 | 
			
		||||
    Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.
 | 
			
		||||
 | 
			
		||||
    If `resume_from_checkpoint` was passed then an attempt to resume from a previously saved checkpoint will be made.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        trainer: Trainer object
 | 
			
		||||
        num_training_steps: per single gpu
 | 
			
		||||
        resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load
 | 
			
		||||
        inference: launch in inference mode (no optimizer and no lr scheduler)
 | 
			
		||||
 | 
			
		||||
    Returns: optimizer, lr_scheduler
 | 
			
		||||
 | 
			
		||||
    We may use `deepspeed_init` more than once during the life of Trainer, when we do - it's a temp hack based on:
 | 
			
		||||
    https://github.com/microsoft/DeepSpeed/issues/1394#issuecomment-937405374 until Deepspeed fixes a bug where it
 | 
			
		||||
    can't resume from a checkpoint after it did some stepping https://github.com/microsoft/DeepSpeed/issues/1612
 | 
			
		||||
 | 
			
		||||
    """
 | 
			
		||||
    from deepspeed.utils import logger as ds_logger
 | 
			
		||||
 | 
			
		||||
    model = trainer.model
 | 
			
		||||
    args = trainer.args
 | 
			
		||||
 | 
			
		||||
    hf_deepspeed_config = trainer.accelerator.state.deepspeed_plugin.hf_ds_config
 | 
			
		||||
 | 
			
		||||
    # resume config update - some bits like `model` and `num_training_steps` only become available during train
 | 
			
		||||
    hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps)
 | 
			
		||||
 | 
			
		||||
    # set the Deepspeed log level consistent with the Trainer
 | 
			
		||||
    ds_logger.setLevel(args.get_process_log_level())
 | 
			
		||||
 | 
			
		||||
    if inference:
 | 
			
		||||
        # only Z3 makes sense for the inference
 | 
			
		||||
        if not hf_deepspeed_config.is_zero3():
 | 
			
		||||
            raise ValueError("ZeRO inference only makes sense with ZeRO Stage 3 - please adjust your config")
 | 
			
		||||
 | 
			
		||||
        # in case the training config is re-used for inference
 | 
			
		||||
        hf_deepspeed_config.del_config_sub_tree("optimizer")
 | 
			
		||||
        hf_deepspeed_config.del_config_sub_tree("lr_scheduler")
 | 
			
		||||
        optimizer, lr_scheduler = None, None
 | 
			
		||||
        model_parameters = None
 | 
			
		||||
    else:
 | 
			
		||||
        trainer.optimizer = None  # important for when deepspeed_init is used as re-init
 | 
			
		||||
        model_parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
 | 
			
		||||
        optimizer, lr_scheduler = deepspeed_optim_sched(
 | 
			
		||||
            trainer, hf_deepspeed_config, args, num_training_steps, model_parameters
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    # keep for quick debug:
 | 
			
		||||
    # from pprint import pprint; pprint(config)
 | 
			
		||||
 | 
			
		||||
    return optimizer, lr_scheduler
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def deepspeed_load_checkpoint(deepspeed_engine, checkpoint_path):
 | 
			
		||||
    # it's possible that the user is trying to resume from model_path, which doesn't necessarily
 | 
			
		||||
    # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's
 | 
			
		||||
    # a resume from a checkpoint and not just a local pretrained weight. So we check here if the
 | 
			
		||||
    # path contains what looks like a deepspeed checkpoint
 | 
			
		||||
    import glob
 | 
			
		||||
 | 
			
		||||
    deepspeed_checkpoint_dirs = sorted(glob.glob(f"{checkpoint_path}/global_step*"))
 | 
			
		||||
 | 
			
		||||
    if len(deepspeed_checkpoint_dirs) > 0:
 | 
			
		||||
        logger.info(f"Attempting to resume from {checkpoint_path}")
 | 
			
		||||
        # this magically updates self.optimizer and self.lr_scheduler
 | 
			
		||||
        load_path, _ = deepspeed_engine.load_checkpoint(
 | 
			
		||||
            checkpoint_path, load_optimizer_states=True, load_lr_scheduler_states=True
 | 
			
		||||
        )
 | 
			
		||||
        if load_path is None:
 | 
			
		||||
            raise ValueError(f"[deepspeed] failed to resume from checkpoint {checkpoint_path}")
 | 
			
		||||
    else:
 | 
			
		||||
        raise ValueError(f"Can't find a valid checkpoint at {checkpoint_path}")
 | 
			
		||||
@ -30,8 +30,8 @@ from typing import TYPE_CHECKING, Dict, Optional
 | 
			
		||||
 | 
			
		||||
import numpy as np
 | 
			
		||||
 | 
			
		||||
from . import __version__ as version
 | 
			
		||||
from .utils import flatten_dict, is_datasets_available, is_pandas_available, is_torch_available, logging
 | 
			
		||||
from .. import __version__ as version
 | 
			
		||||
from ..utils import flatten_dict, is_datasets_available, is_pandas_available, is_torch_available, logging
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
logger = logging.get_logger(__name__)
 | 
			
		||||
@ -68,10 +68,10 @@ if TYPE_CHECKING and _has_neptune:
 | 
			
		||||
        except importlib.metadata.PackageNotFoundError:
 | 
			
		||||
            _has_neptune = False
 | 
			
		||||
 | 
			
		||||
from .trainer_callback import ProgressCallback, TrainerCallback  # noqa: E402
 | 
			
		||||
from .trainer_utils import PREFIX_CHECKPOINT_DIR, BestRun, IntervalStrategy  # noqa: E402
 | 
			
		||||
from .training_args import ParallelMode  # noqa: E402
 | 
			
		||||
from .utils import ENV_VARS_TRUE_VALUES, is_torch_tpu_available  # noqa: E402
 | 
			
		||||
from ..trainer_callback import ProgressCallback, TrainerCallback  # noqa: E402
 | 
			
		||||
from ..trainer_utils import PREFIX_CHECKPOINT_DIR, BestRun, IntervalStrategy  # noqa: E402
 | 
			
		||||
from ..training_args import ParallelMode  # noqa: E402
 | 
			
		||||
from ..utils import ENV_VARS_TRUE_VALUES, is_torch_tpu_available  # noqa: E402
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Integration functions:
 | 
			
		||||
@ -14,7 +14,7 @@
 | 
			
		||||
import inspect
 | 
			
		||||
from typing import Optional
 | 
			
		||||
 | 
			
		||||
from ...utils import (
 | 
			
		||||
from ..utils import (
 | 
			
		||||
    check_peft_version,
 | 
			
		||||
    find_adapter_config_file,
 | 
			
		||||
    is_accelerate_available,
 | 
			
		||||
@ -1,14 +0,0 @@
 | 
			
		||||
# Copyright 2023 The HuggingFace Team. All rights reserved.
 | 
			
		||||
#
 | 
			
		||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
# you may not use this file except in compliance with the License.
 | 
			
		||||
# You may obtain a copy of the License at
 | 
			
		||||
#
 | 
			
		||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
#
 | 
			
		||||
# Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
# See the License for the specific language governing permissions and
 | 
			
		||||
# limitations under the License.
 | 
			
		||||
from .peft import PeftAdapterMixin
 | 
			
		||||
@ -1,15 +0,0 @@
 | 
			
		||||
# Copyright 2023 The HuggingFace Team. All rights reserved.
 | 
			
		||||
#
 | 
			
		||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
# you may not use this file except in compliance with the License.
 | 
			
		||||
# You may obtain a copy of the License at
 | 
			
		||||
#
 | 
			
		||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
#
 | 
			
		||||
# Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
# See the License for the specific language governing permissions and
 | 
			
		||||
# limitations under the License.
 | 
			
		||||
 | 
			
		||||
from .peft_mixin import PeftAdapterMixin
 | 
			
		||||
@ -35,10 +35,9 @@ from torch.nn import CrossEntropyLoss
 | 
			
		||||
 | 
			
		||||
from .activations import get_activation
 | 
			
		||||
from .configuration_utils import PretrainedConfig
 | 
			
		||||
from .deepspeed import deepspeed_config, is_deepspeed_zero3_enabled
 | 
			
		||||
from .dynamic_module_utils import custom_object_save
 | 
			
		||||
from .generation import GenerationConfig, GenerationMixin
 | 
			
		||||
from .lib_integrations import PeftAdapterMixin
 | 
			
		||||
from .integrations import PeftAdapterMixin, deepspeed_config, is_deepspeed_zero3_enabled
 | 
			
		||||
from .pytorch_utils import (  # noqa: F401
 | 
			
		||||
    Conv1D,
 | 
			
		||||
    apply_chunking_to_forward,
 | 
			
		||||
@ -661,7 +660,7 @@ def _load_state_dict_into_meta_model(
 | 
			
		||||
    #   they won't get loaded.
 | 
			
		||||
 | 
			
		||||
    if is_quantized:
 | 
			
		||||
        from .utils.bitsandbytes import set_module_quantized_tensor_to_device
 | 
			
		||||
        from .integrations import set_module_quantized_tensor_to_device
 | 
			
		||||
 | 
			
		||||
    error_msgs = []
 | 
			
		||||
 | 
			
		||||
@ -2944,7 +2943,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
 | 
			
		||||
            keep_in_fp32_modules = []
 | 
			
		||||
 | 
			
		||||
        if load_in_8bit or load_in_4bit:
 | 
			
		||||
            from .utils.bitsandbytes import get_keys_to_not_convert, replace_with_bnb_linear
 | 
			
		||||
            from .integrations import get_keys_to_not_convert, replace_with_bnb_linear
 | 
			
		||||
 | 
			
		||||
            llm_int8_skip_modules = quantization_config.llm_int8_skip_modules
 | 
			
		||||
            load_in_8bit_fp32_cpu_offload = quantization_config.llm_int8_enable_fp32_cpu_offload
 | 
			
		||||
@ -3262,7 +3261,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
 | 
			
		||||
    ):
 | 
			
		||||
        is_safetensors = False
 | 
			
		||||
        if is_quantized:
 | 
			
		||||
            from .utils.bitsandbytes import set_module_quantized_tensor_to_device
 | 
			
		||||
            from .integrations import set_module_quantized_tensor_to_device
 | 
			
		||||
 | 
			
		||||
        if device_map is not None and "disk" in device_map.values():
 | 
			
		||||
            archive_file = (
 | 
			
		||||
 | 
			
		||||
@ -25,7 +25,7 @@ from torch import nn
 | 
			
		||||
from torch.nn import CrossEntropyLoss
 | 
			
		||||
 | 
			
		||||
from ...activations import ACT2FN
 | 
			
		||||
from ...deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...modeling_outputs import (
 | 
			
		||||
    BaseModelOutput,
 | 
			
		||||
    CausalLMOutput,
 | 
			
		||||
 | 
			
		||||
@ -23,8 +23,8 @@ import torch.utils.checkpoint
 | 
			
		||||
from torch import nn
 | 
			
		||||
 | 
			
		||||
from ....activations import ACT2FN
 | 
			
		||||
from ....deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ....file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 | 
			
		||||
from ....integrations.deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ....modeling_outputs import BaseModelOutput, CausalLMOutput
 | 
			
		||||
from ....modeling_utils import (
 | 
			
		||||
    PreTrainedModel,
 | 
			
		||||
 | 
			
		||||
@ -29,7 +29,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 | 
			
		||||
 | 
			
		||||
from ...activations import get_activation
 | 
			
		||||
from ...configuration_utils import PretrainedConfig
 | 
			
		||||
from ...deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...modeling_outputs import (
 | 
			
		||||
    BaseModelOutput,
 | 
			
		||||
    MaskedLMOutput,
 | 
			
		||||
 | 
			
		||||
@ -23,7 +23,7 @@ import torch
 | 
			
		||||
import torch.nn as nn
 | 
			
		||||
from torch.nn import LayerNorm
 | 
			
		||||
 | 
			
		||||
from ...deepspeed import is_deepspeed_available
 | 
			
		||||
from ...integrations.deepspeed import is_deepspeed_available
 | 
			
		||||
from ...modeling_outputs import ModelOutput
 | 
			
		||||
from ...utils import (
 | 
			
		||||
    ContextManagers,
 | 
			
		||||
 | 
			
		||||
@ -35,7 +35,7 @@ from torch import Tensor, nn
 | 
			
		||||
from torch.nn import CrossEntropyLoss, LayerNorm
 | 
			
		||||
 | 
			
		||||
from ...activations import ACT2FN
 | 
			
		||||
from ...deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...modeling_outputs import (
 | 
			
		||||
    BaseModelOutput,
 | 
			
		||||
    BaseModelOutputWithPastAndCrossAttentions,
 | 
			
		||||
 | 
			
		||||
@ -24,7 +24,7 @@ from torch import nn
 | 
			
		||||
from torch.nn import CrossEntropyLoss
 | 
			
		||||
 | 
			
		||||
from ...activations import ACT2FN
 | 
			
		||||
from ...deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 | 
			
		||||
from ...modeling_utils import PreTrainedModel
 | 
			
		||||
from ...utils import (
 | 
			
		||||
 | 
			
		||||
@ -23,7 +23,7 @@ from torch import nn
 | 
			
		||||
from torch.nn import CrossEntropyLoss
 | 
			
		||||
 | 
			
		||||
from ...activations import ACT2FN
 | 
			
		||||
from ...deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...modeling_outputs import (
 | 
			
		||||
    BaseModelOutput,
 | 
			
		||||
    BaseModelOutputWithPastAndCrossAttentions,
 | 
			
		||||
 | 
			
		||||
@ -24,7 +24,7 @@ from torch.nn import CrossEntropyLoss
 | 
			
		||||
from torch.utils.checkpoint import checkpoint
 | 
			
		||||
 | 
			
		||||
from ...activations import ACT2FN
 | 
			
		||||
from ...deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...modeling_outputs import (
 | 
			
		||||
    MoEModelOutput,
 | 
			
		||||
    MoEModelOutputWithPastAndCrossAttentions,
 | 
			
		||||
 | 
			
		||||
@ -25,7 +25,7 @@ from torch import nn
 | 
			
		||||
from torch.nn import CrossEntropyLoss
 | 
			
		||||
 | 
			
		||||
from ...activations import ACT2FN
 | 
			
		||||
from ...deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 | 
			
		||||
from ...modeling_utils import PreTrainedModel
 | 
			
		||||
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 | 
			
		||||
 | 
			
		||||
@ -26,7 +26,7 @@ from torch import nn
 | 
			
		||||
from torch.nn import CrossEntropyLoss, LayerNorm
 | 
			
		||||
 | 
			
		||||
from ...activations import ACT2FN
 | 
			
		||||
from ...deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 | 
			
		||||
from ...modeling_utils import PreTrainedModel
 | 
			
		||||
from ...pytorch_utils import softmax_backward_data
 | 
			
		||||
 | 
			
		||||
@ -25,7 +25,7 @@ from torch import nn
 | 
			
		||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, L1Loss
 | 
			
		||||
 | 
			
		||||
from ...activations import ACT2FN
 | 
			
		||||
from ...deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...modeling_outputs import (
 | 
			
		||||
    BaseModelOutput,
 | 
			
		||||
    BaseModelOutputWithPastAndCrossAttentions,
 | 
			
		||||
 | 
			
		||||
@ -26,7 +26,7 @@ from torch import nn
 | 
			
		||||
from torch.nn import CrossEntropyLoss
 | 
			
		||||
 | 
			
		||||
from ...activations import ACT2FN
 | 
			
		||||
from ...deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, Wav2Vec2BaseModelOutput
 | 
			
		||||
from ...modeling_utils import PreTrainedModel
 | 
			
		||||
from ...utils import (
 | 
			
		||||
 | 
			
		||||
@ -26,7 +26,7 @@ from torch import nn
 | 
			
		||||
from torch.nn import CrossEntropyLoss
 | 
			
		||||
 | 
			
		||||
from ...activations import ACT2FN
 | 
			
		||||
from ...deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...modeling_outputs import (
 | 
			
		||||
    BaseModelOutput,
 | 
			
		||||
    CausalLMOutput,
 | 
			
		||||
 | 
			
		||||
@ -26,7 +26,7 @@ from torch import nn
 | 
			
		||||
from torch.nn import CrossEntropyLoss
 | 
			
		||||
 | 
			
		||||
from ...activations import ACT2FN
 | 
			
		||||
from ...deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...modeling_outputs import (
 | 
			
		||||
    BaseModelOutput,
 | 
			
		||||
    CausalLMOutput,
 | 
			
		||||
 | 
			
		||||
@ -25,7 +25,7 @@ from torch import nn
 | 
			
		||||
from torch.nn import CrossEntropyLoss
 | 
			
		||||
 | 
			
		||||
from ...activations import ACT2FN
 | 
			
		||||
from ...deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...modeling_outputs import (
 | 
			
		||||
    BaseModelOutput,
 | 
			
		||||
    CausalLMOutput,
 | 
			
		||||
 | 
			
		||||
@ -26,7 +26,7 @@ from torch import nn
 | 
			
		||||
from torch.nn import CrossEntropyLoss
 | 
			
		||||
 | 
			
		||||
from ...activations import ACT2FN
 | 
			
		||||
from ...deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from ...modeling_outputs import (
 | 
			
		||||
    BaseModelOutput,
 | 
			
		||||
    CausalLMOutput,
 | 
			
		||||
 | 
			
		||||
@ -40,7 +40,6 @@ import requests
 | 
			
		||||
 | 
			
		||||
from transformers import logging as transformers_logging
 | 
			
		||||
 | 
			
		||||
from .deepspeed import is_deepspeed_available
 | 
			
		||||
from .integrations import (
 | 
			
		||||
    is_clearml_available,
 | 
			
		||||
    is_fairscale_available,
 | 
			
		||||
@ -49,6 +48,7 @@ from .integrations import (
 | 
			
		||||
    is_sigopt_available,
 | 
			
		||||
    is_wandb_available,
 | 
			
		||||
)
 | 
			
		||||
from .integrations.deepspeed import is_deepspeed_available
 | 
			
		||||
from .utils import (
 | 
			
		||||
    is_accelerate_available,
 | 
			
		||||
    is_apex_available,
 | 
			
		||||
 | 
			
		||||
@ -58,9 +58,9 @@ from . import __version__
 | 
			
		||||
from .configuration_utils import PretrainedConfig
 | 
			
		||||
from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
 | 
			
		||||
from .debug_utils import DebugOption, DebugUnderflowOverflow
 | 
			
		||||
from .deepspeed import deepspeed_init, deepspeed_load_checkpoint
 | 
			
		||||
from .dependency_versions_check import dep_version_check
 | 
			
		||||
from .hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS, default_hp_search_backend
 | 
			
		||||
from .integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint
 | 
			
		||||
from .modelcard import TrainingSummary
 | 
			
		||||
from .modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model
 | 
			
		||||
from .models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_MAPPING_NAMES
 | 
			
		||||
@ -1197,7 +1197,7 @@ class Trainer:
 | 
			
		||||
            # Rebuild the deepspeed config to reflect the updated training parameters
 | 
			
		||||
            from accelerate.utils import DeepSpeedPlugin
 | 
			
		||||
 | 
			
		||||
            from transformers.deepspeed import HfTrainerDeepSpeedConfig
 | 
			
		||||
            from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig
 | 
			
		||||
 | 
			
		||||
            self.args.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.args.deepspeed)
 | 
			
		||||
            self.args.hf_deepspeed_config.trainer_config_process(self.args)
 | 
			
		||||
@ -3899,7 +3899,7 @@ class Trainer:
 | 
			
		||||
 | 
			
		||||
        if self.is_deepspeed_enabled:
 | 
			
		||||
            if getattr(self.args, "hf_deepspeed_config", None) is None:
 | 
			
		||||
                from transformers.deepspeed import HfTrainerDeepSpeedConfig
 | 
			
		||||
                from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig
 | 
			
		||||
 | 
			
		||||
                ds_plugin = self.accelerator.state.deepspeed_plugin
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -35,7 +35,7 @@ from torch import nn
 | 
			
		||||
from torch.utils.data import Dataset, IterableDataset, RandomSampler, Sampler
 | 
			
		||||
from torch.utils.data.distributed import DistributedSampler
 | 
			
		||||
 | 
			
		||||
from .deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from .integrations.deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from .tokenization_utils_base import BatchEncoding
 | 
			
		||||
from .utils import is_sagemaker_mp_enabled, is_torch_tpu_available, is_training_run_on_sagemaker, logging
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -20,8 +20,8 @@ import torch
 | 
			
		||||
from torch import nn
 | 
			
		||||
from torch.utils.data import Dataset
 | 
			
		||||
 | 
			
		||||
from .deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from .generation.configuration_utils import GenerationConfig
 | 
			
		||||
from .integrations.deepspeed import is_deepspeed_zero3_enabled
 | 
			
		||||
from .trainer import Trainer
 | 
			
		||||
from .utils import logging
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -1647,7 +1647,7 @@ class TrainingArguments:
 | 
			
		||||
            # - must be run before the model is created.
 | 
			
		||||
            if not is_accelerate_available():
 | 
			
		||||
                raise ValueError("--deepspeed requires Accelerate to be installed: `pip install accelerate`.")
 | 
			
		||||
            from transformers.deepspeed import HfTrainerDeepSpeedConfig
 | 
			
		||||
            from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig
 | 
			
		||||
 | 
			
		||||
            # will be used later by the Trainer
 | 
			
		||||
            # note: leave self.deepspeed unmodified in case a user relies on it not to be modified)
 | 
			
		||||
 | 
			
		||||
@ -1,291 +1,28 @@
 | 
			
		||||
import importlib.metadata
 | 
			
		||||
# Copyright 2023 The HuggingFace Team. All rights reserved.
 | 
			
		||||
#
 | 
			
		||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
# you may not use this file except in compliance with the License.
 | 
			
		||||
# You may obtain a copy of the License at
 | 
			
		||||
#
 | 
			
		||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
#
 | 
			
		||||
# Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
# See the License for the specific language governing permissions and
 | 
			
		||||
# limitations under the License.
 | 
			
		||||
import warnings
 | 
			
		||||
from copy import deepcopy
 | 
			
		||||
 | 
			
		||||
from packaging import version
 | 
			
		||||
 | 
			
		||||
from ..utils import logging
 | 
			
		||||
from .import_utils import is_accelerate_available, is_bitsandbytes_available
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if is_bitsandbytes_available():
 | 
			
		||||
    import bitsandbytes as bnb
 | 
			
		||||
    import torch
 | 
			
		||||
    import torch.nn as nn
 | 
			
		||||
warnings.warn(
 | 
			
		||||
    "transformers.utils.bitsandbytes module is deprecated and will be removed in a future version. Please import bitsandbytes modules directly from transformers.integrations",
 | 
			
		||||
    FutureWarning,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
    from ..pytorch_utils import Conv1D
 | 
			
		||||
 | 
			
		||||
if is_accelerate_available():
 | 
			
		||||
    from accelerate import init_empty_weights
 | 
			
		||||
    from accelerate.utils import find_tied_parameters
 | 
			
		||||
 | 
			
		||||
logger = logging.get_logger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def set_module_quantized_tensor_to_device(module, tensor_name, device, value=None, fp16_statistics=None):
 | 
			
		||||
    """
 | 
			
		||||
    A helper function to set a given tensor (parameter of buffer) of a module on a specific device (note that doing
 | 
			
		||||
    `param.to(device)` creates a new tensor not linked to the parameter, which is why we need this function). The
 | 
			
		||||
    function is adapted from `set_module_tensor_to_device` function from accelerate that is adapted to support the
 | 
			
		||||
    class `Int8Params` from `bitsandbytes`.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        module (`torch.nn.Module`):
 | 
			
		||||
            The module in which the tensor we want to move lives.
 | 
			
		||||
        tensor_name (`str`):
 | 
			
		||||
            The full name of the parameter/buffer.
 | 
			
		||||
        device (`int`, `str` or `torch.device`):
 | 
			
		||||
            The device on which to set the tensor.
 | 
			
		||||
        value (`torch.Tensor`, *optional*):
 | 
			
		||||
            The value of the tensor (useful when going from the meta device to any other device).
 | 
			
		||||
        fp16_statistics (`torch.HalfTensor`, *optional*):
 | 
			
		||||
            The list of fp16 statistics to set on the module, used for serialization.
 | 
			
		||||
    """
 | 
			
		||||
    # Recurse if needed
 | 
			
		||||
    if "." in tensor_name:
 | 
			
		||||
        splits = tensor_name.split(".")
 | 
			
		||||
        for split in splits[:-1]:
 | 
			
		||||
            new_module = getattr(module, split)
 | 
			
		||||
            if new_module is None:
 | 
			
		||||
                raise ValueError(f"{module} has no attribute {split}.")
 | 
			
		||||
            module = new_module
 | 
			
		||||
        tensor_name = splits[-1]
 | 
			
		||||
 | 
			
		||||
    if tensor_name not in module._parameters and tensor_name not in module._buffers:
 | 
			
		||||
        raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
 | 
			
		||||
    is_buffer = tensor_name in module._buffers
 | 
			
		||||
    old_value = getattr(module, tensor_name)
 | 
			
		||||
 | 
			
		||||
    if old_value.device == torch.device("meta") and device not in ["meta", torch.device("meta")] and value is None:
 | 
			
		||||
        raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {device}.")
 | 
			
		||||
 | 
			
		||||
    is_4bit = False
 | 
			
		||||
    is_8bit = False
 | 
			
		||||
    if is_buffer or not is_bitsandbytes_available():
 | 
			
		||||
        is_8bit = False
 | 
			
		||||
        is_4bit = False
 | 
			
		||||
    else:
 | 
			
		||||
        is_4bit = hasattr(bnb.nn, "Params4bit") and isinstance(module._parameters[tensor_name], bnb.nn.Params4bit)
 | 
			
		||||
        is_8bit = isinstance(module._parameters[tensor_name], bnb.nn.Int8Params)
 | 
			
		||||
 | 
			
		||||
    if is_8bit or is_4bit:
 | 
			
		||||
        param = module._parameters[tensor_name]
 | 
			
		||||
        if param.device.type != "cuda":
 | 
			
		||||
            if value is None:
 | 
			
		||||
                new_value = old_value.to(device)
 | 
			
		||||
            elif isinstance(value, torch.Tensor):
 | 
			
		||||
                new_value = value.to("cpu")
 | 
			
		||||
                if value.dtype == torch.int8:
 | 
			
		||||
                    is_8bit_serializable = version.parse(importlib.metadata.version("bitsandbytes")) > version.parse(
 | 
			
		||||
                        "0.37.2"
 | 
			
		||||
                    )
 | 
			
		||||
                    if not is_8bit_serializable:
 | 
			
		||||
                        raise ValueError(
 | 
			
		||||
                            "Detected int8 weights but the version of bitsandbytes is not compatible with int8 serialization. "
 | 
			
		||||
                            "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`."
 | 
			
		||||
                        )
 | 
			
		||||
            else:
 | 
			
		||||
                new_value = torch.tensor(value, device="cpu")
 | 
			
		||||
 | 
			
		||||
            # Support models using `Conv1D` in place of `nn.Linear` (e.g. gpt2) by transposing the weight matrix prior to quantization.
 | 
			
		||||
            # Since weights are saved in the correct "orientation", we skip transposing when loading.
 | 
			
		||||
            if issubclass(module.source_cls, Conv1D) and fp16_statistics is None:
 | 
			
		||||
                new_value = new_value.T
 | 
			
		||||
 | 
			
		||||
            kwargs = old_value.__dict__
 | 
			
		||||
            if is_8bit:
 | 
			
		||||
                new_value = bnb.nn.Int8Params(new_value, requires_grad=False, **kwargs).to(device)
 | 
			
		||||
            elif is_4bit:
 | 
			
		||||
                new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(device)
 | 
			
		||||
 | 
			
		||||
            module._parameters[tensor_name] = new_value
 | 
			
		||||
            if fp16_statistics is not None:
 | 
			
		||||
                setattr(module.weight, "SCB", fp16_statistics.to(device))
 | 
			
		||||
 | 
			
		||||
    else:
 | 
			
		||||
        if value is None:
 | 
			
		||||
            new_value = old_value.to(device)
 | 
			
		||||
        elif isinstance(value, torch.Tensor):
 | 
			
		||||
            new_value = value.to(device)
 | 
			
		||||
        else:
 | 
			
		||||
            new_value = torch.tensor(value, device=device)
 | 
			
		||||
 | 
			
		||||
        if is_buffer:
 | 
			
		||||
            module._buffers[tensor_name] = new_value
 | 
			
		||||
        else:
 | 
			
		||||
            new_value = nn.Parameter(new_value, requires_grad=old_value.requires_grad)
 | 
			
		||||
            module._parameters[tensor_name] = new_value
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _replace_with_bnb_linear(
 | 
			
		||||
    model, modules_to_not_convert=None, current_key_name=None, quantization_config=None, has_been_replaced=False
 | 
			
		||||
):
 | 
			
		||||
    """
 | 
			
		||||
    Private method that wraps the recursion for module replacement.
 | 
			
		||||
 | 
			
		||||
    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
 | 
			
		||||
    """
 | 
			
		||||
    for name, module in model.named_children():
 | 
			
		||||
        if current_key_name is None:
 | 
			
		||||
            current_key_name = []
 | 
			
		||||
        current_key_name.append(name)
 | 
			
		||||
 | 
			
		||||
        if (isinstance(module, nn.Linear) or isinstance(module, Conv1D)) and name not in modules_to_not_convert:
 | 
			
		||||
            # Check if the current key is not in the `modules_to_not_convert`
 | 
			
		||||
            if not any(key in ".".join(current_key_name) for key in modules_to_not_convert):
 | 
			
		||||
                with init_empty_weights():
 | 
			
		||||
                    if isinstance(module, Conv1D):
 | 
			
		||||
                        in_features, out_features = module.weight.shape
 | 
			
		||||
                    else:
 | 
			
		||||
                        in_features = module.in_features
 | 
			
		||||
                        out_features = module.out_features
 | 
			
		||||
 | 
			
		||||
                    if quantization_config.quantization_method() == "llm_int8":
 | 
			
		||||
                        model._modules[name] = bnb.nn.Linear8bitLt(
 | 
			
		||||
                            in_features,
 | 
			
		||||
                            out_features,
 | 
			
		||||
                            module.bias is not None,
 | 
			
		||||
                            has_fp16_weights=quantization_config.llm_int8_has_fp16_weight,
 | 
			
		||||
                            threshold=quantization_config.llm_int8_threshold,
 | 
			
		||||
                        )
 | 
			
		||||
                        has_been_replaced = True
 | 
			
		||||
                    else:
 | 
			
		||||
                        if (
 | 
			
		||||
                            quantization_config.llm_int8_skip_modules is not None
 | 
			
		||||
                            and name in quantization_config.llm_int8_skip_modules
 | 
			
		||||
                        ):
 | 
			
		||||
                            pass
 | 
			
		||||
                        else:
 | 
			
		||||
                            model._modules[name] = bnb.nn.Linear4bit(
 | 
			
		||||
                                in_features,
 | 
			
		||||
                                out_features,
 | 
			
		||||
                                module.bias is not None,
 | 
			
		||||
                                quantization_config.bnb_4bit_compute_dtype,
 | 
			
		||||
                                compress_statistics=quantization_config.bnb_4bit_use_double_quant,
 | 
			
		||||
                                quant_type=quantization_config.bnb_4bit_quant_type,
 | 
			
		||||
                            )
 | 
			
		||||
                            has_been_replaced = True
 | 
			
		||||
                    # Store the module class in case we need to transpose the weight later
 | 
			
		||||
                    model._modules[name].source_cls = type(module)
 | 
			
		||||
                    # Force requires grad to False to avoid unexpected errors
 | 
			
		||||
                    model._modules[name].requires_grad_(False)
 | 
			
		||||
        if len(list(module.children())) > 0:
 | 
			
		||||
            _, has_been_replaced = _replace_with_bnb_linear(
 | 
			
		||||
                module,
 | 
			
		||||
                modules_to_not_convert,
 | 
			
		||||
                current_key_name,
 | 
			
		||||
                quantization_config,
 | 
			
		||||
                has_been_replaced=has_been_replaced,
 | 
			
		||||
            )
 | 
			
		||||
        # Remove the last key for recursion
 | 
			
		||||
        current_key_name.pop(-1)
 | 
			
		||||
    return model, has_been_replaced
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name=None, quantization_config=None):
 | 
			
		||||
    """
 | 
			
		||||
    A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules from the `bitsandbytes`
 | 
			
		||||
    library. This will enable running your models using mixed int8 precision as described by the paper `LLM.int8():
 | 
			
		||||
    8-bit Matrix Multiplication for Transformers at Scale`. Make sure `bitsandbytes` compiled with the correct CUDA
 | 
			
		||||
    version of your hardware is installed before running this function. `pip install -i https://test.pypi.org/simple/
 | 
			
		||||
    bitsandbytes`
 | 
			
		||||
 | 
			
		||||
    The function will be run recursively and replace all `torch.nn.Linear` modules except for the `lm_head` that should
 | 
			
		||||
    be kept as a `torch.nn.Linear` module. The replacement is done under `init_empty_weights` context manager so no
 | 
			
		||||
    CPU/GPU memory is required to run this function. Int8 mixed-precision matrix decomposition works by separating a
 | 
			
		||||
    matrix multiplication into two streams: (1) and systematic feature outlier stream matrix multiplied in fp16
 | 
			
		||||
    (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no
 | 
			
		||||
    predictive degradation is possible for very large models (>=176B parameters).
 | 
			
		||||
 | 
			
		||||
    Parameters:
 | 
			
		||||
        model (`torch.nn.Module`):
 | 
			
		||||
            Input model or `torch.nn.Module` as the function is run recursively.
 | 
			
		||||
        modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`):
 | 
			
		||||
            Names of the modules to not convert in `Linear8bitLt`. In practice we keep the `lm_head` in full precision
 | 
			
		||||
            for numerical stability reasons.
 | 
			
		||||
        current_key_name (`List[`str`]`, *optional*):
 | 
			
		||||
            An array to track the current key of the recursion. This is used to check whether the current key (part of
 | 
			
		||||
            it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
 | 
			
		||||
            `disk`).
 | 
			
		||||
    """
 | 
			
		||||
    modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert
 | 
			
		||||
    model, has_been_replaced = _replace_with_bnb_linear(
 | 
			
		||||
        model, modules_to_not_convert, current_key_name, quantization_config
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    if not has_been_replaced:
 | 
			
		||||
        logger.warning(
 | 
			
		||||
            "You are loading your model in 8bit or 4bit but no linear modules were found in your model."
 | 
			
		||||
            " Please double check your model architecture, or submit an issue on github if you think this is"
 | 
			
		||||
            " a bug."
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    return model
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# For backward compatibility
 | 
			
		||||
def replace_8bit_linear(*args, **kwargs):
 | 
			
		||||
    warnings.warn(
 | 
			
		||||
        "`replace_8bit_linear` will be deprecated in a future version, please use `replace_with_bnb_linear` instead",
 | 
			
		||||
        FutureWarning,
 | 
			
		||||
    )
 | 
			
		||||
    return replace_with_bnb_linear(*args, **kwargs)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# For backward compatiblity
 | 
			
		||||
def set_module_8bit_tensor_to_device(*args, **kwargs):
 | 
			
		||||
    warnings.warn(
 | 
			
		||||
        "`set_module_8bit_tensor_to_device` will be deprecated in a future version, please use `set_module_quantized_tensor_to_device` instead",
 | 
			
		||||
        FutureWarning,
 | 
			
		||||
    )
 | 
			
		||||
    return set_module_quantized_tensor_to_device(*args, **kwargs)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_keys_to_not_convert(model):
 | 
			
		||||
    r"""
 | 
			
		||||
    An utility function to get the key of the module to keep in full precision if any For example for CausalLM modules
 | 
			
		||||
    we may want to keep the lm_head in full precision for numerical stability reasons. For other architectures, we want
 | 
			
		||||
    to keep the tied weights of the model. The function will return a list of the keys of the modules to not convert in
 | 
			
		||||
    int8.
 | 
			
		||||
 | 
			
		||||
    Parameters:
 | 
			
		||||
    model (`torch.nn.Module`):
 | 
			
		||||
        Input model
 | 
			
		||||
    """
 | 
			
		||||
    # Create a copy of the model and tie the weights, then
 | 
			
		||||
    # check if it contains tied weights
 | 
			
		||||
    tied_model = deepcopy(model)  # this has 0 cost since it is done inside `init_empty_weights` context manager`
 | 
			
		||||
    tied_model.tie_weights()
 | 
			
		||||
 | 
			
		||||
    tied_params = find_tied_parameters(tied_model)
 | 
			
		||||
    # For compatibility with Accelerate < 0.18
 | 
			
		||||
    if isinstance(tied_params, dict):
 | 
			
		||||
        tied_keys = sum(list(tied_params.values()), []) + list(tied_params.keys())
 | 
			
		||||
    else:
 | 
			
		||||
        tied_keys = sum(tied_params, [])
 | 
			
		||||
    has_tied_params = len(tied_keys) > 0
 | 
			
		||||
 | 
			
		||||
    # If there is not tied weights, we want to keep the lm_head(output_embedding) in full precision
 | 
			
		||||
    if not has_tied_params:
 | 
			
		||||
        output_emb = model.get_output_embeddings()
 | 
			
		||||
        if output_emb is not None:
 | 
			
		||||
            list_last_module = [name for name, module in model.named_modules() if id(module) == id(output_emb)]
 | 
			
		||||
            return list_last_module
 | 
			
		||||
 | 
			
		||||
    # otherwise, no tied weights, no output embedding defined, simply keep the last module in full precision
 | 
			
		||||
    list_modules = list(model.named_parameters())
 | 
			
		||||
    list_last_module = [list_modules[-1][0]]
 | 
			
		||||
    # add last module together with tied weights
 | 
			
		||||
    intersection = set(list_last_module) - set(tied_keys)
 | 
			
		||||
    list_untouched = list(set(tied_keys)) + list(intersection)
 | 
			
		||||
 | 
			
		||||
    # remove ".weight" from the keys
 | 
			
		||||
    names_to_remove = [".weight", ".bias"]
 | 
			
		||||
    filtered_module_names = []
 | 
			
		||||
    for name in list_untouched:
 | 
			
		||||
        for name_to_remove in names_to_remove:
 | 
			
		||||
            if name_to_remove in name:
 | 
			
		||||
                name = name.replace(name_to_remove, "")
 | 
			
		||||
        filtered_module_names.append(name)
 | 
			
		||||
 | 
			
		||||
    return filtered_module_names
 | 
			
		||||
from ..integrations import (  # noqa
 | 
			
		||||
    get_keys_to_not_convert,
 | 
			
		||||
    replace_8bit_linear,
 | 
			
		||||
    replace_with_bnb_linear,
 | 
			
		||||
    set_module_8bit_tensor_to_device,
 | 
			
		||||
    set_module_quantized_tensor_to_device,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
@ -27,7 +27,11 @@ from parameterized import parameterized
 | 
			
		||||
import tests.trainer.test_trainer
 | 
			
		||||
from tests.trainer.test_trainer import TrainerIntegrationCommon  # noqa
 | 
			
		||||
from transformers import AutoModel, TrainingArguments, is_torch_available, logging
 | 
			
		||||
from transformers.deepspeed import HfDeepSpeedConfig, is_deepspeed_available, unset_hf_deepspeed_config
 | 
			
		||||
from transformers.integrations.deepspeed import (
 | 
			
		||||
    HfDeepSpeedConfig,
 | 
			
		||||
    is_deepspeed_available,
 | 
			
		||||
    unset_hf_deepspeed_config,
 | 
			
		||||
)
 | 
			
		||||
from transformers.testing_utils import (
 | 
			
		||||
    CaptureLogger,
 | 
			
		||||
    CaptureStd,
 | 
			
		||||
@ -113,7 +117,7 @@ def require_deepspeed_aio(test_case):
 | 
			
		||||
if is_deepspeed_available():
 | 
			
		||||
    from deepspeed.utils import logger as deepspeed_logger  # noqa
 | 
			
		||||
    from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
 | 
			
		||||
    from transformers.deepspeed import deepspeed_config, is_deepspeed_zero3_enabled  # noqa
 | 
			
		||||
    from transformers.integrations.deepspeed import deepspeed_config, is_deepspeed_zero3_enabled  # noqa
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_launcher(distributed=False):
 | 
			
		||||
 | 
			
		||||
@ -131,7 +131,7 @@ class MixedInt8Test(BaseMixedInt8Test):
 | 
			
		||||
        from accelerate import init_empty_weights
 | 
			
		||||
 | 
			
		||||
        from transformers import AutoModelForMaskedLM, Blip2ForConditionalGeneration, MptForCausalLM, OPTForCausalLM
 | 
			
		||||
        from transformers.utils.bitsandbytes import get_keys_to_not_convert
 | 
			
		||||
        from transformers.integrations.bitsandbytes import get_keys_to_not_convert
 | 
			
		||||
 | 
			
		||||
        model_id = "mosaicml/mpt-7b"
 | 
			
		||||
        config = AutoConfig.from_pretrained(
 | 
			
		||||
 | 
			
		||||
@ -383,9 +383,11 @@ src/transformers/hyperparameter_search.py
 | 
			
		||||
src/transformers/image_processing_utils.py
 | 
			
		||||
src/transformers/image_transforms.py
 | 
			
		||||
src/transformers/image_utils.py
 | 
			
		||||
src/transformers/integrations.py
 | 
			
		||||
src/transformers/integrations/bitsandbytes.py
 | 
			
		||||
src/transformers/integrations/deepspeed.py
 | 
			
		||||
src/transformers/integrations/integration_utils.py
 | 
			
		||||
src/transformers/integrations/peft.py
 | 
			
		||||
src/transformers/keras_callbacks.py
 | 
			
		||||
src/transformers/lib_integrations/peft/peft_mixin.py
 | 
			
		||||
src/transformers/modelcard.py
 | 
			
		||||
src/transformers/modeling_flax_outputs.py
 | 
			
		||||
src/transformers/modeling_flax_pytorch_utils.py
 | 
			
		||||
 | 
			
		||||
		Reference in New Issue
	
	Block a user