mirror of
				https://github.com/huggingface/transformers.git
				synced 2025-10-21 01:23:56 +08:00 
			
		
		
		
	Compare commits
	
		
			39 Commits
		
	
	
		
			v4.42.1
			...
			run-move-i
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 2b143945bf | |||
| 4b4c68157c | |||
| 33412d3759 | |||
| fa451d4604 | |||
| 3107a96b6c | |||
| 89f4ebd4e3 | |||
| 7b6098c34a | |||
| 10d3b77aa7 | |||
| 8ace6bd21a | |||
| 5773b33b5f | |||
| 72fd103a12 | |||
| d50051a215 | |||
| 080fc2fa90 | |||
| b756acedc7 | |||
| bb0a0259c6 | |||
| 310ceb1e10 | |||
| be38218dbe | |||
| b8fcf6143b | |||
| 615ac14ea5 | |||
| bd95ee271c | |||
| b4c4cf7c95 | |||
| 7cc7cbb5fa | |||
| 80d27756f6 | |||
| 656c41113e | |||
| c80cfd1aed | |||
| bc7a6ae4a6 | |||
| f4d8c833f1 | |||
| 566847461d | |||
| e4e245b16f | |||
| b2e167265d | |||
| de497d4f2e | |||
| f8afb0abc0 | |||
| b30adec90f | |||
| 026f53c4d2 | |||
| 190b83f7f4 | |||
| 582fbde0a3 | |||
| 7c0b4bb9af | |||
| 9453fa2538 | |||
| b04903bcd8 | 
							
								
								
									
										22
									
								
								.github/workflows/self-scheduled.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										22
									
								
								.github/workflows/self-scheduled.yml
									
									
									
									
										vendored
									
									
								
							| @ -8,11 +8,9 @@ name: Self-hosted runner (scheduled) | ||||
|  | ||||
| on: | ||||
|   repository_dispatch: | ||||
|   schedule: | ||||
|     - cron: "17 2 * * *" | ||||
|   push: | ||||
|     branches: | ||||
|       - run_scheduled_ci* | ||||
|       - run-move-integrations | ||||
|  | ||||
| env: | ||||
|   HF_HOME: /mnt/cache | ||||
| @ -43,7 +41,7 @@ jobs: | ||||
|     strategy: | ||||
|       matrix: | ||||
|         machine_type: [single-gpu, multi-gpu] | ||||
|     runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} | ||||
|     runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} | ||||
|     container: | ||||
|       image: huggingface/transformers-all-latest-gpu | ||||
|       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | ||||
| @ -58,7 +56,7 @@ jobs: | ||||
|     strategy: | ||||
|       matrix: | ||||
|         machine_type: [single-gpu, multi-gpu] | ||||
|     runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} | ||||
|     runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} | ||||
|     container: | ||||
|       image: huggingface/transformers-all-latest-gpu | ||||
|       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | ||||
| @ -85,7 +83,7 @@ jobs: | ||||
|         name: Identify models to test | ||||
|         working-directory: /transformers/tests | ||||
|         run: | | ||||
|           echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT | ||||
|           echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2[:10] + d1; print(d)')" >> $GITHUB_OUTPUT | ||||
|  | ||||
|       - name: NVIDIA-SMI | ||||
|         run: | | ||||
| @ -98,7 +96,7 @@ jobs: | ||||
|       matrix: | ||||
|         folders: ${{ fromJson(needs.setup.outputs.matrix) }} | ||||
|         machine_type: [single-gpu] | ||||
|     runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} | ||||
|     runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} | ||||
|     container: | ||||
|       image: huggingface/transformers-all-latest-gpu | ||||
|       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | ||||
| @ -159,7 +157,7 @@ jobs: | ||||
|       matrix: | ||||
|         folders: ${{ fromJson(needs.setup.outputs.matrix) }} | ||||
|         machine_type: [multi-gpu] | ||||
|     runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} | ||||
|     runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} | ||||
|     container: | ||||
|       image: huggingface/transformers-all-latest-gpu | ||||
|       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | ||||
| @ -219,7 +217,7 @@ jobs: | ||||
|       fail-fast: false | ||||
|       matrix: | ||||
|         machine_type: [single-gpu] | ||||
|     runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} | ||||
|     runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} | ||||
|     container: | ||||
|       image: huggingface/transformers-all-latest-gpu | ||||
|       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | ||||
| @ -270,7 +268,7 @@ jobs: | ||||
|       fail-fast: false | ||||
|       matrix: | ||||
|         machine_type: [single-gpu, multi-gpu] | ||||
|     runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} | ||||
|     runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} | ||||
|     container: | ||||
|       image: huggingface/transformers-pytorch-gpu | ||||
|       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | ||||
| @ -320,7 +318,7 @@ jobs: | ||||
|       fail-fast: false | ||||
|       matrix: | ||||
|         machine_type: [single-gpu, multi-gpu] | ||||
|     runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} | ||||
|     runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} | ||||
|     container: | ||||
|       image: huggingface/transformers-tensorflow-gpu | ||||
|       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | ||||
| @ -371,7 +369,7 @@ jobs: | ||||
|       fail-fast: false | ||||
|       matrix: | ||||
|         machine_type: [single-gpu, multi-gpu] | ||||
|     runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} | ||||
|     runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} | ||||
|     needs: setup | ||||
|     container: | ||||
|       image: huggingface/transformers-pytorch-deepspeed-latest-gpu | ||||
|  | ||||
| @ -2065,20 +2065,20 @@ In this case you usually need to raise the value of `initial_scale_power`. Setti | ||||
|  | ||||
| ## Non-Trainer Deepspeed Integration | ||||
|  | ||||
| The [`~deepspeed.HfDeepSpeedConfig`] is used to integrate Deepspeed into the 🤗 Transformers core | ||||
| The [`~integrations.HfDeepSpeedConfig`] is used to integrate Deepspeed into the 🤗 Transformers core | ||||
| functionality, when [`Trainer`] is not used. The only thing that it does is handling Deepspeed ZeRO-3 param gathering and automatically splitting the model onto multiple gpus during `from_pretrained` call. Everything else you have to do by yourself. | ||||
|  | ||||
| When using [`Trainer`] everything is automatically taken care of. | ||||
|  | ||||
| When not using [`Trainer`], to efficiently deploy DeepSpeed ZeRO-3, you must instantiate the | ||||
| [`~deepspeed.HfDeepSpeedConfig`] object before instantiating the model and keep that object alive. | ||||
| [`~integrations.HfDeepSpeedConfig`] object before instantiating the model and keep that object alive. | ||||
|  | ||||
| If you're using Deepspeed ZeRO-1 or ZeRO-2 you don't need to use `HfDeepSpeedConfig` at all. | ||||
|  | ||||
| For example for a pretrained model: | ||||
|  | ||||
| ```python | ||||
| from transformers.deepspeed import HfDeepSpeedConfig | ||||
| from transformers.integrations import HfDeepSpeedConfig | ||||
| from transformers import AutoModel | ||||
| import deepspeed | ||||
|  | ||||
| @ -2092,7 +2092,7 @@ engine = deepspeed.initialize(model=model, config_params=ds_config, ...) | ||||
| or for non-pretrained model: | ||||
|  | ||||
| ```python | ||||
| from transformers.deepspeed import HfDeepSpeedConfig | ||||
| from transformers.integrations import HfDeepSpeedConfig | ||||
| from transformers import AutoModel, AutoConfig | ||||
| import deepspeed | ||||
|  | ||||
| @ -2108,7 +2108,7 @@ Please note that if you're not using the [`Trainer`] integration, you're complet | ||||
|  | ||||
| ## HfDeepSpeedConfig | ||||
|  | ||||
| [[autodoc]] deepspeed.HfDeepSpeedConfig | ||||
| [[autodoc]] integrations.HfDeepSpeedConfig | ||||
|     - all | ||||
|  | ||||
| ### Custom DeepSpeed ZeRO Inference | ||||
| @ -2161,7 +2161,7 @@ Make sure to: | ||||
|  | ||||
|  | ||||
| from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM | ||||
| from transformers.deepspeed import HfDeepSpeedConfig | ||||
| from transformers.integrations import HfDeepSpeedConfig | ||||
| import deepspeed | ||||
| import os | ||||
| import torch | ||||
|  | ||||
| @ -32,7 +32,7 @@ from copy import deepcopy  # noqa | ||||
|  | ||||
| from parameterized import parameterized  # noqa | ||||
| from transformers import TrainingArguments, is_torch_available  # noqa | ||||
| from transformers.deepspeed import is_deepspeed_available  # noqa | ||||
| from transformers.integrations.deepspeed import is_deepspeed_available  # noqa | ||||
| from transformers.file_utils import WEIGHTS_NAME  # noqa | ||||
| from transformers.testing_utils import (  # noqa | ||||
|     CaptureLogger, | ||||
|  | ||||
| @ -94,6 +94,7 @@ _import_structure = { | ||||
|     "data.metrics": [], | ||||
|     "data.processors": [], | ||||
|     "debug_utils": [], | ||||
|     "deepspeed": [], | ||||
|     "dependency_versions_check": [], | ||||
|     "dependency_versions_table": [], | ||||
|     "dynamic_module_utils": [], | ||||
| @ -115,8 +116,6 @@ _import_structure = { | ||||
|         "is_tensorboard_available", | ||||
|         "is_wandb_available", | ||||
|     ], | ||||
|     "lib_integrations": [], | ||||
|     "lib_integrations.peft": [], | ||||
|     "modelcard": ["ModelCard"], | ||||
|     "modeling_tf_pytorch_utils": [ | ||||
|         "convert_tf_weight_name_to_pt_weight_name", | ||||
| @ -745,7 +744,6 @@ _import_structure = { | ||||
|         "is_vision_available", | ||||
|         "logging", | ||||
|     ], | ||||
|     "utils.bitsandbytes": [], | ||||
|     "utils.quantization_config": ["BitsAndBytesConfig", "GPTQConfig"], | ||||
| } | ||||
|  | ||||
| @ -1002,7 +1000,6 @@ else: | ||||
|         "TextDataset", | ||||
|         "TextDatasetForNextSentencePrediction", | ||||
|     ] | ||||
|     _import_structure["deepspeed"] = [] | ||||
|     _import_structure["generation"].extend( | ||||
|         [ | ||||
|             "BeamScorer", | ||||
|  | ||||
| @ -12,378 +12,29 @@ | ||||
| # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | ||||
| """ | ||||
| Integration with Deepspeed | ||||
| Integration with Deepspeed - kept for backward compatiblity, if you plan to make any edit, make sure to modify the file | ||||
| in `integrations/deepspeed` instead. | ||||
|  | ||||
| Check: https://github.com/huggingface/transformers/pull/25599 | ||||
| """ | ||||
|  | ||||
| import importlib.util | ||||
| import weakref | ||||
| from functools import partialmethod | ||||
|  | ||||
| from .dependency_versions_check import dep_version_check | ||||
| from .utils import is_accelerate_available, is_torch_available, logging | ||||
|  | ||||
|  | ||||
| if is_torch_available(): | ||||
|     import torch | ||||
|  | ||||
| logger = logging.get_logger(__name__) | ||||
|  | ||||
|  | ||||
| def is_deepspeed_available(): | ||||
|     return importlib.util.find_spec("deepspeed") is not None | ||||
|  | ||||
|  | ||||
| if is_accelerate_available() and is_deepspeed_available(): | ||||
|     from accelerate.utils.deepspeed import HfDeepSpeedConfig as DeepSpeedConfig | ||||
| else: | ||||
|     # Inherits from a dummy `object` if accelerate is not available, so that python succeeds to import this file. | ||||
|     # Deepspeed glue code will never inherit this dummy object as it checks if accelerate is available. | ||||
|     from builtins import object as DeepSpeedConfig | ||||
|  | ||||
|  | ||||
| class HfDeepSpeedConfig(DeepSpeedConfig): | ||||
|     """ | ||||
|     This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage. | ||||
|  | ||||
|     A `weakref` of this object is stored in the module's globals to be able to access the config from areas where | ||||
|     things like the Trainer object is not available (e.g. `from_pretrained` and `_get_resized_embeddings`). Therefore | ||||
|     it's important that this object remains alive while the program is still running. | ||||
|  | ||||
|     [`Trainer`] uses the `HfTrainerDeepSpeedConfig` subclass instead. That subclass has logic to sync the configuration | ||||
|     with values of [`TrainingArguments`] by replacing special placeholder values: `"auto"`. Without this special logic | ||||
|     the DeepSpeed configuration is not modified in any way. | ||||
|  | ||||
|     Args: | ||||
|         config_file_or_dict (`Union[str, Dict]`): path to DeepSpeed config file or dict. | ||||
|  | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, config_file_or_dict): | ||||
|         # set global weakref object | ||||
|         set_hf_deepspeed_config(self) | ||||
|         dep_version_check("accelerate") | ||||
|         dep_version_check("deepspeed") | ||||
|         super().__init__(config_file_or_dict) | ||||
|  | ||||
|  | ||||
| class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig): | ||||
|     """ | ||||
|     The `HfTrainerDeepSpeedConfig` object is meant to be created during `TrainingArguments` object creation and has the | ||||
|     same lifespan as the latter. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, config_file_or_dict): | ||||
|         super().__init__(config_file_or_dict) | ||||
|         self._dtype = None | ||||
|         self.mismatches = [] | ||||
|  | ||||
|     def dtype(self): | ||||
|         if self._dtype is None: | ||||
|             raise ValueError("trainer_config_process() wasn't called yet to tell dtype") | ||||
|         return self._dtype | ||||
|  | ||||
|     def is_auto(self, ds_key_long): | ||||
|         val = self.get_value(ds_key_long) | ||||
|         if val is None: | ||||
|             return False | ||||
|         else: | ||||
|             return val == "auto" | ||||
|  | ||||
|     def fill_match(self, ds_key_long, hf_val, hf_key=None, must_match=True): | ||||
|         """ | ||||
|         A utility method that massages the config file and can optionally verify that the values match. | ||||
|  | ||||
|         1. Replace "auto" values with `TrainingArguments` value. | ||||
|  | ||||
|         2. If it wasn't "auto" and `must_match` is true, then check that DS config matches Trainer | ||||
|         config values and if mismatched add the entry to `self.mismatched` - will assert during | ||||
|         `trainer_config_finalize` for one or more mismatches. | ||||
|  | ||||
|         """ | ||||
|         config, ds_key = self.find_config_node(ds_key_long) | ||||
|         if config is None: | ||||
|             return | ||||
|  | ||||
|         if config.get(ds_key) == "auto": | ||||
|             config[ds_key] = hf_val | ||||
|             return | ||||
|  | ||||
|         if not must_match: | ||||
|             return | ||||
|  | ||||
|         ds_val = config.get(ds_key) | ||||
|         if ds_val is not None and ds_val != hf_val: | ||||
|             self.mismatches.append(f"- ds {ds_key_long}={ds_val} vs hf {hf_key}={hf_val}") | ||||
|  | ||||
|     fill_only = partialmethod(fill_match, must_match=False) | ||||
|  | ||||
|     def trainer_config_process(self, args): | ||||
|         """ | ||||
|         Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object | ||||
|         creation. | ||||
|         """ | ||||
|         # DeepSpeed does: | ||||
|         # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps | ||||
|         train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps | ||||
|         self.fill_match( | ||||
|             "train_micro_batch_size_per_gpu", args.per_device_train_batch_size, "per_device_train_batch_size" | ||||
|         ) | ||||
|         self.fill_match("gradient_accumulation_steps", args.gradient_accumulation_steps, "gradient_accumulation_steps") | ||||
|         self.fill_match("train_batch_size", train_batch_size, "train_batch_size (calculated)") | ||||
|         self.fill_match("gradient_clipping", args.max_grad_norm, "max_grad_norm") | ||||
|  | ||||
|         self.fill_match("optimizer.params.lr", args.learning_rate, "learning_rate") | ||||
|         self.fill_match("optimizer.params.betas", [args.adam_beta1, args.adam_beta2], "adam_beta1+adam_beta2") | ||||
|         self.fill_match("optimizer.params.eps", args.adam_epsilon, "adam_epsilon") | ||||
|         self.fill_match("optimizer.params.weight_decay", args.weight_decay, "weight_decay") | ||||
|  | ||||
|         self.fill_only("scheduler.params.warmup_min_lr", 0)  # not a trainer arg | ||||
|         self.fill_match("scheduler.params.warmup_max_lr", args.learning_rate, "learning_rate") | ||||
|         # total_num_steps - will get set in trainer_config_finalize | ||||
|  | ||||
|         # fp16 | ||||
|         if args.fp16 or args.fp16_full_eval: | ||||
|             fp16_backend = "apex" if args.fp16_backend == "apex" else "amp" | ||||
|         else: | ||||
|             fp16_backend = None | ||||
|  | ||||
|         if args.save_on_each_node: | ||||
|             # deepspeed uses shared storage by default. Let's override this setting if save_on_each_node == True | ||||
|             self.config["checkpoint"] = self.config.get("checkpoint", {}) | ||||
|             self.config["checkpoint"]["use_node_local_storage"] = args.save_on_each_node | ||||
|  | ||||
|         # amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set | ||||
|         # any here unless the user did the work | ||||
|         self.fill_match( | ||||
|             "fp16.enabled", | ||||
|             ((args.fp16 or args.fp16_full_eval) and fp16_backend == "amp"), | ||||
|             "fp16|fp16_full_eval+fp16_backend(amp)", | ||||
|         ) | ||||
|  | ||||
|         # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any | ||||
|         # ZeRO features | ||||
|         self.fill_match("amp.enabled", fp16_backend == "apex", "fp16+fp16_backend(apex)") | ||||
|         self.fill_match("amp.opt_level", args.fp16_opt_level, "fp16_opt_level") | ||||
|  | ||||
|         self.fill_match("bf16.enabled", (args.bf16 or args.bf16_full_eval), "bf16|bf16_full_eval") | ||||
|  | ||||
|         # deepspeed's default mode is fp16 unless there is a config that says differently | ||||
|         if self.is_true("bf16.enabled"): | ||||
|             self._dtype = torch.bfloat16 | ||||
|         elif self.is_false("fp16.enabled"): | ||||
|             self._dtype = torch.float32 | ||||
|         else: | ||||
|             self._dtype = torch.float16 | ||||
|  | ||||
|     def trainer_config_finalize(self, args, model, num_training_steps): | ||||
|         """ | ||||
|         This stage is run after we have the model and know num_training_steps. | ||||
|  | ||||
|         Now we can complete the configuration process. | ||||
|         """ | ||||
|         # zero | ||||
|  | ||||
|         # deal with config keys that use `auto` value and rely on model's hidden_size | ||||
|         hidden_size_based_keys = [ | ||||
|             "zero_optimization.reduce_bucket_size", | ||||
|             "zero_optimization.stage3_prefetch_bucket_size", | ||||
|             "zero_optimization.stage3_param_persistence_threshold", | ||||
|         ] | ||||
|         hidden_size_auto_keys = [x for x in hidden_size_based_keys if self.is_auto(x)] | ||||
|  | ||||
|         if len(hidden_size_auto_keys) > 0: | ||||
|             if hasattr(model.config, "hidden_size"): | ||||
|                 hidden_size = model.config.hidden_size | ||||
|             elif hasattr(model.config, "hidden_sizes"): | ||||
|                 # if there are many hidden sizes pick the largest one | ||||
|                 hidden_size = max(model.config.hidden_sizes) | ||||
|             else: | ||||
|                 raise ValueError( | ||||
|                     "The model's config file has neither `hidden_size` nor `hidden_sizes` entry, " | ||||
|                     "therefore it's not possible to automatically fill out the following `auto` entries " | ||||
|                     f"in the DeepSpeed config file: {hidden_size_auto_keys}. You can fix that by replacing " | ||||
|                     "`auto` values for these keys with an integer value of your choice." | ||||
|                 ) | ||||
|  | ||||
|             self.fill_only("zero_optimization.reduce_bucket_size", hidden_size * hidden_size) | ||||
|             if self.is_zero3(): | ||||
|                 # automatically assign the optimal config values based on model config | ||||
|                 self.fill_only("zero_optimization.stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size) | ||||
|                 self.fill_only("zero_optimization.stage3_param_persistence_threshold", 10 * hidden_size) | ||||
|  | ||||
|         # scheduler | ||||
|         self.fill_match("scheduler.params.total_num_steps", num_training_steps, "num_training_steps (calculated)") | ||||
|         self.fill_match("scheduler.params.warmup_num_steps", args.get_warmup_steps(num_training_steps), "warmup_steps") | ||||
|  | ||||
|         if len(self.mismatches) > 0: | ||||
|             mismatches = "\n".join(self.mismatches) | ||||
|             raise ValueError( | ||||
|                 "Please correct the following DeepSpeed config values that mismatch TrainingArguments" | ||||
|                 f" values:\n{mismatches}\nThe easiest method is to set these DeepSpeed config values to 'auto'." | ||||
|             ) | ||||
|  | ||||
|  | ||||
| # keep the config object global to be able to access it anywhere during TrainingArguments life-cycle | ||||
| _hf_deepspeed_config_weak_ref = None | ||||
|  | ||||
|  | ||||
| def set_hf_deepspeed_config(hf_deepspeed_config_obj): | ||||
|     # this is a special weakref global object to allow us to get to Deepspeed config from APIs | ||||
|     # that don't have an easy way to get to the Deepspeed config outside of the Trainer domain. | ||||
|     global _hf_deepspeed_config_weak_ref | ||||
|     # will go away automatically when HfDeepSpeedConfig is destroyed (when TrainingArguments is destroyed) | ||||
|     _hf_deepspeed_config_weak_ref = weakref.ref(hf_deepspeed_config_obj) | ||||
|  | ||||
|  | ||||
| def unset_hf_deepspeed_config(): | ||||
|     # useful for unit tests to ensure the global state doesn't leak - call from `tearDown` method | ||||
|     global _hf_deepspeed_config_weak_ref | ||||
|     _hf_deepspeed_config_weak_ref = None | ||||
|  | ||||
|  | ||||
| def is_deepspeed_zero3_enabled(): | ||||
|     if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None: | ||||
|         return _hf_deepspeed_config_weak_ref().is_zero3() | ||||
|     else: | ||||
|         return False | ||||
|  | ||||
|  | ||||
| def deepspeed_config(): | ||||
|     if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None: | ||||
|         return _hf_deepspeed_config_weak_ref().config | ||||
|     else: | ||||
|         return None | ||||
|  | ||||
|  | ||||
| def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps, model_parameters): | ||||
|     """ | ||||
|     A convenience wrapper that deals with optimizer and lr scheduler configuration. | ||||
|     """ | ||||
|     from accelerate.utils import DummyOptim, DummyScheduler | ||||
|  | ||||
|     config = hf_deepspeed_config.config | ||||
|  | ||||
|     # Optimizer + Scheduler | ||||
|     # Currently supported combos: | ||||
|     # 1. DS scheduler + DS optimizer: Yes | ||||
|     # 2. HF scheduler + HF optimizer: Yes | ||||
|     # 3. DS scheduler + HF optimizer: Yes | ||||
|     # 4. HF scheduler + DS optimizer: No | ||||
|     # | ||||
|     # Unless Offload is enabled in which case it's: | ||||
|     # 1. DS scheduler + DS optimizer: Yes | ||||
|     # 2. HF scheduler + HF optimizer: Mostly* | ||||
|     # 3. DS scheduler + HF optimizer: Mostly* | ||||
|     # 4. HF scheduler + DS optimizer: No | ||||
|     # | ||||
|     # Mostly*: All non-native DeepSpeed optimizers that have both CPU and GPU implementation should work (except LAMB) | ||||
|  | ||||
|     optimizer = None | ||||
|     if "optimizer" in config: | ||||
|         if args.adafactor: | ||||
|             raise ValueError( | ||||
|                 "--adafactor was passed, but also found `optimizer` configured in the DeepSpeed config. " | ||||
|                 "Only one optimizer can be configured." | ||||
|             ) | ||||
|         optimizer = DummyOptim(params=model_parameters) | ||||
|     else: | ||||
|         if hf_deepspeed_config.is_offload(): | ||||
|             logger.info( | ||||
|                 "Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the" | ||||
|                 " custom optimizer has both CPU and GPU implementation (except LAMB)" | ||||
|             ) | ||||
|  | ||||
|         # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch. | ||||
|         # But trainer uses AdamW by default. | ||||
|         optimizer = trainer.create_optimizer() | ||||
|         # To use other optimizers requires voiding warranty with: `zero_allow_untested_optimizer` | ||||
|         config["zero_allow_untested_optimizer"] = True | ||||
|  | ||||
|     lr_scheduler = None | ||||
|     if "scheduler" in config: | ||||
|         lr_scheduler = DummyScheduler(optimizer) | ||||
|     else: | ||||
|         if isinstance(optimizer, DummyOptim): | ||||
|             raise ValueError( | ||||
|                 "Found `optimizer` configured in the DeepSpeed config, but no `scheduler`. " | ||||
|                 "Please configure a scheduler in the DeepSpeed config." | ||||
|             ) | ||||
|         lr_scheduler = trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer) | ||||
|  | ||||
|     return optimizer, lr_scheduler | ||||
|  | ||||
|  | ||||
| def deepspeed_init(trainer, num_training_steps, inference=False): | ||||
|     """ | ||||
|     Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args. | ||||
|  | ||||
|     If `resume_from_checkpoint` was passed then an attempt to resume from a previously saved checkpoint will be made. | ||||
|  | ||||
|     Args: | ||||
|         trainer: Trainer object | ||||
|         num_training_steps: per single gpu | ||||
|         resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load | ||||
|         inference: launch in inference mode (no optimizer and no lr scheduler) | ||||
|  | ||||
|     Returns: optimizer, lr_scheduler | ||||
|  | ||||
|     We may use `deepspeed_init` more than once during the life of Trainer, when we do - it's a temp hack based on: | ||||
|     https://github.com/microsoft/DeepSpeed/issues/1394#issuecomment-937405374 until Deepspeed fixes a bug where it | ||||
|     can't resume from a checkpoint after it did some stepping https://github.com/microsoft/DeepSpeed/issues/1612 | ||||
|  | ||||
|     """ | ||||
|     from deepspeed.utils import logger as ds_logger | ||||
|  | ||||
|     model = trainer.model | ||||
|     args = trainer.args | ||||
|  | ||||
|     hf_deepspeed_config = trainer.accelerator.state.deepspeed_plugin.hf_ds_config | ||||
|  | ||||
|     # resume config update - some bits like `model` and `num_training_steps` only become available during train | ||||
|     hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps) | ||||
|  | ||||
|     # set the Deepspeed log level consistent with the Trainer | ||||
|     ds_logger.setLevel(args.get_process_log_level()) | ||||
|  | ||||
|     if inference: | ||||
|         # only Z3 makes sense for the inference | ||||
|         if not hf_deepspeed_config.is_zero3(): | ||||
|             raise ValueError("ZeRO inference only makes sense with ZeRO Stage 3 - please adjust your config") | ||||
|  | ||||
|         # in case the training config is re-used for inference | ||||
|         hf_deepspeed_config.del_config_sub_tree("optimizer") | ||||
|         hf_deepspeed_config.del_config_sub_tree("lr_scheduler") | ||||
|         optimizer, lr_scheduler = None, None | ||||
|         model_parameters = None | ||||
|     else: | ||||
|         trainer.optimizer = None  # important for when deepspeed_init is used as re-init | ||||
|         model_parameters = list(filter(lambda p: p.requires_grad, model.parameters())) | ||||
|         optimizer, lr_scheduler = deepspeed_optim_sched( | ||||
|             trainer, hf_deepspeed_config, args, num_training_steps, model_parameters | ||||
|         ) | ||||
|  | ||||
|     # keep for quick debug: | ||||
|     # from pprint import pprint; pprint(config) | ||||
|  | ||||
|     return optimizer, lr_scheduler | ||||
|  | ||||
|  | ||||
| def deepspeed_load_checkpoint(deepspeed_engine, checkpoint_path): | ||||
|     # it's possible that the user is trying to resume from model_path, which doesn't necessarily | ||||
|     # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's | ||||
|     # a resume from a checkpoint and not just a local pretrained weight. So we check here if the | ||||
|     # path contains what looks like a deepspeed checkpoint | ||||
|     import glob | ||||
|  | ||||
|     deepspeed_checkpoint_dirs = sorted(glob.glob(f"{checkpoint_path}/global_step*")) | ||||
|  | ||||
|     if len(deepspeed_checkpoint_dirs) > 0: | ||||
|         logger.info(f"Attempting to resume from {checkpoint_path}") | ||||
|         # this magically updates self.optimizer and self.lr_scheduler | ||||
|         load_path, _ = deepspeed_engine.load_checkpoint( | ||||
|             checkpoint_path, load_optimizer_states=True, load_lr_scheduler_states=True | ||||
|         ) | ||||
|         if load_path is None: | ||||
|             raise ValueError(f"[deepspeed] failed to resume from checkpoint {checkpoint_path}") | ||||
|     else: | ||||
|         raise ValueError(f"Can't find a valid checkpoint at {checkpoint_path}") | ||||
| import warnings | ||||
|  | ||||
|  | ||||
| warnings.warn( | ||||
|     "transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations", | ||||
|     FutureWarning, | ||||
| ) | ||||
|  | ||||
| # Backward compatibility imports, to make sure all those objects can be found in integrations/deepspeed | ||||
| from .integrations.deepspeed import (  # noqa | ||||
|     HfDeepSpeedConfig, | ||||
|     HfTrainerDeepSpeedConfig, | ||||
|     deepspeed_config, | ||||
|     deepspeed_init, | ||||
|     deepspeed_load_checkpoint, | ||||
|     deepspeed_optim_sched, | ||||
|     is_deepspeed_available, | ||||
|     is_deepspeed_zero3_enabled, | ||||
|     set_hf_deepspeed_config, | ||||
|     unset_hf_deepspeed_config, | ||||
| ) | ||||
|  | ||||
| @ -24,7 +24,7 @@ import torch | ||||
| import torch.distributed as dist | ||||
| from torch import nn | ||||
|  | ||||
| from ..deepspeed import is_deepspeed_zero3_enabled | ||||
| from ..integrations.deepspeed import is_deepspeed_zero3_enabled | ||||
| from ..modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput | ||||
| from ..models.auto import ( | ||||
|     MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING, | ||||
|  | ||||
							
								
								
									
										71
									
								
								src/transformers/integrations/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										71
									
								
								src/transformers/integrations/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,71 @@ | ||||
| # Copyright 2023 The HuggingFace Team. All rights reserved. | ||||
| # | ||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| # you may not use this file except in compliance with the License. | ||||
| # You may obtain a copy of the License at | ||||
| # | ||||
| #     http://www.apache.org/licenses/LICENSE-2.0 | ||||
| # | ||||
| # Unless required by applicable law or agreed to in writing, software | ||||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | ||||
| from .bitsandbytes import ( | ||||
|     get_keys_to_not_convert, | ||||
|     replace_8bit_linear, | ||||
|     replace_with_bnb_linear, | ||||
|     set_module_8bit_tensor_to_device, | ||||
|     set_module_quantized_tensor_to_device, | ||||
| ) | ||||
| from .deepspeed import ( | ||||
|     HfDeepSpeedConfig, | ||||
|     HfTrainerDeepSpeedConfig, | ||||
|     deepspeed_config, | ||||
|     deepspeed_init, | ||||
|     deepspeed_load_checkpoint, | ||||
|     deepspeed_optim_sched, | ||||
|     is_deepspeed_available, | ||||
|     is_deepspeed_zero3_enabled, | ||||
|     set_hf_deepspeed_config, | ||||
|     unset_hf_deepspeed_config, | ||||
| ) | ||||
| from .integration_utils import ( | ||||
|     INTEGRATION_TO_CALLBACK, | ||||
|     AzureMLCallback, | ||||
|     ClearMLCallback, | ||||
|     CodeCarbonCallback, | ||||
|     CometCallback, | ||||
|     DagsHubCallback, | ||||
|     FlyteCallback, | ||||
|     MLflowCallback, | ||||
|     NeptuneCallback, | ||||
|     NeptuneMissingConfiguration, | ||||
|     TensorBoardCallback, | ||||
|     WandbCallback, | ||||
|     get_available_reporting_integrations, | ||||
|     get_reporting_integration_callbacks, | ||||
|     hp_params, | ||||
|     is_azureml_available, | ||||
|     is_clearml_available, | ||||
|     is_codecarbon_available, | ||||
|     is_comet_available, | ||||
|     is_dagshub_available, | ||||
|     is_fairscale_available, | ||||
|     is_flyte_deck_standard_available, | ||||
|     is_flytekit_available, | ||||
|     is_mlflow_available, | ||||
|     is_neptune_available, | ||||
|     is_optuna_available, | ||||
|     is_ray_available, | ||||
|     is_ray_tune_available, | ||||
|     is_sigopt_available, | ||||
|     is_tensorboard_available, | ||||
|     is_wandb_available, | ||||
|     rewrite_logs, | ||||
|     run_hp_search_optuna, | ||||
|     run_hp_search_ray, | ||||
|     run_hp_search_sigopt, | ||||
|     run_hp_search_wandb, | ||||
| ) | ||||
| from .peft import PeftAdapterMixin | ||||
							
								
								
									
										290
									
								
								src/transformers/integrations/bitsandbytes.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										290
									
								
								src/transformers/integrations/bitsandbytes.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,290 @@ | ||||
| import importlib.metadata | ||||
| import warnings | ||||
| from copy import deepcopy | ||||
|  | ||||
| from packaging import version | ||||
|  | ||||
| from ..utils import is_accelerate_available, is_bitsandbytes_available, logging | ||||
|  | ||||
|  | ||||
| if is_bitsandbytes_available(): | ||||
|     import bitsandbytes as bnb | ||||
|     import torch | ||||
|     import torch.nn as nn | ||||
|  | ||||
|     from ..pytorch_utils import Conv1D | ||||
|  | ||||
| if is_accelerate_available(): | ||||
|     from accelerate import init_empty_weights | ||||
|     from accelerate.utils import find_tied_parameters | ||||
|  | ||||
| logger = logging.get_logger(__name__) | ||||
|  | ||||
|  | ||||
| def set_module_quantized_tensor_to_device(module, tensor_name, device, value=None, fp16_statistics=None): | ||||
|     """ | ||||
|     A helper function to set a given tensor (parameter of buffer) of a module on a specific device (note that doing | ||||
|     `param.to(device)` creates a new tensor not linked to the parameter, which is why we need this function). The | ||||
|     function is adapted from `set_module_tensor_to_device` function from accelerate that is adapted to support the | ||||
|     class `Int8Params` from `bitsandbytes`. | ||||
|  | ||||
|     Args: | ||||
|         module (`torch.nn.Module`): | ||||
|             The module in which the tensor we want to move lives. | ||||
|         tensor_name (`str`): | ||||
|             The full name of the parameter/buffer. | ||||
|         device (`int`, `str` or `torch.device`): | ||||
|             The device on which to set the tensor. | ||||
|         value (`torch.Tensor`, *optional*): | ||||
|             The value of the tensor (useful when going from the meta device to any other device). | ||||
|         fp16_statistics (`torch.HalfTensor`, *optional*): | ||||
|             The list of fp16 statistics to set on the module, used for serialization. | ||||
|     """ | ||||
|     # Recurse if needed | ||||
|     if "." in tensor_name: | ||||
|         splits = tensor_name.split(".") | ||||
|         for split in splits[:-1]: | ||||
|             new_module = getattr(module, split) | ||||
|             if new_module is None: | ||||
|                 raise ValueError(f"{module} has no attribute {split}.") | ||||
|             module = new_module | ||||
|         tensor_name = splits[-1] | ||||
|  | ||||
|     if tensor_name not in module._parameters and tensor_name not in module._buffers: | ||||
|         raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.") | ||||
|     is_buffer = tensor_name in module._buffers | ||||
|     old_value = getattr(module, tensor_name) | ||||
|  | ||||
|     if old_value.device == torch.device("meta") and device not in ["meta", torch.device("meta")] and value is None: | ||||
|         raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {device}.") | ||||
|  | ||||
|     is_4bit = False | ||||
|     is_8bit = False | ||||
|     if is_buffer or not is_bitsandbytes_available(): | ||||
|         is_8bit = False | ||||
|         is_4bit = False | ||||
|     else: | ||||
|         is_4bit = hasattr(bnb.nn, "Params4bit") and isinstance(module._parameters[tensor_name], bnb.nn.Params4bit) | ||||
|         is_8bit = isinstance(module._parameters[tensor_name], bnb.nn.Int8Params) | ||||
|  | ||||
|     if is_8bit or is_4bit: | ||||
|         param = module._parameters[tensor_name] | ||||
|         if param.device.type != "cuda": | ||||
|             if value is None: | ||||
|                 new_value = old_value.to(device) | ||||
|             elif isinstance(value, torch.Tensor): | ||||
|                 new_value = value.to("cpu") | ||||
|                 if value.dtype == torch.int8: | ||||
|                     is_8bit_serializable = version.parse(importlib.metadata.version("bitsandbytes")) > version.parse( | ||||
|                         "0.37.2" | ||||
|                     ) | ||||
|                     if not is_8bit_serializable: | ||||
|                         raise ValueError( | ||||
|                             "Detected int8 weights but the version of bitsandbytes is not compatible with int8 serialization. " | ||||
|                             "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`." | ||||
|                         ) | ||||
|             else: | ||||
|                 new_value = torch.tensor(value, device="cpu") | ||||
|  | ||||
|             # Support models using `Conv1D` in place of `nn.Linear` (e.g. gpt2) by transposing the weight matrix prior to quantization. | ||||
|             # Since weights are saved in the correct "orientation", we skip transposing when loading. | ||||
|             if issubclass(module.source_cls, Conv1D) and fp16_statistics is None: | ||||
|                 new_value = new_value.T | ||||
|  | ||||
|             kwargs = old_value.__dict__ | ||||
|             if is_8bit: | ||||
|                 new_value = bnb.nn.Int8Params(new_value, requires_grad=False, **kwargs).to(device) | ||||
|             elif is_4bit: | ||||
|                 new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(device) | ||||
|  | ||||
|             module._parameters[tensor_name] = new_value | ||||
|             if fp16_statistics is not None: | ||||
|                 setattr(module.weight, "SCB", fp16_statistics.to(device)) | ||||
|  | ||||
|     else: | ||||
|         if value is None: | ||||
|             new_value = old_value.to(device) | ||||
|         elif isinstance(value, torch.Tensor): | ||||
|             new_value = value.to(device) | ||||
|         else: | ||||
|             new_value = torch.tensor(value, device=device) | ||||
|  | ||||
|         if is_buffer: | ||||
|             module._buffers[tensor_name] = new_value | ||||
|         else: | ||||
|             new_value = nn.Parameter(new_value, requires_grad=old_value.requires_grad) | ||||
|             module._parameters[tensor_name] = new_value | ||||
|  | ||||
|  | ||||
| def _replace_with_bnb_linear( | ||||
|     model, modules_to_not_convert=None, current_key_name=None, quantization_config=None, has_been_replaced=False | ||||
| ): | ||||
|     """ | ||||
|     Private method that wraps the recursion for module replacement. | ||||
|  | ||||
|     Returns the converted model and a boolean that indicates if the conversion has been successfull or not. | ||||
|     """ | ||||
|     for name, module in model.named_children(): | ||||
|         if current_key_name is None: | ||||
|             current_key_name = [] | ||||
|         current_key_name.append(name) | ||||
|  | ||||
|         if (isinstance(module, nn.Linear) or isinstance(module, Conv1D)) and name not in modules_to_not_convert: | ||||
|             # Check if the current key is not in the `modules_to_not_convert` | ||||
|             if not any(key in ".".join(current_key_name) for key in modules_to_not_convert): | ||||
|                 with init_empty_weights(): | ||||
|                     if isinstance(module, Conv1D): | ||||
|                         in_features, out_features = module.weight.shape | ||||
|                     else: | ||||
|                         in_features = module.in_features | ||||
|                         out_features = module.out_features | ||||
|  | ||||
|                     if quantization_config.quantization_method() == "llm_int8": | ||||
|                         model._modules[name] = bnb.nn.Linear8bitLt( | ||||
|                             in_features, | ||||
|                             out_features, | ||||
|                             module.bias is not None, | ||||
|                             has_fp16_weights=quantization_config.llm_int8_has_fp16_weight, | ||||
|                             threshold=quantization_config.llm_int8_threshold, | ||||
|                         ) | ||||
|                         has_been_replaced = True | ||||
|                     else: | ||||
|                         if ( | ||||
|                             quantization_config.llm_int8_skip_modules is not None | ||||
|                             and name in quantization_config.llm_int8_skip_modules | ||||
|                         ): | ||||
|                             pass | ||||
|                         else: | ||||
|                             model._modules[name] = bnb.nn.Linear4bit( | ||||
|                                 in_features, | ||||
|                                 out_features, | ||||
|                                 module.bias is not None, | ||||
|                                 quantization_config.bnb_4bit_compute_dtype, | ||||
|                                 compress_statistics=quantization_config.bnb_4bit_use_double_quant, | ||||
|                                 quant_type=quantization_config.bnb_4bit_quant_type, | ||||
|                             ) | ||||
|                             has_been_replaced = True | ||||
|                     # Store the module class in case we need to transpose the weight later | ||||
|                     model._modules[name].source_cls = type(module) | ||||
|                     # Force requires grad to False to avoid unexpected errors | ||||
|                     model._modules[name].requires_grad_(False) | ||||
|         if len(list(module.children())) > 0: | ||||
|             _, has_been_replaced = _replace_with_bnb_linear( | ||||
|                 module, | ||||
|                 modules_to_not_convert, | ||||
|                 current_key_name, | ||||
|                 quantization_config, | ||||
|                 has_been_replaced=has_been_replaced, | ||||
|             ) | ||||
|         # Remove the last key for recursion | ||||
|         current_key_name.pop(-1) | ||||
|     return model, has_been_replaced | ||||
|  | ||||
|  | ||||
| def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name=None, quantization_config=None): | ||||
|     """ | ||||
|     A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules from the `bitsandbytes` | ||||
|     library. This will enable running your models using mixed int8 precision as described by the paper `LLM.int8(): | ||||
|     8-bit Matrix Multiplication for Transformers at Scale`. Make sure `bitsandbytes` compiled with the correct CUDA | ||||
|     version of your hardware is installed before running this function. `pip install -i https://test.pypi.org/simple/ | ||||
|     bitsandbytes` | ||||
|  | ||||
|     The function will be run recursively and replace all `torch.nn.Linear` modules except for the `lm_head` that should | ||||
|     be kept as a `torch.nn.Linear` module. The replacement is done under `init_empty_weights` context manager so no | ||||
|     CPU/GPU memory is required to run this function. Int8 mixed-precision matrix decomposition works by separating a | ||||
|     matrix multiplication into two streams: (1) and systematic feature outlier stream matrix multiplied in fp16 | ||||
|     (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no | ||||
|     predictive degradation is possible for very large models (>=176B parameters). | ||||
|  | ||||
|     Parameters: | ||||
|         model (`torch.nn.Module`): | ||||
|             Input model or `torch.nn.Module` as the function is run recursively. | ||||
|         modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`): | ||||
|             Names of the modules to not convert in `Linear8bitLt`. In practice we keep the `lm_head` in full precision | ||||
|             for numerical stability reasons. | ||||
|         current_key_name (`List[`str`]`, *optional*): | ||||
|             An array to track the current key of the recursion. This is used to check whether the current key (part of | ||||
|             it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or | ||||
|             `disk`). | ||||
|     """ | ||||
|     modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert | ||||
|     model, has_been_replaced = _replace_with_bnb_linear( | ||||
|         model, modules_to_not_convert, current_key_name, quantization_config | ||||
|     ) | ||||
|  | ||||
|     if not has_been_replaced: | ||||
|         logger.warning( | ||||
|             "You are loading your model in 8bit or 4bit but no linear modules were found in your model." | ||||
|             " Please double check your model architecture, or submit an issue on github if you think this is" | ||||
|             " a bug." | ||||
|         ) | ||||
|  | ||||
|     return model | ||||
|  | ||||
|  | ||||
| # For backward compatibility | ||||
| def replace_8bit_linear(*args, **kwargs): | ||||
|     warnings.warn( | ||||
|         "`replace_8bit_linear` will be deprecated in a future version, please use `replace_with_bnb_linear` instead", | ||||
|         FutureWarning, | ||||
|     ) | ||||
|     return replace_with_bnb_linear(*args, **kwargs) | ||||
|  | ||||
|  | ||||
| # For backward compatiblity | ||||
| def set_module_8bit_tensor_to_device(*args, **kwargs): | ||||
|     warnings.warn( | ||||
|         "`set_module_8bit_tensor_to_device` will be deprecated in a future version, please use `set_module_quantized_tensor_to_device` instead", | ||||
|         FutureWarning, | ||||
|     ) | ||||
|     return set_module_quantized_tensor_to_device(*args, **kwargs) | ||||
|  | ||||
|  | ||||
| def get_keys_to_not_convert(model): | ||||
|     r""" | ||||
|     An utility function to get the key of the module to keep in full precision if any For example for CausalLM modules | ||||
|     we may want to keep the lm_head in full precision for numerical stability reasons. For other architectures, we want | ||||
|     to keep the tied weights of the model. The function will return a list of the keys of the modules to not convert in | ||||
|     int8. | ||||
|  | ||||
|     Parameters: | ||||
|     model (`torch.nn.Module`): | ||||
|         Input model | ||||
|     """ | ||||
|     # Create a copy of the model and tie the weights, then | ||||
|     # check if it contains tied weights | ||||
|     tied_model = deepcopy(model)  # this has 0 cost since it is done inside `init_empty_weights` context manager` | ||||
|     tied_model.tie_weights() | ||||
|  | ||||
|     tied_params = find_tied_parameters(tied_model) | ||||
|     # For compatibility with Accelerate < 0.18 | ||||
|     if isinstance(tied_params, dict): | ||||
|         tied_keys = sum(list(tied_params.values()), []) + list(tied_params.keys()) | ||||
|     else: | ||||
|         tied_keys = sum(tied_params, []) | ||||
|     has_tied_params = len(tied_keys) > 0 | ||||
|  | ||||
|     # If there is not tied weights, we want to keep the lm_head(output_embedding) in full precision | ||||
|     if not has_tied_params: | ||||
|         output_emb = model.get_output_embeddings() | ||||
|         if output_emb is not None: | ||||
|             list_last_module = [name for name, module in model.named_modules() if id(module) == id(output_emb)] | ||||
|             return list_last_module | ||||
|  | ||||
|     # otherwise, no tied weights, no output embedding defined, simply keep the last module in full precision | ||||
|     list_modules = list(model.named_parameters()) | ||||
|     list_last_module = [list_modules[-1][0]] | ||||
|     # add last module together with tied weights | ||||
|     intersection = set(list_last_module) - set(tied_keys) | ||||
|     list_untouched = list(set(tied_keys)) + list(intersection) | ||||
|  | ||||
|     # remove ".weight" from the keys | ||||
|     names_to_remove = [".weight", ".bias"] | ||||
|     filtered_module_names = [] | ||||
|     for name in list_untouched: | ||||
|         for name_to_remove in names_to_remove: | ||||
|             if name_to_remove in name: | ||||
|                 name = name.replace(name_to_remove, "") | ||||
|         filtered_module_names.append(name) | ||||
|  | ||||
|     return filtered_module_names | ||||
							
								
								
									
										389
									
								
								src/transformers/integrations/deepspeed.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										389
									
								
								src/transformers/integrations/deepspeed.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,389 @@ | ||||
| # Copyright 2020 The HuggingFace Team. All rights reserved. | ||||
| # | ||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| # you may not use this file except in compliance with the License. | ||||
| # You may obtain a copy of the License at | ||||
| # | ||||
| #     http://www.apache.org/licenses/LICENSE-2.0 | ||||
| # | ||||
| # Unless required by applicable law or agreed to in writing, software | ||||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | ||||
| """ | ||||
| Integration with Deepspeed | ||||
| """ | ||||
|  | ||||
| import importlib.util | ||||
| import weakref | ||||
| from functools import partialmethod | ||||
|  | ||||
| from ..dependency_versions_check import dep_version_check | ||||
| from ..utils import is_accelerate_available, is_torch_available, logging | ||||
|  | ||||
|  | ||||
| if is_torch_available(): | ||||
|     import torch | ||||
|  | ||||
| logger = logging.get_logger(__name__) | ||||
|  | ||||
|  | ||||
| def is_deepspeed_available(): | ||||
|     return importlib.util.find_spec("deepspeed") is not None | ||||
|  | ||||
|  | ||||
| if is_accelerate_available() and is_deepspeed_available(): | ||||
|     from accelerate.utils.deepspeed import HfDeepSpeedConfig as DeepSpeedConfig | ||||
| else: | ||||
|     # Inherits from a dummy `object` if accelerate is not available, so that python succeeds to import this file. | ||||
|     # Deepspeed glue code will never inherit this dummy object as it checks if accelerate is available. | ||||
|     from builtins import object as DeepSpeedConfig | ||||
|  | ||||
|  | ||||
| class HfDeepSpeedConfig(DeepSpeedConfig): | ||||
|     """ | ||||
|     This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage. | ||||
|  | ||||
|     A `weakref` of this object is stored in the module's globals to be able to access the config from areas where | ||||
|     things like the Trainer object is not available (e.g. `from_pretrained` and `_get_resized_embeddings`). Therefore | ||||
|     it's important that this object remains alive while the program is still running. | ||||
|  | ||||
|     [`Trainer`] uses the `HfTrainerDeepSpeedConfig` subclass instead. That subclass has logic to sync the configuration | ||||
|     with values of [`TrainingArguments`] by replacing special placeholder values: `"auto"`. Without this special logic | ||||
|     the DeepSpeed configuration is not modified in any way. | ||||
|  | ||||
|     Args: | ||||
|         config_file_or_dict (`Union[str, Dict]`): path to DeepSpeed config file or dict. | ||||
|  | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, config_file_or_dict): | ||||
|         # set global weakref object | ||||
|         set_hf_deepspeed_config(self) | ||||
|         dep_version_check("accelerate") | ||||
|         dep_version_check("deepspeed") | ||||
|         super().__init__(config_file_or_dict) | ||||
|  | ||||
|  | ||||
| class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig): | ||||
|     """ | ||||
|     The `HfTrainerDeepSpeedConfig` object is meant to be created during `TrainingArguments` object creation and has the | ||||
|     same lifespan as the latter. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, config_file_or_dict): | ||||
|         super().__init__(config_file_or_dict) | ||||
|         self._dtype = None | ||||
|         self.mismatches = [] | ||||
|  | ||||
|     def dtype(self): | ||||
|         if self._dtype is None: | ||||
|             raise ValueError("trainer_config_process() wasn't called yet to tell dtype") | ||||
|         return self._dtype | ||||
|  | ||||
|     def is_auto(self, ds_key_long): | ||||
|         val = self.get_value(ds_key_long) | ||||
|         if val is None: | ||||
|             return False | ||||
|         else: | ||||
|             return val == "auto" | ||||
|  | ||||
|     def fill_match(self, ds_key_long, hf_val, hf_key=None, must_match=True): | ||||
|         """ | ||||
|         A utility method that massages the config file and can optionally verify that the values match. | ||||
|  | ||||
|         1. Replace "auto" values with `TrainingArguments` value. | ||||
|  | ||||
|         2. If it wasn't "auto" and `must_match` is true, then check that DS config matches Trainer | ||||
|         config values and if mismatched add the entry to `self.mismatched` - will assert during | ||||
|         `trainer_config_finalize` for one or more mismatches. | ||||
|  | ||||
|         """ | ||||
|         config, ds_key = self.find_config_node(ds_key_long) | ||||
|         if config is None: | ||||
|             return | ||||
|  | ||||
|         if config.get(ds_key) == "auto": | ||||
|             config[ds_key] = hf_val | ||||
|             return | ||||
|  | ||||
|         if not must_match: | ||||
|             return | ||||
|  | ||||
|         ds_val = config.get(ds_key) | ||||
|         if ds_val is not None and ds_val != hf_val: | ||||
|             self.mismatches.append(f"- ds {ds_key_long}={ds_val} vs hf {hf_key}={hf_val}") | ||||
|  | ||||
|     fill_only = partialmethod(fill_match, must_match=False) | ||||
|  | ||||
|     def trainer_config_process(self, args): | ||||
|         """ | ||||
|         Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object | ||||
|         creation. | ||||
|         """ | ||||
|         # DeepSpeed does: | ||||
|         # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps | ||||
|         train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps | ||||
|         self.fill_match( | ||||
|             "train_micro_batch_size_per_gpu", args.per_device_train_batch_size, "per_device_train_batch_size" | ||||
|         ) | ||||
|         self.fill_match("gradient_accumulation_steps", args.gradient_accumulation_steps, "gradient_accumulation_steps") | ||||
|         self.fill_match("train_batch_size", train_batch_size, "train_batch_size (calculated)") | ||||
|         self.fill_match("gradient_clipping", args.max_grad_norm, "max_grad_norm") | ||||
|  | ||||
|         self.fill_match("optimizer.params.lr", args.learning_rate, "learning_rate") | ||||
|         self.fill_match("optimizer.params.betas", [args.adam_beta1, args.adam_beta2], "adam_beta1+adam_beta2") | ||||
|         self.fill_match("optimizer.params.eps", args.adam_epsilon, "adam_epsilon") | ||||
|         self.fill_match("optimizer.params.weight_decay", args.weight_decay, "weight_decay") | ||||
|  | ||||
|         self.fill_only("scheduler.params.warmup_min_lr", 0)  # not a trainer arg | ||||
|         self.fill_match("scheduler.params.warmup_max_lr", args.learning_rate, "learning_rate") | ||||
|         # total_num_steps - will get set in trainer_config_finalize | ||||
|  | ||||
|         # fp16 | ||||
|         if args.fp16 or args.fp16_full_eval: | ||||
|             fp16_backend = "apex" if args.fp16_backend == "apex" else "amp" | ||||
|         else: | ||||
|             fp16_backend = None | ||||
|  | ||||
|         if args.save_on_each_node: | ||||
|             # deepspeed uses shared storage by default. Let's override this setting if save_on_each_node == True | ||||
|             self.config["checkpoint"] = self.config.get("checkpoint", {}) | ||||
|             self.config["checkpoint"]["use_node_local_storage"] = args.save_on_each_node | ||||
|  | ||||
|         # amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set | ||||
|         # any here unless the user did the work | ||||
|         self.fill_match( | ||||
|             "fp16.enabled", | ||||
|             ((args.fp16 or args.fp16_full_eval) and fp16_backend == "amp"), | ||||
|             "fp16|fp16_full_eval+fp16_backend(amp)", | ||||
|         ) | ||||
|  | ||||
|         # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any | ||||
|         # ZeRO features | ||||
|         self.fill_match("amp.enabled", fp16_backend == "apex", "fp16+fp16_backend(apex)") | ||||
|         self.fill_match("amp.opt_level", args.fp16_opt_level, "fp16_opt_level") | ||||
|  | ||||
|         self.fill_match("bf16.enabled", (args.bf16 or args.bf16_full_eval), "bf16|bf16_full_eval") | ||||
|  | ||||
|         # deepspeed's default mode is fp16 unless there is a config that says differently | ||||
|         if self.is_true("bf16.enabled"): | ||||
|             self._dtype = torch.bfloat16 | ||||
|         elif self.is_false("fp16.enabled"): | ||||
|             self._dtype = torch.float32 | ||||
|         else: | ||||
|             self._dtype = torch.float16 | ||||
|  | ||||
|     def trainer_config_finalize(self, args, model, num_training_steps): | ||||
|         """ | ||||
|         This stage is run after we have the model and know num_training_steps. | ||||
|  | ||||
|         Now we can complete the configuration process. | ||||
|         """ | ||||
|         # zero | ||||
|  | ||||
|         # deal with config keys that use `auto` value and rely on model's hidden_size | ||||
|         hidden_size_based_keys = [ | ||||
|             "zero_optimization.reduce_bucket_size", | ||||
|             "zero_optimization.stage3_prefetch_bucket_size", | ||||
|             "zero_optimization.stage3_param_persistence_threshold", | ||||
|         ] | ||||
|         hidden_size_auto_keys = [x for x in hidden_size_based_keys if self.is_auto(x)] | ||||
|  | ||||
|         if len(hidden_size_auto_keys) > 0: | ||||
|             if hasattr(model.config, "hidden_size"): | ||||
|                 hidden_size = model.config.hidden_size | ||||
|             elif hasattr(model.config, "hidden_sizes"): | ||||
|                 # if there are many hidden sizes pick the largest one | ||||
|                 hidden_size = max(model.config.hidden_sizes) | ||||
|             else: | ||||
|                 raise ValueError( | ||||
|                     "The model's config file has neither `hidden_size` nor `hidden_sizes` entry, " | ||||
|                     "therefore it's not possible to automatically fill out the following `auto` entries " | ||||
|                     f"in the DeepSpeed config file: {hidden_size_auto_keys}. You can fix that by replacing " | ||||
|                     "`auto` values for these keys with an integer value of your choice." | ||||
|                 ) | ||||
|  | ||||
|             self.fill_only("zero_optimization.reduce_bucket_size", hidden_size * hidden_size) | ||||
|             if self.is_zero3(): | ||||
|                 # automatically assign the optimal config values based on model config | ||||
|                 self.fill_only("zero_optimization.stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size) | ||||
|                 self.fill_only("zero_optimization.stage3_param_persistence_threshold", 10 * hidden_size) | ||||
|  | ||||
|         # scheduler | ||||
|         self.fill_match("scheduler.params.total_num_steps", num_training_steps, "num_training_steps (calculated)") | ||||
|         self.fill_match("scheduler.params.warmup_num_steps", args.get_warmup_steps(num_training_steps), "warmup_steps") | ||||
|  | ||||
|         if len(self.mismatches) > 0: | ||||
|             mismatches = "\n".join(self.mismatches) | ||||
|             raise ValueError( | ||||
|                 "Please correct the following DeepSpeed config values that mismatch TrainingArguments" | ||||
|                 f" values:\n{mismatches}\nThe easiest method is to set these DeepSpeed config values to 'auto'." | ||||
|             ) | ||||
|  | ||||
|  | ||||
| # keep the config object global to be able to access it anywhere during TrainingArguments life-cycle | ||||
| _hf_deepspeed_config_weak_ref = None | ||||
|  | ||||
|  | ||||
| def set_hf_deepspeed_config(hf_deepspeed_config_obj): | ||||
|     # this is a special weakref global object to allow us to get to Deepspeed config from APIs | ||||
|     # that don't have an easy way to get to the Deepspeed config outside of the Trainer domain. | ||||
|     global _hf_deepspeed_config_weak_ref | ||||
|     # will go away automatically when HfDeepSpeedConfig is destroyed (when TrainingArguments is destroyed) | ||||
|     _hf_deepspeed_config_weak_ref = weakref.ref(hf_deepspeed_config_obj) | ||||
|  | ||||
|  | ||||
| def unset_hf_deepspeed_config(): | ||||
|     # useful for unit tests to ensure the global state doesn't leak - call from `tearDown` method | ||||
|     global _hf_deepspeed_config_weak_ref | ||||
|     _hf_deepspeed_config_weak_ref = None | ||||
|  | ||||
|  | ||||
| def is_deepspeed_zero3_enabled(): | ||||
|     if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None: | ||||
|         return _hf_deepspeed_config_weak_ref().is_zero3() | ||||
|     else: | ||||
|         return False | ||||
|  | ||||
|  | ||||
| def deepspeed_config(): | ||||
|     if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None: | ||||
|         return _hf_deepspeed_config_weak_ref().config | ||||
|     else: | ||||
|         return None | ||||
|  | ||||
|  | ||||
| def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps, model_parameters): | ||||
|     """ | ||||
|     A convenience wrapper that deals with optimizer and lr scheduler configuration. | ||||
|     """ | ||||
|     from accelerate.utils import DummyOptim, DummyScheduler | ||||
|  | ||||
|     config = hf_deepspeed_config.config | ||||
|  | ||||
|     # Optimizer + Scheduler | ||||
|     # Currently supported combos: | ||||
|     # 1. DS scheduler + DS optimizer: Yes | ||||
|     # 2. HF scheduler + HF optimizer: Yes | ||||
|     # 3. DS scheduler + HF optimizer: Yes | ||||
|     # 4. HF scheduler + DS optimizer: No | ||||
|     # | ||||
|     # Unless Offload is enabled in which case it's: | ||||
|     # 1. DS scheduler + DS optimizer: Yes | ||||
|     # 2. HF scheduler + HF optimizer: Mostly* | ||||
|     # 3. DS scheduler + HF optimizer: Mostly* | ||||
|     # 4. HF scheduler + DS optimizer: No | ||||
|     # | ||||
|     # Mostly*: All non-native DeepSpeed optimizers that have both CPU and GPU implementation should work (except LAMB) | ||||
|  | ||||
|     optimizer = None | ||||
|     if "optimizer" in config: | ||||
|         if args.adafactor: | ||||
|             raise ValueError( | ||||
|                 "--adafactor was passed, but also found `optimizer` configured in the DeepSpeed config. " | ||||
|                 "Only one optimizer can be configured." | ||||
|             ) | ||||
|         optimizer = DummyOptim(params=model_parameters) | ||||
|     else: | ||||
|         if hf_deepspeed_config.is_offload(): | ||||
|             logger.info( | ||||
|                 "Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the" | ||||
|                 " custom optimizer has both CPU and GPU implementation (except LAMB)" | ||||
|             ) | ||||
|  | ||||
|         # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch. | ||||
|         # But trainer uses AdamW by default. | ||||
|         optimizer = trainer.create_optimizer() | ||||
|         # To use other optimizers requires voiding warranty with: `zero_allow_untested_optimizer` | ||||
|         config["zero_allow_untested_optimizer"] = True | ||||
|  | ||||
|     lr_scheduler = None | ||||
|     if "scheduler" in config: | ||||
|         lr_scheduler = DummyScheduler(optimizer) | ||||
|     else: | ||||
|         if isinstance(optimizer, DummyOptim): | ||||
|             raise ValueError( | ||||
|                 "Found `optimizer` configured in the DeepSpeed config, but no `scheduler`. " | ||||
|                 "Please configure a scheduler in the DeepSpeed config." | ||||
|             ) | ||||
|         lr_scheduler = trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer) | ||||
|  | ||||
|     return optimizer, lr_scheduler | ||||
|  | ||||
|  | ||||
| def deepspeed_init(trainer, num_training_steps, inference=False): | ||||
|     """ | ||||
|     Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args. | ||||
|  | ||||
|     If `resume_from_checkpoint` was passed then an attempt to resume from a previously saved checkpoint will be made. | ||||
|  | ||||
|     Args: | ||||
|         trainer: Trainer object | ||||
|         num_training_steps: per single gpu | ||||
|         resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load | ||||
|         inference: launch in inference mode (no optimizer and no lr scheduler) | ||||
|  | ||||
|     Returns: optimizer, lr_scheduler | ||||
|  | ||||
|     We may use `deepspeed_init` more than once during the life of Trainer, when we do - it's a temp hack based on: | ||||
|     https://github.com/microsoft/DeepSpeed/issues/1394#issuecomment-937405374 until Deepspeed fixes a bug where it | ||||
|     can't resume from a checkpoint after it did some stepping https://github.com/microsoft/DeepSpeed/issues/1612 | ||||
|  | ||||
|     """ | ||||
|     from deepspeed.utils import logger as ds_logger | ||||
|  | ||||
|     model = trainer.model | ||||
|     args = trainer.args | ||||
|  | ||||
|     hf_deepspeed_config = trainer.accelerator.state.deepspeed_plugin.hf_ds_config | ||||
|  | ||||
|     # resume config update - some bits like `model` and `num_training_steps` only become available during train | ||||
|     hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps) | ||||
|  | ||||
|     # set the Deepspeed log level consistent with the Trainer | ||||
|     ds_logger.setLevel(args.get_process_log_level()) | ||||
|  | ||||
|     if inference: | ||||
|         # only Z3 makes sense for the inference | ||||
|         if not hf_deepspeed_config.is_zero3(): | ||||
|             raise ValueError("ZeRO inference only makes sense with ZeRO Stage 3 - please adjust your config") | ||||
|  | ||||
|         # in case the training config is re-used for inference | ||||
|         hf_deepspeed_config.del_config_sub_tree("optimizer") | ||||
|         hf_deepspeed_config.del_config_sub_tree("lr_scheduler") | ||||
|         optimizer, lr_scheduler = None, None | ||||
|         model_parameters = None | ||||
|     else: | ||||
|         trainer.optimizer = None  # important for when deepspeed_init is used as re-init | ||||
|         model_parameters = list(filter(lambda p: p.requires_grad, model.parameters())) | ||||
|         optimizer, lr_scheduler = deepspeed_optim_sched( | ||||
|             trainer, hf_deepspeed_config, args, num_training_steps, model_parameters | ||||
|         ) | ||||
|  | ||||
|     # keep for quick debug: | ||||
|     # from pprint import pprint; pprint(config) | ||||
|  | ||||
|     return optimizer, lr_scheduler | ||||
|  | ||||
|  | ||||
| def deepspeed_load_checkpoint(deepspeed_engine, checkpoint_path): | ||||
|     # it's possible that the user is trying to resume from model_path, which doesn't necessarily | ||||
|     # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's | ||||
|     # a resume from a checkpoint and not just a local pretrained weight. So we check here if the | ||||
|     # path contains what looks like a deepspeed checkpoint | ||||
|     import glob | ||||
|  | ||||
|     deepspeed_checkpoint_dirs = sorted(glob.glob(f"{checkpoint_path}/global_step*")) | ||||
|  | ||||
|     if len(deepspeed_checkpoint_dirs) > 0: | ||||
|         logger.info(f"Attempting to resume from {checkpoint_path}") | ||||
|         # this magically updates self.optimizer and self.lr_scheduler | ||||
|         load_path, _ = deepspeed_engine.load_checkpoint( | ||||
|             checkpoint_path, load_optimizer_states=True, load_lr_scheduler_states=True | ||||
|         ) | ||||
|         if load_path is None: | ||||
|             raise ValueError(f"[deepspeed] failed to resume from checkpoint {checkpoint_path}") | ||||
|     else: | ||||
|         raise ValueError(f"Can't find a valid checkpoint at {checkpoint_path}") | ||||
| @ -30,8 +30,8 @@ from typing import TYPE_CHECKING, Dict, Optional | ||||
| 
 | ||||
| import numpy as np | ||||
| 
 | ||||
| from . import __version__ as version | ||||
| from .utils import flatten_dict, is_datasets_available, is_pandas_available, is_torch_available, logging | ||||
| from .. import __version__ as version | ||||
| from ..utils import flatten_dict, is_datasets_available, is_pandas_available, is_torch_available, logging | ||||
| 
 | ||||
| 
 | ||||
| logger = logging.get_logger(__name__) | ||||
| @ -68,10 +68,10 @@ if TYPE_CHECKING and _has_neptune: | ||||
|         except importlib.metadata.PackageNotFoundError: | ||||
|             _has_neptune = False | ||||
| 
 | ||||
| from .trainer_callback import ProgressCallback, TrainerCallback  # noqa: E402 | ||||
| from .trainer_utils import PREFIX_CHECKPOINT_DIR, BestRun, IntervalStrategy  # noqa: E402 | ||||
| from .training_args import ParallelMode  # noqa: E402 | ||||
| from .utils import ENV_VARS_TRUE_VALUES, is_torch_tpu_available  # noqa: E402 | ||||
| from ..trainer_callback import ProgressCallback, TrainerCallback  # noqa: E402 | ||||
| from ..trainer_utils import PREFIX_CHECKPOINT_DIR, BestRun, IntervalStrategy  # noqa: E402 | ||||
| from ..training_args import ParallelMode  # noqa: E402 | ||||
| from ..utils import ENV_VARS_TRUE_VALUES, is_torch_tpu_available  # noqa: E402 | ||||
| 
 | ||||
| 
 | ||||
| # Integration functions: | ||||
| @ -14,7 +14,7 @@ | ||||
| import inspect | ||||
| from typing import Optional | ||||
| 
 | ||||
| from ...utils import ( | ||||
| from ..utils import ( | ||||
|     check_peft_version, | ||||
|     find_adapter_config_file, | ||||
|     is_accelerate_available, | ||||
| @ -1,14 +0,0 @@ | ||||
| # Copyright 2023 The HuggingFace Team. All rights reserved. | ||||
| # | ||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| # you may not use this file except in compliance with the License. | ||||
| # You may obtain a copy of the License at | ||||
| # | ||||
| #     http://www.apache.org/licenses/LICENSE-2.0 | ||||
| # | ||||
| # Unless required by applicable law or agreed to in writing, software | ||||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | ||||
| from .peft import PeftAdapterMixin | ||||
| @ -1,15 +0,0 @@ | ||||
| # Copyright 2023 The HuggingFace Team. All rights reserved. | ||||
| # | ||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| # you may not use this file except in compliance with the License. | ||||
| # You may obtain a copy of the License at | ||||
| # | ||||
| #     http://www.apache.org/licenses/LICENSE-2.0 | ||||
| # | ||||
| # Unless required by applicable law or agreed to in writing, software | ||||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | ||||
|  | ||||
| from .peft_mixin import PeftAdapterMixin | ||||
| @ -35,10 +35,9 @@ from torch.nn import CrossEntropyLoss | ||||
|  | ||||
| from .activations import get_activation | ||||
| from .configuration_utils import PretrainedConfig | ||||
| from .deepspeed import deepspeed_config, is_deepspeed_zero3_enabled | ||||
| from .dynamic_module_utils import custom_object_save | ||||
| from .generation import GenerationConfig, GenerationMixin | ||||
| from .lib_integrations import PeftAdapterMixin | ||||
| from .integrations import PeftAdapterMixin, deepspeed_config, is_deepspeed_zero3_enabled | ||||
| from .pytorch_utils import (  # noqa: F401 | ||||
|     Conv1D, | ||||
|     apply_chunking_to_forward, | ||||
| @ -661,7 +660,7 @@ def _load_state_dict_into_meta_model( | ||||
|     #   they won't get loaded. | ||||
|  | ||||
|     if is_quantized: | ||||
|         from .utils.bitsandbytes import set_module_quantized_tensor_to_device | ||||
|         from .integrations import set_module_quantized_tensor_to_device | ||||
|  | ||||
|     error_msgs = [] | ||||
|  | ||||
| @ -2944,7 +2943,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix | ||||
|             keep_in_fp32_modules = [] | ||||
|  | ||||
|         if load_in_8bit or load_in_4bit: | ||||
|             from .utils.bitsandbytes import get_keys_to_not_convert, replace_with_bnb_linear | ||||
|             from .integrations import get_keys_to_not_convert, replace_with_bnb_linear | ||||
|  | ||||
|             llm_int8_skip_modules = quantization_config.llm_int8_skip_modules | ||||
|             load_in_8bit_fp32_cpu_offload = quantization_config.llm_int8_enable_fp32_cpu_offload | ||||
| @ -3262,7 +3261,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix | ||||
|     ): | ||||
|         is_safetensors = False | ||||
|         if is_quantized: | ||||
|             from .utils.bitsandbytes import set_module_quantized_tensor_to_device | ||||
|             from .integrations import set_module_quantized_tensor_to_device | ||||
|  | ||||
|         if device_map is not None and "disk" in device_map.values(): | ||||
|             archive_file = ( | ||||
|  | ||||
| @ -25,7 +25,7 @@ from torch import nn | ||||
| from torch.nn import CrossEntropyLoss | ||||
|  | ||||
| from ...activations import ACT2FN | ||||
| from ...deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...integrations.deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...modeling_outputs import ( | ||||
|     BaseModelOutput, | ||||
|     CausalLMOutput, | ||||
|  | ||||
| @ -23,8 +23,8 @@ import torch.utils.checkpoint | ||||
| from torch import nn | ||||
|  | ||||
| from ....activations import ACT2FN | ||||
| from ....deepspeed import is_deepspeed_zero3_enabled | ||||
| from ....file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward | ||||
| from ....integrations.deepspeed import is_deepspeed_zero3_enabled | ||||
| from ....modeling_outputs import BaseModelOutput, CausalLMOutput | ||||
| from ....modeling_utils import ( | ||||
|     PreTrainedModel, | ||||
|  | ||||
| @ -29,7 +29,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss | ||||
|  | ||||
| from ...activations import get_activation | ||||
| from ...configuration_utils import PretrainedConfig | ||||
| from ...deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...integrations.deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...modeling_outputs import ( | ||||
|     BaseModelOutput, | ||||
|     MaskedLMOutput, | ||||
|  | ||||
| @ -23,7 +23,7 @@ import torch | ||||
| import torch.nn as nn | ||||
| from torch.nn import LayerNorm | ||||
|  | ||||
| from ...deepspeed import is_deepspeed_available | ||||
| from ...integrations.deepspeed import is_deepspeed_available | ||||
| from ...modeling_outputs import ModelOutput | ||||
| from ...utils import ( | ||||
|     ContextManagers, | ||||
|  | ||||
| @ -35,7 +35,7 @@ from torch import Tensor, nn | ||||
| from torch.nn import CrossEntropyLoss, LayerNorm | ||||
|  | ||||
| from ...activations import ACT2FN | ||||
| from ...deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...integrations.deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...modeling_outputs import ( | ||||
|     BaseModelOutput, | ||||
|     BaseModelOutputWithPastAndCrossAttentions, | ||||
|  | ||||
| @ -24,7 +24,7 @@ from torch import nn | ||||
| from torch.nn import CrossEntropyLoss | ||||
|  | ||||
| from ...activations import ACT2FN | ||||
| from ...deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...integrations.deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput | ||||
| from ...modeling_utils import PreTrainedModel | ||||
| from ...utils import ( | ||||
|  | ||||
| @ -23,7 +23,7 @@ from torch import nn | ||||
| from torch.nn import CrossEntropyLoss | ||||
|  | ||||
| from ...activations import ACT2FN | ||||
| from ...deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...integrations.deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...modeling_outputs import ( | ||||
|     BaseModelOutput, | ||||
|     BaseModelOutputWithPastAndCrossAttentions, | ||||
|  | ||||
| @ -24,7 +24,7 @@ from torch.nn import CrossEntropyLoss | ||||
| from torch.utils.checkpoint import checkpoint | ||||
|  | ||||
| from ...activations import ACT2FN | ||||
| from ...deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...integrations.deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...modeling_outputs import ( | ||||
|     MoEModelOutput, | ||||
|     MoEModelOutputWithPastAndCrossAttentions, | ||||
|  | ||||
| @ -25,7 +25,7 @@ from torch import nn | ||||
| from torch.nn import CrossEntropyLoss | ||||
|  | ||||
| from ...activations import ACT2FN | ||||
| from ...deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...integrations.deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput | ||||
| from ...modeling_utils import PreTrainedModel | ||||
| from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging | ||||
|  | ||||
| @ -26,7 +26,7 @@ from torch import nn | ||||
| from torch.nn import CrossEntropyLoss, LayerNorm | ||||
|  | ||||
| from ...activations import ACT2FN | ||||
| from ...deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...integrations.deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput | ||||
| from ...modeling_utils import PreTrainedModel | ||||
| from ...pytorch_utils import softmax_backward_data | ||||
|  | ||||
| @ -25,7 +25,7 @@ from torch import nn | ||||
| from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, L1Loss | ||||
|  | ||||
| from ...activations import ACT2FN | ||||
| from ...deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...integrations.deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...modeling_outputs import ( | ||||
|     BaseModelOutput, | ||||
|     BaseModelOutputWithPastAndCrossAttentions, | ||||
|  | ||||
| @ -26,7 +26,7 @@ from torch import nn | ||||
| from torch.nn import CrossEntropyLoss | ||||
|  | ||||
| from ...activations import ACT2FN | ||||
| from ...deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...integrations.deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, Wav2Vec2BaseModelOutput | ||||
| from ...modeling_utils import PreTrainedModel | ||||
| from ...utils import ( | ||||
|  | ||||
| @ -26,7 +26,7 @@ from torch import nn | ||||
| from torch.nn import CrossEntropyLoss | ||||
|  | ||||
| from ...activations import ACT2FN | ||||
| from ...deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...integrations.deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...modeling_outputs import ( | ||||
|     BaseModelOutput, | ||||
|     CausalLMOutput, | ||||
|  | ||||
| @ -26,7 +26,7 @@ from torch import nn | ||||
| from torch.nn import CrossEntropyLoss | ||||
|  | ||||
| from ...activations import ACT2FN | ||||
| from ...deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...integrations.deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...modeling_outputs import ( | ||||
|     BaseModelOutput, | ||||
|     CausalLMOutput, | ||||
|  | ||||
| @ -25,7 +25,7 @@ from torch import nn | ||||
| from torch.nn import CrossEntropyLoss | ||||
|  | ||||
| from ...activations import ACT2FN | ||||
| from ...deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...integrations.deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...modeling_outputs import ( | ||||
|     BaseModelOutput, | ||||
|     CausalLMOutput, | ||||
|  | ||||
| @ -26,7 +26,7 @@ from torch import nn | ||||
| from torch.nn import CrossEntropyLoss | ||||
|  | ||||
| from ...activations import ACT2FN | ||||
| from ...deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...integrations.deepspeed import is_deepspeed_zero3_enabled | ||||
| from ...modeling_outputs import ( | ||||
|     BaseModelOutput, | ||||
|     CausalLMOutput, | ||||
|  | ||||
| @ -40,7 +40,6 @@ import requests | ||||
|  | ||||
| from transformers import logging as transformers_logging | ||||
|  | ||||
| from .deepspeed import is_deepspeed_available | ||||
| from .integrations import ( | ||||
|     is_clearml_available, | ||||
|     is_fairscale_available, | ||||
| @ -49,6 +48,7 @@ from .integrations import ( | ||||
|     is_sigopt_available, | ||||
|     is_wandb_available, | ||||
| ) | ||||
| from .integrations.deepspeed import is_deepspeed_available | ||||
| from .utils import ( | ||||
|     is_accelerate_available, | ||||
|     is_apex_available, | ||||
|  | ||||
| @ -58,9 +58,9 @@ from . import __version__ | ||||
| from .configuration_utils import PretrainedConfig | ||||
| from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator | ||||
| from .debug_utils import DebugOption, DebugUnderflowOverflow | ||||
| from .deepspeed import deepspeed_init, deepspeed_load_checkpoint | ||||
| from .dependency_versions_check import dep_version_check | ||||
| from .hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS, default_hp_search_backend | ||||
| from .integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint | ||||
| from .modelcard import TrainingSummary | ||||
| from .modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model | ||||
| from .models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_MAPPING_NAMES | ||||
| @ -1197,7 +1197,7 @@ class Trainer: | ||||
|             # Rebuild the deepspeed config to reflect the updated training parameters | ||||
|             from accelerate.utils import DeepSpeedPlugin | ||||
|  | ||||
|             from transformers.deepspeed import HfTrainerDeepSpeedConfig | ||||
|             from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig | ||||
|  | ||||
|             self.args.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.args.deepspeed) | ||||
|             self.args.hf_deepspeed_config.trainer_config_process(self.args) | ||||
| @ -3899,7 +3899,7 @@ class Trainer: | ||||
|  | ||||
|         if self.is_deepspeed_enabled: | ||||
|             if getattr(self.args, "hf_deepspeed_config", None) is None: | ||||
|                 from transformers.deepspeed import HfTrainerDeepSpeedConfig | ||||
|                 from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig | ||||
|  | ||||
|                 ds_plugin = self.accelerator.state.deepspeed_plugin | ||||
|  | ||||
|  | ||||
| @ -35,7 +35,7 @@ from torch import nn | ||||
| from torch.utils.data import Dataset, IterableDataset, RandomSampler, Sampler | ||||
| from torch.utils.data.distributed import DistributedSampler | ||||
|  | ||||
| from .deepspeed import is_deepspeed_zero3_enabled | ||||
| from .integrations.deepspeed import is_deepspeed_zero3_enabled | ||||
| from .tokenization_utils_base import BatchEncoding | ||||
| from .utils import is_sagemaker_mp_enabled, is_torch_tpu_available, is_training_run_on_sagemaker, logging | ||||
|  | ||||
|  | ||||
| @ -20,8 +20,8 @@ import torch | ||||
| from torch import nn | ||||
| from torch.utils.data import Dataset | ||||
|  | ||||
| from .deepspeed import is_deepspeed_zero3_enabled | ||||
| from .generation.configuration_utils import GenerationConfig | ||||
| from .integrations.deepspeed import is_deepspeed_zero3_enabled | ||||
| from .trainer import Trainer | ||||
| from .utils import logging | ||||
|  | ||||
|  | ||||
| @ -1647,7 +1647,7 @@ class TrainingArguments: | ||||
|             # - must be run before the model is created. | ||||
|             if not is_accelerate_available(): | ||||
|                 raise ValueError("--deepspeed requires Accelerate to be installed: `pip install accelerate`.") | ||||
|             from transformers.deepspeed import HfTrainerDeepSpeedConfig | ||||
|             from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig | ||||
|  | ||||
|             # will be used later by the Trainer | ||||
|             # note: leave self.deepspeed unmodified in case a user relies on it not to be modified) | ||||
|  | ||||
| @ -1,291 +1,28 @@ | ||||
| import importlib.metadata | ||||
| # Copyright 2023 The HuggingFace Team. All rights reserved. | ||||
| # | ||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| # you may not use this file except in compliance with the License. | ||||
| # You may obtain a copy of the License at | ||||
| # | ||||
| #     http://www.apache.org/licenses/LICENSE-2.0 | ||||
| # | ||||
| # Unless required by applicable law or agreed to in writing, software | ||||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | ||||
| import warnings | ||||
| from copy import deepcopy | ||||
|  | ||||
| from packaging import version | ||||
|  | ||||
| from ..utils import logging | ||||
| from .import_utils import is_accelerate_available, is_bitsandbytes_available | ||||
|  | ||||
|  | ||||
| if is_bitsandbytes_available(): | ||||
|     import bitsandbytes as bnb | ||||
|     import torch | ||||
|     import torch.nn as nn | ||||
| warnings.warn( | ||||
|     "transformers.utils.bitsandbytes module is deprecated and will be removed in a future version. Please import bitsandbytes modules directly from transformers.integrations", | ||||
|     FutureWarning, | ||||
| ) | ||||
|  | ||||
|     from ..pytorch_utils import Conv1D | ||||
|  | ||||
| if is_accelerate_available(): | ||||
|     from accelerate import init_empty_weights | ||||
|     from accelerate.utils import find_tied_parameters | ||||
|  | ||||
| logger = logging.get_logger(__name__) | ||||
|  | ||||
|  | ||||
| def set_module_quantized_tensor_to_device(module, tensor_name, device, value=None, fp16_statistics=None): | ||||
|     """ | ||||
|     A helper function to set a given tensor (parameter of buffer) of a module on a specific device (note that doing | ||||
|     `param.to(device)` creates a new tensor not linked to the parameter, which is why we need this function). The | ||||
|     function is adapted from `set_module_tensor_to_device` function from accelerate that is adapted to support the | ||||
|     class `Int8Params` from `bitsandbytes`. | ||||
|  | ||||
|     Args: | ||||
|         module (`torch.nn.Module`): | ||||
|             The module in which the tensor we want to move lives. | ||||
|         tensor_name (`str`): | ||||
|             The full name of the parameter/buffer. | ||||
|         device (`int`, `str` or `torch.device`): | ||||
|             The device on which to set the tensor. | ||||
|         value (`torch.Tensor`, *optional*): | ||||
|             The value of the tensor (useful when going from the meta device to any other device). | ||||
|         fp16_statistics (`torch.HalfTensor`, *optional*): | ||||
|             The list of fp16 statistics to set on the module, used for serialization. | ||||
|     """ | ||||
|     # Recurse if needed | ||||
|     if "." in tensor_name: | ||||
|         splits = tensor_name.split(".") | ||||
|         for split in splits[:-1]: | ||||
|             new_module = getattr(module, split) | ||||
|             if new_module is None: | ||||
|                 raise ValueError(f"{module} has no attribute {split}.") | ||||
|             module = new_module | ||||
|         tensor_name = splits[-1] | ||||
|  | ||||
|     if tensor_name not in module._parameters and tensor_name not in module._buffers: | ||||
|         raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.") | ||||
|     is_buffer = tensor_name in module._buffers | ||||
|     old_value = getattr(module, tensor_name) | ||||
|  | ||||
|     if old_value.device == torch.device("meta") and device not in ["meta", torch.device("meta")] and value is None: | ||||
|         raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {device}.") | ||||
|  | ||||
|     is_4bit = False | ||||
|     is_8bit = False | ||||
|     if is_buffer or not is_bitsandbytes_available(): | ||||
|         is_8bit = False | ||||
|         is_4bit = False | ||||
|     else: | ||||
|         is_4bit = hasattr(bnb.nn, "Params4bit") and isinstance(module._parameters[tensor_name], bnb.nn.Params4bit) | ||||
|         is_8bit = isinstance(module._parameters[tensor_name], bnb.nn.Int8Params) | ||||
|  | ||||
|     if is_8bit or is_4bit: | ||||
|         param = module._parameters[tensor_name] | ||||
|         if param.device.type != "cuda": | ||||
|             if value is None: | ||||
|                 new_value = old_value.to(device) | ||||
|             elif isinstance(value, torch.Tensor): | ||||
|                 new_value = value.to("cpu") | ||||
|                 if value.dtype == torch.int8: | ||||
|                     is_8bit_serializable = version.parse(importlib.metadata.version("bitsandbytes")) > version.parse( | ||||
|                         "0.37.2" | ||||
|                     ) | ||||
|                     if not is_8bit_serializable: | ||||
|                         raise ValueError( | ||||
|                             "Detected int8 weights but the version of bitsandbytes is not compatible with int8 serialization. " | ||||
|                             "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`." | ||||
|                         ) | ||||
|             else: | ||||
|                 new_value = torch.tensor(value, device="cpu") | ||||
|  | ||||
|             # Support models using `Conv1D` in place of `nn.Linear` (e.g. gpt2) by transposing the weight matrix prior to quantization. | ||||
|             # Since weights are saved in the correct "orientation", we skip transposing when loading. | ||||
|             if issubclass(module.source_cls, Conv1D) and fp16_statistics is None: | ||||
|                 new_value = new_value.T | ||||
|  | ||||
|             kwargs = old_value.__dict__ | ||||
|             if is_8bit: | ||||
|                 new_value = bnb.nn.Int8Params(new_value, requires_grad=False, **kwargs).to(device) | ||||
|             elif is_4bit: | ||||
|                 new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(device) | ||||
|  | ||||
|             module._parameters[tensor_name] = new_value | ||||
|             if fp16_statistics is not None: | ||||
|                 setattr(module.weight, "SCB", fp16_statistics.to(device)) | ||||
|  | ||||
|     else: | ||||
|         if value is None: | ||||
|             new_value = old_value.to(device) | ||||
|         elif isinstance(value, torch.Tensor): | ||||
|             new_value = value.to(device) | ||||
|         else: | ||||
|             new_value = torch.tensor(value, device=device) | ||||
|  | ||||
|         if is_buffer: | ||||
|             module._buffers[tensor_name] = new_value | ||||
|         else: | ||||
|             new_value = nn.Parameter(new_value, requires_grad=old_value.requires_grad) | ||||
|             module._parameters[tensor_name] = new_value | ||||
|  | ||||
|  | ||||
| def _replace_with_bnb_linear( | ||||
|     model, modules_to_not_convert=None, current_key_name=None, quantization_config=None, has_been_replaced=False | ||||
| ): | ||||
|     """ | ||||
|     Private method that wraps the recursion for module replacement. | ||||
|  | ||||
|     Returns the converted model and a boolean that indicates if the conversion has been successfull or not. | ||||
|     """ | ||||
|     for name, module in model.named_children(): | ||||
|         if current_key_name is None: | ||||
|             current_key_name = [] | ||||
|         current_key_name.append(name) | ||||
|  | ||||
|         if (isinstance(module, nn.Linear) or isinstance(module, Conv1D)) and name not in modules_to_not_convert: | ||||
|             # Check if the current key is not in the `modules_to_not_convert` | ||||
|             if not any(key in ".".join(current_key_name) for key in modules_to_not_convert): | ||||
|                 with init_empty_weights(): | ||||
|                     if isinstance(module, Conv1D): | ||||
|                         in_features, out_features = module.weight.shape | ||||
|                     else: | ||||
|                         in_features = module.in_features | ||||
|                         out_features = module.out_features | ||||
|  | ||||
|                     if quantization_config.quantization_method() == "llm_int8": | ||||
|                         model._modules[name] = bnb.nn.Linear8bitLt( | ||||
|                             in_features, | ||||
|                             out_features, | ||||
|                             module.bias is not None, | ||||
|                             has_fp16_weights=quantization_config.llm_int8_has_fp16_weight, | ||||
|                             threshold=quantization_config.llm_int8_threshold, | ||||
|                         ) | ||||
|                         has_been_replaced = True | ||||
|                     else: | ||||
|                         if ( | ||||
|                             quantization_config.llm_int8_skip_modules is not None | ||||
|                             and name in quantization_config.llm_int8_skip_modules | ||||
|                         ): | ||||
|                             pass | ||||
|                         else: | ||||
|                             model._modules[name] = bnb.nn.Linear4bit( | ||||
|                                 in_features, | ||||
|                                 out_features, | ||||
|                                 module.bias is not None, | ||||
|                                 quantization_config.bnb_4bit_compute_dtype, | ||||
|                                 compress_statistics=quantization_config.bnb_4bit_use_double_quant, | ||||
|                                 quant_type=quantization_config.bnb_4bit_quant_type, | ||||
|                             ) | ||||
|                             has_been_replaced = True | ||||
|                     # Store the module class in case we need to transpose the weight later | ||||
|                     model._modules[name].source_cls = type(module) | ||||
|                     # Force requires grad to False to avoid unexpected errors | ||||
|                     model._modules[name].requires_grad_(False) | ||||
|         if len(list(module.children())) > 0: | ||||
|             _, has_been_replaced = _replace_with_bnb_linear( | ||||
|                 module, | ||||
|                 modules_to_not_convert, | ||||
|                 current_key_name, | ||||
|                 quantization_config, | ||||
|                 has_been_replaced=has_been_replaced, | ||||
|             ) | ||||
|         # Remove the last key for recursion | ||||
|         current_key_name.pop(-1) | ||||
|     return model, has_been_replaced | ||||
|  | ||||
|  | ||||
| def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name=None, quantization_config=None): | ||||
|     """ | ||||
|     A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules from the `bitsandbytes` | ||||
|     library. This will enable running your models using mixed int8 precision as described by the paper `LLM.int8(): | ||||
|     8-bit Matrix Multiplication for Transformers at Scale`. Make sure `bitsandbytes` compiled with the correct CUDA | ||||
|     version of your hardware is installed before running this function. `pip install -i https://test.pypi.org/simple/ | ||||
|     bitsandbytes` | ||||
|  | ||||
|     The function will be run recursively and replace all `torch.nn.Linear` modules except for the `lm_head` that should | ||||
|     be kept as a `torch.nn.Linear` module. The replacement is done under `init_empty_weights` context manager so no | ||||
|     CPU/GPU memory is required to run this function. Int8 mixed-precision matrix decomposition works by separating a | ||||
|     matrix multiplication into two streams: (1) and systematic feature outlier stream matrix multiplied in fp16 | ||||
|     (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no | ||||
|     predictive degradation is possible for very large models (>=176B parameters). | ||||
|  | ||||
|     Parameters: | ||||
|         model (`torch.nn.Module`): | ||||
|             Input model or `torch.nn.Module` as the function is run recursively. | ||||
|         modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`): | ||||
|             Names of the modules to not convert in `Linear8bitLt`. In practice we keep the `lm_head` in full precision | ||||
|             for numerical stability reasons. | ||||
|         current_key_name (`List[`str`]`, *optional*): | ||||
|             An array to track the current key of the recursion. This is used to check whether the current key (part of | ||||
|             it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or | ||||
|             `disk`). | ||||
|     """ | ||||
|     modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert | ||||
|     model, has_been_replaced = _replace_with_bnb_linear( | ||||
|         model, modules_to_not_convert, current_key_name, quantization_config | ||||
|     ) | ||||
|  | ||||
|     if not has_been_replaced: | ||||
|         logger.warning( | ||||
|             "You are loading your model in 8bit or 4bit but no linear modules were found in your model." | ||||
|             " Please double check your model architecture, or submit an issue on github if you think this is" | ||||
|             " a bug." | ||||
|         ) | ||||
|  | ||||
|     return model | ||||
|  | ||||
|  | ||||
| # For backward compatibility | ||||
| def replace_8bit_linear(*args, **kwargs): | ||||
|     warnings.warn( | ||||
|         "`replace_8bit_linear` will be deprecated in a future version, please use `replace_with_bnb_linear` instead", | ||||
|         FutureWarning, | ||||
|     ) | ||||
|     return replace_with_bnb_linear(*args, **kwargs) | ||||
|  | ||||
|  | ||||
| # For backward compatiblity | ||||
| def set_module_8bit_tensor_to_device(*args, **kwargs): | ||||
|     warnings.warn( | ||||
|         "`set_module_8bit_tensor_to_device` will be deprecated in a future version, please use `set_module_quantized_tensor_to_device` instead", | ||||
|         FutureWarning, | ||||
|     ) | ||||
|     return set_module_quantized_tensor_to_device(*args, **kwargs) | ||||
|  | ||||
|  | ||||
| def get_keys_to_not_convert(model): | ||||
|     r""" | ||||
|     An utility function to get the key of the module to keep in full precision if any For example for CausalLM modules | ||||
|     we may want to keep the lm_head in full precision for numerical stability reasons. For other architectures, we want | ||||
|     to keep the tied weights of the model. The function will return a list of the keys of the modules to not convert in | ||||
|     int8. | ||||
|  | ||||
|     Parameters: | ||||
|     model (`torch.nn.Module`): | ||||
|         Input model | ||||
|     """ | ||||
|     # Create a copy of the model and tie the weights, then | ||||
|     # check if it contains tied weights | ||||
|     tied_model = deepcopy(model)  # this has 0 cost since it is done inside `init_empty_weights` context manager` | ||||
|     tied_model.tie_weights() | ||||
|  | ||||
|     tied_params = find_tied_parameters(tied_model) | ||||
|     # For compatibility with Accelerate < 0.18 | ||||
|     if isinstance(tied_params, dict): | ||||
|         tied_keys = sum(list(tied_params.values()), []) + list(tied_params.keys()) | ||||
|     else: | ||||
|         tied_keys = sum(tied_params, []) | ||||
|     has_tied_params = len(tied_keys) > 0 | ||||
|  | ||||
|     # If there is not tied weights, we want to keep the lm_head(output_embedding) in full precision | ||||
|     if not has_tied_params: | ||||
|         output_emb = model.get_output_embeddings() | ||||
|         if output_emb is not None: | ||||
|             list_last_module = [name for name, module in model.named_modules() if id(module) == id(output_emb)] | ||||
|             return list_last_module | ||||
|  | ||||
|     # otherwise, no tied weights, no output embedding defined, simply keep the last module in full precision | ||||
|     list_modules = list(model.named_parameters()) | ||||
|     list_last_module = [list_modules[-1][0]] | ||||
|     # add last module together with tied weights | ||||
|     intersection = set(list_last_module) - set(tied_keys) | ||||
|     list_untouched = list(set(tied_keys)) + list(intersection) | ||||
|  | ||||
|     # remove ".weight" from the keys | ||||
|     names_to_remove = [".weight", ".bias"] | ||||
|     filtered_module_names = [] | ||||
|     for name in list_untouched: | ||||
|         for name_to_remove in names_to_remove: | ||||
|             if name_to_remove in name: | ||||
|                 name = name.replace(name_to_remove, "") | ||||
|         filtered_module_names.append(name) | ||||
|  | ||||
|     return filtered_module_names | ||||
| from ..integrations import (  # noqa | ||||
|     get_keys_to_not_convert, | ||||
|     replace_8bit_linear, | ||||
|     replace_with_bnb_linear, | ||||
|     set_module_8bit_tensor_to_device, | ||||
|     set_module_quantized_tensor_to_device, | ||||
| ) | ||||
|  | ||||
| @ -27,7 +27,11 @@ from parameterized import parameterized | ||||
| import tests.trainer.test_trainer | ||||
| from tests.trainer.test_trainer import TrainerIntegrationCommon  # noqa | ||||
| from transformers import AutoModel, TrainingArguments, is_torch_available, logging | ||||
| from transformers.deepspeed import HfDeepSpeedConfig, is_deepspeed_available, unset_hf_deepspeed_config | ||||
| from transformers.integrations.deepspeed import ( | ||||
|     HfDeepSpeedConfig, | ||||
|     is_deepspeed_available, | ||||
|     unset_hf_deepspeed_config, | ||||
| ) | ||||
| from transformers.testing_utils import ( | ||||
|     CaptureLogger, | ||||
|     CaptureStd, | ||||
| @ -113,7 +117,7 @@ def require_deepspeed_aio(test_case): | ||||
| if is_deepspeed_available(): | ||||
|     from deepspeed.utils import logger as deepspeed_logger  # noqa | ||||
|     from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint | ||||
|     from transformers.deepspeed import deepspeed_config, is_deepspeed_zero3_enabled  # noqa | ||||
|     from transformers.integrations.deepspeed import deepspeed_config, is_deepspeed_zero3_enabled  # noqa | ||||
|  | ||||
|  | ||||
| def get_launcher(distributed=False): | ||||
|  | ||||
| @ -131,7 +131,7 @@ class MixedInt8Test(BaseMixedInt8Test): | ||||
|         from accelerate import init_empty_weights | ||||
|  | ||||
|         from transformers import AutoModelForMaskedLM, Blip2ForConditionalGeneration, MptForCausalLM, OPTForCausalLM | ||||
|         from transformers.utils.bitsandbytes import get_keys_to_not_convert | ||||
|         from transformers.integrations.bitsandbytes import get_keys_to_not_convert | ||||
|  | ||||
|         model_id = "mosaicml/mpt-7b" | ||||
|         config = AutoConfig.from_pretrained( | ||||
|  | ||||
| @ -383,9 +383,11 @@ src/transformers/hyperparameter_search.py | ||||
| src/transformers/image_processing_utils.py | ||||
| src/transformers/image_transforms.py | ||||
| src/transformers/image_utils.py | ||||
| src/transformers/integrations.py | ||||
| src/transformers/integrations/bitsandbytes.py | ||||
| src/transformers/integrations/deepspeed.py | ||||
| src/transformers/integrations/integration_utils.py | ||||
| src/transformers/integrations/peft.py | ||||
| src/transformers/keras_callbacks.py | ||||
| src/transformers/lib_integrations/peft/peft_mixin.py | ||||
| src/transformers/modelcard.py | ||||
| src/transformers/modeling_flax_outputs.py | ||||
| src/transformers/modeling_flax_pytorch_utils.py | ||||
|  | ||||
		Reference in New Issue
	
	Block a user
	