check

2025-11-16 07:40:53 +08:00 · 2025-11-06 14:22:59 +01:00
869 changed files with 11986 additions and 11412 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -46,8 +46,8 @@ jobs:
            - run: uv pip install -U -e .
            - run: echo 'export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)"' >> "$BASH_ENV" && source "$BASH_ENV"
            - run: mkdir -p test_preparation
-            - run: python utils/tests_fetcher.py | tee tests_fetched_summary.txt || true
-            - run: python utils/tests_fetcher.py --filter_tests || true
+            - run: python utils/tests_fetcher.py | tee tests_fetched_summary.txt
+            - run: python utils/tests_fetcher.py --filter_tests
            - run: export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)" && echo $GIT_COMMIT_MESSAGE && python .circleci/create_circleci_config.py --fetcher_folder test_preparation
            - run: |
                if [ ! -s test_preparation/generated_config.yml ]; then
@ -98,8 +98,8 @@ jobs:
            - run: uv pip install -U -e .
            - run: echo 'export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)"' >> "$BASH_ENV" && source "$BASH_ENV"
            - run: mkdir -p test_preparation
-            - run: python utils/tests_fetcher.py --fetch_all | tee tests_fetched_summary.txt || true
-            - run: python utils/tests_fetcher.py --filter_tests || true
+            - run: python utils/tests_fetcher.py --fetch_all | tee tests_fetched_summary.txt
+            - run: python utils/tests_fetcher.py --filter_tests
            - run: export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)" && echo $GIT_COMMIT_MESSAGE && python .circleci/create_circleci_config.py --fetcher_folder test_preparation
            - run: |
                if [ ! -s test_preparation/generated_config.yml ]; then
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -185,8 +185,8 @@ class CircleCIJob:
            },
            # During the CircleCI docker images build time, we might already (or not) download the data.
            # If it's done already, the files are inside the directory `/test_data/`.
-            # {"run": {"name": "fetch hub objects before pytest", "command": "cp -r /test_data/* . 2>/dev/null || true; python3 utils/fetch_hub_objects_for_ci.py"}},
-            # {"run": {"name": "download and unzip hub cache", "command": 'curl -L -o huggingface-cache.tar.gz https://huggingface.co/datasets/hf-internal-testing/hf_hub_cache/resolve/main/huggingface-cache.tar.gz && apt-get install pigz && tar --use-compress-program="pigz -d -p 8" -xf huggingface-cache.tar.gz && mv -n hub/* /root/.cache/huggingface/hub/ && ls -la /root/.cache/huggingface/hub/'}},
+            {"run": {"name": "fetch hub objects before pytest", "command": "cp -r /test_data/* . 2>/dev/null || true; python3 utils/fetch_hub_objects_for_ci.py"}},
+            {"run": {"name": "download and unzip hub cache", "command": 'curl -L -o huggingface-cache.tar.gz https://huggingface.co/datasets/hf-internal-testing/hf_hub_cache/resolve/main/huggingface-cache.tar.gz && apt-get install pigz && tar --use-compress-program="pigz -d -p 8" -xf huggingface-cache.tar.gz && mv -n hub/* /root/.cache/huggingface/hub/ && ls -la /root/.cache/huggingface/hub/'}},
            {"run": {
                "name": "Run tests",
                "command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {junit_flags} {repeat_on_failure_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -32,16 +32,16 @@ jobs:
      options: --gpus all --privileged --ipc host
    steps:
      - name: Get repo
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
        with:
-          fetch-depth: 1
+          ref: ${{ github.event.pull_request.head.sha || github.sha }}

      - name: Install benchmark script dependencies
        run: python3 -m pip install -r benchmark_v2/requirements.txt kernels

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]"
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]" && python3 -m pip uninstall -y torchvision # temp fix

      - name: Run benchmark
        run: |
--- a/.github/workflows/check-workflow-permissions.yml
+++ b/.github/workflows/check-workflow-permissions.yml
@ -1,23 +0,0 @@
---
-name: Check Permissions Advisor
-
-on:
-  workflow_dispatch:
-    inputs:
-      workflow_name:
-        description: 'Workflow file name'
-        type: string
-      run_count:
-        description: 'Number of runs to analyze'
-        type: string
-        default: "10"
-
-jobs:
-  advisor:
-    uses: huggingface/security-workflows/.github/workflows/permissions-advisor-reusable.yml@main
-    permissions:
-      actions: read
-      contents: read
-    with:
-      workflow_name: ${{ inputs.workflow_name }}
-      run_count: ${{ fromJSON(inputs.run_count) }}
--- a/.github/workflows/check_failed_tests.yml
+++ b/.github/workflows/check_failed_tests.yml
@ -125,7 +125,7 @@ jobs:
            const { data: merge_commit }  = await github.rest.repos.getCommit({
              owner: pr.base.repo.owner.login,
              repo: pr.base.repo.name,
-              ref: '${{ inputs.commit_sha }}',
+              ref: pr.merge_commit_sha,
            });

            core.setOutput('merge_commit_base_sha', merge_commit.parents[0].sha);
--- a/.github/workflows/pr_slow_ci_suggestion.yml
+++ b/.github/workflows/pr_slow_ci_suggestion.yml
@ -30,8 +30,6 @@ jobs:

      # We need to use `${{ ... }}` here to avoid `Argument list too long` error when a PR changes a lot of files.
      # (We could also try to use artifact approach, but it's more involved).
-      # `CodeQL` doesn't identify any security issue here. Also `PR_FILES` is from `get-pr-info.yml` by using an api
-      # `github.rest.pulls.listFiles`, which is fine.
      - name: Write pr_files file
        run: |
          cat > pr_files.txt << 'EOF'
--- a/1
+++ b/1
@ -45,7 +45,6 @@ repo-consistency:
 	python utils/check_modular_conversion.py
 	python utils/check_dummies.py
 	python utils/check_repo.py
-	python utils/check_init_weights_data.py
 	python utils/check_inits.py
 	python utils/check_pipeline_typing.py
 	python utils/check_config_docstrings.py
--- a/benchmark/requirements.txt
+++ b/benchmark/requirements.txt
@ -1,5 +1,6 @@
 gpustat==1.1.1
 psutil==6.0.0
 psycopg2==2.9.9
+torch>=2.4.0
 hf_xet
-pandas>=1.5.0
+pandas>=1.5.0
--- a/benchmark_v2/framework/benchmark_config.py
+++ b/benchmark_v2/framework/benchmark_config.py
@ -36,7 +36,6 @@ class BenchmarkConfig:
        warmup_iterations: int = 5,
        measurement_iterations: int = 20,
        gpu_monitoring: bool = True,  # NOTE: you may want to disable this at times as we have obsvered it could heavily slow down benchmarks on AMD
-        continuous_batching: bool = False,
        batch_size: int = 1,
        sequence_length: int = 128,
        num_tokens_to_generate: int = 128,
@ -52,7 +51,6 @@ class BenchmarkConfig:
        self.warmup_iterations = warmup_iterations
        self.measurement_iterations = measurement_iterations
        self.gpu_monitoring = gpu_monitoring
-        self.continuous_batching = continuous_batching
        # Input parameters
        self.batch_size = batch_size
        self.sequence_length = sequence_length
@ -87,22 +85,6 @@ class BenchmarkConfig:
        if is_fa:
            logger.warning("Flash attention does not support compile mode. Turning off compile mode.")
            self.compile_mode = None
-        # Handle SDPA backend if not determined by the config (needs to be done before skipping duplicates)
-        if self.attn_implementation == "sdpa" and self.sdpa_backend is None:
-            default_backend = "flash_attention"  # FIXME: torch has a _cur_sdpa_kernel_backends but it fails
-            logger.warning(f"No SDPA backend provided, using {default_backend} instead.")
-            self.sdpa_backend = default_backend
-        if self.continuous_batching:
-            if self.attn_implementation == "flex_attention":
-                logger.error(
-                    "disabling continuous batching because of invalid configuration: flex attention is not supported"
-                )
-                self.continuous_batching = False
-            elif self.attn_implementation == "sdpa" and self.sdpa_backend is not None:
-                logger.warning(
-                    "when continuous batching is enabled, sdpa_backend must be None because of the attention mask, setting it to None"
-                )
-                self.sdpa_backend = "math"

    @property
    def hash(self) -> str:
@ -118,7 +100,6 @@ class BenchmarkConfig:
            attn_code += f"_{self.sdpa_backend}" if self.attn_implementation == "sdpa" else ""
            compile_str = f"compiled_{self.compile_mode}" if self.compile_mode is not None else "uncompiled"
            kernelize_str = "kernelized" if self.kernelize else "unkernelized"
-            continuous_batching_str = "cb" if self.continuous_batching else "generate"
            sep = "-"
        else:
            iter_str = f"{self.warmup_iterations} warmup, {self.measurement_iterations} iterations"
@ -128,11 +109,8 @@ class BenchmarkConfig:
            attn_code += f" with {self.sdpa_backend} backend" if self.attn_implementation == "sdpa" else ""
            compile_str = "compiled" if self.compile_mode is not None else "not compiled"
            kernelize_str = "kernelized" if self.kernelize else "not kernelized"
-            continuous_batching_str = "continuous batching" if self.continuous_batching else "regular generate"
            sep = ", "
-        return sep.join(
-            [iter_str, gpu_monitor_str, dimensions_str, attn_code, compile_str, kernelize_str, continuous_batching_str]
-        )
+        return sep.join([iter_str, gpu_monitor_str, dimensions_str, attn_code, compile_str, kernelize_str])

    def to_dict(self) -> dict[str, Any]:
        return {
@ -140,7 +118,6 @@ class BenchmarkConfig:
            "warmup_iterations": self.warmup_iterations,
            "measurement_iterations": self.measurement_iterations,
            "gpu_monitoring": self.gpu_monitoring,
-            "continuous_batching": self.continuous_batching,
            "batch_size": self.batch_size,
            "sequence_length": self.sequence_length,
            "num_tokens_to_generate": self.num_tokens_to_generate,
@ -157,7 +134,6 @@ class BenchmarkConfig:
            warmup_iterations=data.get("warmup_iterations", 5),
            measurement_iterations=data.get("measurement_iterations", 20),
            gpu_monitoring=data.get("gpu_monitoring", False),
-            continuous_batching=data.get("continuous_batching", False),
            batch_size=data.get("batch_size", 1),
            sequence_length=data.get("sequence_length", 128),
            num_tokens_to_generate=data.get("num_tokens_to_generate", 128),
@ -215,17 +191,15 @@ def get_config_by_level(level: int) -> list[BenchmarkConfig]:
            # Usually there is not much to gain by compiling with other modes, but we allow it for level 4
            compile_modes = BenchmarkConfig.all_compiled_modes if level >= 4 else [None, "default"]
            for cm in compile_modes:
-                for kernelize_on in {False, KERNELIZATION_AVAILABLE}:
-                    for cb_on in [False, True]:
-                        configs.append(
-                            BenchmarkConfig(
-                                attn_implementation=attn_implementation,
-                                sdpa_backend=sdpa_backend,
-                                compile_mode=cm,
-                                kernelize=kernelize_on,
-                                continuous_batching=cb_on,
-                            )
+                for kernelize_on in [False, KERNELIZATION_AVAILABLE]:
+                    configs.append(
+                        BenchmarkConfig(
+                            attn_implementation=attn_implementation,
+                            sdpa_backend=sdpa_backend,
+                            compile_mode=cm,
+                            kernelize=kernelize_on,
                        )
+                    )
        return configs
    # Otherwise, we add the configs for the given level
    if level >= 0:
@ -233,10 +207,8 @@ def get_config_by_level(level: int) -> list[BenchmarkConfig]:
    if level >= 1:
        configs.append(BenchmarkConfig(attn_implementation="flash_attention_2"))
        configs.append(BenchmarkConfig(attn_implementation="eager", compile_mode="default"))
-        configs.append(BenchmarkConfig(attn_implementation="flash_attention_2", continuous_batching=True))
    if level >= 2:
        configs.append(BenchmarkConfig(attn_implementation="sdpa", compile_mode="default"))
        configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", kernelize=True))
        configs.append(BenchmarkConfig(attn_implementation="flash_attention_2", kernelize=True))
-        configs.append(BenchmarkConfig(attn_implementation="paged|sdpa", continuous_batching=True))
    return configs
--- a/benchmark_v2/framework/benchmark_runner.py
+++ b/benchmark_v2/framework/benchmark_runner.py
@ -234,9 +234,8 @@ class BenchmarkRunner:
            self.logger.info(f"Running benchmark scenario: {config.name}")

            # Quick validation: try one measurement first to see if this scenario works
-            generate_fn = self.time_generate_batch if config.continuous_batching else self.time_generate
            flush_memory()
-            e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = generate_fn(
+            e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
                max_new_tokens=1, gpu_monitor=None
            )
            if e2e_latency < 0:
@ -246,14 +245,14 @@ class BenchmarkRunner:
            # Warmup runs
            self.logger.info(f"Warming up with {config.warmup_iterations} iterations...")
            for _ in trange(config.warmup_iterations):
-                _ = generate_fn(max_new_tokens=config.num_tokens_to_generate)
+                _ = self.time_generate(max_new_tokens=config.num_tokens_to_generate)
            self.logger.info("Warmup over.")

            # Measurement runs
            result = BenchmarkResult()
            self.logger.info(f"Benchmarking with {config.measurement_iterations} iterations.")
            for _ in trange(config.measurement_iterations):
-                e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = generate_fn(
+                e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
                    max_new_tokens=config.num_tokens_to_generate,
                    gpu_monitor=(GPUMonitor(logger=self.logger) if config.gpu_monitoring else None),
                )
@ -275,58 +274,6 @@ class BenchmarkRunner:
                "config": config,
            }

-    # TODO: refactor `generate_batch` to handle streaming so we can use it here
-    def time_generate_batch(
-        self,
-        max_new_tokens: int,
-        gpu_monitor: GPUMonitor | None = None,
-    ) -> tuple[float, list[float], str, GPURawMetrics | None]:
-        if gpu_monitor is not None:
-            gpu_monitor.start()
-        config = GenerationConfig(
-            max_new_tokens=max_new_tokens,
-            eos_token_id=self.tokenizer.eos_token_id,
-            pad_token_id=self.tokenizer.pad_token_id,
-            do_sample=True,
-        )
-        manager = self.model.init_continuous_batching(config)
-        manager.start()
-        try:
-            first_req_results = []
-            timestamps = []
-            wall_time_0 = time.perf_counter()
-            inputs = self.inputs["input_ids"].tolist()
-            manager.add_requests(inputs, max_new_tokens=max_new_tokens, streaming=True)
-            first_req_id = None
-            num_requests = len(inputs)
-            finished_requests = 0
-            while finished_requests < num_requests:
-                # NOTE: I don't like having the extra if stmt here, but hopefully won't degrade perf too much
-                result = manager.get_result()
-                if result:
-                    timestamps.append(time.perf_counter() - wall_time_0)
-                    if result.is_finished():
-                        finished_requests += 1
-                    if first_req_id is None:
-                        first_req_id = result.request_id
-                    if result.request_id == first_req_id:
-                        first_req_results.append(result)
-                else:
-                    if not manager.is_running():
-                        raise RuntimeError("Generation thread exited unexpectedly")
-            wall_time_1 = time.perf_counter()
-            gpu_metrics = gpu_monitor.stop_and_collect() if gpu_monitor is not None else None
-            decoded_output = self.tokenizer.decode(
-                [res.generated_tokens[0] for res in first_req_results], skip_special_tokens=True
-            )
-            shape_and_decoded_output = f"{(1, len(first_req_results))} | {decoded_output}"
-            e2e_latency = wall_time_1 - wall_time_0
-            return e2e_latency, timestamps, shape_and_decoded_output, gpu_metrics
-        except Exception as e:
-            raise e
-        finally:
-            manager.stop()
-
    def time_generate(
        self,
        max_new_tokens: int,
@ -392,6 +339,12 @@ class BenchmarkRunner:

        n_configs = len(benchmark_configs)
        for i, config in enumerate(benchmark_configs):
+            # Handle SDPA backend if not determined by the config (needs to be done before skipping duplicates)
+            if config.attn_implementation == "sdpa" and config.sdpa_backend is None:
+                default_backend = "flash_attention"  # FIXME: torch has a _cur_sdpa_kernel_backends but it fails
+                self.logger.warning(f"No SDPA backend provided, using {default_backend} instead.")
+                config.sdpa_backend = default_backend
+
            # Skip if already run
            if config.hash in all_results:
                self.logger.info(f"Skipping duplicate config {config.name} for model {model_id} ({i + 1}/{n_configs})")
@ -415,27 +368,21 @@ class BenchmarkRunner:
            self.cleanup()
            self.save_results(model_id, all_results, timestamp=timestamp)

-        if len(all_results) < 1:
-            raise RuntimeError("No benchmark was run succesfully")
-
        if pretty_print_summary:
            print()
            print("=" * 100)
            print(f"Finished benchmarks in {time.perf_counter() - start_time:.2f} seconds")
            print(f"Total number of benchmarks: {len(all_results)}")
-            print("First run metadata:")
-            first_key = list(all_results.keys())[0]
-            first_metadata = all_results[first_key]["metadata"].to_dict()
-            hardware_info = first_metadata.pop("hardware_info")
-            pretty_print_dict(first_metadata | hardware_info, tabs=1)
+            if len(all_results) > 0:
+                print("First run metadata:")
+                first_key = list(all_results.keys())[0]
+                first_metadata = all_results[first_key]["metadata"].to_dict()
+                hardware_info = first_metadata.pop("hardware_info")
+                pretty_print_dict(first_metadata | hardware_info, tabs=1)
            for result in all_results.values():
                print("=" * 100)
                print(f"Config: {result['config'].infer_name(compact=False)}\n")
-                result["measurements"].pprint(
-                    batch_size=result["config"].batch_size,
-                    num_generated_tokens=result["config"].num_tokens_to_generate,
-                    tabs=1,
-                )
+                result["measurements"].pprint(batch_size=result["config"].batch_size, tabs=1)
            print("=" * 100)

        return (timestamp, all_results)
--- a/benchmark_v2/framework/data_classes.py
+++ b/benchmark_v2/framework/data_classes.py
@ -36,17 +36,16 @@ def add_unit_to_duration(stats: dict[str, float]) -> dict[str, str]:
    return stats


-def equalize_lengths_and_collate(stats: dict[str, dict[str, str]]) -> dict[str, str]:
-    """Note: This operation is destructive as it will update values in place before returning a new correctly formatted dict"""
+def equalize_lengths_and_collate(stats: list[dict[str, str]]) -> list[str]:
    keys = ["avg", "std", "min", "med", "max", "p95"]
    for key in keys:
-        max_length = max(len(stat[key]) for stat in stats.values())
-        for stat in stats.values():
+        max_length = max(len(stat[key]) for stat in stats)
+        for stat in stats:
            stat[key] = stat[key].ljust(max_length, " ")
-    return {name: " ".join([f"{key}={stat[key]}" for key in keys]) for name, stat in stats.items()}
+    return [" ".join([f"{key}={stat[key]}" for key in keys]) for stat in stats]


-def pretty_print_dict(data: dict[str, str], tabs: int = 0) -> None:
+def pretty_print_dict(data: dict[str, Any], tabs: int = 0) -> None:
    max_key_length = max([len(key) for key in data.keys()])
    for key, value in data.items():
        tabs_str = "  " * tabs
@ -142,19 +141,27 @@ class BenchmarkResult:
    def get_measured_itl(self) -> list[float]:
        return [(dt[-1] - dt[0]) / (len(dt) - 1) for dt in self.token_generation_times if len(dt) > 1]

-    def get_throughput(self, total_generated_tokens: int) -> list[float]:
-        return [total_generated_tokens / e2e_latency for e2e_latency in self.e2e_latency]
+    def get_throughput(self, batch_size: int) -> float:
+        return [
+            batch_size * len(dt) / e2e_latency
+            for e2e_latency, dt in zip(self.e2e_latency, self.token_generation_times)
+        ]

-    def pprint(self, batch_size: int = 0, num_generated_tokens: int = 0, tabs: int = 0) -> None:
-        measurements = {
-            "E2E Latency": add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
-            "Time to First Token": add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
-        }
-        itl_values = self.get_measured_itl()
-        if len(itl_values) > 0:
-            measurements["Inter-Token Latency"] = add_unit_to_duration(compute_basic_statistics(itl_values))
+    def pprint(self, batch_size: int = 0, tabs: int = 0) -> None:
+        stats_to_collate = [
+            add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
+            add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
+            add_unit_to_duration(compute_basic_statistics(self.get_measured_itl())),
+        ]
        if batch_size > 0:
-            throughput_stats = compute_basic_statistics(self.get_throughput(batch_size * num_generated_tokens))
-            measurements["Throughput"] = {key: f"{value:.2f}tok/s" for key, value in throughput_stats.items()}
-        dict_to_pprint = equalize_lengths_and_collate(measurements)
+            throughput_stats = compute_basic_statistics(self.get_throughput(batch_size))
+            stats_to_collate.append({key: f"{value:.2f}tok/s" for key, value in throughput_stats.items()})
+        collated_stats = equalize_lengths_and_collate(stats_to_collate)
+        dict_to_pprint = {
+            "E2E Latency": collated_stats[0],
+            "Time to First Token": collated_stats[1],
+            "Inter-Token Latency": collated_stats[2],
+        }
+        if batch_size > 0:
+            dict_to_pprint["Throughput"] = collated_stats[3]
        pretty_print_dict(dict_to_pprint, tabs=tabs)
--- a/benchmark_v2/requirements.txt
+++ b/benchmark_v2/requirements.txt
@ -2,5 +2,6 @@ numpy>=1.21.0
 psutil>=5.8.0
 gpustat>=1.0.0
 torch>=2.0.0
+transformers>=4.30.0
 datasets>=2.10.0
 huggingface_hub>=0.16.0
--- a/benchmark_v2/run_benchmarks.py
+++ b/benchmark_v2/run_benchmarks.py
@ -80,10 +80,6 @@ if __name__ == "__main__":
    logger.info(f"Benchmark run UUID: {benchmark_run_uuid}")
    logger.info(f"Output directory: {args.output_dir}")

-    # We cannot compute ITL if we don't have at least two measurements
-    if any(n <= 1 for n in args.num_tokens_to_generate):
-        raise ValueError("--num_tokens_to_generate arguments should be larger than 1")
-
    # Error out if one of the arguments is not provided
    if len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 0:
        raise ValueError(
--- a/docs/source/de/add_new_model.md
+++ b/docs/source/de/add_new_model.md
@ -508,16 +508,16 @@ BERT `_init_weights` Methode:
 def _init_weights(self, module):
    """Initialize the weights"""
    if isinstance(module, nn.Linear):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.zero_()
+            module.bias.data.zero_()
    elif isinstance(module, nn.Embedding):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.padding_idx is not None:
            module.weight.data[module.padding_idx].zero_()
    elif isinstance(module, nn.LayerNorm):
-        module.bias.zero_()
-        module.weight.fill_(1.0)
+        module.bias.data.zero_()
+        module.weight.data.fill_(1.0)
 ```

 Sie können weitere benutzerdefinierte Schemata verwenden, wenn Sie eine spezielle Initialisierung für einige Module benötigen. Zum Beispiel in
@ -533,9 +533,9 @@ def _init_weights(self, module):
        module.project_hid._is_hf_initialized = True
        module.project_q._is_hf_initialized = True
    elif isinstance(module, nn.Linear):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.zero_()
+            module.bias.data.zero_()
 ```

 Das Flag `_is_hf_initialized` wird intern verwendet, um sicherzustellen, dass wir ein Submodul nur einmal initialisieren. Wenn Sie es auf
--- a/docs/source/en/add_new_model.md
+++ b/docs/source/en/add_new_model.md
@ -314,16 +314,16 @@ Random initialization occurs in the `_init_weights` method of `BrandNewLlamaPreT
 def _init_weights(self, module):
    """Initialize the weights"""
    if isinstance(module, nn.Linear):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.zero_()
+            module.bias.data.zero_()
    elif isinstance(module, nn.Embedding):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.padding_idx is not None:
            module.weight.data[module.padding_idx].zero_()
    elif isinstance(module, nn.LayerNorm):
-        module.bias.zero_()
-        module.weight.fill_(1.0)
+        module.bias.data.zero_()
+        module.weight.data.fill_(1.0)
 ```

 The initialization scheme can look different if you need to adapt it to your model. For example, [`Wav2Vec2ForPreTraining`] initializes [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) in its last two linear layers.
@ -339,9 +339,9 @@ def _init_weights(self, module):
        module.project_hid._is_hf_initialized = True
        module.project_q._is_hf_initialized = True
    elif isinstance(module, nn.Linear):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.zero_()
+            module.bias.data.zero_()
 ```

 ### Convert checkpoints to Transformers
--- a/docs/source/en/model_doc/dinov3.md
+++ b/docs/source/en/model_doc/dinov3.md
@ -169,9 +169,6 @@ print("Pooled output shape:", pooled_output.shape)
 [[autodoc]] DINOv3ViTModel
    - forward

-## DINOv3ViTBackbone    
-[[autodoc]] DINOv3ViTBackbone
-
 ## DINOv3ConvNextModel

 [[autodoc]] DINOv3ConvNextModel
--- a/docs/source/en/model_doc/smolvlm.md
+++ b/docs/source/en/model_doc/smolvlm.md
@ -159,7 +159,7 @@ conversation3 = [

 conversations = [conversation1, conversation2, conversation3]
 inputs = processor.apply_chat_template(
-    conversations,
+    conversation,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
--- a/docs/source/en/perf_infer_gpu_multi.md
+++ b/docs/source/en/perf_infer_gpu_multi.md
@ -149,7 +149,7 @@ The example below packs `up_proj` and `gate_proj` into a single `gate_up_proj` m
 ```python
 class Llama4TextExperts(nn.Module):
    ...
-    self.gate_up_proj = nn.Parameter(torch.zeros(self.num_experts, self.hidden_size, 2 * self.expert_dim))
+    self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
 ```

 Batch matrix multiplication can be used in the `forward` pass to compute the output of the `gate_up_proj` module.
--- a/docs/source/it/migration.md
+++ b/docs/source/it/migration.md
@ -170,7 +170,7 @@ Per quanto riguarda la classe `TrainingArguments`:
 - L'argomento `evaluate_during_training` di `TrainingArguments` è deprecato a favore di `eval_strategy`.

 Per quanto riguarda il modello Transfo-XL:
- L'attributo di configurazione `tie_weight` di Transfo-XL diventa `tie_word_embeddings`.
+- L'attributo di configurazione `tie_weight` di Transfo-XL diventa `tie_words_embeddings`.
 - Il metodo di modellazione `reset_length` di Transfo-XL diventa `reset_memory_length`.

 Per quanto riguarda le pipeline:
--- a/docs/source/ja/add_new_model.md
+++ b/docs/source/ja/add_new_model.md
@ -406,16 +406,16 @@ model = BrandNewBertModel(BrandNewBertConfig())
 def _init_weights(self, module):
    """Initialize the weights"""
    if isinstance(module, nn.Linear):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.zero_()
+            module.bias.data.zero_()
    elif isinstance(module, nn.Embedding):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.padding_idx is not None:
            module.weight.data[module.padding_idx].zero_()
    elif isinstance(module, nn.LayerNorm):
-        module.bias.zero_()
-        module.weight.fill_(1.0)
+        module.bias.data.zero_()
+        module.weight.data.fill_(1.0)
 ```

 特定のモジュールに特別な初期化が必要な場合、カスタムスキームをさらに持つことができます。たとえば、
@ -431,9 +431,9 @@ def _init_weights(self, module):
        module.project_hid._is_hf_initialized = True
        module.project_q._is_hf_initialized = True
    elif isinstance(module, nn.Linear):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.zero_()
+            module.bias.data.zero_()
 ```

 `_is_hf_initialized`フラグは、サブモジュールを一度だけ初期化することを確実にするために内部で使用されます。
--- a/docs/source/ko/add_new_model.md
+++ b/docs/source/ko/add_new_model.md
@ -348,16 +348,16 @@ model = BrandNewBertModel(BrandNewBertConfig())
 def _init_weights(self, module):
    """Initialize the weights"""
    if isinstance(module, nn.Linear):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.zero_()
+            module.bias.data.zero_()
    elif isinstance(module, nn.Embedding):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.padding_idx is not None:
            module.weight.data[module.padding_idx].zero_()
    elif isinstance(module, nn.LayerNorm):
-        module.bias.zero_()
-        module.weight.fill_(1.0)
+        module.bias.data.zero_()
+        module.weight.data.fill_(1.0)
 ```

 몇 가지 모듈에 대해 특별한 초기화가 필요한 경우 사용자 정의 방식을 사용할 수도 있습니다. 예를 들어, `Wav2Vec2ForPreTraining`에서 마지막 두 개의 선형 레이어는 일반적인 PyTorch `nn.Linear`의 초기화를 가져야 하지만, 다른 모든 레이어는 위와 같은 초기화를 사용해야 합니다. 이는 다음과 같이 코드화됩니다:
@ -371,9 +371,9 @@ def _init_weights(self, module):
        module.project_hid._is_hf_initialized = True
        module.project_q._is_hf_initialized = True
    elif isinstance(module, nn.Linear):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.zero_()
+            module.bias.data.zero_()
 ```

 `_is_hf_initialized` 플래그는 서브모듈을 한 번만 초기화하도록 내부적으로 사용됩니다. `module.project_q` 및 `module.project_hid`에 대해 `True`로 설정함으로써, 우리가 수행한 사용자 정의 초기화가 이후에 덮어쓰이지 않도록 합니다. 즉, `_init_weights` 함수가 이들에게 적용되지 않습니다.
--- a/docs/source/ko/perf_infer_gpu_multi.md
+++ b/docs/source/ko/perf_infer_gpu_multi.md
@ -152,7 +152,7 @@ class ParallelInterface(MutableMapping):
 ```python
 class Llama4TextExperts(nn.Module):
    ...
-    self.gate_up_proj = nn.Parameter(torch.zeros(self.num_experts, self.hidden_size, 2 * self.expert_dim))
+    self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
 ```

 배치 행렬 곱셈을 `forward` 패스에서 사용하여 `gate_up_proj` 모듈의 출력을 계산할 수 있습니다.
--- a/examples/modular-transformers/modeling_dummy_bert.py
+++ b/examples/modular-transformers/modeling_dummy_bert.py
@ -502,10 +502,16 @@ class DummyBertLMPredictionHead(nn.Module):

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=True)
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states)
@ -530,18 +536,18 @@ class DummyBertPreTrainedModel(PreTrainedModel):
    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, nn.Linear):
-            module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
-                module.bias.zero_()
+                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
-            module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
-            module.bias.zero_()
-            module.weight.fill_(1.0)
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
        elif isinstance(module, DummyBertLMPredictionHead):
-            module.bias.zero_()
+            module.bias.data.zero_()


@auto_docstring(
--- a/examples/modular-transformers/modeling_my_new_model2.py
+++ b/examples/modular-transformers/modeling_my_new_model2.py
@ -265,7 +265,7 @@ class MyNewModel2PreTrainedModel(PreTrainedModel):

        # We initialize with 0s to be 1 centered as the RMSNorm here does (1 + weight)
        if "RMSNorm" in module.__class__.__name__:
-            module.weight.zero_()
+            module.weight.data.zero_()


 class MyNewModel2ForSequenceClassification(GenericForSequenceClassification, MyNewModel2PreTrainedModel):
--- a/examples/modular-transformers/modeling_new_task_model.py
+++ b/examples/modular-transformers/modeling_new_task_model.py
@ -104,9 +104,9 @@ class NewTaskModelPreTrainedModel(PreTrainedModel):
        std = getattr(self.config, "initializer_range", self.config.get_text_config().initializer_range)

        if isinstance(module, nn.Linear):
-            module.weight.normal_(mean=0.0, std=std)
+            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
-                module.bias.zero_()
+                module.bias.data.zero_()


 def token_type_ids_mask_function(
@ -428,7 +428,7 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
        "^multi_modal_projector": "model.multi_modal_projector",
        "^language_model.lm_head": "lm_head",
    }
-    _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"}
+    _tied_weights_keys = ["lm_head.weight"]
    main_input_name: ClassVar[str] = "doc_input_ids"  # transformers-related

    def __init__(self, config):
@ -440,15 +440,7 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
        self.custom_text_proj = nn.Linear(self.config.text_config.hidden_size, self.embedding_dim)

        if self.language_model._tied_weights_keys is not None:
-            prefix = "model.language_model."
-            prefixed_mapping = {
-                f"{prefix}{target}": f"{prefix}{source}"
-                for target, source in self.language_model._tied_weights_keys.items()
-            }
-            if isinstance(self._tied_weights_keys, dict):
-                self._tied_weights_keys.update(prefixed_mapping)
-            else:
-                self._tied_weights_keys = prefixed_mapping
+            self._tied_weights_keys = [f"model.language_model.{k}" for k in self.language_model._tied_weights_keys]
        self.post_init()

    def get_input_embeddings(self):
--- a/examples/modular-transformers/modeling_roberta.py
+++ b/examples/modular-transformers/modeling_roberta.py
@ -505,10 +505,16 @@ class RobertaLMPredictionHead(nn.Module):

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=True)
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states)
@ -533,18 +539,18 @@ class RobertaPreTrainedModel(PreTrainedModel):
    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, nn.Linear):
-            module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
-                module.bias.zero_()
+                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
-            module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
-            module.bias.zero_()
-            module.weight.fill_(1.0)
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
        elif isinstance(module, RobertaLMPredictionHead):
-            module.bias.zero_()
+            module.bias.data.zero_()


@auto_docstring(
--- a/examples/modular-transformers/modeling_test_detr.py
+++ b/examples/modular-transformers/modeling_test_detr.py
@ -846,11 +846,11 @@ class TestDetrPreTrainedModel(PreTrainedModel):
            nn.init.xavier_uniform_(module.output_proj.weight.data)
            nn.init.constant_(module.output_proj.bias.data, 0.0)
        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
-            module.weight.normal_(mean=0.0, std=std)
+            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
-                module.bias.zero_()
+                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
-            module.weight.normal_(mean=0.0, std=std)
+            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        if hasattr(module, "reference_points") and not self.config.two_stage:
--- a/examples/modular-transformers/modular_new_task_model.py
+++ b/examples/modular-transformers/modular_new_task_model.py
@ -19,15 +19,7 @@ class NewTaskModelForNewTask(PaliGemmaForConditionalGeneration):
        self.custom_text_proj = nn.Linear(self.config.text_config.hidden_size, self.embedding_dim)

        if self.language_model._tied_weights_keys is not None:
-            prefix = "model.language_model."
-            prefixed_mapping = {
-                f"{prefix}{target}": f"{prefix}{source}"
-                for target, source in self.language_model._tied_weights_keys.items()
-            }
-            if isinstance(self._tied_weights_keys, dict):
-                self._tied_weights_keys.update(prefixed_mapping)
-            else:
-                self._tied_weights_keys = prefixed_mapping
+            self._tied_weights_keys = [f"model.language_model.{k}" for k in self.language_model._tied_weights_keys]

        self.post_init()

--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@ -127,7 +127,7 @@ def parse_args():
    parser.add_argument(
        "--use_slow_tokenizer",
        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
    )
    parser.add_argument(
        "--per_device_train_batch_size",
--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@ -132,7 +132,7 @@ def parse_args():
    parser.add_argument(
        "--use_slow_tokenizer",
        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the Tokenizers library).",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
    )
    parser.add_argument(
        "--per_device_train_batch_size",
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@ -130,7 +130,7 @@ def parse_args():
    parser.add_argument(
        "--use_slow_tokenizer",
        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
    )
    parser.add_argument(
        "--per_device_train_batch_size",
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@ -128,7 +128,7 @@ def parse_args():
    parser.add_argument(
        "--use_slow_tokenizer",
        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the HuggingFace Tokenizers library).",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
    )
    parser.add_argument(
        "--per_device_train_batch_size",
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@ -151,7 +151,7 @@ def parse_args():
    parser.add_argument(
        "--use_slow_tokenizer",
        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
    )
    parser.add_argument(
        "--per_device_train_batch_size",
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@ -223,7 +223,7 @@ def parse_args():
    parser.add_argument(
        "--use_slow_tokenizer",
        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
    )
    parser.add_argument(
        "--per_device_train_batch_size",
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@ -120,7 +120,7 @@ def parse_args():
    parser.add_argument(
        "--use_slow_tokenizer",
        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
    )
    parser.add_argument(
        "--per_device_train_batch_size",
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@ -212,7 +212,7 @@ def parse_args():
    parser.add_argument(
        "--use_slow_tokenizer",
        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
    )
    parser.add_argument(
        "--per_device_train_batch_size",
--- a/setup.py
+++ b/setup.py
@ -138,7 +138,7 @@ _deps = [
    "pyyaml>=5.1",
    "pydantic>=2",
    "pytest>=7.2.0",
-    "pytest-asyncio>=1.2.0",
+    "pytest-asyncio",
    "pytest-rerunfailures<16.0",
    "pytest-timeout",
    "pytest-xdist",
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@ -876,7 +876,7 @@ class PreTrainedConfig(PushToHubMixin):
        if hasattr(self, "quantization_config"):
            serializable_config_dict["quantization_config"] = (
                self.quantization_config.to_dict()
-                if not isinstance(self.quantization_config, dict) and self.quantization_config is not None
+                if not isinstance(self.quantization_config, dict)
                else self.quantization_config
            )
        self.dict_dtype_to_str(serializable_config_dict)
@ -910,7 +910,7 @@ class PreTrainedConfig(PushToHubMixin):
        if hasattr(self, "quantization_config"):
            output["quantization_config"] = (
                self.quantization_config.to_dict()
-                if not isinstance(self.quantization_config, dict) and self.quantization_config is not None
+                if not isinstance(self.quantization_config, dict)
                else self.quantization_config
            )
        self.dict_dtype_to_str(output)
--- a/src/transformers/conversion_mapping.py
+++ b/src/transformers/conversion_mapping.py
@ -1,136 +0,0 @@
-# coding=utf-8
-# Copyright (C) 2025 the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from copy import deepcopy
-
-from .core_model_loading import Concatenate, MergeModulelist, WeightConverter
-from .utils import is_torch_available
-
-
-if is_torch_available():
-    import torch
-
-
-def _build_checkpoint_conversion_mapping():
-    mapping = {
-        "mixtral": [
-            WeightConverter(
-                source_keys=[
-                    "block_sparse_moe.experts.*.w1.weight",
-                    "block_sparse_moe.experts.*.w3.weight",
-                ],  # you give me a list of 2 keys, I collect a list of a list of tensors
-                target_keys="mlp.experts.gate_up_proj",  # target key gets the list of two tensors
-                operations=[
-                    MergeModulelist(
-                        dim=0
-                    ),  # each process has two lists of tensors, we cat each list. -> we end up with 2 tensors
-                    Concatenate(dim=1),  # each process has 2 tensors, gate and up, we concat them into gate_up
-                ],  # we want the loading to add this shard operation here. Though we can't shard after concats and merge, needs to be first
-            ),
-            WeightConverter(
-                source_keys=[
-                    "block_sparse_moe.experts.*.w2.weight",
-                ],
-                target_keys="mlp.experts.down_proj",  # target key gets the list of two tensors
-                operations=[
-                    MergeModulelist(
-                        dim=0
-                    ),  # each process has two lists of tensors, we cat each list. -> we end up with 2 tensors
-                ],  # we want the loading to add this shard operation here. Though we can't shard after concats and merge, needs to be first
-            ),
-            # WeightConverter(
-            #     ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"],
-            #     "self_attn.qkv_proj",
-            #     operations=[Concatenate(dim=0)],  # more like stack?
-            # ),
-            WeightConverter("*.block_sparse_moe.", "*.mlp."),
-        ],
-        "qwen2_moe": [
-            WeightConverter(
-                source_keys=[
-                    "mlp.experts.*.gate_proj.weight",
-                    "mlp.experts.*.up_proj.weight",
-                ],
-                target_keys="mlp.experts.gate_up_proj",
-                operations=[MergeModulelist(dim=0), Concatenate(dim=1)],
-            ),
-            WeightConverter(
-                source_keys=["mlp.experts.*.down_proj.weight"],
-                target_keys="mlp.experts.down_proj",
-                operations=[MergeModulelist(dim=0)],
-            ),
-        ],
-        "legacy": [
-            WeightConverter(
-                source_keys="LayerNorm.gamma",
-                target_keys="LayerNorm.weight",
-            ),
-            WeightConverter(
-                source_keys="LayerNorm.beta",
-                target_keys="LayerNorm.bias",
-            ),
-        ],
-    }
-    if hasattr(torch.nn.utils.parametrizations, "weight_norm"):
-        mapping["legacy"] += [
-            WeightConverter(
-                source_keys="weight_g",
-                target_keys="parametrizations.weight.original0",
-            ),
-            WeightConverter(
-                source_keys="weight_v",
-                target_keys="parametrizations.weight.original1",
-            ),
-        ]
-    else:
-        mapping["legacy"] += [
-            WeightConverter(
-                source_keys="parametrizations.weight.original0",
-                target_keys="weight_g",
-            ),
-            WeightConverter(
-                source_keys="parametrizations.weight.original1",
-                target_keys="weight_v",
-            ),
-        ]
-
-    mapping["phimoe"] = mapping["mixtral"].copy()
-    mapping["deepseek_v2"] = mapping["qwen2_moe"].copy()
-    mapping["deepseek_v3"] = mapping["qwen2_moe"].copy()
-    mapping["dot1"] = mapping["qwen2_moe"].copy()
-    mapping["ernie_4_5_moe"] = mapping["qwen2_moe"].copy()
-    mapping["glm4_moe"] = mapping["qwen2_moe"].copy()
-    mapping["glm4v_moe"] = mapping["qwen2_moe"].copy()
-    mapping["jamba"] = mapping["qwen2_moe"].copy()
-    mapping["lfm2_moe"] = mapping["mixtral"].copy()
-    mapping["long_cat_flash"] = mapping["qwen2_moe"].copy()
-    mapping["qwen3_moe"] = mapping["qwen2_moe"].copy()
-    mapping["qwen3_omni_moe"] = mapping["qwen2_moe"].copy()
-    mapping["qwen3_next"] = mapping["qwen2_moe"].copy()
-    mapping["qwen3_vl_moe"] = mapping["qwen2_moe"].copy()
-    mapping["hunyuan_v1_moe"] = mapping["qwen2_moe"].copy()
-    mapping["minimax"] = mapping["mixtral"].copy()
-
-    return mapping
-
-
-_checkpoint_conversion_mapping_cache = None
-
-
-def get_checkpoint_conversion_mapping(model_type):
-    global _checkpoint_conversion_mapping_cache
-    _checkpoint_conversion_mapping_cache = _build_checkpoint_conversion_mapping()
-    globals()["_checkpoint_conversion_mapping"] = _checkpoint_conversion_mapping_cache
-    return deepcopy(_checkpoint_conversion_mapping_cache.get(model_type, None))
--- a/src/transformers/core_model_loading.py
+++ b/src/transformers/core_model_loading.py
@ -1,733 +0,0 @@
-# coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Core helpers for loading model checkpoints."""
-
-from __future__ import annotations
-
-import itertools
-import os
-import re
-from abc import abstractmethod
-from collections import defaultdict
-from collections.abc import MutableMapping, MutableSet, Sequence
-from concurrent.futures import Future, ThreadPoolExecutor
-from contextlib import contextmanager
-from dataclasses import dataclass, field
-from functools import partial
-from types import MethodType
-from typing import TYPE_CHECKING, Any, Optional, Union
-
-import torch
-
-from .integrations.tensor_parallel import ALL_PARALLEL_STYLES, DTensor, Replicate, TensorParallelLayer
-from .utils import is_torch_greater_or_equal, logging
-
-
-_torch_distributed_available = torch.distributed.is_available()
-_is_dtensor_available = _torch_distributed_available and is_torch_greater_or_equal("2.5")
-if _is_dtensor_available:
-    from torch.distributed.tensor import DTensor
-
-if TYPE_CHECKING:
-    from .modeling_utils import PreTrainedModel
-    from .quantizers import HfQuantizer
-
-
-logger = logging.get_logger(__name__)
-
-str_to_torch_dtype = {
-    "BOOL": torch.bool,
-    "U8": torch.uint8,
-    "I8": torch.int8,
-    "I16": torch.int16,
-    "F16": torch.float16,
-    "BF16": torch.bfloat16,
-    "I32": torch.int32,
-    "F32": torch.float32,
-    "F64": torch.float64,
-    "I64": torch.int64,
-    "F8_E4M3": torch.float8_e4m3fn,
-    "F8_E5M2": torch.float8_e5m2,
-}
-
-
-logger = logging.get_logger(__name__)
-
-
-def _glob_to_regex_src(glob: str, *, digits_only: bool = True) -> str:
-    """
-    Convert a glob with '*' into a regex *source* string. We don't use `glob.translate`
-    '*' matches (\\d+) if digits_only else (.+). Inner groups are non-capturing.
-    """
-    star = r"(\d+)" if digits_only else r"(.+)"
-    return glob.replace(r"\*", star)
-
-
-def build_glob_alt(
-    globs: list[str],
-) -> tuple[re.Pattern, dict[str, str]]:
-    r"""
-    Build one compiled regex alternation with a named group per glob. This allows to run a single
-    re.match and get the correct group name to finally get which pattern matched.
-    Returns (compiled_regex, name->glob map).
-
-    Example:
-
-    ```py
-    >>> reg, map_ = build_glob_alt(["mlp.*.w1", "mlp.*.w2"])
-    >>> print(reg)
-    (re.compile(r'(?P<g0>.*mlp\.(\d+)\.w1)|(?P<g1>.*mlp\.(\d+)\.w2)', re.UNICODE),
-    >>> print(map_)
-    {'g0': 'mlp.*.w1', 'g1': 'mlp.*.w2'})
-    >>> match_ = reg.match("model.layers.0.mlp.0.w1.weight")
-    >>> print(match_.lastgroup)
-    'g0'
-    >>> print(map_[match_.lastgroup])
-    mlp.*.w1
-    ```
-    """
-    name_map: dict[str, str] = {}
-    parts: list[str] = []
-
-    for i, g in enumerate(globs):
-        name = f"g{i}"
-        name_map[name] = g
-        pat_src = _glob_to_regex_src(g)
-        prefix_src = ""
-        if pat_src.startswith("*"):
-            prefix_src = "."
-        elif not pat_src.startswith(r"\^") and not pat_src.startswith(r".*"):
-            prefix_src = ".*"
-
-        parts.append(f"(?P<{name}>{prefix_src}{pat_src})")
-
-    alt_src = "|".join(parts).replace("\\^", "^").replace("\\.", r"\.")
-    try:
-        reg = re.compile(alt_src)
-    except re.error as e:
-        logger.error(f"Error compiling regex for alternation: {alt_src}")
-        raise e
-
-    return reg, name_map
-
-
-def match_glob(key: str, alt: re.Pattern, name_map: dict[str, str]) -> Optional[str]:
-    """
-    Match the key against the alternation; return the original glob string that matched.
-    """
-    m = alt.match(key)
-    if not m:
-        return None
-    return name_map.get(m.lastgroup)
-
-
-class ConversionOps:
-    """Base class for weight conversion operations."""
-
-    # The inverse operation class, will be used when saving the checkpoint
-    reverse_op: type[ConversionOps]
-
-    @abstractmethod
-    def convert(
-        self, value: Union[dict[str, torch.Tensor], Sequence[torch.Tensor], torch.Tensor], *args, **kwargs
-    ) -> torch.Tensor:
-        raise NotImplementedError
-
-
-class Chunk(ConversionOps):
-    """Split a tensor along ``dim`` into equally sized chunks or using explicit ``sizes``."""
-
-    reverse_op: type[ConversionOps]
-
-    def __init__(self, dim: int = 0, chunks: Optional[int] = None, sizes: Optional[Sequence[int]] = None):
-        if chunks is None and sizes is None:
-            raise ValueError("`chunks` or `sizes` must be provided for Chunk operations.")
-        if chunks is not None and chunks <= 0:
-            raise ValueError("`chunks` must be a strictly positive integer.")
-        self.dim = dim
-        self.chunks = chunks
-        self.sizes = list(sizes) if sizes is not None else None
-        self.reverse_op = Concatenate
-
-    def convert(self, value: torch.Tensor, *args, **kwargs) -> list[torch.Tensor]:
-        if not isinstance(value, torch.Tensor):
-            raise TypeError("Chunk expects a torch.Tensor as input.")
-        if self.sizes is not None:
-            return list(torch.split(value, self.sizes, dim=self.dim))
-        return list(torch.chunk(value, self.chunks, dim=self.dim))
-
-
-class Concatenate(ConversionOps):
-    """Concatenate tensors along `dim` using a reusable buffer."""
-
-    reverse_op: type[ConversionOps]
-
-    def __init__(self, dim: int = 0):
-        self.dim = dim
-        self.reverse_op = Chunk
-
-    @torch.no_grad
-    def convert(self, value: Sequence[torch.Tensor], *args, **kwargs) -> torch.Tensor:
-        if isinstance(value[0], list):
-            value = [v[0] for v in value]
-        tensors = value
-        if not tensors:
-            raise ValueError("Fuse requires at least one tensor to concatenate.")
-
-        return torch.cat(tuple(tensors), dim=self.dim)
-
-
-class MergeModulelist(Concatenate):
-    """
-    Merge a list of tensors into a single tensor along the first dimension.
-    We explicitly define this because for EP or TP you want to make sure you know what you are doing!
-
-    """
-
-    def __init__(self, dim: int = 0):
-        super().__init__(dim=dim)
-        self.reverse_op = SplitModulelist
-
-    @torch.no_grad
-    def convert(self, value: Sequence[torch.Tensor], *args, **kwargs) -> list[torch.Tensor]:
-        merged = []
-        for group in value:
-            if not isinstance(group, Sequence) or len(group) == 0:
-                raise ValueError("MergeModulelist requires non-empty sub-sequences.")
-            group = [k for k in group if k.ndim]
-            merged.append(torch.stack(group, dim=self.dim))
-        return merged
-
-
-class SplitModulelist(ConversionOps):
-    """Inverse of :class:`MergeModulelist` using explicit split sizes per group."""
-
-    def __init__(self, sizes: Sequence[Sequence[int]], dim: int = 0):
-        if not isinstance(sizes, Sequence) or not all(isinstance(sub, Sequence) and sub for sub in sizes):
-            raise ValueError("`sizes` must be a sequence of non-empty sequences of integers.")
-        self.sizes = [list(sub) for sub in sizes]
-        self.dim = dim
-        self.reverse_op = MergeModulelist
-
-    @torch.no_grad
-    def convert(self, value: Sequence[torch.Tensor], *, context: dict[str, Any]) -> list[list[torch.Tensor]]:
-        if not isinstance(value, Sequence):
-            raise TypeError("SplitModulelist expects a sequence of tensors.")
-        if len(value) != len(self.sizes):
-            raise ValueError("Number of tensors does not match the provided split specifications.")
-
-        result: list[list[torch.Tensor]] = []
-        for tensor, split_sizes in zip(value, self.sizes):
-            if not isinstance(tensor, torch.Tensor):
-                raise TypeError("SplitModulelist can only split torch.Tensor instances.")
-            splits = torch.split(tensor, split_sizes, dim=self.dim)
-            result.append(list(splits))
-        return result
-
-
-class PermuteForRope(ConversionOps):
-    """
-    Applies the permutation required to convert complex RoPE weights to the split sin/cos format.
-    """
-
-    def __init__(self):
-        pass
-
-    def _apply(self, tensor: torch.Tensor) -> torch.Tensor:
-        dim1, dim2 = tensor.shape
-        n_heads = self.config.getattr("num_attention_heads", 1)
-
-        tensor = tensor.view(n_heads, dim1 // n_heads // 2, 2, dim2)
-        tensor = tensor.transpose(1, 2).reshape(dim1, dim2)
-        return tensor
-
-    @torch.no_grad
-    def convert(
-        self, value: Union[dict[str, torch.Tensor], Sequence[torch.Tensor], torch.Tensor], config
-    ) -> Union[dict[str, torch.Tensor], list[torch.Tensor], torch.Tensor]:
-        self.config = config
-        out = [[self._apply(x) for x in inner] if isinstance(inner, list) else self._apply(inner) for inner in value]
-        return out
-
-
-@dataclass(slots=True)
-class WeightConverter:
-    r"""
-    A weight convert that acts on a pattern of source keys.
-    The keys need to be collected based on the target keys.
-
-    With wild card, glob patterns are matched, so you have to be detailed with what to match. If you match:
-    `model.layers.*.experts.*` -> it will act on all of them
-    {"model.layers.*.experts.*": []}
-    but
-    `experts.*.mlp` will be layer specific.
-    {"model.layers.1.experts.*": [], }
-    - source_keys: str | list[str] (wildcards '*' match digits)
-    - target_keys: str | list[str] | None
-    - distributed_operation / operations / quantization_operations are ALWAYS lists.
-
-    TODO: for BNB we need to collect model.weight.quant_state_keys
-    """
-
-    source_keys: Union[str, list[str]]
-    target_keys: Optional[Union[str, list[str]]] = None
-    operations: list[ConversionOps] = field(default_factory=list, repr=False)
-
-    distributed_operation: Optional[TensorParallelLayer] = None
-    quantization_operation: Optional[ConversionOps] = None
-
-    def __post_init__(self):
-        if not isinstance(self.source_keys, list):
-            self.source_keys = [self.source_keys]
-        targets_were_none = False
-        if not isinstance(self.target_keys, list):
-            if self.target_keys is None:
-                self.target_keys = list(self.source_keys)
-                targets_were_none = True
-            else:
-                self.target_keys = [self.target_keys]
-
-        if not targets_were_none and bool(len(self.source_keys) - 1) + bool(len(self.target_keys) - 1) >= 2:
-            raise ValueError(
-                f"source keys={self.source_keys}, target_keys={self.target_keys} but you can only have one to many, one to one or many to one."
-            )
-
-
-@dataclass(slots=True)
-class ConversionEntry:
-    weight_converter: WeightConverter
-    collected_tensors: dict = field(default_factory=lambda: defaultdict(dict))
-
-
-GLOBAL_WORKERS = min(16, (os.cpu_count() or 8) * 2)  # NVMe: 8-16; HDD/NFS: 2-4
-
-
-# Factory function to create LoadedParameter subclasses dynamically
-def get_loaded_parameter_class(base_cls):
-    """
-    base_cls: an nn.Parameter subclass (or nn.Parameter) or a Tensor
-    Returns a new class that combines the base_cls with LoadedParameterMixin
-
-    """
-
-    class LoadedParam(base_cls):
-        _inplace_methods = [
-            "add_",
-            "mul_",
-            "clamp_",
-            "zero_",
-            "fill_",
-            "normal_",
-            "uniform_",
-            "copy_",
-            "erfinv_",
-            "log_",
-            "__getitem__",
-            "neg_",
-            "exp_",
-            "sub_",
-        ]
-
-        def __new__(cls, from_existing, **kwargs):
-            if isinstance(from_existing, torch.nn.Parameter):
-                inst = super().__new__(cls, from_existing.data, from_existing.requires_grad, **from_existing.__dict__)
-            else:
-                inst = super().__new__(cls, from_existing)
-            # we store the original object to get it back later on
-            inst._original = from_existing
-            # Explicitly override all in-place methods per instance
-            for method_name in inst._inplace_methods:
-                setattr(inst, method_name, MethodType(inst._skip, inst))
-
-            return inst
-
-        def _skip(self, *args, **kwargs):
-            """Helper to skip in-place operations."""
-            return self
-
-        def __repr__(self):
-            return f"LoadedParameter(data={self.data})"
-
-        @property
-        def data(self):
-            return super().data
-
-        @data.setter
-        def data(self, new):
-            pass
-
-    def __lt__(self, other):
-        return torch.Tensor.__lt__(self, other)
-
-    def __le__(self, other):
-        return torch.Tensor.__le__(self, other)
-
-    def __gt__(self, other):
-        return torch.Tensor.__gt__(self, other)
-
-    def __ge__(self, other):
-        return torch.Tensor.__ge__(self, other)
-
-    def __eq__(self, other):
-        return torch.Tensor.__eq__(self, other)
-
-    def __ne__(self, other):
-        return torch.Tensor.__ne__(self, other)
-
-    def __iadd__(self, *args, **kwargs):
-        return self
-
-    def __isub__(self, *args, **kwargs):
-        return self
-
-    def __imul__(self, *args, **kwargs):
-        return self
-
-    def __imatmul__(self, *args, **kwargs):
-        return self
-
-    def __itruediv__(self, *args, **kwargs):
-        return self
-
-    def __ifloordiv__(self, *args, **kwargs):
-        return self
-
-    def __imod__(self, *args, **kwargs):
-        return self
-
-    def __ipow__(self, *args, **kwargs):
-        return self
-
-    def __iand__(self, *args, **kwargs):
-        return self
-
-    def __ior__(self, *args, **kwargs):
-        return self
-
-    def __ixor__(self, *args, **kwargs):
-        return self
-
-    def __ilshift__(self, *args, **kwargs):
-        return self
-
-    def __irshift__(self, *args, **kwargs):
-        return self
-
-    return LoadedParam
-
-
-def _materialize_copy(tensor, dtype=None):
-    tensor = tensor[...]
-    if dtype is not None:
-        tensor = tensor.to(dtype)
-    return tensor
-
-
-def spawn_materialize(thread_pool, tensor, dtype=None) -> Future:
-    def _job():
-        return _materialize_copy(tensor, dtype)
-
-    return thread_pool.submit(_job)
-
-
-def spawn_tp_materialize(thread_pool, tensor, sharding_method, tensor_idx, dtype=None) -> Future:
-    def _job():
-        return sharding_method.shard_tensor(tensor, param_casting_dtype=dtype, tensor_idx=tensor_idx)[0]
-
-    return thread_pool.submit(_job)
-
-
-def dot_natural_key(s: str):
-    parts = s.split(".")
-    for i, p in enumerate(parts):
-        # whole-segment digits -> int; otherwise leave as str
-        if p.isdigit():
-            parts[i] = int(p)
-    return parts
-
-
-@contextmanager
-def log_to_misc(
-    layer_name: str,
-    misc: MutableMapping[str, str],
-    extras: Any = None,
-    op: Union[list[ConversionOps], ConversionOps, None] = None,
-):
-    # A simple helper to handle errors with contextual messages.
-    try:
-        yield
-    except Exception as e:
-
-        def _format_op_name(curr_op: Union[list[ConversionOps], ConversionOps, None]) -> Optional[str]:
-            if curr_op is None:
-                return None
-            if isinstance(curr_op, (list, tuple, set)):
-                names = [o.__class__.__name__ for o in curr_op if o is not None]
-                if not names:
-                    return None
-                return ", ".join(names)
-            return curr_op.__class__.__name__
-
-        op_name = _format_op_name(op)
-        if isinstance(extras, tuple) and len(extras) == 2:
-            values, target_keys = extras
-            descriptor = f"{op_name} " if op_name else ""
-            misc[layer_name] = (
-                f"{e}\nError: {descriptor}on tensors destined for {target_keys}. Ckpt contains: {len(values[0])}"
-            )
-        elif isinstance(extras, str):
-            suffix = f" via {op_name}" if op_name else ""
-            misc[layer_name] = f"{e}\nError{suffix} when processing parameter {extras}"
-        elif extras is None and op_name:
-            misc[layer_name] = f"{op_name}: {e}"
-        else:
-            misc[layer_name] = f"{extras} |Error: {e}"
-        raise SkipLayer()
-
-
-def set_param_for_module(
-    model: PreTrainedModel,
-    layer_name: str,
-    param_value: torch.Tensor,
-    mismatch_keys: MutableSet[tuple[str, torch.Size, torch.Size]],
-    missing_keys: MutableSet[str],
-    misc: MutableMapping[str, Any],
-    distributed_operation: Optional[TensorParallelLayer],
-):
-    with log_to_misc(layer_name, misc, layer_name):
-        module_path, _, param_name = layer_name.rpartition(".")
-        module_obj = model.get_submodule(module_path) if module_path else model
-        param_value = param_value[0] if isinstance(param_value, list) else param_value[...]
-        ref = getattr(module_obj, param_name)
-
-        use_dtensor = hasattr(distributed_operation, "use_dtensor") and distributed_operation.use_dtensor
-        if not isinstance(param_value, torch.nn.Parameter):
-            if distributed_operation is not None:
-                param_value = DTensor.from_local(
-                    param_value,
-                    distributed_operation.device_mesh,
-                    getattr(distributed_operation, "shard", Replicate()),
-                    run_check=False,
-                    shape=ref.size(),
-                    stride=ref.stride(),
-                )
-                if not use_dtensor:
-                    # we convert to local
-                    param_value = param_value.to_local()
-            if param_name not in module_obj._buffers:
-                param_value = torch.nn.Parameter(param_value, requires_grad=param_value.is_floating_point())
-        param_value = get_loaded_parameter_class(param_value.__class__)(from_existing=param_value)
-
-        # Remove from missing keys (it's either mismatched, or all good)
-        missing_keys.discard(layer_name)
-        if ref is not None and ref.shape != param_value.shape:
-            mismatch_keys.add((layer_name, param_value.shape, ref.shape))
-            module_obj.param_name._is_hf_initialized = False  # Needs to be initialized
-        else:
-            param_value._is_hf_initialized = True  # super important otherwise _init_weight re-initi if bias is missing
-            setattr(module_obj, param_name, param_value)
-
-
-class SkipLayer(Exception):
-    """Control-flow sentinel: abort processing of the current layer only."""
-
-    pass
-
-
-def convert_and_load_state_dict_in_model(
-    model: PreTrainedModel,
-    state_dict: dict[str, Any],
-    weight_mapping: dict[str, WeightConverter] | None,
-    tp_plan: dict[str, str] | None,
-    quantizer: HfQuantizer | None,
-    dtype: torch.dtype | None = None,
-    device_map: dict | None = None,
-    dtype_plan: dict | None = None,
-    device_mesh: torch.distributed.device_mesh.DeviceMesh | None = None,
-):
-    """
-    Convert a state dict according to a weight mapping (one WeightConverter per glob pattern),
-    collecting tensors per *layer instance* (the concrete indices captured from '*').
-    """
-
-    prefix = model.base_model_prefix
-    tp_plan = tp_plan or {}  # {glob_pattern: plan_obj_or_key}
-    device_map = device_map or {}  # {exact_target_key: device}
-    dtype_plan = dtype_plan or {}  # {glob_pattern: dtype}
-    weight_mapping = weight_mapping or {}  # {glob_pattern: WeightConverter}
-    meta_model_state_dict = model.state_dict()
-    missing_keys = set(meta_model_state_dict.keys())
-
-    misc = {}
-    mismatch_keys = set()
-    unexpected_keys = set()
-    # Global thread_pool
-    thread_pool = ThreadPoolExecutor(max_workers=GLOBAL_WORKERS)
-
-    _patterns = list(itertools.chain.from_iterable([k.source_keys for k in weight_mapping]))
-    source_to_target = {sk: k for k in weight_mapping for sk in k.source_keys}
-    weight_pattern_alt, weight_pattern_by_group_name = build_glob_alt(_patterns)
-    tp_plan_alt, tp_plan_by_group_name = build_glob_alt(list(tp_plan.keys()))
-    dtype_policy_alt, dtype_policy_by_group_name = build_glob_alt(list(dtype_plan.keys()))
-
-    state_dict = sorted(state_dict.items(), key=lambda kv: dot_natural_key(kv[0]))
-    # 1. Create the conversion entries
-    by_conversion_pattern: dict[str, ConversionEntry] = {}
-    for original_key, tensor in state_dict:
-        matched_pattern = match_glob(original_key, weight_pattern_alt, weight_pattern_by_group_name)
-        if matched_pattern is not None:
-            converter = source_to_target[matched_pattern]  # TODO make sure its the ref
-            sub_with_extractor = partial(re.sub, matched_pattern.replace("*", r"(\d+)"), string=original_key)
-            entry_key = "|".join(converter.target_keys)
-            target_key = "|".join(map(sub_with_extractor, [k.replace("*", "\\1") for k in converter.target_keys]))
-            entry: ConversionEntry = by_conversion_pattern.setdefault(entry_key, ConversionEntry(converter))
-            converter_key = sub_with_extractor(matched_pattern)
-        else:
-            converter = WeightConverter(original_key)
-            converter_key = entry_key = target_key = original_key
-            entry = by_conversion_pattern.setdefault(converter_key, ConversionEntry(converter))
-
-        _dtype = dtype
-        new_target_key = []  # test_load_with_mismatched_shapes for AutoModel.from_pretrained(AutoForCausal, vocab=10)
-        for t in target_key.split("|"):
-            if t.startswith(prefix) and meta_model_state_dict.get(t.replace(f"{prefix}.", "")) is not None:
-                t = t.replace(f"{prefix}.", "")
-            elif meta_model_state_dict.get(f"{prefix}.{t}") is not None:
-                t = f"{prefix}.{t}"
-            new_target_key.append(t)
-            empty_param = meta_model_state_dict.get(t)
-            # If it does not exist, it's unexpected
-            if empty_param is None:
-                unexpected_keys.add(t)
-                continue
-
-            if quantizer is not None and quantizer.param_needs_quantization(model, t):
-                if quantizer.__class__.__name__ == "FineGrainedFP8HfQuantizer":
-                    from .integrations.finegrained_fp8 import Fp8Quantize
-
-                    converter.quantization_operation = Fp8Quantize()  # TODO support other methods
-                else:
-                    raise ValueError("This quantization method is gonna be supported SOOOON")
-            else:
-                _dtype = dtype
-                matched_dtype_pattern = match_glob(t, dtype_policy_alt, dtype_policy_by_group_name)
-                if matched_dtype_pattern is not None:
-                    _dtype = dtype_plan[matched_dtype_pattern]
-                elif empty_param.dtype != _dtype:
-                    _dtype = empty_param.dtype
-
-        first_target_key = new_target_key[0]
-        target_key = "|".join(new_target_key)
-
-        future = None
-        if device_mesh:
-            if matched_tp_pattern := match_glob(first_target_key, tp_plan_alt, tp_plan_by_group_name):
-                empty_param = meta_model_state_dict.get(first_target_key)
-                if getattr(converter, "distributed_operation", {}) is None:
-                    tp_layer = ALL_PARALLEL_STYLES[model.tp_plan[matched_tp_pattern]].__class__
-                    converter.distributed_operation = tp_layer(
-                        device_mesh=device_mesh, rank=device_map[""].index, empty_param=empty_param.clone()
-                    )
-                    # VERY IMPORTANT: this tells us wether we collected stuffs or not.
-                shard_index = len(entry.collected_tensors[target_key].get(converter_key, []))
-                future = spawn_tp_materialize(
-                    thread_pool,
-                    tensor,
-                    _dtype,
-                    converter.distributed_operation,
-                    shard_index,
-                )
-
-        if future is None:  # If not TP, async materialize the tensors. TODO handle disk offload?
-            future = spawn_materialize(thread_pool, tensor, _dtype)
-        entry.collected_tensors[target_key].setdefault(converter_key, []).append(future)
-
-    # 2. Actually convert the ckpt
-    inverse_converters = {}
-    keys = list(by_conversion_pattern.keys())
-
-    with logging.tqdm(total=len(keys), desc="Loading weights") as pbar:
-        for key in keys[::-1]:  # revert to process simple keys first
-            group = by_conversion_pattern.pop(key)
-            converter = group.weight_converter
-            operations = converter.operations if isinstance(converter.operations, list) else [converter.operations]
-            for layer_name, tensors_for_this_layer in group.collected_tensors.items():
-                pbar.update(1)
-                pbar.set_postfix({"Materializing param": layer_name})
-                pbar.refresh()
-                concrete_target_keys = layer_name.split("|")
-                try:
-                    if bool(set(concrete_target_keys) - unexpected_keys):
-                        with log_to_misc(layer_name, misc):
-                            values = [[k.result() for k in inner] for inner in tensors_for_this_layer.values()]
-
-                        for op in operations:
-                            with log_to_misc(layer_name, misc, (values, concrete_target_keys), operations):
-                                values = op.convert(values, model.config)
-
-                        values = [values] if not isinstance(values, list) else values
-                        with log_to_misc(layer_name, misc, (values, concrete_target_keys), operations):
-                            realized_value = {
-                                k: t for k, t in zip(concrete_target_keys, values) if k not in unexpected_keys
-                            }
-
-                        for k in list(realized_value.keys()).copy():
-                            if op := converter.quantization_operation:
-                                with log_to_misc(layer_name, misc, op=op):
-                                    realized_value.update(
-                                        op.convert(
-                                            {k: realized_value.pop(k)}, quant_config=quantizer.quantization_config
-                                        )
-                                    )
-
-                        for k, output_value in realized_value.items():
-                            for src in converter.source_keys:  # what should happen to k when we meet k at saving
-                                inverse_converters[k] = {src: converter}
-                            set_param_for_module(
-                                model,
-                                k,
-                                output_value,
-                                mismatch_keys,
-                                missing_keys,
-                                misc,
-                                converter.distributed_operation,
-                            )
-
-                except SkipLayer:
-                    continue
-            del group
-
-    model.inverse_converters = inverse_converters
-    thread_pool.shutdown(wait=False)
-    return missing_keys, unexpected_keys, mismatch_keys, misc
-
-
-# TODO this is not done yet!
-def revert_weight_conversion(model, state_dict):
-    mapping = getattr(model, "_checkpoint_conversion_mapping", {})  # IDK why but setting this will fail all llava.
-    reverse_key_mapping = [(v, k) for k, v in mapping.items()]
-    original_state_dict = {}
-    for key, value in state_dict.items():
-        for pattern, inverse_converter in reverse_key_mapping:
-            # TODO FIXME you name it
-            replacement = inverse_converter.lstrip("^")  # strip off un-needed chars and patterns
-            replacement = re.sub(r"\(.*\)", "", replacement)
-            key, n_replace = re.subn(pattern, replacement, key)
-            # Early exit of the loop
-            if n_replace > 0:
-                break
-        original_state_dict[key] = value
-    state_dict = original_state_dict
-    return state_dict
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@ -723,7 +723,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):

            if self.mask_replace_prob < 1:
                warnings.warn(
-                    "Random token replacement is not supported with whole word masking. "
+                    "Random token replacement is not supported with whole word masking.",
                    "Setting mask_replace_prob to 1.",
                )
                self.mask_replace_prob = 1
--- a/src/transformers/data/datasets/glue.py
+++ b/src/transformers/data/datasets/glue.py
@ -82,7 +82,7 @@ class GlueDataset(Dataset):
        cache_dir: Optional[str] = None,
    ):
        warnings.warn(
-            "This dataset will be removed from the library soon, preprocessing should be handled with the Hugging Face Datasets "
+            "This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
            "library. You can have a look at this example script for pointers: "
            "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py",
            FutureWarning,
--- a/src/transformers/data/metrics/init.py
+++ b/src/transformers/data/metrics/init.py
@ -21,7 +21,7 @@ if is_sklearn_available():


 DEPRECATION_WARNING = (
-    "This metric will be removed from the library soon, metrics should be handled with the Hugging Face Evaluate "
+    "This metric will be removed from the library soon, metrics should be handled with the 🤗 Evaluate "
    "library. You can have a look at this example script for pointers: "
    "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
 )
--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
@ -28,7 +28,7 @@ from .utils import DataProcessor, InputExample, InputFeatures
 logger = logging.get_logger(__name__)

 DEPRECATION_WARNING = (
-    "This {0} will be removed from the library soon, preprocessing should be handled with the Hugging Face Datasets "
+    "This {0} will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
    "library. You can have a look at this example script for pointers: "
    "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
 )
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -48,7 +48,7 @@ deps = {
    "pyyaml": "pyyaml>=5.1",
    "pydantic": "pydantic>=2",
    "pytest": "pytest>=7.2.0",
-    "pytest-asyncio": "pytest-asyncio>=1.2.0",
+    "pytest-asyncio": "pytest-asyncio",
    "pytest-rerunfailures": "pytest-rerunfailures<16.0",
    "pytest-timeout": "pytest-timeout",
    "pytest-xdist": "pytest-xdist",
--- a/src/transformers/generation/continuous_batching/continuous_api.py
+++ b/src/transformers/generation/continuous_batching/continuous_api.py
@ -807,7 +807,7 @@ class ContinuousBatchingManager:
        """Check if the background generation thread is running."""
        return self._generation_thread is not None and self._generation_thread.is_alive()

-    def stop(self, block: bool = True, timeout: Optional[float] = None) -> None:
+    def stop(self, block: bool = False, timeout: Optional[float] = None) -> None:
        """Signal the background thread to stop.

        Args:
@ -818,15 +818,14 @@ class ContinuousBatchingManager:
            logger.warning("Manager not started.")
            return

-        stop_trigger_time = perf_counter()
        if not self.stop_event.is_set():
            self.stop_event.set()
            logger.info("Stopping continuous batching manager...")

        if block:
-            self.join(stop_trigger_time, timeout)
+            self.join(timeout)

-    def join(self, stop_trigger_time: float, timeout: Optional[float] = None) -> None:
+    def join(self, timeout: Optional[float] = None) -> None:
        """Wait for the background thread to finish.

        Args:
@ -835,10 +834,9 @@ class ContinuousBatchingManager:
        if self._generation_thread is not None:
            self._generation_thread.join(timeout=timeout)
            if self._generation_thread.is_alive():
-                logger.warning(f"Generation thread did not exit after join timeout ({timeout}).")
+                logger.warning("Generation thread did not exit after join timeout.")
            else:
-                end = perf_counter()
-                logger.info(f"Continuous Batching Manager stopped after {end - stop_trigger_time:.2f}s.")
+                logger.info("Continuous Batching Manager stopped.")
                self._generation_thread = None

    def add_request(
@ -879,11 +877,9 @@ class ContinuousBatchingManager:
        self.input_queue.put(state, block=True, timeout=10)  # XXX: pass timeout as fn arg?
        return request_id

-    def add_requests(
-        self, inputs: list[list[int]], max_new_tokens: Optional[int] = None, streaming: bool = False
-    ) -> None:
+    def add_requests(self, inputs: list[list[int]], max_new_tokens: Optional[int] = None) -> None:
        for input_ids in inputs:
-            self.add_request(input_ids, max_new_tokens=max_new_tokens, streaming=streaming)
+            self.add_request(input_ids, max_new_tokens=max_new_tokens)

    def cancel_request(self, request_id: str) -> None:
        """Cancel a request by its ID.
@ -894,7 +890,6 @@ class ContinuousBatchingManager:
        if self.batch_processor is not None:
            self.batch_processor.scheduler.set_request_cancellation(request_id)

-    # TODO:handle benchmarking properly when updating / fixing the requeue logic
    def get_result(
        self, request_id: Optional[str] = None, timeout: Optional[float] = None
    ) -> Optional[GenerationOutput]:
@ -910,7 +905,6 @@ class ContinuousBatchingManager:
            return None
        try:
            result = self.output_queue.get(block=True, timeout=timeout)
-            # NOTE: requeue logic here
            if request_id is not None and result.request_id != request_id:
                self.output_queue.put(result)
                return None
@ -1098,7 +1092,6 @@ class ContinuousMixin:
            num_kv_cuda_graphs=num_kv_cuda_graphs,
        )

-    # TODO: support streaming
    @traced
    @torch.inference_mode()
    def generate_batch(
@ -1155,7 +1148,7 @@ class ContinuousMixin:
                        result = manager.get_result(timeout=1)
                        if result:
                            req_id = result.request_id
-                            if result.is_finished():
+                            if result.status == RequestStatus.FINISHED:
                                results[req_id] = result
                                finished_count += 1
                                pbar.update(1)
--- a/src/transformers/generation/continuous_batching/requests.py
+++ b/src/transformers/generation/continuous_batching/requests.py
@ -19,7 +19,6 @@ from typing import Optional

 import torch

-from ...utils import is_torch_xpu_available
 from ...utils.logging import logging
 from ...utils.metrics import traced

@ -36,13 +35,6 @@ def get_device_and_memory_breakdown() -> tuple[torch.device, int, int, int]:
        total_memory = torch.cuda.get_device_properties(device).total_memory
        reserved_memory = torch.cuda.memory_reserved(device)
        allocated_memory = torch.cuda.memory_allocated(device)
-    elif is_torch_xpu_available():
-        device = torch.device("xpu")
-        torch.xpu.empty_cache()
-        torch.xpu.synchronize()
-        total_memory = torch.xpu.get_device_properties(device).total_memory
-        reserved_memory = torch.xpu.memory_reserved(device)
-        allocated_memory = torch.xpu.memory_allocated(device)
    elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
        device = torch.device("mps")
        # MPS memory reporting (PyTorch 2.0+)
@ -91,9 +83,6 @@ class GenerationOutput:
    status: RequestStatus = RequestStatus.PENDING
    created_time: float = field(default_factory=time.time)

-    def is_finished(self) -> bool:
-        return self.status == RequestStatus.FINISHED
-

@dataclass
 class RequestState:
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@ -608,7 +608,7 @@ class GenerationMixin(ContinuousMixin):
        use_cache = kwargs.get("use_cache")
        if use_cache is None:
            use_cache = getattr(self.config, "use_cache", False)
-        if past_key_values is not None or use_cache:
+        if past_key_values is None or use_cache:
            # TODO (joao): handle the case where cache length == input_ids length. The function below results in an
            # exception because we get empty input_ids after slicing. In essence, we need to roll back the cache 1
            # token to recompute the logits for the first token to be generated (but not all caches support roll backs)
@ -1635,12 +1635,7 @@ class GenerationMixin(ContinuousMixin):

        # TransformersKwargs are model-agnostic attention and generation arguments such as 'output_attentions'
        for key, value in model_kwargs.items():
-            if (
-                value is not None
-                and key not in model_args
-                and key not in TransformersKwargs.__optional_keys__
-                and key != "debug_io"
-            ):
+            if value is not None and key not in model_args and key not in TransformersKwargs.__optional_keys__:
                unused_model_args.append(key)

        if unused_model_args:
--- a/src/transformers/generation/watermarking.py
+++ b/src/transformers/generation/watermarking.py
@ -383,11 +383,10 @@ class BayesianDetectorModel(PreTrainedModel):
        )
        self.prior = torch.nn.Parameter(torch.tensor([self.base_rate]))

-    @torch.no_grad()
    def _init_weights(self, module):
        """Initialize the weights."""
        if isinstance(module, nn.Parameter):
-            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            module.weight.data.normal_(mean=0.0, std=0.02)

    def _compute_posterior(
        self,
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@ -821,26 +821,14 @@ def split_to_tiles(images: "torch.Tensor", num_tiles_height: int, num_tiles_widt
    return image


+def _cast_tensor_to_float(x):
+    if x.is_floating_point():
+        return x
+    return x.float()
+
+
 def _group_images_by_shape(nested_images, *paired_inputs, is_nested: bool = False):
-    """
-    Helper function to flatten a single level of nested image and batch structures and group by shape.
-    Args:
-        nested_images (list):
-            A list of images or a single tensor
-        paired_inputs (Any, *optional*):
-            Zero or more lists that mirror the structure of `nested_images` (flat list, or list of lists when
-            `is_nested=True`). Each element is paired 1:1 with the corresponding image so it can be grouped by the
-            same shape key. These paired values are grouped alongside `nested_images` but are not stacked in the output, so
-            they do not need to be tensors.
-        is_nested (bool, *optional*, defaults to False):
-            Whether the images are nested.
-    Returns:
-        tuple[dict, ...]:
-            - A dictionary with shape as key and list of images with that shape as value
-            - A dictionary with shape as key and list of paired values with that shape as value
-            - A dictionary mapping original indices to (shape, index) tuples
-            - A dictionary mapping original indices to (shape, index) tuples for each paired input
-    """
+    """Helper function to flatten a single level of nested image and batch structures and group by shape."""
    grouped_images = defaultdict(list)
    grouped_images_index = {}
    paired_grouped_values = [defaultdict(list) for _ in paired_inputs]
@ -892,20 +880,27 @@ def _reconstruct_nested_structure(indices, processed_images):
    return result


-def _iterate_items(items, is_nested: bool):
-    """
-    Helper function to iterate over items yielding (key, item) pairs.
+def _disable_grouping_output_nested(images, *paired_inputs):
+    """Build the disable_grouping output tuple for a single-level nested structure."""
+    outer_range = range(len(images))
+    inner_ranges = [range(len(images[i])) for i in outer_range]

-    For nested structures, yields ((row_index, col_index), item).
-    For flat structures, yields (index, item).
-    """
-    if is_nested:
-        for i, row in enumerate(items):
-            for j, item in enumerate(row):
-                yield (i, j), item
-    else:
-        for i, item in enumerate(items):
-            yield i, item
+    # Precompute all (i, j) pairs
+    ij_pairs = [(i, j) for i in outer_range for j in inner_ranges[i]]
+
+    images_dict = {(i, j): images[i][j].unsqueeze(0) for (i, j) in ij_pairs}
+    paired_dicts = [{(i, j): paired_list[i][j].unsqueeze(0) for (i, j) in ij_pairs} for paired_list in paired_inputs]
+    index_map = {(i, j): ((i, j), 0) for (i, j) in ij_pairs}
+    return images_dict, *paired_dicts, index_map
+
+
+def _disable_grouping_output_flat(images, *paired_inputs):
+    """Build the disable_grouping output tuple for a flat list structure."""
+    idx_range = range(len(images))
+    images_dict = {i: images[i].unsqueeze(0) for i in idx_range}
+    paired_dicts = [{i: paired_list[i].unsqueeze(0) for i in idx_range} for paired_list in paired_inputs]
+    index_map = {i: (i, 0) for i in idx_range}
+    return images_dict, *paired_dicts, index_map


 def group_images_by_shape(
@ -925,7 +920,7 @@ def group_images_by_shape(
    Args:
        images (Union[list["torch.Tensor"], "torch.Tensor"]):
            A list of images or a single tensor
-        paired_inputs (Any, *optional*):
+        *paired_inputs (Any):
            Zero or more lists that mirror the structure of `images` (flat list, or list of lists when
            `is_nested=True`). Each element is paired 1:1 with the corresponding image so it can be grouped by the
            same shape key. These paired values are grouped alongside `images` but are not stacked in the output, so
@ -949,14 +944,10 @@ def group_images_by_shape(
        disable_grouping = device == "cpu"

    if disable_grouping:
-        return (
-            {key: img.unsqueeze(0) for key, img in _iterate_items(images, is_nested)},
-            *[
-                {key: item.unsqueeze(0) for key, item in _iterate_items(paired_list, is_nested)}
-                for paired_list in paired_inputs
-            ],
-            {key: (key, 0) for key, _ in _iterate_items(images, is_nested)},
-        )
+        if is_nested:
+            return _disable_grouping_output_nested(images, *paired_inputs)
+        else:
+            return _disable_grouping_output_flat(images, *paired_inputs)

    # Handle single level nested structure
    grouped_images, *paired_grouped_values, grouped_images_index = _group_images_by_shape(
@ -999,3 +990,14 @@ def reorder_images(
        ]

    return _reconstruct_nested_structure(grouped_images_index, processed_images)
+
+
+class NumpyToTensor:
+    """
+    Convert a numpy array to a PyTorch tensor.
+    """
+
+    def __call__(self, image: np.ndarray):
+        # Same as in PyTorch, we assume incoming numpy images are in HWC format
+        # c.f. https://github.com/pytorch/vision/blob/61d97f41bc209e1407dcfbd685d2ee2da9c1cdad/torchvision/transforms/functional.py#L154
+        return torch.from_numpy(image.transpose(2, 0, 1)).contiguous()
--- a/src/transformers/initialization.py
+++ b/src/transformers/initialization.py
@ -1,210 +0,0 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from contextlib import contextmanager
-from functools import wraps
-
-import torch
-
-
-def uniform_(
-    tensor: torch.Tensor, a: float = 0.0, b: float = 1.0, generator: torch.Generator | None = None
-) -> torch.Tensor:
-    if not getattr(tensor, "_is_hf_initialized", False):
-        return torch.nn.init.uniform_(tensor, a=a, b=b, generator=generator)
-    return tensor
-
-
-def normal_(
-    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, generator: torch.Generator | None = None
-) -> torch.Tensor:
-    if not getattr(tensor, "_is_hf_initialized", False):
-        return torch.nn.init.normal_(tensor, mean=mean, std=std, generator=generator)
-    return tensor
-
-
-def constant_(tensor: torch.Tensor, val: float) -> torch.Tensor:
-    if not getattr(tensor, "_is_hf_initialized", False):
-        return torch.nn.init.constant_(tensor, val=val)
-    return tensor
-
-
-def ones_(tensor: torch.Tensor) -> torch.Tensor:
-    if not getattr(tensor, "_is_hf_initialized", False):
-        return torch.nn.init.ones_(tensor)
-    return tensor
-
-
-def zeros_(tensor: torch.Tensor) -> torch.Tensor:
-    if not getattr(tensor, "_is_hf_initialized", False):
-        return torch.nn.init.zeros_(tensor)
-    return tensor
-
-
-def eye_(tensor: torch.Tensor) -> torch.Tensor:
-    if not getattr(tensor, "_is_hf_initialized", False):
-        return torch.nn.init.eye_(tensor)
-    return tensor
-
-
-def dirac_(tensor: torch.Tensor, groups: int = 1) -> torch.Tensor:
-    if not getattr(tensor, "_is_hf_initialized", False):
-        return torch.nn.init.dirac_(tensor, groups=groups)
-    return tensor
-
-
-def xavier_uniform_(tensor: torch.Tensor, gain: float = 1.0, generator: torch.Generator | None = None) -> torch.Tensor:
-    if not getattr(tensor, "_is_hf_initialized", False):
-        return torch.nn.init.xavier_uniform_(tensor, gain=gain, generator=generator)
-    return tensor
-
-
-def xavier_normal_(tensor: torch.Tensor, gain: float = 1.0, generator: torch.Generator | None = None) -> torch.Tensor:
-    if not getattr(tensor, "_is_hf_initialized", False):
-        return torch.nn.init.xavier_normal_(tensor, gain=gain, generator=generator)
-    return tensor
-
-
-def kaiming_uniform_(
-    tensor: torch.Tensor,
-    a: float = 0,
-    mode: str = "fan_in",
-    nonlinearity: str = "leaky_relu",
-    generator: torch.Generator | None = None,
-) -> torch.Tensor:
-    if not getattr(tensor, "_is_hf_initialized", False):
-        return torch.nn.init.kaiming_uniform_(tensor, a=a, mode=mode, nonlinearity=nonlinearity, generator=generator)
-    return tensor
-
-
-def kaiming_normal_(
-    tensor: torch.Tensor,
-    a: float = 0,
-    mode: str = "fan_in",
-    nonlinearity: str = "leaky_relu",
-    generator: torch.Generator | None = None,
-) -> torch.Tensor:
-    if not getattr(tensor, "_is_hf_initialized", False):
-        return torch.nn.init.kaiming_normal_(tensor, a=a, mode=mode, nonlinearity=nonlinearity, generator=generator)
-    return tensor
-
-
-def trunc_normal_(
-    tensor: torch.Tensor,
-    mean: float = 0.0,
-    std: float = 1.0,
-    a: float = -2.0,
-    b: float = 2.0,
-    generator: torch.Generator | None = None,
-) -> torch.Tensor:
-    if not getattr(tensor, "_is_hf_initialized", False):
-        return torch.nn.init.trunc_normal_(tensor, mean=mean, std=std, a=a, b=b, generator=generator)
-    return tensor
-
-
-def orthogonal_(
-    tensor: torch.Tensor,
-    gain: float = 1,
-    generator: torch.Generator | None = None,
-) -> torch.Tensor:
-    if not getattr(tensor, "_is_hf_initialized", False):
-        return torch.nn.init.orthogonal_(tensor, gain=gain, generator=generator)
-    return tensor
-
-
-def sparse_(
-    tensor: torch.Tensor, sparsity: float, std: float = 0.01, generator: torch.Generator | None = None
-) -> torch.Tensor:
-    if not getattr(tensor, "_is_hf_initialized", False):
-        return torch.nn.init.sparse_(tensor, sparsity=sparsity, std=std, generator=generator)
-    return tensor
-
-
-def copy_(tensor: torch.Tensor, other: torch.Tensor) -> torch.Tensor:
-    if not getattr(tensor, "_is_hf_initialized", False):
-        with torch.no_grad():
-            return tensor.copy_(other)
-    return tensor
-
-
-TORCH_INIT_FUNCTIONS = (
-    "uniform_",
-    "normal_",
-    "constant_",
-    "ones_",
-    "zeros_",
-    "eye_",
-    "dirac_",
-    "xavier_uniform_",
-    "xavier_normal_",
-    "kaiming_uniform_",
-    "kaiming_normal_",
-    "trunc_normal_",
-    "orthogonal_",
-    "sparse_",
-)
-
-
-@contextmanager
-def no_init_weights():
-    """
-    Context manager to globally disable weight initialization to speed up loading large models.
-    """
-    global _init_weights
-    old_init_weights = _init_weights
-
-    _init_weights = False
-
-    def _skip_init(*args, **kwargs):
-        pass
-
-    # Save the original initialization functions
-    for name, init_func in TORCH_INIT_FUNCTIONS.items():
-        setattr(torch.nn.init, name, _skip_init)
-
-    try:
-        yield
-    finally:
-        _init_weights = old_init_weights
-        # Restore the original initialization functions
-        for name, init_func in TORCH_INIT_FUNCTIONS.items():
-            setattr(torch.nn.init, name, init_func)
-
-
-@contextmanager
-def guard_torch_init():
-    """
-    Guard the `torch.nn.init` primitive functions to behave exactly like the functions in this file, i.e. be
-    protected against the `_is_hf_initialized` flag to avoid re-init if the param was already loaded.
-    """
-    originals = {}
-
-    def make_wrapper(fn):
-        @wraps(fn)
-        def wrapped(*args, **kwargs):
-            # Tensor can come positionally or as a kwarg
-            tensor = args[0] if args else kwargs.get("tensor")
-            if not getattr(tensor, "_is_hf_initialized", False):
-                return fn(*args, **kwargs)
-            return tensor
-
-        return wrapped
-
-    try:
-        for name in TORCH_INIT_FUNCTIONS:
-            originals[name] = getattr(torch.nn.init, name)
-            setattr(torch.nn.init, name, make_wrapper(originals[name]))
-        yield
-    finally:
-        for name, fn in originals.items():
-            setattr(torch.nn.init, name, fn)
--- a/src/transformers/integrations/accelerate.py
+++ b/src/transformers/integrations/accelerate.py
@ -512,8 +512,10 @@ def accelerate_disk_offload(
    checkpoint_files,
    device_map,
    checkpoint_keys,
+    key_renaming_mapping,
    sharded_metadata,
    dtype,
+    reverse_key_renaming_mapping,
 ):
    disk_only_shard_files = []
    if disk_offload_folder is not None:
@ -532,13 +534,19 @@ def accelerate_disk_offload(
            weight_map = dict.fromkeys(checkpoint_keys, checkpoint_files[0])
        else:
            folder = os.path.sep.join(checkpoint_files[0].split(os.path.sep)[:-1])
+            # Fix the weight map keys according to the key mapping
+            weight_map = {
+                key_renaming_mapping[k]: v
+                for k, v in sharded_metadata["weight_map"].items()
+                if k in key_renaming_mapping
+            }
            weight_map = {k: os.path.join(folder, v) for k, v in weight_map.items()}
            # Find potential checkpoints containing only offloaded weights
            disk_only_shard_files = get_disk_only_shard_files(device_map, weight_map)
        disk_offload_index = {
            name: {
                "safetensors_file": file,
-                "weight_name": name,
+                "weight_name": reverse_key_renaming_mapping[name],
                "dtype": str_dtype,
            }
            for name, file in weight_map.items()
--- a/src/transformers/integrations/bitsandbytes.py
+++ b/src/transformers/integrations/bitsandbytes.py
@ -1,4 +1,5 @@
 import inspect
+from copy import deepcopy
 from inspect import signature

 from ..utils import (
@ -23,6 +24,7 @@ if is_accelerate_available():
    import accelerate
    from accelerate import init_empty_weights
    from accelerate.hooks import add_hook_to_module, remove_hook_from_module
+    from accelerate.utils import find_tied_parameters

 logger = logging.get_logger(__name__)

@ -149,6 +151,52 @@ def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name
    return model


+def get_keys_to_not_convert(model):
+    r"""
+    An utility function to get the key of the module to keep in full precision if any For example for CausalLM modules
+    we may want to keep the lm_head in full precision for numerical stability reasons. For other architectures, we want
+    to keep the tied weights of the model. The function will return a list of the keys of the modules to not convert in
+    int8.
+
+    Parameters:
+    model (`torch.nn.Module`):
+        Input model
+    """
+    # Create a copy of the model and tie the weights, then
+    # check if it contains tied weights
+    tied_model = deepcopy(model)  # this has 0 cost since it is done inside `init_empty_weights` context manager`
+    tied_model.tie_weights()
+
+    tied_params = find_tied_parameters(tied_model)
+    tied_keys = sum(tied_params, [])
+    has_tied_params = len(tied_keys) > 0
+
+    # If there is not tied weights, we want to keep the lm_head（output_embedding) in full precision
+    if not has_tied_params:
+        output_emb = model.get_output_embeddings()
+        if output_emb is not None:
+            list_last_module = [name for name, module in model.named_modules() if id(module) == id(output_emb)]
+            return list_last_module
+
+    # otherwise, no tied weights, no output embedding defined, simply keep the last module in full precision
+    list_modules = list(model.named_parameters())
+    list_last_module = [list_modules[-1][0]]
+    # add last module together with tied weights
+    intersection = set(list_last_module) - set(tied_keys)
+    list_untouched = list(set(tied_keys)) + list(intersection)
+
+    # remove ".weight" from the keys
+    names_to_remove = [".weight", ".bias"]
+    filtered_module_names = []
+    for name in list_untouched:
+        for name_to_remove in names_to_remove:
+            if name_to_remove in name:
+                name = name.replace(name_to_remove, "")
+        filtered_module_names.append(name)
+
+    return filtered_module_names
+
+
 # Copied from PEFT: https://github.com/huggingface/peft/blob/47b3712898539569c02ec5b3ed4a6c36811331a1/src/peft/utils/integrations.py#L41
 def dequantize_bnb_weight(weight: "torch.nn.Parameter", dtype: "torch.dtype", state=None):
    """
--- a/src/transformers/integrations/executorch.py
+++ b/src/transformers/integrations/executorch.py
@ -11,6 +11,7 @@
 # specific language governing permissions and limitations under the License.

 import logging
+from collections.abc import Callable
 from typing import Optional

 import torch
@ -23,7 +24,13 @@ from ..cache_utils import (
    StaticCache,
 )
 from ..generation.configuration_utils import GenerationConfig
-from ..modeling_utils import PreTrainedModel
+from ..masking_utils import (
+    ALL_MASK_ATTENTION_FUNCTIONS,
+    _ignore_causal_mask_sdpa,
+    _is_torch_greater_or_equal_than_2_5,
+    prepare_padding_mask,
+)
+from ..modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ..pytorch_utils import (
    is_torch_greater_or_equal,
    is_torch_greater_or_equal_than_2_3,
@ -222,6 +229,10 @@ class TorchExportableModuleForDecoderOnlyLM(torch.nn.Module):
                "Using `StaticCache` for export as `layer_types` is not specified or `sliding_window` is `null` in the config."
            )
            self.model = TorchExportableModuleWithStaticCache(model, batch_size, max_cache_len, device)
+        # This is the same as sdpa, but mask creation does not use `vmap` which is not exportable
+        ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", sdpa_mask_without_vmap)
+        ALL_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", ALL_ATTENTION_FUNCTIONS["sdpa"])
+        self.model.model.config._attn_implementation = "sdpa_without_vmap"

    def forward(
        self,
@ -757,6 +768,11 @@ def convert_and_export_with_cache(

    import torch.export._trace

+    # This is the same as sdpa, but mask creation does not use `vmap` which is not exportable
+    ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", sdpa_mask_without_vmap)
+    ALL_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", ALL_ATTENTION_FUNCTIONS["sdpa"])
+    model.config._attn_implementation = "sdpa_without_vmap"
+
    with torch.no_grad():
        # TODO: The default inputs only work for text models. We need to add support for vision/audio models.
        example_input_ids = (
@ -1020,6 +1036,11 @@ def export_with_dynamic_cache(
    if not is_torch_greater_or_equal_than_2_3:
        raise ImportError("torch >= 2.3 is required.")

+    # This is the same as sdpa, but mask creation does not use `vmap` which is not exportable
+    ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", sdpa_mask_without_vmap)
+    ALL_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", ALL_ATTENTION_FUNCTIONS["sdpa"])
+    model.config._attn_implementation = "sdpa_without_vmap"
+
    register_dynamic_cache_export_support()

    with torch.no_grad():
@ -1088,3 +1109,92 @@ def _unflatten_dynamic_cache(values, context: torch.utils._pytree.Context):
        value = value_list[idx] if idx < len(value_list) else None
        cache.update(key, value, idx)
    return cache
+
+
+def sdpa_mask_without_vmap(
+    batch_size: int,
+    cache_position: torch.Tensor,
+    kv_length: int,
+    kv_offset: int = 0,
+    mask_function: Optional[Callable] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    local_size: Optional[int] = None,
+    allow_is_causal_skip: bool = True,
+    allow_torch_fix: bool = True,
+    **kwargs,
+) -> Optional[torch.Tensor]:
+    """
+    Create a 4D boolean mask of shape `(batch_size, 1, query_length, kv_length)` where a value of True indicates that
+    the element should take part in the attention computation, and False that it should not.
+
+    This is similar to `masking_utils.sdpa_mask` but does not use `vmap` which is incompatible with export.
+
+    Args:
+        batch_size (`int`):
+            The batch size of the input sequence.
+        cache_position (`torch.Tensor`):
+            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
+        kv_length (`int`):
+            The size that the key and value states will have during the attention computation.
+        kv_offset (`int`, optional):
+            An optional offset to indicate at which first position the key and values states will refer to.
+        mask_function (`Callable`):
+            The mask factory function describing the mask pattern.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
+        local_size (`int`, optional):
+            The size of the local attention, if we do not use full attention. This is used only if `allow_is_causal_skip=True`
+            to try to skip mask creation if possible.
+        allow_is_causal_skip (`bool`, optional):
+            Whether to allow to return `None` for the mask under conditions where we can use the `is_causal` argument in
+            `torch.sdpa` instead. Default to `True`.
+        allow_torch_fix (`bool`, optional):
+            Whether to update the mask in case a query is not attending to any tokens, to solve a bug in torch's older
+            versions. We need an arg to skip it when using eager. By default `True`.
+
+    """
+
+    q_length = cache_position.shape[0]
+    # Potentially pad the 2D mask, and slice it correctly
+    padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset)
+
+    #  Under specific conditions, we can avoid materializing the mask, instead relying on the `is_causal` argument
+    if allow_is_causal_skip and _ignore_causal_mask_sdpa(padding_mask, q_length, kv_length, local_size):
+        return None
+
+    # Similar to `kv_arange = torch.arange(start=kv_offset, end=kv_offset + kv_length, device=cache_position.device)`
+    # but without data-dependent slicing (i.e. torch.compile friendly)
+    kv_arange = torch.arange(kv_length, device=cache_position.device)
+    kv_arange += kv_offset
+    reshaped_cache_position = cache_position.view(-1, 1)
+
+    # This is a bit hacky to know what pattern we are using, but all mask creation function actually forward
+    # the config through kwargs anyway, so it allows to rely on it
+    # Usually, the `mask_function` is the only entry-point to define the pattern - we could do for loops over it,
+    # but this is more efficient
+    sliding_window = getattr(kwargs["config"], "sliding_window", None)
+    chunk_size = getattr(kwargs["config"], "attention_chunk_size", None)
+
+    if sliding_window is not None and chunk_size is not None:
+        raise ValueError("Cannot use both `sliding_window` and `attention_chunk_size`")
+
+    # Simplest and most efficient way to obtain a causal mask
+    causal_mask = kv_arange <= reshaped_cache_position
+    # If using sliding window, add the sliding mask
+    if sliding_window is not None:
+        sliding_mask_overlay = kv_arange > reshaped_cache_position - sliding_window
+        causal_mask *= sliding_mask_overlay
+    # If using chunk attention, add the chunked mask
+    elif chunk_size is not None:
+        chunked_mask_overlay = kv_arange // chunk_size == reshaped_cache_position // chunk_size
+        causal_mask *= chunked_mask_overlay
+
+    causal_mask = causal_mask[None, None, :, :].expand(batch_size, -1, -1, -1)
+    if padding_mask is not None:
+        causal_mask = causal_mask * padding_mask[:, None, None, :]
+
+    # Due to a bug in some older torch version, we need to update the mask in case a query is not attending to any
+    # tokens (due to padding). See details in https://github.com/pytorch/pytorch/issues/110213
+    if not _is_torch_greater_or_equal_than_2_5 and allow_torch_fix:
+        causal_mask |= torch.all(~causal_mask, dim=-1, keepdim=True)
+    return causal_mask
--- a/src/transformers/integrations/finegrained_fp8.py
+++ b/src/transformers/integrations/finegrained_fp8.py
@ -13,11 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import re
-from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import Optional

-from ..core_model_loading import ConversionOps
 from ..utils import is_accelerate_available, is_torch_accelerator_available, is_torch_available, logging


@ -33,18 +30,6 @@ if is_accelerate_available():


 logger = logging.get_logger(__name__)
-try:
-    _FP8_DTYPE = torch.float8_e4m3fn
-    _FP8_MIN = torch.finfo(_FP8_DTYPE).min
-    _FP8_MAX = torch.finfo(_FP8_DTYPE).max
-    _FP8_IS_INT = False
-except AttributeError:
-    _FP8_DTYPE = torch.int8
-    _FP8_MIN, _FP8_MAX = -127, 127
-    _FP8_IS_INT = True
-    logger.warning_once(
-        "torch.float8_e4m3fn not available; falling back to int8 emulation for Fp8Quantize operations."
-    )


 # Copied from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py
@ -347,12 +332,6 @@ class FP8Linear(nn.Linear):
        if self.weight.element_size() > 1:
            return F.linear(input, self.weight, self.bias)
        else:
-            if isinstance(self.weight, torch.distributed.tensor.DTensor):
-                weight = self.weight._local_tensor.contiguous()
-                scale_inv = self.weight_scale_inv._local_tensor.contiguous()
-            else:
-                weight = self.weight.contiguous()
-                scale_inv = self.weight_scale_inv.contiguous()
            # Context manager used to switch among the available accelerators
            device_type = torch.accelerator.current_accelerator().type if is_torch_accelerator_available() else "cuda"
            torch_accelerator_module = getattr(torch, device_type, torch.cuda)
@ -360,9 +339,9 @@ class FP8Linear(nn.Linear):
                qinput, scale = act_quant(input, self.block_size[1])
                output = w8a8_block_fp8_matmul_triton(
                    qinput,
-                    weight,
+                    self.weight,
                    scale,
-                    scale_inv,
+                    self.weight_scale_inv,
                    self.block_size,
                    output_dtype=input.dtype,
                )
@ -371,124 +350,9 @@ class FP8Linear(nn.Linear):
            torch_accelerator_module.synchronize()
            if self.bias is not None:
                output = output + self.bias
-            output = torch.nan_to_num(output, nan=0.0)
            return output.to(dtype=input.dtype)


-def _ceil_div(a, b):
-    return (a + b - 1) // b
-
-
-class FP8Expert(nn.Module):
-    dtype = torch.float8_e4m3fn
-
-    def __init__(self, config, block_size, device):
-        super().__init__()
-
-        from ..activations import ACT2FN
-
-        self.block_size = block_size
-        self.num_experts = config.num_local_experts
-        self.hidden_dim = config.hidden_size
-        self.intermediate_dim = config.intermediate_size
-
-        Wg_out, Wg_in = 2 * self.intermediate_dim, self.hidden_dim
-        Wd_out, Wd_in = self.hidden_dim, self.intermediate_dim
-
-        self.gate_up_proj = nn.Parameter(
-            torch.zeros(self.num_experts, Wg_out, Wg_in, dtype=FP8Expert.dtype, device=device)
-        )
-        self.down_proj = nn.Parameter(
-            torch.zeros(self.num_experts, Wd_out, Wd_in, dtype=FP8Expert.dtype, device=device)
-        )
-
-        # Create inverse scale tiles only when using 1-byte types (fp8)
-        if self.gate_up_proj.element_size() == 1:
-            bo, bi = self.block_size
-
-            # gate_up tiles: ceil(Wg_out/bo) x ceil(Wg_in/bi)
-            gu_scale_o = _ceil_div(Wg_out, bo)
-            gu_scale_i = _ceil_div(Wg_in, bi)
-            self.gate_up_proj_scales_inv = nn.Parameter(
-                torch.zeros(self.num_experts, gu_scale_o, gu_scale_i, dtype=torch.float32, device=device)
-            )
-
-            # down tiles: ceil(Wd_out/bo) x ceil(Wd_in/bi)
-            dp_scale_o = _ceil_div(Wd_out, bo)
-            dp_scale_i = _ceil_div(Wd_in, bi)
-            self.down_proj_scales_inv = nn.Parameter(
-                torch.zeros(self.num_experts, dp_scale_o, dp_scale_i, dtype=torch.float32, device=device)
-            )
-        else:
-            # Match FP8Linear behavior when not using 1-byte weights
-            self.register_parameter("gate_up_proj_scale_inv", None)
-            self.register_parameter("down_proj_scale_inv", None)
-
-        # (Optional) bias per projection — many MoEs omit bias; keep None to match your FP8Linear default
-        self.register_parameter("gate_up_bias", None)
-        self.register_parameter("down_bias", None)
-
-        # Activation used in the MLP (same as your config / ACT2FN)
-        # Keep a handle here; actual usage happens in forward of your MoE block
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        top_k_index: torch.Tensor,
-        top_k_weights: torch.Tensor,
-    ) -> torch.Tensor:
-        final_hidden_states = torch.zeros_like(hidden_states)
-        num_experts = top_k_weights.shape[1]
-        with torch.no_grad():
-            expert_mask = torch.nn.functional.one_hot(top_k_index, num_classes=num_experts + 1)
-            expert_mask = expert_mask.permute(2, 1, 0)
-            expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
-
-        for expert_idx in expert_hit:
-            expert_idx = expert_idx[0]
-            if expert_idx == num_experts:
-                continue
-            _, token_idx = torch.where(expert_mask[expert_idx])
-            current_state = hidden_states.index_select(0, token_idx)
-            gate, up = self.linear(
-                current_state, self.gate_up_proj[expert_idx], self.gate_up_proj_scales_inv[expert_idx]
-            ).chunk(2, dim=-1)
-            current_hidden_states = self.act_fn(gate) * up
-            current_hidden_states = self.linear(
-                current_hidden_states, self.down_proj[expert_idx], self.down_proj_scales_inv[expert_idx]
-            )
-
-            routing_weights = top_k_weights[token_idx, expert_idx].unsqueeze(-1)
-            current_hidden_states = current_hidden_states * routing_weights.to(current_hidden_states.dtype)
-            final_hidden_states.index_add_(0, token_idx, current_hidden_states.to(final_hidden_states.dtype))
-
-        return final_hidden_states
-
-    def linear(self, input: torch.Tensor, weight: torch.Tensor, weight_scale_inv: torch.Tensor) -> torch.Tensor:
-        if weight.element_size() > 1:
-            return F.linear(input, weight, None)
-        else:
-            # Context manager used to switch among the available accelerators
-            device_type = torch.accelerator.current_accelerator().type if is_torch_accelerator_available() else "cuda"
-            torch_accelerator_module = getattr(torch, device_type, torch.cuda)
-            with torch_accelerator_module.device(input.device):
-                qinput, scale = act_quant(input, self.block_size[1])
-                output = w8a8_block_fp8_matmul_triton(
-                    qinput,
-                    weight,
-                    scale,
-                    weight_scale_inv,
-                    self.block_size,
-                    output_dtype=input.dtype,
-                )
-            # Blocks the CPU until all accelerator operations on the specified device are complete. It is used to ensure that the results of the
-            # preceding operations are ready before proceeding
-            torch_accelerator_module.synchronize()
-            return output.to(dtype=input.dtype)
-
-
-# TODO: we do need this.... but not recursive...
 def _replace_with_fp8_linear(
    model,
    tp_plan=None,
@ -497,48 +361,40 @@ def _replace_with_fp8_linear(
    quantization_config=None,
    has_been_replaced=False,
 ):
-    iterator = list(model.named_parameters()).copy()
-    for name, empty_tensor in iterator:
-        current_key_name = name
-        name = name.rsplit(".", 1)[0] if "." in name else name
-        module = model.get_submodule(name)
+    """Replace Linear layers with FP8Linear."""
+    if current_key_name is None:
+        current_key_name = []

-        current_key_name_str = re.sub(r"\d+", "*", current_key_name)
-        if not any(key in current_key_name_str for key in (modules_to_not_convert or [])):
-            with init_empty_weights():
-                if (
-                    "gate_up_proj" in current_key_name
-                    or "down_proj" in current_key_name
-                    and "experts" in current_key_name
-                ):  # Experts!
-                    in_features = empty_tensor.size(-2)
-                    out_features = empty_tensor.size(-1)
-                    model.set_submodule(
-                        name,
-                        FP8Expert(
-                            config=model.config,
-                            block_size=quantization_config.weight_block_size,
-                            device=empty_tensor.device,
-                        ),
-                    )
+    for name, module in model.named_children():
+        current_key_name.append(name)

-                elif isinstance(module, nn.Linear):
-                    in_features = module.in_features
-                    out_features = module.out_features
-                    model.set_submodule(
-                        name,
-                        FP8Linear(
-                            in_features=in_features,
-                            out_features=out_features,
-                            bias=module.bias is not None,
-                            device=module.weight.device,
-                            dtype=module.weight.dtype,
-                            activation_scheme=quantization_config.activation_scheme,
-                            block_size=quantization_config.weight_block_size,
-                        ),
+        if isinstance(module, nn.Linear) and name not in (modules_to_not_convert or []):
+            current_key_name_str = ".".join(current_key_name)
+            if not any(key in current_key_name_str for key in (modules_to_not_convert or [])):
+                with init_empty_weights():
+                    model._modules[name] = FP8Linear(
+                        in_features=module.in_features,
+                        out_features=module.out_features,
+                        bias=module.bias is not None,
+                        device=module.weight.device,
+                        dtype=module.weight.dtype,
+                        activation_scheme=quantization_config.activation_scheme,
+                        block_size=quantization_config.weight_block_size,
                    )
-                has_been_replaced = True
-        # when changing a layer the TP PLAN for that layer should be updated. TODO
+                    has_been_replaced = True
+            # when changing a layer the TP PLAN for that layer should be updated. TODO
+
+        if len(list(module.children())) > 0:
+            _, has_been_replaced = _replace_with_fp8_linear(
+                module,
+                tp_plan,
+                modules_to_not_convert,
+                current_key_name,
+                quantization_config,
+                has_been_replaced=has_been_replaced,
+            )
+
+        current_key_name.pop(-1)

    return model, has_been_replaced

@ -549,7 +405,7 @@ def replace_with_fp8_linear(
    quantization_config=None,
 ):
    """Helper function to replace model layers with FP8 versions."""
-    modules_to_not_convert += ["lm_head"]
+    modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert

    if quantization_config.modules_to_not_convert is not None:
        modules_to_not_convert.extend(quantization_config.modules_to_not_convert)
@ -568,133 +424,3 @@ def replace_with_fp8_linear(
        )

    return model
-
-
-class QuantizationOp(ConversionOps):
-    """Base class for quantization operations."""
-
-    pass
-
-
-class Fp8Quantize(QuantizationOp):
-    """
-    A quantization operation that creates two tensors, weight and scale out of a weight.
-    """
-
-    reverse_op: type[ConversionOps]
-
-    def __init__(self, block_size: Optional[tuple[int, int]] = None):
-        self.block_size = block_size
-        self.reverse_op = Fp8Dequantize
-
-    def convert(self, input_dict: torch.Tensor, *, quant_config: dict[str, Any]) -> dict[str, torch.Tensor]:
-        # Unpack single key/value (value may be wrapped in a list)
-        target_keys, value = tuple(input_dict.items())[0]
-        value = value[0] if isinstance(value, list) else value
-
-        # Resolve block size (support dict-like or attr-like quant_config)
-        block_size = None
-        if quant_config is not None:
-            if isinstance(quant_config, dict):
-                block_size = quant_config.get("weight_block_size")
-            else:
-                block_size = getattr(quant_config, "weight_block_size", None)
-        if block_size is None:
-            block_size = (value.shape[-2], value.shape[-1])
-
-        block_m, block_n = block_size
-        rows, cols = value.shape[-2], value.shape[-1]
-
-        # Enforce exact tiling like your original
-        if rows % block_m != 0 or cols % block_n != 0:
-            raise ValueError(
-                f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes ({block_m}, {block_n}). for {target_keys}"
-            )
-
-        # Leading dims can be empty (2D) or include num_experts/... (3D+)
-        leading_shape = value.shape[:-2]
-        rows_tiles = rows // block_m
-        cols_tiles = cols // block_n
-
-        original_shape = value.shape
-        value_fp32 = value.to(torch.float32)
-
-        # Reshape to (..., rows_tiles, block_m, cols_tiles, block_n)
-        reshaped = value_fp32.reshape(*leading_shape, rows_tiles, block_m, cols_tiles, block_n)
-
-        # Per-tile max-abs over the block dims
-        # dims: block_m is at -3, block_n is at -1 after the reshape
-        max_abs = reshaped.abs().amax(dim=(-3, -1))
-        safe_max_abs = torch.where(max_abs > 0, max_abs, torch.ones_like(max_abs))
-
-        # Tile scale (we store inverse scale like your Linear: weight_scale_inv)
-        scales = _FP8_MAX / safe_max_abs
-        scales = torch.where(max_abs > 0, scales, torch.ones_like(scales))  # keep zeros stable
-
-        # Broadcast scales back over the block dims and quantize
-        # max_abs/scales shape: (..., rows_tiles, cols_tiles)
-        scales_broadcast = scales.unsqueeze(-1).unsqueeze(-3)  # -> (..., rows_tiles, 1, cols_tiles, 1)
-        scaled = reshaped * scales_broadcast
-
-        if _FP8_IS_INT:
-            quantized = torch.clamp(scaled.round(), min=_FP8_MIN, max=_FP8_MAX).to(_FP8_DTYPE)
-        else:
-            quantized = torch.clamp(scaled, min=_FP8_MIN, max=_FP8_MAX).to(_FP8_DTYPE)
-
-        quantized = quantized.reshape(original_shape)
-
-        inv_scales = (1.0 / scales).to(torch.float32)  # shape: (*leading, rows_tiles, cols_tiles)
-        if target_keys.endswith("weight"):
-            scale_key = target_keys.rsplit(".", 1)[0] + ".weight_scale_inv"
-        else:
-            scale_key = target_keys + "_scales_inv"
-
-        # Return both quantized weights and per-tile inverse scales (keeps leading dims, e.g., num_experts)
-        return {
-            target_keys: quantized,
-            scale_key: inv_scales,
-        }
-
-
-class Fp8Dequantize(QuantizationOp):
-    """Inverse operation of :class:`Fp8Quantize`. Takes a pair (weight, scale) and reconstructs the fp32 tensor."""
-
-    def __init__(self, block_size: Optional[tuple[int, int]] = None):
-        self.block_size = block_size
-        self.reverse_op = Fp8Quantize
-
-    def convert(
-        self,
-        value: Union[Sequence[torch.Tensor], dict[str, torch.Tensor]],
-        *,
-        context: dict[str, Any],
-    ) -> torch.Tensor:
-        if isinstance(value, dict):
-            tensors = list(value.values())
-        else:
-            tensors = list(value) if isinstance(value, Sequence) else [value]
-        if len(tensors) != 2:
-            raise ValueError("Fp8Dequantize expects exactly two tensors: quantized weights and scales.")
-        quantized, scales = tensors
-        if not isinstance(quantized, torch.Tensor) or not isinstance(scales, torch.Tensor):
-            raise TypeError("Fp8Dequantize expects tensors as inputs.")
-
-        quantized_fp32 = quantized.to(torch.float32)
-        rows, cols = quantized_fp32.shape[-2:]
-        block_size = self.block_size
-        if block_size is None:
-            quant_config = context.get("quantization_config")
-            block_size = getattr(quant_config, "weight_block_size", None)
-        if block_size is None:
-            block_size = (rows, cols)
-        block_m, block_n = block_size
-        if rows % block_m != 0 or cols % block_n != 0:
-            raise ValueError(
-                f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes ({block_m}, {block_n})."
-            )
-
-        reshaped = quantized_fp32.reshape(-1, rows // block_m, block_m, cols // block_n, block_n)
-        expanded_scales = scales.to(torch.float32).reshape(-1, rows // block_m, cols // block_n)
-        expanded_scales = expanded_scales.unsqueeze(-1).unsqueeze(2)
-        dequantized = reshaped * expanded_scales
-        return dequantized.reshape(quantized_fp32.shape)
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@ -38,7 +38,7 @@ from transformers.utils.import_utils import _is_package_available


 if os.getenv("WANDB_MODE") == "offline":
-    print("[INFO] Running in WANDB offline mode")
+    print("⚙️  Running in WANDB offline mode")

 from .. import PreTrainedModel, TrainingArguments
 from .. import __version__ as version
--- a/src/transformers/integrations/mxfp4.py
+++ b/src/transformers/integrations/mxfp4.py
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from ..utils import is_accelerate_available, is_torch_available, is_torch_xpu_available, logging
+from ..utils import is_accelerate_available, is_torch_available, logging


 if is_torch_available():
@ -114,9 +114,6 @@ def convert_moe_packed_tensors(
    if not blocks.is_cuda and torch.cuda.is_available():
        blocks = blocks.cuda()
        scales = scales.cuda()
-    elif (blocks.device.type != "xpu") and is_torch_xpu_available():
-        blocks = blocks.to("xpu")
-        scales = scales.to("xpu")

    scales = scales.to(torch.int32) - 127  # TODO that's because 128=2**7

@ -354,8 +351,6 @@ def dequantize(module, param_name, param_value, target_device, dq_param_name, **
                dequantized = convert_moe_packed_tensors(getattr(module, blocks_attr), getattr(module, scales_attr))
                if target_device == "cpu" and torch.cuda.is_available():
                    torch.cuda.empty_cache()
-                elif target_device == "cpu" and is_torch_xpu_available():
-                    torch.xpu.empty_cache()
                setattr(module, proj, torch.nn.Parameter(dequantized.to(target_device)))
                delattr(module, blocks_attr)
                delattr(module, scales_attr)
@ -400,7 +395,7 @@ def load_and_swizzle_mxfp4(module, param_name, param_value, target_device, trito
        else:
            blocks = blocks.reshape(local_experts, -1, module.intermediate_size // 2)
        if getattr(target_device, "type", target_device) == "cpu":
-            target_device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+            target_device = "cuda"
        blocks = blocks.to(target_device).contiguous()
        scales = scales.to(target_device).contiguous()
        with on_device(target_device):
--- a/src/transformers/integrations/peft.py
+++ b/src/transformers/integrations/peft.py
@ -236,7 +236,7 @@ class PeftAdapterMixin:
                **adapter_kwargs,
            )
            peft_config.inference_mode = not is_trainable
-        # TODO: WE NEED TOO APPLY OUR DYNAMIC WEIGHT CONVERSION AT SOME POINT HERE!
+
        # Create and add fresh new adapters into the model.
        inject_adapter_in_model(peft_config, self, adapter_name, **peft_load_kwargs)

--- a/src/transformers/integrations/tensor_parallel.py
+++ b/src/transformers/integrations/tensor_parallel.py
@ -18,7 +18,6 @@ import operator
 import os
 import re
 from functools import partial, reduce
-from typing import Optional

 import torch
 import torch.distributed as dist
@ -307,7 +306,7 @@ def repack_weights(
    return final_ordered_tensor


-def get_tensor_shard(param, empty_param, device_mesh, rank, dim, tensor_idx: Optional[int] = None):
+def get_tensor_shard(param, empty_param, device_mesh, rank, dim):
    """
    Generalized tensor sharding across a multi-dimensional device mesh.
    Extract only the fraction of the parameter owned by the given `rank` when the parameter would have gone sharding at provided `dim`.
@ -359,57 +358,32 @@ def get_tensor_shard(param, empty_param, device_mesh, rank, dim, tensor_idx: Opt
        rank (int): Global rank of the current process/device.
        dim (int): Dimension along which to shard the tensor.
    """
-    param_dim = empty_param.ndim
+    param_dim = empty_param.dim()
+
+    if dim < 0:
+        dim = param_dim + dim
+    if dim >= param_dim:
+        raise ValueError(f"dim {dim} is out of bounds for tensor of dimension {param_dim}")
+
    # Flatten the mesh to get the total number of devices
    mesh_shape = device_mesh.shape
    world_size = reduce(operator.mul, mesh_shape)
-    if dim < 0:
-        dim = param_dim + dim
-    if empty_param.dim() == 3 and dim == 1 and len(param.get_shape()) == 2:
-        dim = 0
-    elif empty_param.dim() == 3 and dim == 2 and len(param.get_shape()) == 2:
-        dim = 0
-
-    shard_size = math.ceil(empty_param.size(dim) / world_size)
-    start = rank * shard_size
-    end = min(start + shard_size, empty_param.size(dim))
-
-    if dim >= param_dim:
-        raise ValueError(f"dim {dim} is out of bounds for tensor of dimension {param_dim}")

    if rank >= world_size:
        raise ValueError(f"Rank {rank} is out of bounds for mesh size {world_size}")

-    # we have the full tensor not 1 part of it.
-    # in that case, we just assume that the weight was properly saved
-    # and thus because we TP if the layer is colwise it should not use this. Layer should be packed_colwise
-    # to inform that it needs to read form a packed tensor. It will also take care of the module list thingy.
-    # here we take care of potential chunking / layer split / layer chunking.
-    # The only "hard" case is? if we collect q,k,v -> merge it into qkv. In that case
-    # actually we still shard dim=0 does not change
-    # so only case is if the dim of the empty param is 3 and the shard dim is 0 -> we put the
-    # tensor on a certain device (with the input tensor_index)
-    dimensions = param.get_shape()
+    shard_size = math.ceil(empty_param.shape[dim] / world_size)
+    start = rank * shard_size

-    if empty_param.dim() == 3 and dim == 0 and len(param.get_shape()) == 2:
-        # special case we don't "shard" just send this entire tensor to the correct rank.
-        if start <= tensor_idx < end:
-            # this tensor does need to be materialized on this device:
-            return param[:]
-        else:
-            return torch.empty([], dtype=torch.int64, device=rank)
-
-    slice_indices = [slice(None)] * len(param.get_shape())
-
-    if start < param.get_shape()[dim]:
+    # Construct slicing index dynamically
+    end = min(start + shard_size, empty_param.shape[dim])
+    slice_indices = [slice(None)] * param_dim
+    if start < empty_param.shape[dim]:
        slice_indices[dim] = slice(start, end)
-        param = param[tuple(slice_indices)]
-        if isinstance(param, list):  # TODO handle the modulelist case!
-            param = [p[:] for p in param]
-        return param
-
+        return param[tuple(slice_indices)]
+    dimensions = list(param.shape)
    dimensions[dim] = 0
-    return torch.empty(tuple(dimensions), dtype=torch.int64)  # empty allocates memory....
+    return torch.empty(tuple(dimensions), dtype=torch.int64)


 def distribute_module(
@ -436,19 +410,6 @@ class TensorParallelLayer:
    """

    use_dtensor = True
-    device_mesh = None
-    rank = None
-
-    # Used to compare the shape of the original tensor
-    empty_param = None
-
-    # Used to init the corresponding DTensor
-    shard = None
-
-    def __init__(self, device_mesh=None, rank=None, empty_param=None):
-        self.rank = rank
-        self.device_mesh = device_mesh
-        self.empty_param = empty_param

    @staticmethod
    def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh): ...
@ -478,12 +439,12 @@ class GatherParallel(TensorParallelLayer):

    def __init__(
        self,
+        *,
        input_layouts: Placement | None = None,
        output_layouts: Placement | None = None,
        use_local_output: bool = True,
-        **kwargs,
    ):
-        super().__init__(**kwargs)
+        super().__init__()
        self.input_layouts = (input_layouts or Replicate(),)
        self.output_layouts = output_layouts
        self.desired_input_layouts = (Replicate(),)
@ -504,21 +465,6 @@ class GatherParallel(TensorParallelLayer):
            dist.all_reduce(outputs[0], op=dist.ReduceOp.SUM, async_op=False)
        return outputs

-    def shard_tensor(
-        self,
-        param,
-        param_type=None,
-        param_casting_dtype=None,
-        to_contiguous=None,
-        rank=None,
-        device_mesh=None,
-        tensor_idx=None,
-    ):
-        shard = [Replicate()]
-        parameter = param[...].to(param_casting_dtype)
-        self.shard = shard
-        return parameter, shard
-
    def prepare_module_tp(self, module: nn.Module, device_mesh) -> nn.Module:
        distribute_module(
            module,
@ -547,23 +493,6 @@ class IsolatedParallel(TensorParallelLayer):
        # TODO: figure out dynamo support for instance method and switch this to instance method
        return outputs

-    def shard_tensor(
-        self,
-        param,
-        param_type=None,
-        param_casting_dtype=None,
-        to_contiguous=None,
-        rank=None,
-        device_mesh=None,
-        tensor_idx=None,
-    ):
-        mesh = device_mesh or self.device_mesh
-        parameter = param[...].to(param_casting_dtype)
-        if mesh is not None:
-            parameter = parameter / mesh.size()
-        self.shard = None
-        return parameter, None
-
    def partition_tensor(self, param, empty_param, param_type, param_casting_dtype, to_contiguous, rank, device_mesh):
        param = param[...].to(param_casting_dtype)
        if to_contiguous:
@ -586,8 +515,8 @@ class ReplicateParallel(TensorParallelLayer):
    This class is used to replicate computation in a TP layer (used in SP regions when we don't use sequence parallelism for example)
    """

-    def __init__(self, use_dtensor=True, use_local_output=True, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(self, *, use_dtensor=True, use_local_output=True):
+        super().__init__()
        self.input_layouts = (Replicate(),)
        self.output_layouts = (Replicate(),)
        self.desired_input_layouts = (Replicate(),)
@ -608,33 +537,12 @@ class ReplicateParallel(TensorParallelLayer):
    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
        return outputs.to_local() if use_local_output and isinstance(outputs, DTensor) else outputs

-    def shard_tensor(
-        self,
-        param,
-        param_type=None,
-        param_casting_dtype=None,
-        to_contiguous=None,
-        rank=None,
-        device_mesh=None,
-        tensor_idx=None,
-    ):
-        parameter = param[...].to(param_casting_dtype)
-        shard = [Replicate()]
-        self.shard = shard
-        return parameter, shard
-
    def partition_tensor(self, param, empty_param, param_type, param_casting_dtype, to_contiguous, rank, device_mesh):
-        parameter, shard = self.shard_tensor(
-            param,
-            param_type=param_type,
-            param_casting_dtype=param_casting_dtype,
-            to_contiguous=to_contiguous,
-            rank=rank,
-            device_mesh=device_mesh,
-        )
-        if self.use_dtensor:
-            parameter = DTensor.from_local(parameter, device_mesh, shard, run_check=False)
-        return parameter
+        param = param[...].to(param_casting_dtype)
+        if to_contiguous:
+            param = param.contiguous()
+        param = DTensor.from_local(param, device_mesh, [Replicate()], run_check=False)
+        return param


 class ColwiseParallel(TensorParallelLayer):
@ -644,13 +552,13 @@ class ColwiseParallel(TensorParallelLayer):

    def __init__(
        self,
+        *,
        input_layouts: Placement | None = None,
        output_layouts: Placement | None = None,
        use_local_output: bool = True,
        use_dtensor=True,
-        **kwargs,
    ):
-        super().__init__(**kwargs)
+        super().__init__()
        self.input_layouts = (input_layouts or Replicate(),)
        self.output_layouts = (output_layouts or Shard(-1),)
        self.desired_input_layouts = (Replicate(),)
@ -670,34 +578,18 @@ class ColwiseParallel(TensorParallelLayer):
            input_tensor = input_tensor.redistribute(placements=desired_input_layouts, async_op=False)
        return input_tensor

-    def shard_tensor(
-        self,
-        param,
-        param_type=None,
-        param_casting_dtype=None,
-        to_contiguous=None,
-        rank=None,
-        device_mesh=None,
-        tensor_idx=None,
-    ):
-        device_mesh = self.device_mesh
-        empty_param = self.empty_param
-        rank = self.rank
-        if param_type == "bias":
-            parameter = get_tensor_shard(param, empty_param, device_mesh, rank, -1, tensor_idx)
-            shard = [Shard(-1)]
-        else:
-            shard = [Shard(-2)]
-            parameter = get_tensor_shard(param, empty_param, device_mesh, rank, -2, tensor_idx)
-        parameter = parameter.to(param_casting_dtype)
-        self.shard = shard
-        return parameter, shard
-
    def partition_tensor(self, param, empty_param, param_type, param_casting_dtype, to_contiguous, rank, device_mesh):
        # colwise shard weight/bias to Shard(0), weight be Shard(-2) (0 if you have 1 dim only)
        # means Colwise as Linear is input * weight^T + bias, where
        # weight would become Shard(1)
-        parameter, shard = self.shard_tensor(param, param_type, param_casting_dtype, to_contiguous, rank, device_mesh)
+        if param_type == "bias":
+            parameter = get_tensor_shard(param, empty_param, device_mesh, rank, -1)
+            shard = [Shard(-1)]
+        else:
+            shard = [Shard(-2)]
+            parameter = get_tensor_shard(param, empty_param, device_mesh, rank, -2)
+
+        parameter = parameter.to(param_casting_dtype)
        if to_contiguous:
            parameter = parameter.contiguous()
        if self.use_dtensor:
@ -716,21 +608,6 @@ class ColwiseParallel(TensorParallelLayer):


 class PackedColwiseParallel(ColwiseParallel):
-    def shard_tensor(
-        self,
-        param,
-        param_type=None,
-        param_casting_dtype=None,
-        to_contiguous=None,
-        rank=None,
-        device_mesh=None,
-        tensor_idx=None,
-    ):
-        device_mesh = device_mesh or self.device_mesh
-        empty_param = self.empty_param
-        rank = rank if rank is not None else self.rank
-        return get_packed_weights(param, empty_param, device_mesh, rank, -2).to(param_casting_dtype), [Shard(-2)]
-
    def partition_tensor(self, param, empty_param, param_type, param_casting_dtype, to_contiguous, rank, device_mesh):
        # colwise shard weight/bias to Shard(0), weight be Shard(-2) (0 if you have 1 dim only)
        # means Colwise as Linear is input * weight^T + bias, where
@ -765,41 +642,18 @@ class RowwiseParallel(TensorParallelLayer):

    def __init__(
        self,
+        *,
        input_layouts: Placement | None = None,
        output_layouts: Placement | None = None,
        use_local_output: bool = True,
        use_dtensor=True,
-        **kwargs,
    ):
-        super().__init__(**kwargs)
+        super().__init__()
        self.input_layouts = (input_layouts or Shard(-1),)
        self.output_layouts = (output_layouts or Replicate(),)
        self.use_local_output = use_local_output
        self.use_dtensor = use_dtensor

-    def shard_tensor(
-        self,
-        param,
-        param_type=None,
-        param_casting_dtype=None,
-        to_contiguous=None,
-        rank=None,
-        device_mesh=None,
-        tensor_idx=None,
-    ):
-        device_mesh = device_mesh or self.device_mesh
-        empty_param = self.empty_param
-        rank = rank if rank is not None else self.rank
-        if param_type == "bias":
-            shard = [Replicate()]
-            parameter = param[...]
-        else:
-            parameter = get_tensor_shard(param, empty_param, device_mesh, rank, -1, tensor_idx=tensor_idx)
-            shard = [Shard(-1)]
-        parameter = parameter.to(param_casting_dtype)
-        self.shard = shard
-        return parameter, shard
-
    def partition_tensor(self, param, empty_param, param_type, param_casting_dtype, to_contiguous, rank, device_mesh):
        # Rowwise shard weight to Shard(1), bias to Replicate(), weight be Shard(1)
        # means Rowwise as nn.Linear is input * weight^T + bias, where
@ -871,21 +725,6 @@ class RowwiseParallel(TensorParallelLayer):


 class PackedRowwiseParallel(RowwiseParallel):
-    def shard_tensor(
-        self,
-        param,
-        param_type=None,
-        param_casting_dtype=None,
-        to_contiguous=None,
-        rank=None,
-        device_mesh=None,
-        tensor_idx=None,
-    ):
-        device_mesh = device_mesh or self.device_mesh
-        empty_param = self.empty_param
-        rank = rank if rank is not None else self.rank
-        return get_packed_weights(param, empty_param, device_mesh, rank, -1), [Shard(-1)]
-
    def partition_tensor(self, param, empty_param, param_type, param_casting_dtype, to_contiguous, rank, device_mesh):
        # colwise shard weight/bias to Shard(0), weight be Shard(-2) (0 if you have 1 dim only)
        # means Colwise as Linear is input * weight^T + bias, where
@ -944,8 +783,8 @@ class SequenceParallel(TensorParallelLayer):
        to ensure that they are replicated.
    """

-    def __init__(self, sequence_dim: int = 1, use_local_output: bool = False, use_dtensor=False, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(self, *, sequence_dim: int = 1, use_local_output: bool = False, use_dtensor=False):
+        super().__init__()
        self.input_layouts = (Replicate(),)
        self.desired_input_layouts = (Shard(1),)
        self.output_layouts = (Replicate(),)
@ -954,21 +793,6 @@ class SequenceParallel(TensorParallelLayer):
        self.sequence_sharding = (Shard(sequence_dim),)
        self.use_local_output = use_local_output

-    def shard_tensor(
-        self,
-        param,
-        param_type=None,
-        param_casting_dtype=None,
-        to_contiguous=None,
-        rank=None,
-        device_mesh=None,
-        tensor_idx=None,
-    ):
-        parameter = param[...].to(param_casting_dtype)
-        shard = [Replicate()]
-        self.shard = shard
-        return parameter, shard
-
    @staticmethod
    def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh):
        input_tensor = inputs[0]
@ -1003,34 +827,10 @@ class GroupedGemmParallel(TensorParallelLayer):
    Applies Expert Parallelism to MoE experts by loading the correct experts on each device.
    """

-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(self):
+        super().__init__()
        self.use_dtensor = False

-    def shard_tensor(
-        self,
-        param,
-        param_type=None,
-        param_casting_dtype=None,
-        to_contiguous=None,
-        rank=None,
-        device_mesh=None,
-        tensor_idx=None,
-    ):
-        empty_param = self.empty_param
-        ep_rank = self.rank
-        device_mesh = self.device_mesh
-
-        global_num_experts = empty_param.shape[0]
-        if global_num_experts % device_mesh.size() != 0:
-            raise ValueError(
-                f"Global number of experts must be divisible by number of devices: {global_num_experts} % {device_mesh.size()} != 0"
-            )
-        local_num_experts = global_num_experts // device_mesh.size()
-        parameter = param[ep_rank * local_num_experts : (ep_rank + 1) * local_num_experts].to(param_casting_dtype)
-        self.shard = None
-        return parameter, None
-
    def partition_tensor(self, param, empty_param, param_type, param_casting_dtype, to_contiguous, rank, device_mesh):
        ep_rank = rank
        global_num_experts = empty_param.shape[0]
@ -1051,8 +851,8 @@ class RouterParallel(TensorParallelLayer):
    """

    def __init__(self, *args, **kwargs):
-        super().__init__(**kwargs)
        self.args = args
+        self.kwargs = kwargs
        self.use_dtensor = False

    @staticmethod
@ -1117,20 +917,6 @@ class RouterParallel(TensorParallelLayer):
        )  # masking class for one hot
        return router_scores, router_indices

-    def shard_tensor(
-        self,
-        param,
-        param_type=None,
-        param_casting_dtype=None,
-        to_contiguous=None,
-        rank=None,
-        device_mesh=None,
-        tensor_idx=None,
-    ):
-        parameter = param[...].to(param_casting_dtype)
-        self.shard = None
-        return parameter, None
-
    def partition_tensor(self, param, empty_param, param_type, param_casting_dtype, to_contiguous, rank, device_mesh):
        # TODO: i'd like for this to be the default
        param = param[...].to(param_casting_dtype)
@ -1273,9 +1059,6 @@ def shard_and_distribute_module(
    if current_shard_plan is not None:
        try:
            tp_layer = ALL_PARALLEL_STYLES[current_shard_plan]
-            tp_layer.empty_param = empty_param
-            tp_layer.device_mesh = device_mesh
-            tp_layer.rank = rank
            param = tp_layer.partition_tensor(
                param, empty_param, param_type, param_casting_dtype, is_contiguous, rank, device_mesh
            )
--- a/src/transformers/masking_utils.py
+++ b/src/transformers/masking_utils.py
@ -82,10 +82,8 @@ def causal_mask_function(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int)
 def bidirectional_mask_function(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
    """
    This creates a full bidirectional mask.
-
-    NOTE: It is important to keep an index-based version for non-vmap expansion.
    """
-    return q_idx >= 0
+    return q_idx.new_ones((), dtype=torch.bool)


 def sliding_window_overlay(sliding_window: int) -> Callable:
@ -112,6 +110,18 @@ def chunked_overlay(chunk_size: int, left_padding: torch.Tensor) -> Callable:
    return inner_mask


+def _legacy_chunked_overlay(chunk_size: int) -> Callable:
+    """
+    Same as the above function, but do not correctly account for left padding tokens.
+    Only kept for compatibility with older torch versions (< 2.6).
+    """
+
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        return kv_idx // chunk_size == q_idx // chunk_size
+
+    return inner_mask
+
+
 def sliding_window_causal_mask_function(sliding_window: int) -> Callable:
    """
    This return the mask_function function to create a sliding window mask.
@ -123,6 +133,8 @@ def chunked_causal_mask_function(chunk_size: int, left_padding: torch.Tensor) ->
    """
    This return the mask_function function to create a chunked attention mask.
    """
+    if not _is_torch_greater_or_equal_than_2_6:
+        return and_masks(_legacy_chunked_overlay(chunk_size), causal_mask_function)
    return and_masks(chunked_overlay(chunk_size, left_padding), causal_mask_function)


@ -163,17 +175,52 @@ def add_offsets_to_mask_function(mask_function: Callable, q_offset: int, kv_offs
    return inner_mask


+def _vmap_for_bhqkv(mask_function: Callable, bh_indices: bool = True) -> Callable:
+    """
+    Used to vmap our mask_functions over the q_idx and kv_idx dimensions of the inputs. Optionally, vmap over
+    the batch and head indices as well if `bh_indices=True`.
+    Using vmap here allows us to keep the performance of vectorized ops, while having a single set of primitive
+    functions between attention interfaces (i.e. between flex and sdpa/eager, FA2 being a bit different).
+
+    Args:
+        mask_function (`Callable`):
+            The mask_function to vmap.
+        bh_indices (`bool`, optional):
+            Whether to vmap over the batch and head indices as well, or only q and kv indices.
+
+    Returns:
+        Callable: The vmapped function.
+    """
+    # We vmap the function 2 times, broadcasting the [q_idx, kv_idx] dimensions
+    dimensions = [(None, None, None, 0), (None, None, 0, None)]
+    if bh_indices:
+        # We extend broadcasting over the [batch_idx, head_idx] dimensions
+        dimensions.extend([(None, 0, None, None), (0, None, None, None)])
+
+    for dims in dimensions:
+        mask_function = torch.vmap(mask_function, in_dims=dims, out_dims=0)
+    return mask_function
+
+
 def prepare_padding_mask(
-    attention_mask: Optional[torch.Tensor], kv_length: int, kv_offset: int
+    attention_mask: Optional[torch.Tensor], kv_length: int, kv_offset: int, _slice: bool = True
 ) -> Optional[torch.Tensor]:
    """
-    From the 2D attention mask, prepare the correct padding mask to use by potentially padding it.
+    From the 2D attention mask, prepare the correct padding mask to use by potentially padding it, and slicing
+    according to the `kv_offset` if `_slice` is `True`.
    """
    local_padding_mask = attention_mask
    if attention_mask is not None:
        # Pad it if necessary
        if (padding_length := kv_length + kv_offset - attention_mask.shape[-1]) > 0:
            local_padding_mask = torch.nn.functional.pad(attention_mask, (0, padding_length))
+        # For flex, we should not slice them, only use an offset
+        if _slice:
+            # Equivalent to: `local_padding_mask = attention_mask[:, kv_offset : kv_offset + kv_length]`,
+            # but without data-dependent slicing (i.e. torch.compile friendly)
+            mask_indices = torch.arange(kv_length, device=local_padding_mask.device)
+            mask_indices += kv_offset
+            local_padding_mask = local_padding_mask[:, mask_indices]
    return local_padding_mask


@ -235,39 +282,7 @@ def _ignore_bidirectional_mask_sdpa(padding_mask: Optional[torch.Tensor]) -> boo
    return False


-def _vmap_expansion_sdpa(mask_function: Callable) -> Callable:
-    """
-    Used to vmap our mask_functions over the all 4 dimensions (b_idx, h_idx, q_idx, kv_idx) of the inputs.
-    Using vmap here allows us to keep the performance of vectorized ops, while having a single set of primitive
-    functions between attention interfaces (i.e. between flex and sdpa/eager, FA2 being a bit different).
-    """
-    # We vmap the function over all 4 dimensions, broadcasting [b_idx, h_idx, q_idx, kv_idx]
-    dimensions = [(None, None, None, 0), (None, None, 0, None), (None, 0, None, None), (0, None, None, None)]
-    for dims in dimensions:
-        mask_function = torch.vmap(mask_function, in_dims=dims, out_dims=0)
-    return mask_function
-
-
-def _non_vmap_expansion_sdpa(
-    batch_indices: torch.Tensor, head_indices: torch.Tensor, q_indices: torch.Tensor, kv_indices: torch.Tensor
-):
-    """
-    Used to broadcast our mask_functions over the all 4 dimensions (b_idx, h_idx, q_idx, kv_idx) of the inputs.
-    Allows the usage of any index-based mask function without relying on vmap.
-
-    NOTE: This is limited to index based functions only and is not guaranteed to work otherwise.
-
-    Reference:
-        - https://github.com/huggingface/optimum-onnx/blob/c123e8f4fab61b54a8e0e31ce74462bcacca576e/optimum/exporters/onnx/model_patcher.py#L362-L365
-    """
-    batch_indices = batch_indices[:, None, None, None]
-    head_indices = head_indices[None, :, None, None]
-    q_indices = q_indices[None, None, :, None]
-    kv_indices = kv_indices[None, None, None, :]
-    return batch_indices, head_indices, q_indices, kv_indices
-
-
-def sdpa_mask(
+def sdpa_mask_recent_torch(
    batch_size: int,
    cache_position: torch.Tensor,
    kv_length: int,
@ -277,8 +292,6 @@ def sdpa_mask(
    local_size: Optional[int] = None,
    allow_is_causal_skip: bool = True,
    allow_is_bidirectional_skip: bool = False,
-    allow_torch_fix: bool = True,
-    use_vmap: bool = False,
    **kwargs,
 ) -> Optional[torch.Tensor]:
    """
@ -311,12 +324,6 @@ def sdpa_mask(
        allow_is_bidirectional_skip (`bool`, optional):
            Whether to allow to return `None` for the mask under conditions where we do not have to add any bias,
            i.e. full attention without any padding. Default to `False`.
-        allow_torch_fix (`bool`, optional):
-            Whether to update the mask in case a query is not attending to any tokens, to solve a bug in torch's older
-            versions. We need an arg to skip it when using eager. By default `True`.
-        use_vmap (`bool`, optional):
-            Whether to use `vmap` during the mask construction or not. Allows powerful custom patterns that may not be
-            index-based (for the cost of speed performance). By default `False`.


    ## Creating a simple causal mask:
@ -384,8 +391,97 @@ def sdpa_mask(

    """
    q_length = cache_position.shape[0]
+    # Potentially pad the 2D mask, and slice it correctly
+    padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset, _slice=False)

-    # Potentially pad the 2D mask
+    # Under specific conditions, we can avoid materializing the mask
+    #   1. Causal masks can rely on the `is_causal` argument
+    #   2. Bidirectional do not need any further processing (no bias)
+    if allow_is_causal_skip and _ignore_causal_mask_sdpa(padding_mask, q_length, kv_length, kv_offset, local_size):
+        return None
+    if allow_is_bidirectional_skip and _ignore_bidirectional_mask_sdpa(padding_mask):
+        return None
+
+    # vmap can incur performance issues as reported in #41566 for bidirectional mask as we only need to expand the
+    # padding mask. Thus, we allow early exit here if we do not detect any modification to the base mask function
+    if mask_function is bidirectional_mask_function:
+        if padding_mask is not None:
+            # used for slicing without data-dependent slicing
+            mask_indices = torch.arange(kv_length, device=cache_position.device) + kv_offset
+            return padding_mask[:, None, None, mask_indices].expand(-1, -1, q_length, -1)
+        else:
+            return torch.ones(batch_size, 1, q_length, kv_length, dtype=torch.bool, device=cache_position.device)
+
+    # Similar to `kv_arange = torch.arange(start=kv_offset, end=kv_offset + kv_length, device=cache_position.device)`
+    # but without data-dependent slicing (i.e. torch.compile friendly)
+    kv_arange = torch.arange(kv_length, device=cache_position.device)
+    kv_arange += kv_offset
+
+    # Potentially add the padding 2D mask
+    if padding_mask is not None:
+        mask_function = and_masks(mask_function, padding_mask_function(padding_mask))
+
+    batch_arange = torch.arange(batch_size, device=cache_position.device)
+    head_arange = torch.arange(1, device=cache_position.device)
+    # This creates the 4D mask easily. Note that we need this context manager as vmap cannot handle slicing a tensor from
+    # scalar tensor (it internally calls `.item()` which vmap does not allow, but this context works around it
+    # We don't need to add an offset to the mask_function either, as we vmap directly the correct indices for k and kv indices
+    with TransformGetItemToIndex():
+        causal_mask = _vmap_for_bhqkv(mask_function)(batch_arange, head_arange, cache_position, kv_arange)
+
+    return causal_mask
+
+
+def sdpa_mask_older_torch(
+    batch_size: int,
+    cache_position: torch.Tensor,
+    kv_length: int,
+    kv_offset: int = 0,
+    mask_function: Callable = causal_mask_function,
+    attention_mask: Optional[torch.Tensor] = None,
+    local_size: Optional[int] = None,
+    allow_is_causal_skip: bool = True,
+    allow_torch_fix: bool = True,
+    allow_is_bidirectional_skip: bool = False,
+    **kwargs,
+) -> Optional[torch.Tensor]:
+    """
+    NOTE: This function is only used when torch version is torch<2.5 - see `sdpa_mask_recent_torch` otherwise.
+
+    Create a 4D boolean mask of shape `(batch_size, 1, query_length, kv_length)` where a value of True indicates that
+    the element should take part in the attention computation, and False that it should not.
+    If `allow_torch_fix=True` (the default), rows corresponding to query tokens that do not attend
+    to any other tokens (due to padding) will be fully attended to instead, in order to avoid `nan` propagation (this does
+    not change the final result).
+
+    Args:
+        batch_size (`int`):
+            The batch size of the input sequence.
+        cache_position (`torch.Tensor`):
+            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
+        kv_length (`int`):
+            The size that the key and value states will have during the attention computation.
+        kv_offset (`int`, optional):
+            An optional offset to indicate at which first position the key and values states will refer to.
+        mask_function (`Callable`):
+            The mask factory function describing the mask pattern.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
+        local_size (`int`, optional):
+            The size of the local attention, if we do not use full attention. This is used only if `allow_is_causal_skip=True`
+            to try to skip mask creation if possible.
+        allow_is_causal_skip (`bool`, optional):
+            Whether to allow to return `None` for the mask under conditions where we can use the `is_causal` argument in
+            `torch.sdpa` instead. Default to `True`.
+        allow_torch_fix (`bool`, optional):
+            Whether to update the mask in case a query is not attending to any tokens, to solve a bug in torch's older
+            versions. We need an arg to skip it when using eager. By default `True`.
+        allow_is_bidirectional_skip (`bool`, optional):
+            Whether to allow to return `None` for the mask under conditions where we do not have to add any bias,
+            i.e. full attention without any padding. Default to `False`.
+    """
+    q_length = cache_position.shape[0]
+    # Potentially pad the 2D mask, and slice it correctly
    padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset)

    # Under specific conditions, we can avoid materializing the mask
@ -396,45 +492,38 @@ def sdpa_mask(
    if allow_is_bidirectional_skip and _ignore_bidirectional_mask_sdpa(padding_mask):
        return None

-    # Potentially add the padding 2D mask
-    if padding_mask is not None:
-        mask_function = and_masks(mask_function, padding_mask_function(padding_mask))
+    # vmap can incur performance issues as reported in #41566 for bidirectional mask as we only need to expand the
+    # padding mask. Thus, we allow early exit here if we do not detect any modification to the base mask function
+    if mask_function is bidirectional_mask_function:
+        if padding_mask is not None:
+            return padding_mask[:, None, None, :].expand(-1, -1, q_length, -1)
+        else:
+            return torch.ones(batch_size, 1, q_length, kv_length, dtype=torch.bool, device=cache_position.device)

-    batch_arange = torch.arange(batch_size, device=cache_position.device)
-    head_arange = torch.arange(1, device=cache_position.device)
    # Similar to `kv_arange = torch.arange(start=kv_offset, end=kv_offset + kv_length, device=cache_position.device)`
    # but without data-dependent slicing (i.e. torch.compile friendly)
-    kv_arange = torch.arange(kv_length, device=cache_position.device) + kv_offset
+    kv_arange = torch.arange(kv_length, device=cache_position.device)
+    kv_arange += kv_offset

-    # Actual mask creation
-    # Option 1: Fast non-vmap mask creation (default)
-    if not use_vmap:
-        # Apply mask function element-wise through broadcasting
-        attention_mask = mask_function(*_non_vmap_expansion_sdpa(batch_arange, head_arange, cache_position, kv_arange))
-        # Expand the mask to match batch size and query length if they weren't used in the mask function
-        attention_mask = attention_mask.expand(batch_size, -1, q_length, kv_length)
-
-    # Option 2: Vmap mask creation (torch>=2.6 and custom patterns)
-    elif _is_torch_greater_or_equal_than_2_6:
-        # This creates the 4D mask easily. Note that we need this context manager as vmap cannot handle slicing a tensor from
-        # scalar tensor (it internally calls `.item()` which vmap does not allow, but this context works around it
-        # We don't need to add an offset to the mask_function either, as we vmap directly the correct indices for k and kv indices
-        with TransformGetItemToIndex():
-            attention_mask = _vmap_expansion_sdpa(mask_function)(batch_arange, head_arange, cache_position, kv_arange)
-
-    # Option 3: Error out since it indicates that the user did something custom, which they shouldn't have (torch<2.6)
-    else:
-        raise ValueError(
-            "The vmap functionality for mask creation is only supported from torch>=2.6. "
-            "Please update your torch version or use `use_vmap=False` with index-based masks."
-        )
+    # This creates the 4D mask easily. Note that we do not include vmap over the batch_idx dimension as well,
+    # as vmap cannot handle slicing a tensor from scalar tensor (it internally calls `.item()` which vmap does not allow
+    # However, in more recent version of Pytorch, a trick was introduced to handle it - which is the reason we have
+    # `sdpa_mask_recent_torch`, as it allows more general `mask_function`
+    causal_mask = _vmap_for_bhqkv(mask_function, bh_indices=False)(None, None, cache_position, kv_arange)
+    causal_mask = causal_mask[None, None, :, :].expand(batch_size, -1, -1, -1)
+    if padding_mask is not None:
+        causal_mask = causal_mask * padding_mask[:, None, None, :]

    # Due to a bug in versions of torch<2.5, we need to update the mask in case a query is not attending to any
    # tokens (due to padding). See details in https://github.com/pytorch/pytorch/issues/110213
    if not _is_torch_greater_or_equal_than_2_5 and allow_torch_fix:
-        attention_mask = attention_mask | torch.all(~attention_mask, dim=-1, keepdim=True)
+        causal_mask |= torch.all(~causal_mask, dim=-1, keepdim=True)
+    return causal_mask

-    return attention_mask
+
+# We use the version with newer torch whenever possible, as it is more general and can handle arbitrary mask functions
+# (especially mask_function indexing a tensor, such as the padding mask function)
+sdpa_mask = sdpa_mask_recent_torch if _is_torch_greater_or_equal_than_2_6 else sdpa_mask_older_torch


 def eager_mask(
@ -445,7 +534,6 @@ def eager_mask(
    mask_function: Callable = causal_mask_function,
    attention_mask: Optional[torch.Tensor] = None,
    dtype: torch.dtype = torch.float32,
-    use_vmap: bool = False,
    **kwargs,
 ) -> torch.Tensor:
    """
@ -468,14 +556,10 @@ def eager_mask(
            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
        dtype (`torch.dtype`, optional):
            The dtype to use for the mask. By default, `torch.float32`.
-        use_vmap (`bool`, optional):
-            Whether to use `vmap` during the mask construction or not. Allows powerful custom patterns that may not be
-            index-based (for the cost of speed performance). By default `False`.
    """
    # The masks for eager attention are simply boolean mask from sdpa, casted to 0 and -inf
    _ = kwargs.pop("allow_is_causal_skip", None)
    _ = kwargs.pop("allow_is_bidirectional_skip", None)
-    _ = kwargs.pop("allow_torch_fix", None)
    mask = sdpa_mask(
        batch_size=batch_size,
        cache_position=cache_position,
@ -486,7 +570,6 @@ def eager_mask(
        allow_is_causal_skip=False,
        allow_is_bidirectional_skip=False,
        allow_torch_fix=False,
-        use_vmap=use_vmap,
        **kwargs,
    )
    min_dtype = torch.finfo(dtype).min
@ -572,7 +655,7 @@ def flex_attention_mask(
        if not _is_torch_greater_or_equal_than_2_6 and pad_len > 0:
            attention_mask = torch.nn.functional.pad(attention_mask, value=0, pad=(0, pad_len))

-        padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset)
+        padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset, _slice=False)
        mask_function = and_masks(mask_function, padding_mask_function(padding_mask))

    # Add the offsets on top (because flex interface only allows length, not start and end indices)
@ -768,11 +851,6 @@ def create_causal_mask(
    mask_factory_function = causal_mask_function
    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]

-    # Defaulting to using non-vmap based mask creations except when detecting
-    # users passing custom mask functions (as we cannot guarantee that they
-    # are properly index-based as required by our implementation).
-    use_vmap = False
-
    # Do not allow skip if we are compiling (this is to match BC)
    # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
    if _is_torch_xpu_available:
@ -789,16 +867,14 @@ def create_causal_mask(
            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
        mask_factory_function = or_masks(mask_factory_function, or_mask_function)
        allow_is_causal_skip = False
-        use_vmap = True
    if and_mask_function is not None:
        if not _is_torch_greater_or_equal_than_2_6:
            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
        mask_factory_function = and_masks(mask_factory_function, and_mask_function)
        allow_is_causal_skip = False
-        use_vmap = True

    # If we detected packing format
-    if packed_sequence_mask is not None:
+    if packed_sequence_mask is not None and _is_torch_greater_or_equal_than_2_6:
        mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
        allow_is_causal_skip = False

@ -813,7 +889,6 @@ def create_causal_mask(
        allow_is_causal_skip=allow_is_causal_skip,  # additional kwarg for sdpa
        dtype=dtype,  # Additional kwarg for eager
        config=config,  # Pass the config as well, in case someone wants to easily have their own mask_interface
-        use_vmap=use_vmap,  # Short-circuit to non-vmap expansions for the mask
    )
    return causal_mask

@ -867,10 +942,6 @@ def create_bidirectional_mask(

    # Allow skipping the mask creation except we have additional masking operators (and/or masks)
    allow_is_bidirectional_skip = True
-    # Defaulting to using non-vmap based mask creations except when detecting
-    # users passing custom mask functions (as we cannot guarantee that they
-    # are properly index-based as required by our implementation).
-    use_vmap = False

    # Allow slight deviations from the base mask
    # Note that it is very important to apply this before any other deviations of the mask (such as packed sequence mask,
@ -880,13 +951,11 @@ def create_bidirectional_mask(
            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
        mask_factory_function = or_masks(mask_factory_function, or_mask_function)
        allow_is_bidirectional_skip = False
-        use_vmap = True
    if and_mask_function is not None:
        if not _is_torch_greater_or_equal_than_2_6:
            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
        mask_factory_function = and_masks(mask_factory_function, and_mask_function)
        allow_is_bidirectional_skip = False
-        use_vmap = True

    # We now create the mask
    attention_mask = mask_interface(
@ -901,7 +970,6 @@ def create_bidirectional_mask(
        allow_is_bidirectional_skip=allow_is_bidirectional_skip,
        dtype=dtype,  # Additional kwarg for eager
        config=config,  # Pass the config as well, in case someone wants to easily have their own mask_interface
-        use_vmap=use_vmap,  # Short-circuit to non-vmap expansions for the mask
    )
    return attention_mask

@ -964,10 +1032,6 @@ def create_sliding_window_causal_mask(
    mask_factory_function = sliding_window_causal_mask_function(sliding_window)
    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]

-    # Defaulting to using non-vmap based mask creations except when detecting
-    # users passing custom mask functions (as we cannot guarantee that they
-    # are properly index-based as required by our implementation).
-    use_vmap = False
    # Do not allow skip if we are compiling (this is to match BC)
    # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
    allow_is_causal_skip = not getattr(past_key_values, "is_compileable", False)
@ -980,16 +1044,14 @@ def create_sliding_window_causal_mask(
            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
        mask_factory_function = or_masks(mask_factory_function, or_mask_function)
        allow_is_causal_skip = False
-        use_vmap = True
    if and_mask_function is not None:
        if not _is_torch_greater_or_equal_than_2_6:
            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
        mask_factory_function = and_masks(mask_factory_function, and_mask_function)
        allow_is_causal_skip = False
-        use_vmap = True

    # If we detected packing format
-    if packed_sequence_mask is not None:
+    if packed_sequence_mask is not None and _is_torch_greater_or_equal_than_2_6:
        mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
        allow_is_causal_skip = False

@ -1005,7 +1067,6 @@ def create_sliding_window_causal_mask(
        local_size=sliding_window,  # Additional kwarg for sdpa
        dtype=dtype,  # Additional kwarg for eager
        config=config,  # Pass the config as well, in case someone wants to easily have their own mask_interface
-        use_vmap=use_vmap,  # Short-circuit to non-vmap expansions for the mask
    )
    return causal_mask

@ -1079,13 +1140,20 @@ def create_chunked_causal_mask(
        left_padding_tokens = (attention_mask.cumsum(dim=-1) == torch.zeros_like(attention_mask)).sum(dim=-1)
    else:
        left_padding_tokens = torch.zeros(batch_size, device=cache_position.device, dtype=int)
+    # Raise a warning for older versions if the problematic left-padding situation arises
+    if (
+        not _is_torch_greater_or_equal_than_2_6
+        and kv_length + kv_offset > chunk_size
+        and (left_padding_tokens > 0).any()
+    ):
+        logger.warning_once(
+            "Due to limitations of your current torch version, we cannot correctly account for the left-padding "
+            "when computing the chunked attention pattern. This will lead to a wrong attention mask for the padded "
+            "sequences. Behavior will be undefined. Please upgrade to `torch>=2.6` to solve this issue."
+        )
    mask_factory_function = chunked_causal_mask_function(chunk_size, left_padding_tokens)
    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]

-    # Defaulting to using non-vmap based mask creations except when detecting
-    # users passing custom mask functions (as we cannot guarantee that they
-    # are properly index-based as required by our implementation).
-    use_vmap = False
    # Do not allow skip if we are compiling (this is to match BC)
    # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
    allow_is_causal_skip = not getattr(past_key_values, "is_compileable", False)
@ -1098,16 +1166,14 @@ def create_chunked_causal_mask(
            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
        mask_factory_function = or_masks(mask_factory_function, or_mask_function)
        allow_is_causal_skip = False
-        use_vmap = True
    if and_mask_function is not None:
        if not _is_torch_greater_or_equal_than_2_6:
            raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
        mask_factory_function = and_masks(mask_factory_function, and_mask_function)
        allow_is_causal_skip = False
-        use_vmap = True

    # If we detected packing format
-    if packed_sequence_mask is not None:
+    if packed_sequence_mask is not None and _is_torch_greater_or_equal_than_2_6:
        mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
        allow_is_causal_skip = False

@ -1123,7 +1189,6 @@ def create_chunked_causal_mask(
        local_size=chunk_size,  # Additional kwarg for sdpa
        dtype=dtype,  # Additional kwarg for eager
        config=config,  # Pass the config as well, in case someone wants to easily have their own mask_interface
-        use_vmap=use_vmap,  # Short-circuit to non-vmap expansions for the mask
    )
    return causal_mask

--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@ -29,7 +29,6 @@ import torch
 import torch.nn.functional as F
 from torch import nn

-from ... import initialization as init
 from ...activations import ACT2FN
 from ...integrations import use_kernel_forward_from_hub
 from ...masking_utils import create_causal_mask
@ -407,14 +406,13 @@ class Aimv2PreTrainedModel(PreTrainedModel):
    _supports_flash_attn = True
    _supports_flex_attn = True

-    @torch.no_grad()
    def _init_weights(self, module):
        super()._init_weights(module)
        if hasattr(module, "logit_scale"):
            if isinstance(module.logit_scale, nn.Parameter):
-                init.constant_(module.logit_scale, math.log(1 / 0.07))
+                module.logit_scale.data.fill_(math.log(1 / 0.07))
        elif isinstance(module, Aimv2AttentionPoolingHead):
-            init.normal_(module.cls_token, mean=0.0, std=self.config.initializer_range)
+            module.cls_token.data.normal_(mean=0.0, std=self.config.initializer_range)


@auto_docstring(
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@ -22,7 +22,6 @@ import torch
 import torch.nn.functional as F
 from torch import nn

-from ... import initialization as init
 from ...masking_utils import create_causal_mask
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
@ -450,14 +449,13 @@ class Aimv2PreTrainedModel(PreTrainedModel):
    _supports_flash_attn = True
    _supports_flex_attn = True

-    @torch.no_grad()
    def _init_weights(self, module):
        super()._init_weights(module)
        if hasattr(module, "logit_scale"):
            if isinstance(module.logit_scale, nn.Parameter):
-                init.constant_(module.logit_scale, math.log(1 / 0.07))
+                module.logit_scale.data.fill_(math.log(1 / 0.07))
        elif isinstance(module, Aimv2AttentionPoolingHead):
-            init.normal_(module.cls_token, mean=0.0, std=self.config.initializer_range)
+            module.cls_token.data.normal_(mean=0.0, std=self.config.initializer_range)


@auto_docstring(
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@ -22,7 +22,6 @@ import torch
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

-from ... import initialization as init
 from ...activations import ACT2FN
 from ...masking_utils import create_bidirectional_mask
 from ...modeling_outputs import (
@ -303,22 +302,21 @@ class AlbertPreTrainedModel(PreTrainedModel):
        "attentions": AlbertAttention,
    }

-    @torch.no_grad()
    def _init_weights(self, module):
        """Initialize the weights."""
        if isinstance(module, nn.Linear):
-            init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
-                init.zeros_(module.bias)
+                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
-            init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
-                init.zeros_(module.weight[module.padding_idx])
+                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
-            init.zeros_(module.bias)
-            init.ones_(module.weight)
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
        elif isinstance(module, AlbertMLMHead):
-            init.zeros_(module.bias)
+            module.bias.data.zero_()


@dataclass
@ -427,10 +425,7 @@ class AlbertModel(AlbertPreTrainedModel):
    """
 )
 class AlbertForPreTraining(AlbertPreTrainedModel):
-    _tied_weights_keys = {
-        "predictions.decoder.weight": "albert.embeddings.word_embeddings.weight",
-        "predictions.decoder.bias": "predictions.bias",
-    }
+    _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]

    def __init__(self, config: AlbertConfig):
        super().__init__(config)
@ -530,6 +525,7 @@ class AlbertMLMHead(nn.Module):
        self.dense = nn.Linear(config.hidden_size, config.embedding_size)
        self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
        self.activation = ACT2FN[config.hidden_act]
+        self.decoder.bias = self.bias

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.dense(hidden_states)
@ -541,6 +537,14 @@ class AlbertMLMHead(nn.Module):

        return prediction_scores

+    def _tie_weights(self) -> None:
+        # For accelerate compatibility and to not break backward compatibility
+        if self.decoder.bias.device.type == "meta":
+            self.decoder.bias = self.bias
+        else:
+            # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
+            self.bias = self.decoder.bias
+

 class AlbertSOPHead(nn.Module):
    def __init__(self, config: AlbertConfig):
@ -557,10 +561,7 @@ class AlbertSOPHead(nn.Module):

@auto_docstring
 class AlbertForMaskedLM(AlbertPreTrainedModel):
-    _tied_weights_keys = {
-        "predictions.decoder.weight": "albert.embeddings.word_embeddings.weight",
-        "predictions.decoder.bias": "predictions.bias",
-    }
+    _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]

    def __init__(self, config):
        super().__init__(config)
--- a/src/transformers/models/align/modeling_align.py
+++ b/src/transformers/models/align/modeling_align.py
@ -22,7 +22,6 @@ from typing import Any, Optional, Union
 import torch
 from torch import nn

-from ... import initialization as init
 from ...activations import ACT2FN
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import (
@ -824,25 +823,24 @@ class AlignPreTrainedModel(PreTrainedModel):
    input_modalities = ["image", "text"]
    supports_gradient_checkpointing = True

-    @torch.no_grad()
    def _init_weights(self, module: nn.Module):
        """Initialize the weights"""
        std = self.config.initializer_range
        if isinstance(module, (nn.Linear, nn.Conv2d)):
-            init.normal_(module.weight, mean=0.0, std=std)
+            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
-                init.zeros_(module.bias)
+                module.bias.data.zero_()
        elif isinstance(module, AlignModel):
-            init.xavier_uniform_(module.text_projection.weight)
-            init.zeros_(module.text_projection.bias)
-            init.constant_(module.temperature, self.config.temperature_init_value)
+            nn.init.xavier_uniform_(module.text_projection.weight)
+            module.text_projection.bias.data.zero_()
+            module.temperature.data.fill_(self.config.temperature_init_value)
        elif isinstance(module, nn.Embedding):
-            init.normal_(module.weight, mean=0.0, std=std)
+            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
-                init.zeros_(module.weight[module.padding_idx])
+                module.weight.data[module.padding_idx].zero_()
        if isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)):
-            init.zeros_(module.bias)
-            init.ones_(module.weight)
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)


@auto_docstring(
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@ -59,6 +59,9 @@ class AlignProcessor(ProcessorMixin):

    """

+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "EfficientNetImageProcessor"
+    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
    valid_processor_kwargs = AlignProcessorKwargs

    def __init__(self, image_processor, tokenizer):
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@ -22,7 +22,6 @@ from typing import Any, Optional, Union
 import torch
 import torch.nn as nn

-from ... import initialization as init
 from ...activations import ACT2FN
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import (
@ -771,49 +770,50 @@ class AltCLIPPreTrainedModel(PreTrainedModel):
    supports_gradient_checkpointing = True
    _no_split_module = []

-    @torch.no_grad()
    def _init_weights(self, module):
        """Initialize the weights"""
        factor = self.config.initializer_factor
        if isinstance(module, AltCLIPVisionEmbeddings):
            factor = self.config.initializer_factor
-            init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
-            init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
-            init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
        elif isinstance(module, AltCLIPAttention):
            factor = self.config.initializer_factor
            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            out_proj_std = (module.embed_dim**-0.5) * factor
-            init.normal_(module.q_proj.weight, std=in_proj_std)
-            init.normal_(module.k_proj.weight, std=in_proj_std)
-            init.normal_(module.v_proj.weight, std=in_proj_std)
-            init.normal_(module.out_proj.weight, std=out_proj_std)
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
        elif isinstance(module, AltCLIPMLP):
            factor = self.config.initializer_factor
            in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
-            init.normal_(module.fc1.weight, std=fc_std)
-            init.normal_(module.fc2.weight, std=in_proj_std)
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
        elif isinstance(module, AltCLIPModel):
-            init.normal_(
+            nn.init.normal_(
                module.text_projection.weight,
                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
            )
-            init.normal_(
+            module.text_projection._is_hf_initialized = True
+            nn.init.normal_(
                module.visual_projection.weight,
                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
            )
+            module.visual_projection._is_hf_initialized = True
        elif isinstance(module, nn.LayerNorm):
-            init.zeros_(module.bias)
-            init.ones_(module.weight)
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
        elif isinstance(module, nn.Linear):
-            init.normal_(module.weight, mean=0.0, std=self.config.initializer_factor)
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_factor)
            if module.bias is not None:
-                init.zeros_(module.bias)
+                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
-            init.normal_(module.weight, mean=0.0, std=self.config.initializer_factor)
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_factor)
            if module.padding_idx is not None:
-                init.zeros_(module.weight[module.padding_idx])
+                module.weight.data[module.padding_idx].zero_()


 class AltCLIPVisionTransformer(nn.Module):
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@ -35,6 +35,10 @@ class AltCLIPProcessor(ProcessorMixin):
            The tokenizer is a required input.
    """

+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast")
+    tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")
+
    @deprecate_kwarg(old_name="feature_extractor", version="5.0.0", new_name="image_processor")
    def __init__(self, image_processor=None, tokenizer=None):
        super().__init__(image_processor, tokenizer)
--- a/src/transformers/models/apertus/modeling_apertus.py
+++ b/src/transformers/models/apertus/modeling_apertus.py
@ -429,7 +429,7 @@ class ApertusModel(ApertusPreTrainedModel):

@auto_docstring
 class ApertusForCausalLM(ApertusPreTrainedModel, GenerationMixin):
-    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
+    _tied_weights_keys = ["lm_head.weight"]
    _tp_plan = {"lm_head": "colwise_rep"}
    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}

--- a/src/transformers/models/arcee/modeling_arcee.py
+++ b/src/transformers/models/arcee/modeling_arcee.py
@ -434,7 +434,7 @@ class ArceeModel(ArceePreTrainedModel):

@auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
 class ArceeForCausalLM(ArceePreTrainedModel, GenerationMixin):
-    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
+    _tied_weights_keys = ["lm_head.weight"]
    _tp_plan = {"lm_head": "colwise_rep"}
    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}

--- a/src/transformers/models/aria/modeling_aria.py
+++ b/src/transformers/models/aria/modeling_aria.py
@ -25,7 +25,6 @@ from typing import Optional, Union
 import torch
 from torch import nn

-from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache
 from ...generation import GenerationMixin
@ -586,11 +585,10 @@ class AriaTextPreTrainedModel(PreTrainedModel):
        "attentions": AriaTextAttention,
    }

-    @torch.no_grad()
    def _init_weights(self, module):
        super()._init_weights(module)
        if isinstance(module, AriaGroupedExpertsGemm):
-            init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)


@auto_docstring
@ -610,11 +608,10 @@ class AriaPreTrainedModel(PreTrainedModel):
        "attentions": AriaTextAttention,
    }

-    @torch.no_grad()
    def _init_weights(self, module):
        super()._init_weights(module)
        if isinstance(module, AriaProjector):
-            init.trunc_normal_(module.query, std=self.config.initializer_range)
+            nn.init.trunc_normal_(module.query, std=self.config.initializer_range)


 class AriaTextRotaryEmbedding(nn.Module):
@ -763,7 +760,7 @@ class AriaTextModel(AriaTextPreTrainedModel):

@auto_docstring
 class AriaTextForCausalLM(AriaTextPreTrainedModel, GenerationMixin):
-    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
+    _tied_weights_keys = ["lm_head.weight"]
    _tp_plan = {"lm_head": "colwise_rep"}
    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}

@ -1056,7 +1053,7 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
        "^multi_modal_projector": "model.multi_modal_projector",
        "^language_model.lm_head": "lm_head",
    }
-    _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"}
+    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config: AriaConfig):
        super().__init__(config)
--- a/src/transformers/models/aria/modular_aria.py
+++ b/src/transformers/models/aria/modular_aria.py
@ -19,7 +19,6 @@ import numpy as np
 import torch
 from torch import nn

-from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache
 from ...configuration_utils import PreTrainedConfig
@ -907,6 +906,10 @@ class AriaProcessor(ProcessorMixin):
            A dictionary indicating size conversions for images.
    """

+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AriaImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
    def __init__(
        self,
        image_processor=None,
@ -1188,11 +1191,10 @@ class AriaTextPreTrainedModel(PreTrainedModel):
        "attentions": AriaTextAttention,
    }

-    @torch.no_grad()
    def _init_weights(self, module):
        super()._init_weights(module)
        if isinstance(module, AriaGroupedExpertsGemm):
-            init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)


 class AriaPreTrainedModel(LlamaPreTrainedModel):
@ -1201,11 +1203,10 @@ class AriaPreTrainedModel(LlamaPreTrainedModel):
    _can_compile_fullgraph = False  # MoE models don't work with torch.compile (dynamic slicing)
    _supports_attention_backend = True

-    @torch.no_grad()
    def _init_weights(self, module):
        PreTrainedModel._init_weights(self, module)
        if isinstance(module, AriaProjector):
-            init.trunc_normal_(module.query, std=self.config.initializer_range)
+            nn.init.trunc_normal_(module.query, std=self.config.initializer_range)


 class AriaTextModel(LlamaModel):
@ -1219,7 +1220,7 @@ class AriaTextModel(LlamaModel):


 class AriaTextForCausalLM(AriaTextPreTrainedModel, LlamaForCausalLM):
-    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
+    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config: AriaTextConfig):
        super().__init__(config)
@ -1358,8 +1359,6 @@ class AriaModel(LlavaModel):
    """
 )
 class AriaForConditionalGeneration(LlavaForConditionalGeneration):
-    _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"}
-
    def get_image_features(
        self,
        pixel_values: torch.FloatTensor,
--- a/src/transformers/models/aria/processing_aria.py
+++ b/src/transformers/models/aria/processing_aria.py
@ -67,6 +67,10 @@ class AriaProcessor(ProcessorMixin):
            A dictionary indicating size conversions for images.
    """

+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AriaImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
    def __init__(
        self,
        image_processor=None,
--- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
@ -272,9 +272,7 @@ if __name__ == "__main__":
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether or not to push the converted model to the Hugging Face hub.",
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )

    args = parser.parse_args()
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@ -20,7 +20,6 @@ from typing import Optional, Union
 import torch
 from torch import nn

-from ... import initialization as init
 from ...activations import ACT2FN
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, SequenceClassifierOutput
@ -301,20 +300,23 @@ class ASTPreTrainedModel(PreTrainedModel):
        "attentions": ASTSelfAttention,
    }

-    @torch.no_grad()
    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
        """Initialize the weights"""
        if isinstance(module, (nn.Linear, nn.Conv2d)):
-            init.trunc_normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
            if module.bias is not None:
-                init.zeros_(module.bias)
+                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
-            init.zeros_(module.bias)
-            init.ones_(module.weight)
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
        elif isinstance(module, ASTEmbeddings):
-            init.zeros_(module.cls_token)
-            init.zeros_(module.position_embeddings)
-            init.zeros_(module.distillation_token)
+            module.cls_token.data.zero_()
+            module.position_embeddings.data.zero_()
+            module.distillation_token.data.zero_()


@auto_docstring
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@ -223,7 +223,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
        ("layoutlm", "LayoutLMConfig"),
        ("layoutlmv2", "LayoutLMv2Config"),
        ("layoutlmv3", "LayoutLMv3Config"),
-        ("layoutxlm", "LayoutLMv2Config"),
        ("led", "LEDConfig"),
        ("levit", "LevitConfig"),
        ("lfm2", "Lfm2Config"),
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@ -41,7 +41,6 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
        ("audio-spectrogram-transformer", "ASTFeatureExtractor"),
        ("clap", "ClapFeatureExtractor"),
        ("clvp", "ClvpFeatureExtractor"),
-        ("csm", "EncodecFeatureExtractor"),
        ("dac", "DacFeatureExtractor"),
        ("data2vec-audio", "Wav2Vec2FeatureExtractor"),
        ("dia", "DiaFeatureExtractor"),
@ -50,20 +49,14 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
        ("granite_speech", "GraniteSpeechFeatureExtractor"),
        ("hubert", "Wav2Vec2FeatureExtractor"),
        ("kyutai_speech_to_text", "KyutaiSpeechToTextFeatureExtractor"),
-        ("markuplm", "MarkupLMFeatureExtractor"),
        ("mctct", "MCTCTFeatureExtractor"),
        ("mimi", "EncodecFeatureExtractor"),
        ("moonshine", "Wav2Vec2FeatureExtractor"),
        ("moshi", "EncodecFeatureExtractor"),
-        ("musicgen", "EncodecFeatureExtractor"),
-        ("musicgen_melody", "MusicgenMelodyFeatureExtractor"),
        ("parakeet_ctc", "ParakeetFeatureExtractor"),
        ("parakeet_encoder", "ParakeetFeatureExtractor"),
        ("phi4_multimodal", "Phi4MultimodalFeatureExtractor"),
        ("pop2piano", "Pop2PianoFeatureExtractor"),
-        ("qwen2_5_omni", "WhisperFeatureExtractor"),
-        ("qwen2_audio", "WhisperFeatureExtractor"),
-        ("qwen3_omni_moe", "WhisperFeatureExtractor"),
        ("seamless_m4t", "SeamlessM4TFeatureExtractor"),
        ("seamless_m4t_v2", "SeamlessM4TFeatureExtractor"),
        ("sew", "Wav2Vec2FeatureExtractor"),
@ -73,7 +66,6 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
        ("unispeech", "Wav2Vec2FeatureExtractor"),
        ("unispeech-sat", "Wav2Vec2FeatureExtractor"),
        ("univnet", "UnivNetFeatureExtractor"),
-        ("voxtral", "WhisperFeatureExtractor"),
        ("wav2vec2", "Wav2Vec2FeatureExtractor"),
        ("wav2vec2-bert", "Wav2Vec2FeatureExtractor"),
        ("wav2vec2-conformer", "Wav2Vec2FeatureExtractor"),
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@ -62,9 +62,7 @@ else:
            ("aimv2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
            ("aimv2_vision_model", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
            ("align", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")),
-            ("altclip", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
            ("aria", ("AriaImageProcessor", None)),
-            ("aya_vision", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
            ("beit", ("BeitImageProcessor", "BeitImageProcessorFast")),
            ("bit", ("BitImageProcessor", "BitImageProcessorFast")),
            ("blip", ("BlipImageProcessor", "BlipImageProcessorFast")),
@ -75,8 +73,6 @@ else:
            ("clip", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
            ("clipseg", ("ViTImageProcessor", "ViTImageProcessorFast")),
            ("cohere2_vision", (None, "Cohere2VisionImageProcessorFast")),
-            ("colpali", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
-            ("colqwen2", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
            ("conditional_detr", ("ConditionalDetrImageProcessor", "ConditionalDetrImageProcessorFast")),
            ("convnext", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
            ("convnextv2", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
@ -99,10 +95,8 @@ else:
            ("efficientformer", ("EfficientFormerImageProcessor", None)),
            ("efficientloftr", ("EfficientLoFTRImageProcessor", "EfficientLoFTRImageProcessorFast")),
            ("efficientnet", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")),
-            ("emu3", ("Emu3ImageProcessor", None)),
            ("eomt", ("EomtImageProcessor", "EomtImageProcessorFast")),
            ("flava", ("FlavaImageProcessor", "FlavaImageProcessorFast")),
-            ("florence2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
            ("focalnet", ("BitImageProcessor", "BitImageProcessorFast")),
            ("fuyu", ("FuyuImageProcessor", "FuyuImageProcessorFast")),
            ("gemma3", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
@ -120,13 +114,11 @@ else:
            ("ijepa", ("ViTImageProcessor", "ViTImageProcessorFast")),
            ("imagegpt", ("ImageGPTImageProcessor", "ImageGPTImageProcessorFast")),
            ("instructblip", ("BlipImageProcessor", "BlipImageProcessorFast")),
-            ("internvl", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
            ("janus", ("JanusImageProcessor", "JanusImageProcessorFast")),
            ("kosmos-2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
            ("kosmos-2.5", ("Kosmos2_5ImageProcessor", "Kosmos2_5ImageProcessorFast")),
            ("layoutlmv2", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessorFast")),
            ("layoutlmv3", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")),
-            ("layoutxlm", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessor")),
            ("levit", ("LevitImageProcessor", "LevitImageProcessorFast")),
            ("lfm2_vl", (None, "Lfm2VlImageProcessorFast")),
            ("lightglue", ("LightGlueImageProcessor", "LightGlueImageProcessorFast")),
@ -149,7 +141,6 @@ else:
            ("mobilevitv2", ("MobileViTImageProcessor", "MobileViTImageProcessorFast")),
            ("nat", ("ViTImageProcessor", "ViTImageProcessorFast")),
            ("nougat", ("NougatImageProcessor", "NougatImageProcessorFast")),
-            ("omdet-turbo", ("DetrImageProcessor", "DetrImageProcessorFast")),
            ("oneformer", ("OneFormerImageProcessor", "OneFormerImageProcessorFast")),
            ("ovis2", ("Ovis2ImageProcessor", "Ovis2ImageProcessorFast")),
            ("owlv2", ("Owlv2ImageProcessor", "Owlv2ImageProcessorFast")),
@ -164,17 +155,14 @@ else:
            ("prompt_depth_anything", ("PromptDepthAnythingImageProcessor", "PromptDepthAnythingImageProcessorFast")),
            ("pvt", ("PvtImageProcessor", "PvtImageProcessorFast")),
            ("pvt_v2", ("PvtImageProcessor", "PvtImageProcessorFast")),
-            ("qwen2_5_omni", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
            ("qwen2_5_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
            ("qwen2_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
-            ("qwen3_omni_moe", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
            ("qwen3_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
            ("regnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
            ("resnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
            ("rt_detr", ("RTDetrImageProcessor", "RTDetrImageProcessorFast")),
            ("sam", ("SamImageProcessor", "SamImageProcessorFast")),
            ("sam2", (None, "Sam2ImageProcessorFast")),
-            ("sam2_video", (None, "Sam2ImageProcessorFast")),
            ("sam_hq", ("SamImageProcessor", "SamImageProcessorFast")),
            ("segformer", ("SegformerImageProcessor", "SegformerImageProcessorFast")),
            ("seggpt", ("SegGptImageProcessor", None)),
@ -192,14 +180,12 @@ else:
            ("textnet", ("TextNetImageProcessor", "TextNetImageProcessorFast")),
            ("timesformer", ("VideoMAEImageProcessor", None)),
            ("timm_wrapper", ("TimmWrapperImageProcessor", None)),
-            ("trocr", ("ViTImageProcessor", "ViTImageProcessorFast")),
            ("tvlt", ("TvltImageProcessor", None)),
            ("tvp", ("TvpImageProcessor", "TvpImageProcessorFast")),
            ("udop", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")),
            ("upernet", ("SegformerImageProcessor", "SegformerImageProcessorFast")),
            ("van", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
            ("video_llama_3", ("VideoLlama3ImageProcessor", "VideoLlama3ImageProcessorFast")),
-            ("video_llava", ("VideoLlavaImageProcessor", None)),
            ("videomae", ("VideoMAEImageProcessor", None)),
            ("vilt", ("ViltImageProcessor", "ViltImageProcessorFast")),
            ("vipllava", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
@ -538,9 +524,10 @@ class AutoImageProcessor:
                )
                use_fast = False
            if use_fast:
-                # Check if the fast image processor class exists
-                image_processor_class_fast = get_image_processor_class_from_name(image_processor_type)
-                if image_processor_class_fast is None:
+                for image_processors in IMAGE_PROCESSOR_MAPPING_NAMES.values():
+                    if image_processor_type in image_processors:
+                        break
+                else:
                    image_processor_type = image_processor_type[:-4]
                    use_fast = False
                    logger.warning_once(
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@ -1700,7 +1700,6 @@ MODEL_FOR_BACKBONE_MAPPING_NAMES = OrderedDict(
        ("dinov2", "Dinov2Backbone"),
        ("dinov2_with_registers", "Dinov2WithRegistersBackbone"),
        ("dinov3_convnext", "DINOv3ConvNextBackbone"),
-        ("dinov3_vit", "DINOv3ViTBackbone"),
        ("focalnet", "FocalNetBackbone"),
        ("hgnet_v2", "HGNetV2Backbone"),
        ("hiera", "HieraBackbone"),
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@ -107,7 +107,6 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
        ("mllama", "MllamaProcessor"),
        ("mm-grounding-dino", "GroundingDinoProcessor"),
        ("moonshine", "Wav2Vec2Processor"),
-        ("omdet-turbo", "OmDetTurboProcessor"),
        ("oneformer", "OneFormerProcessor"),
        ("ovis2", "Ovis2Processor"),
        ("owlv2", "Owlv2Processor"),
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@ -72,7 +72,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
            ),
        ),
        ("align", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
-        ("altclip", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)),
        ("arcee", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
        ("aria", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
        ("aya_vision", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
@ -157,7 +156,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
        ("codegen", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
        ("cohere", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
        ("cohere2", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
-        ("cohere2_vision", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
        ("colpali", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
        ("colqwen2", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
        ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
@ -226,7 +224,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
            ),
        ),
        ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)),
-        ("donut", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)),
        (
            "dpr",
            (
@ -241,7 +238,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
        ("ernie4_5_moe", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
        ("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)),
        ("esm", ("EsmTokenizer", None)),
-        ("evolla", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
        (
            "exaone4",
            (
@ -256,13 +252,10 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
            ("FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None, None),
        ),
        ("flaubert", ("FlaubertTokenizer", None)),
-        ("flava", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
        ("flex_olmo", (None, "GPT2TokenizerFast" if is_tokenizers_available() else None)),
-        ("florence2", ("BartTokenizer", "BartTokenizerFast" if is_tokenizers_available() else None)),
        ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)),
        ("fsmt", ("FSMTTokenizer", None)),
        ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)),
-        ("fuyu", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
        (
            "gemma",
            (
@ -311,7 +304,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
        ("glm4_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
        ("glm4v", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
        ("glm4v_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
-        ("got_ocr2", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
        ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
        ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
        ("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
@ -322,7 +314,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
        ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
        ("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)),
        ("granite", ("GPT2Tokenizer", None)),
-        ("granite_speech", ("GPT2Tokenizer", None)),
        ("granitemoe", ("GPT2Tokenizer", None)),
        ("granitemoehybrid", ("GPT2Tokenizer", None)),
        ("granitemoeshared", ("GPT2Tokenizer", None)),
@ -362,14 +353,11 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
            ),
        ),
        ("kosmos-2.5", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
-        ("kyutai_speech_to_text", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
        ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
        ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
        ("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
        ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
        ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)),
-        ("lfm2", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
-        ("lfm2_vl", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
        ("lilt", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
        (
            "llama",
@ -410,7 +398,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
        ("mamba", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
        ("mamba2", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
        ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
-        ("markuplm", ("MarkupLMTokenizer", "MarkupLMTokenizerFast" if is_tokenizers_available() else None)),
        (
            "mbart",
            (
@ -497,7 +484,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
                "NllbTokenizerFast" if is_tokenizers_available() else None,
            ),
        ),
-        ("nougat", (None, "NougatTokenizerFast" if is_tokenizers_available() else None)),
        (
            "nystromformer",
            (
@ -519,7 +505,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
            ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None),
        ),
        ("opt", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
-        ("ovis2", (None, "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
        ("owlv2", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
        ("owlvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
        ("paligemma", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
@ -545,7 +530,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
                None,
            ),
        ),
-        ("perception_lm", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
        (
            "persimmon",
            (
@ -555,7 +539,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
        ),
        ("phi", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)),
        ("phi3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
-        ("phi4_multimodal", (None, "GPT2TokenizerFast" if is_tokenizers_available() else None)),
        ("phimoe", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
        ("phobert", ("PhobertTokenizer", None)),
        ("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
@ -569,7 +552,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
            ),
        ),
        ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)),
-        ("pop2piano", ("Pop2PianoTokenizer", None)),
        ("prophetnet", ("ProphetNetTokenizer", None)),
        ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
        (
@ -676,7 +658,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
            ),
        ),
        ("smollm3", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
-        ("smolvlm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
        ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
        ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
        ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)),
@ -711,7 +692,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
        ("tapas", ("TapasTokenizer", None)),
        ("tapex", ("TapexTokenizer", None)),
        ("transfo-xl", ("TransfoXLTokenizer", None)),
-        ("trocr", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)),
        ("tvp", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
        (
            "udop",
@ -727,14 +707,9 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
                "T5TokenizerFast" if is_tokenizers_available() else None,
            ),
        ),
-        ("video_llama_3", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
        ("video_llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
        ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
        ("vipllava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
-        (
-            "vision_text_dual_encoder",
-            ("PreTrainedTokenizer", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
-        ),
        ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
        ("vits", ("VitsTokenizer", None)),
        (
@ -750,7 +725,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
        ("wav2vec2-bert", ("Wav2Vec2CTCTokenizer", None)),
        ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)),
        ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
-        ("wav2vec2_with_lm", ("Wav2Vec2CTCTokenizer", None)),
        ("whisper", ("WhisperTokenizer", "WhisperTokenizerFast" if is_tokenizers_available() else None)),
        ("xclip", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
        (
@ -1186,7 +1160,7 @@ class AutoTokenizer:
                The configuration corresponding to the model to register.
            slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
                The slow tokenizer to register.
-            fast_tokenizer_class ([`PreTrainedTokenizerFast`], *optional*):
+            fast_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
                The fast tokenizer to register.
        """
        if slow_tokenizer_class is None and fast_tokenizer_class is None:
--- a/src/transformers/models/auto/video_processing_auto.py
+++ b/src/transformers/models/auto/video_processing_auto.py
@ -60,7 +60,6 @@ else:
            ("qwen3_vl_moe", "Qwen3VLVideoProcessor"),
            ("sam2_video", "Sam2VideoVideoProcessor"),
            ("smolvlm", "SmolVLMVideoProcessor"),
-            ("video_llama_3", "VideoLlama3VideoProcessor"),
            ("video_llava", "VideoLlavaVideoProcessor"),
            ("videomae", "VideoMAEVideoProcessor"),
            ("vjepa2", "VJEPA2VideoProcessor"),
@ -292,7 +291,7 @@ class AutoVideoProcessor:

                # Some models have different image processors, e.g. InternVL uses GotOCRImageProcessor
                # We cannot use GotOCRVideoProcessor when falling back for BC and should try to infer from config later on
-                if video_processor_class_from_name(video_processor_class_inferred) is not None:
+                if video_processor_class_inferred in VIDEO_PROCESSOR_MAPPING_NAMES.values():
                    video_processor_class = video_processor_class_inferred
            if "AutoImageProcessor" in config_dict.get("auto_map", {}):
                image_processor_auto_map = config_dict["auto_map"]["AutoImageProcessor"]
--- a/src/transformers/models/autoformer/modeling_autoformer.py
+++ b/src/transformers/models/autoformer/modeling_autoformer.py
@ -24,7 +24,6 @@ import numpy as np
 import torch
 from torch import nn

-from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from ...modeling_attn_mask_utils import (
@ -827,22 +826,21 @@ class AutoformerPreTrainedModel(PreTrainedModel):
    main_input_name = "past_values"
    supports_gradient_checkpointing = True

-    @torch.no_grad()
    def _init_weights(self, module: nn.Module):
        std = self.config.init_std
        if isinstance(module, (nn.Linear, nn.Conv1d)):
-            init.normal_(module.weight, mean=0.0, std=std)
+            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
-                init.zeros_(module.bias)
+                module.bias.data.zero_()
        elif isinstance(module, AutoformerSinusoidalPositionalEmbedding):
            module._init_weight()
        elif isinstance(module, nn.Embedding):
-            init.normal_(module.weight, mean=0.0, std=std)
+            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
-                init.zeros_(module.weight[module.padding_idx])
+                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
-            init.ones_(module.weight)
-            init.zeros_(module.bias)
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()

    # copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask
    def _update_full_mask(
--- a/src/transformers/models/aya_vision/modeling_aya_vision.py
+++ b/src/transformers/models/aya_vision/modeling_aya_vision.py
@ -338,7 +338,7 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
        "^multi_modal_projector": "model.multi_modal_projector",
        "^language_model.lm_head": "lm_head",
    }
-    _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"}
+    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config: AyaVisionConfig):
        super().__init__(config)
--- a/src/transformers/models/aya_vision/processing_aya_vision.py
+++ b/src/transformers/models/aya_vision/processing_aya_vision.py
@ -70,6 +70,10 @@ class AyaVisionProcessor(ProcessorMixin):
            in a chat into a tokenizable string.
    """

+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
    def __init__(
        self,
        image_processor=None,
--- a/src/transformers/models/bamba/modeling_bamba.py
+++ b/src/transformers/models/bamba/modeling_bamba.py
@ -32,7 +32,6 @@ from torch import nn

 from transformers.activations import ACT2FN

-from ... import initialization as init
 from ...cache_utils import Cache
 from ...generation import GenerationMixin
 from ...integrations import use_kernel_forward_from_hub
@ -1127,13 +1126,12 @@ class BambaPreTrainedModel(PreTrainedModel):
    # Note: only supports HybridMambaAttentionDynamicCache
    _is_stateful = True

-    @torch.no_grad()
    def _init_weights(self, module):
        super()._init_weights(module)
        if isinstance(module, BambaMixer):
-            init.ones_(module.dt_bias)
-            init.copy_(module.A_log, torch.log(torch.arange(1, module.num_heads + 1)))
-            init.ones_(module.D)
+            module.dt_bias.data.fill_(1.0)
+            module.A_log.data = torch.log(torch.arange(1, module.num_heads + 1))
+            module.D.data.fill_(1.0)


@auto_docstring
@ -1385,7 +1383,7 @@ class BambaModel(BambaPreTrainedModel):

@auto_docstring
 class BambaForCausalLM(BambaPreTrainedModel, GenerationMixin):
-    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
+    _tied_weights_keys = ["lm_head.weight"]
    _tp_plan = {"lm_head": "colwise_rep"}
    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}

--- a/src/transformers/models/bamba/modular_bamba.py
+++ b/src/transformers/models/bamba/modular_bamba.py
@ -42,7 +42,6 @@ from transformers.models.mamba2.modeling_mamba2 import (
    segment_sum,
 )

-from ... import initialization as init
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from ...modeling_utils import PreTrainedModel
@ -801,13 +800,12 @@ class BambaPreTrainedModel(PreTrainedModel):
    # Note: only supports HybridMambaAttentionDynamicCache
    _is_stateful = True

-    @torch.no_grad()
    def _init_weights(self, module):
        super()._init_weights(module)
        if isinstance(module, BambaMixer):
-            init.ones_(module.dt_bias)
-            init.copy_(module.A_log, torch.log(torch.arange(1, module.num_heads + 1)))
-            init.ones_(module.D)
+            module.dt_bias.data.fill_(1.0)
+            module.A_log.data = torch.log(torch.arange(1, module.num_heads + 1))
+            module.D.data.fill_(1.0)


@auto_docstring
--- a/src/transformers/models/bark/modeling_bark.py
+++ b/src/transformers/models/bark/modeling_bark.py
@ -329,6 +329,23 @@ class BarkPreTrainedModel(PreTrainedModel):
    supports_gradient_checkpointing = False
    _supports_flash_attn = True

+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear,)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
    @property
    def device(self) -> torch.device:
        """
@ -893,9 +910,6 @@ class BarkFineModel(BarkPreTrainedModel):
        # non-causal gpt-like model with one embedding layer and one lm_head for each codebook of Encodec
        super().__init__(config)
        self.config = config
-        self._tied_weights_keys = {}
-        for i in range(self.config.n_codes_total - self.config.n_codes_given):
-            self._tied_weights_keys[f"lm_heads.{i}.weight"] = f"input_embeds_layers.{i + 1}.weight"

        # initialize a modified non causal GPT-like model
        # note that for there is one embedding layer and one lm_head for each codebook of Encodec
@ -1011,6 +1025,25 @@ class BarkFineModel(BarkPreTrainedModel):

        return model_embeds

+    def _tie_weights(self):
+        if getattr(self.config, "tie_word_embeddings", True):
+            self._tied_weights_keys = []
+            output_embeddings = self.get_output_embeddings()
+            input_embeddings = self.get_input_embeddings()
+
+            for i in range(self.config.n_codes_total - self.config.n_codes_given):
+                # self.input_embeds_layers[i + 1].weight = self.lm_heads[i].weight
+                self._tie_embedding_weights(output_embeddings[i], input_embeddings[i + 1])
+                self._tied_weights_keys.append(f"lm_heads.{i}.weight")
+
+    def tie_weights(self):
+        """
+        Tie the weights between the input embeddings list and the output embeddings list.
+        """
+        for module in self.modules():
+            if hasattr(module, "_tie_weights"):
+                module._tie_weights()
+
    @auto_docstring
    def forward(
        self,
@ -1547,6 +1580,14 @@ class BarkModel(BarkPreTrainedModel, GenerationMixin):

        return audio

+    def tie_weights(self):
+        """
+        Tie the weights between the input embeddings list and the output embeddings list.
+        """
+        for module in self.modules():
+            if hasattr(module, "_tie_weights"):
+                module._tie_weights()
+

 __all__ = [
    "BarkFineModel",
--- a/src/transformers/models/bark/processing_bark.py
+++ b/src/transformers/models/bark/processing_bark.py
@ -49,6 +49,9 @@ class BarkProcessor(ProcessorMixin):

    """

+    tokenizer_class = "AutoTokenizer"
+    attributes = ["tokenizer"]
+
    preset_shape = {
        "semantic_prompt": 1,  # 1D array of shape (X,)
        "coarse_prompt": 2,  # 2D array of shape (2,X)
--- a/src/transformers/models/bart/configuration_bart.py
+++ b/src/transformers/models/bart/configuration_bart.py
@ -164,7 +164,7 @@ class BartConfig(PreTrainedConfig):
            forced_eos_token_id=forced_eos_token_id,
            **kwargs,
        )
-        self.tie_encoder_decoder = True
+
        # ensure backward compatibility for BART CNN models
        if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
            self.forced_bos_token_id = self.bos_token_id
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@ -476,6 +476,20 @@ class BartPreTrainedModel(PreTrainedModel):

    _can_compile_fullgraph = True

+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+
    @property
    def dummy_inputs(self):
        pad_token = self.config.pad_token_id
@ -513,7 +527,7 @@ class BartEncoder(BartPreTrainedModel):
        embed_tokens (nn.Embedding): output embedding
    """

-    def __init__(self, config: BartConfig):
+    def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)

        self.dropout = config.dropout
@ -524,9 +538,12 @@ class BartEncoder(BartPreTrainedModel):
        self.max_source_positions = config.max_position_embeddings
        embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

-        self.embed_tokens = BartScaledWordEmbedding(
-            config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
-        )
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = BartScaledWordEmbedding(
+                config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
+            )

        self.embed_positions = BartLearnedPositionalEmbedding(
            config.max_position_embeddings,
@ -657,7 +674,7 @@ class BartDecoder(BartPreTrainedModel):
        embed_tokens (nn.Embedding): output embedding
    """

-    def __init__(self, config: BartConfig):
+    def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)
        self.dropout = config.dropout
        self.layerdrop = config.decoder_layerdrop
@ -665,9 +682,12 @@ class BartDecoder(BartPreTrainedModel):
        self.max_target_positions = config.max_position_embeddings
        embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0

-        self.embed_tokens = BartScaledWordEmbedding(
-            config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
-        )
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = BartScaledWordEmbedding(
+                config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
+            )

        self.embed_positions = BartLearnedPositionalEmbedding(
            config.max_position_embeddings,
@ -879,10 +899,7 @@ class BartDecoder(BartPreTrainedModel):

@auto_docstring
 class BartModel(BartPreTrainedModel):
-    _tied_weights_keys = {
-        "decoder.embed_tokens.weight": "shared.weight",
-        "encoder.embed_tokens.weight": "shared.weight",
-    }
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]

    def __init__(self, config: BartConfig):
        super().__init__(config)
@ -891,12 +908,24 @@ class BartModel(BartPreTrainedModel):
        embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
        self.shared = BartScaledWordEmbedding(vocab_size, config.d_model, padding_idx, embed_scale=embed_scale)

-        self.encoder = BartEncoder(config)
-        self.decoder = BartDecoder(config)
+        self.encoder = BartEncoder(config, self.shared)
+        self.decoder = BartDecoder(config, self.shared)

        # Initialize weights and apply final processing
        self.post_init()

+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            # Some model checkpoints like "facebook/bart-large-cnn"'s embedding weight is in decoder.embed_tokens, need check here, see issue #36247
+            if self.shared.weight.device == torch.device(
+                "meta"
+            ) and self.decoder.embed_tokens.weight.device != torch.device("meta"):
+                self._tie_embedding_weights(self.encoder.embed_tokens, self.decoder.embed_tokens)
+                self._tie_embedding_weights(self.shared, self.decoder.embed_tokens)
+            else:
+                self._tie_embedding_weights(self.encoder.embed_tokens, self.shared)
+                self._tie_embedding_weights(self.decoder.embed_tokens, self.shared)
+
    def get_input_embeddings(self):
        return self.shared

@ -1023,9 +1052,7 @@ class BartModel(BartPreTrainedModel):
 )
 class BartForConditionalGeneration(BartPreTrainedModel, GenerationMixin):
    base_model_prefix = "model"
-    _tied_weights_keys = {
-        "lm_head.weight": "model.shared.weight",
-    }
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
    _keys_to_ignore_on_load_missing = ["final_logits_bias"]

    def __init__(self, config: BartConfig):
@ -1059,6 +1086,11 @@ class BartForConditionalGeneration(BartPreTrainedModel, GenerationMixin):
            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
        self.register_buffer("final_logits_bias", new_bias)

+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self.model._tie_weights()
+            self._tie_embedding_weights(self.lm_head, self.model.shared)
+
    @auto_docstring
    def forward(
        self,
@ -1208,6 +1240,8 @@ class BartForConditionalGeneration(BartPreTrainedModel, GenerationMixin):
    """
 )
 class BartForSequenceClassification(BartPreTrainedModel):
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
    def __init__(self, config: BartConfig, **kwargs):
        super().__init__(config, **kwargs)
        self.model = BartModel(config)
@ -1340,6 +1374,8 @@ class BartForSequenceClassification(BartPreTrainedModel):

@auto_docstring
 class BartForQuestionAnswering(BartPreTrainedModel):
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
    def __init__(self, config):
        super().__init__(config)

@ -1477,9 +1513,7 @@ class BartDecoderWrapper(BartPreTrainedModel):
    """
 )
 class BartForCausalLM(BartPreTrainedModel, GenerationMixin):
-    _tied_weights_keys = {
-        "lm_head.weight": "model.decoder.embed_tokens.weight",
-    }
+    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        config.is_decoder = True
--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@ -24,7 +24,6 @@ import torch
 from torch import Tensor, nn
 from torch.nn import CrossEntropyLoss

-from ... import initialization as init
 from ...activations import ACT2FN
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import (
@ -693,22 +692,31 @@ class BeitPreTrainedModel(PreTrainedModel):
    _keys_to_ignore_on_load_unexpected = [r".*relative_position_index.*"]
    _supports_sdpa = True

-    @torch.no_grad()
    def _init_weights(self, module):
        """Initialize the weights"""
-        super()._init_weights(module)
-        if isinstance(module, BeitEmbeddings):
-            init.zeros_(module.cls_token)
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, BeitEmbeddings):
+            module.cls_token.data.zero_()
            if module.mask_token is not None:
-                init.zeros_(module.mask_token)
+                module.mask_token.data.zero_()
            if module.position_embeddings is not None:
-                init.zeros_(module.position_embeddings)
+                module.position_embeddings.data.zero_()
        elif isinstance(module, BeitRelativePositionBias):
-            init.zeros_(module.relative_position_bias_table)
+            module.relative_position_bias_table.data.zero_()
        elif isinstance(module, BeitLayer):
            if module.lambda_1 is not None:
-                init.constant_(module.lambda_1, self.config.layer_scale_init_value)
-                init.constant_(module.lambda_2, self.config.layer_scale_init_value)
+                module.lambda_1.data.fill_(self.config.layer_scale_init_value)
+                module.lambda_2.data.fill_(self.config.layer_scale_init_value)


@auto_docstring
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@ -24,7 +24,6 @@ import torch
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

-from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from ...generation import GenerationMixin
@ -507,9 +506,16 @@ class BertLMPredictionHead(nn.Module):

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=True)
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states)
@ -563,12 +569,21 @@ class BertPreTrainedModel(PreTrainedModel):
        "cross_attentions": BertCrossAttention,
    }

-    @torch.no_grad()
    def _init_weights(self, module):
        """Initialize the weights"""
-        super()._init_weights(module)
-        if isinstance(module, BertLMPredictionHead):
-            init.zeros_(module.bias)
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, BertLMPredictionHead):
+            module.bias.data.zero_()


@dataclass
@ -755,10 +770,7 @@ class BertModel(BertPreTrainedModel):
    """
 )
 class BertForPreTraining(BertPreTrainedModel):
-    _tied_weights_keys = {
-        "cls.predictions.decoder.weight": "bert.embeddings.word_embeddings.weight",
-        "cls.predictions.decoder.bias": "cls.predictions.bias",
-    }
+    _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]

    def __init__(self, config):
        super().__init__(config)
@ -852,10 +864,7 @@ class BertForPreTraining(BertPreTrainedModel):
    """
 )
 class BertLMHeadModel(BertPreTrainedModel, GenerationMixin):
-    _tied_weights_keys = {
-        "cls.predictions.decoder.weight": "bert.embeddings.word_embeddings.weight",
-        "cls.predictions.decoder.bias": "cls.predictions.bias",
-    }
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]

    def __init__(self, config):
        super().__init__(config)
@ -939,10 +948,7 @@ class BertLMHeadModel(BertPreTrainedModel, GenerationMixin):

@auto_docstring
 class BertForMaskedLM(BertPreTrainedModel):
-    _tied_weights_keys = {
-        "cls.predictions.decoder.weight": "bert.embeddings.word_embeddings.weight",
-        "cls.predictions.decoder.bias": "cls.predictions.bias",
-    }
+    _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]

    def __init__(self, config):
        super().__init__(config)
--- a/src/transformers/models/bert_generation/modeling_bert_generation.py
+++ b/src/transformers/models/bert_generation/modeling_bert_generation.py
@ -20,7 +20,6 @@ from typing import Optional, Union
 import torch
 from torch import nn

-from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from ...generation import GenerationMixin
@ -457,12 +456,21 @@ class BertGenerationPreTrainedModel(PreTrainedModel):
        "cross_attentions": BertGenerationCrossAttention,
    }

-    @torch.no_grad()
    def _init_weights(self, module):
        """Initialize the weights"""
-        super()._init_weights(module)
-        if isinstance(module, BertGenerationOnlyLMHead):
-            init.zeros_(module.bias)
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, BertGenerationOnlyLMHead):
+            module.bias.data.zero_()


@auto_docstring(
@ -621,11 +629,20 @@ class BertGenerationOnlyLMHead(nn.Module):
        super().__init__()
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        self.decoder.bias = self.bias

    def forward(self, hidden_states):
        logits = self.decoder(hidden_states)
        return logits

+    def _tie_weights(self):
+        # For accelerate compatibility and to not break backward compatibility
+        if self.decoder.bias.device.type == "meta":
+            self.decoder.bias = self.bias
+        else:
+            # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
+            self.bias = self.decoder.bias
+

@auto_docstring(
    custom_intro="""
@ -633,10 +650,7 @@ class BertGenerationOnlyLMHead(nn.Module):
    """
 )
 class BertGenerationDecoder(BertGenerationPreTrainedModel, GenerationMixin):
-    _tied_weights_keys = {
-        "lm_head.decoder.weight": "bert.embeddings.word_embeddings.weight",
-        "lm_head.decoder.bias": "lm_head.bias",
-    }
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]

    def __init__(self, config):
        super().__init__(config)
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@ -23,7 +23,6 @@ import torch
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

-from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache
 from ...generation import GenerationMixin
@ -1465,9 +1464,16 @@ class BigBirdLMPredictionHead(nn.Module):

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=True)
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states)
@ -1515,12 +1521,21 @@ class BigBirdPreTrainedModel(PreTrainedModel):
    base_model_prefix = "bert"
    supports_gradient_checkpointing = True

-    @torch.no_grad()
    def _init_weights(self, module):
        """Initialize the weights"""
-        super()._init_weights(module)
-        if isinstance(module, BigBirdLMPredictionHead):
-            init.zeros_(module.bias)
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, BigBirdLMPredictionHead):
+            module.bias.data.zero_()


@dataclass
@ -1884,10 +1899,7 @@ class BigBirdModel(BigBirdPreTrainedModel):


 class BigBirdForPreTraining(BigBirdPreTrainedModel):
-    _tied_weights_keys = {
-        "cls.predictions.decoder.bias": "cls.predictions.bias",
-        "cls.predictions.decoder.weight": "bert.embeddings.word_embeddings.weight",
-    }
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]

    def __init__(self, config):
        super().__init__(config)
@ -1987,10 +1999,7 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel):

@auto_docstring
 class BigBirdForMaskedLM(BigBirdPreTrainedModel):
-    _tied_weights_keys = {
-        "cls.predictions.decoder.bias": "cls.predictions.bias",
-        "cls.predictions.decoder.weight": "bert.embeddings.word_embeddings.weight",
-    }
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]

    def __init__(self, config):
        super().__init__(config)
@ -2132,10 +2141,7 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel):
    """
 )
 class BigBirdForCausalLM(BigBirdPreTrainedModel, GenerationMixin):
-    _tied_weights_keys = {
-        "cls.predictions.decoder.bias": "cls.predictions.bias",
-        "cls.predictions.decoder.weight": "bert.embeddings.word_embeddings.weight",
-    }
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]

    def __init__(self, config):
        super().__init__(config)
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@ -1539,6 +1539,20 @@ class BigBirdPegasusPreTrainedModel(PreTrainedModel):

    _can_compile_fullgraph = True

+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+
    @property
    def dummy_inputs(self):
        pad_token = self.config.pad_token_id
@ -1560,7 +1574,7 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel):
        embed_tokens (nn.Embedding): output embedding
    """

-    def __init__(self, config: BigBirdPegasusConfig):
+    def __init__(self, config: BigBirdPegasusConfig, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)

        self.attention_type = config.attention_type
@ -1578,6 +1592,9 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel):
            config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
        )

+        if embed_tokens is not None:
+            self.embed_tokens.weight = embed_tokens.weight
+
        self.embed_positions = BigBirdPegasusLearnedPositionalEmbedding(
            config.max_position_embeddings,
            embed_dim,
@ -1832,7 +1849,7 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
        embed_tokens (nn.Embedding): output embedding
    """

-    def __init__(self, config: BigBirdPegasusConfig):
+    def __init__(self, config: BigBirdPegasusConfig, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)
        self.dropout = config.dropout
        self.layerdrop = config.decoder_layerdrop
@ -1844,6 +1861,9 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
            config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
        )

+        if embed_tokens is not None:
+            self.embed_tokens.weight = embed_tokens.weight
+
        self.embed_positions = BigBirdPegasusLearnedPositionalEmbedding(
            config.max_position_embeddings,
            config.d_model,
@ -2055,10 +2075,7 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):

@auto_docstring
 class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
-    _tied_weights_keys = {
-        "encoder.embed_tokens.weight": "shared.weight",
-        "decoder.embed_tokens.weight": "shared.weight",
-    }
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]

    def __init__(self, config: BigBirdPegasusConfig):
        super().__init__(config)
@ -2069,8 +2086,8 @@ class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
            vocab_size, config.d_model, padding_idx, embed_scale=embed_scale
        )

-        self.encoder = BigBirdPegasusEncoder(config)
-        self.decoder = BigBirdPegasusDecoder(config)
+        self.encoder = BigBirdPegasusEncoder(config, self.shared)
+        self.decoder = BigBirdPegasusDecoder(config, self.shared)

        # Initialize weights and apply final processing
        self.post_init()
@ -2083,6 +2100,11 @@ class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
        self.encoder.embed_tokens = self.shared
        self.decoder.embed_tokens = self.shared

+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_embedding_weights(self.encoder.embed_tokens, self.shared)
+            self._tie_embedding_weights(self.decoder.embed_tokens, self.shared)
+
    def get_encoder(self):
        return self.encoder

@ -2191,9 +2213,7 @@ class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
 # Copied from transformers.models.bart.modeling_bart.BartForConditionalGeneration with Bart->BigBirdPegasus, BART->BIGBIRD_PEGASUS
 class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel, GenerationMixin):
    base_model_prefix = "model"
-    _tied_weights_keys = {
-        "lm_head.weight": "model.shared.weight",
-    }
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
    _keys_to_ignore_on_load_missing = ["final_logits_bias"]

    def __init__(self, config: BigBirdPegasusConfig):
@ -2227,6 +2247,11 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel, Gene
            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
        self.register_buffer("final_logits_bias", new_bias)

+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self.model._tie_weights()
+            self._tie_embedding_weights(self.lm_head, self.model.shared)
+
    @auto_docstring
    # Ignore copy
    def forward(
@ -2349,6 +2374,8 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel, Gene
    """
 )
 class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel):
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
    def __init__(self, config: BigBirdPegasusConfig, **kwargs):
        super().__init__(config, **kwargs)
        self.model = BigBirdPegasusModel(config)
@ -2470,6 +2497,8 @@ class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel):

@auto_docstring
 class BigBirdPegasusForQuestionAnswering(BigBirdPegasusPreTrainedModel):
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
    def __init__(self, config):
        super().__init__(config)

@ -2592,6 +2621,8 @@ class BigBirdPegasusDecoderWrapper(BigBirdPegasusPreTrainedModel):


 class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
    def __init__(self, config):
        config.is_decoder = True
        config.is_encoder_decoder = False
--- a/src/transformers/models/biogpt/modeling_biogpt.py
+++ b/src/transformers/models/biogpt/modeling_biogpt.py
@ -510,7 +510,7 @@ class BioGptModel(BioGptPreTrainedModel):
    """
 )
 class BioGptForCausalLM(BioGptPreTrainedModel, GenerationMixin):
-    _tied_weights_keys = {"output_projection.weight": "biogpt.embed_tokens.weight"}
+    _tied_weights_keys = ["output_projection.weight"]

    def __init__(self, config):
        super().__init__(config)
--- a/src/transformers/models/biogpt/modular_biogpt.py
+++ b/src/transformers/models/biogpt/modular_biogpt.py
@ -332,7 +332,7 @@ class BioGptModel(BioGptPreTrainedModel):
    """
 )
 class BioGptForCausalLM(BioGptPreTrainedModel, GenerationMixin):
-    _tied_weights_keys = {"output_projection.weight": "biogpt.embed_tokens.weight"}
+    _tied_weights_keys = ["output_projection.weight"]

    def __init__(self, config):
        super().__init__(config)
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@ -22,7 +22,6 @@ import numpy as np
 import torch
 from torch import Tensor, nn

-from ... import initialization as init
 from ...activations import ACT2FN
 from ...modeling_outputs import (
    BackboneOutput,
@ -629,20 +628,19 @@ class BitPreTrainedModel(PreTrainedModel):
    main_input_name = "pixel_values"
    _no_split_modules = ["BitEmbeddings"]

-    @torch.no_grad()
    def _init_weights(self, module):
        if isinstance(module, nn.Conv2d):
-            init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
        # copied from the `reset_parameters` method of `class Linear(Module)` in `torch`.
        elif isinstance(module, nn.Linear):
-            init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
            if module.bias is not None:
-                fan_in, _ = init._calculate_fan_in_and_fan_out(module.weight)
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
-                init.uniform_(module.bias, -bound, bound)
+                nn.init.uniform_(module.bias, -bound, bound)
        elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
-            init.constant_(module.weight, 1)
-            init.constant_(module.bias, 0)
+            nn.init.constant_(module.weight, 1)
+            nn.init.constant_(module.bias, 0)


@auto_docstring
--- a/src/transformers/models/bitnet/modeling_bitnet.py
+++ b/src/transformers/models/bitnet/modeling_bitnet.py
@ -433,7 +433,7 @@ class BitNetModel(BitNetPreTrainedModel):

@auto_docstring
 class BitNetForCausalLM(BitNetPreTrainedModel, GenerationMixin):
-    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
+    _tied_weights_keys = ["lm_head.weight"]
    _tp_plan = None
    _pp_plan = None

--- a/Show More
+++ b/Show More