Update

2025-11-05 00:14:54 +08:00 · 2025-11-03 14:47:02 -08:00 · 2025-11-03 13:04:11 -08:00 · 2025-11-03 12:07:47 -08:00 · 2025-11-03 12:05:34 -08:00 · 2025-11-03 10:46:42 -08:00
498 changed files with 7867 additions and 3679 deletions
--- a/.bc-linter.yml
+++ b/.bc-linter.yml
@ -13,3 +13,4 @@ exclude:
  - "**/benchmarks/**"
  - "**/test_*.py"
  - "**/*_test.py"
+  - "tools/**"
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@ -3,7 +3,7 @@

 set -eux

-ACL_VERSION=${ACL_VERSION:-"v25.02"}
+ACL_VERSION=${ACL_VERSION:-"v52.6.0"}
 ACL_INSTALL_DIR="/acl"

 # Clone ACL
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -97,7 +97,7 @@ case ${image} in
    manylinux2_28-builder:xpu)
        TARGET=xpu_final
        GPU_IMAGE=amd64/almalinux:8
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13"
        MANY_LINUX_VERSION="2_28"
        ;;
    *)
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,15 +1,11 @@
-sphinx==5.3.0
+sphinx==7.2.6
 #Description: This is used to generate PyTorch docs
-#Pinned versions: 5.3.0
+#Pinned versions: 7.2.6

-standard-imghdr==3.13.0; python_version >= "3.13"
-#Description: This is needed by Sphinx, so it needs to be added here.
-# The reasons are as follows:
-# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
-# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
-# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
+pytorch_sphinx_theme2==0.2.0
+#Description: This is needed to generate PyTorch docs
+#Pinned versions: 0.2.0

-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.
@ -36,17 +32,17 @@ tensorboard==2.18.0 ; python_version >= "3.13"
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 2.13.0

-breathe==4.34.0
+breathe==4.36.0
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 4.34.0
+#Pinned versions: 4.36.0

-exhale==0.2.3
+exhale==0.3.7
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.2.3
+#Pinned versions: 0.3.7

-docutils==0.16
+docutils==0.20
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.16
+#Pinned versions: 0.20

 bs4==0.0.1
 #Description: This is used to generate PyTorch C++ docs
@ -56,13 +52,13 @@ IPython==8.12.0
 #Description: This is used to generate PyTorch functorch docs
 #Pinned versions: 8.12.0

-myst-nb==0.17.2
+myst-nb==1.3.0
 #Description: This is used to generate PyTorch functorch and torch.compile docs.
-#Pinned versions: 0.17.2
+#Pinned versions: 1.3.0

 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
 sphinx-copybutton==0.5.0
-sphinx-design==0.4.0
+sphinx-design==0.6.1
 sphinxcontrib-mermaid==1.0.0
-myst-parser==0.18.1
+myst-parser==4.0.1
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -89,20 +89,23 @@ if [ "$is_main_doc" = true ]; then

  make coverage
  # Now we have the coverage report, we need to make sure it is empty.
-  # Count the number of lines in the file and turn that number into a variable
-  # $lines. The `cut -f1 ...` is to only parse the number, not the filename
-  # Skip the report header by subtracting 2: the header will be output even if
-  # there are no undocumented items.
+  # Sphinx 7.2.6+ format: python.txt contains a statistics table with a TOTAL row
+  # showing the undocumented count in the third column.
+  # Example: | TOTAL | 99.83% | 2 |
  #
  # Also: see docs/source/conf.py for "coverage_ignore*" items, which should
  # be documented then removed from there.
-  lines=$(wc -l build/coverage/python.txt 2>/dev/null |cut -f1 -d' ')
-  undocumented=$((lines - 2))
-  if [ $undocumented -lt 0 ]; then
+
+  # Extract undocumented count from TOTAL row in Sphinx 7.2.6 statistics table
+  # The table format is: | Module | Coverage | Undocumented |
+  # Extract the third column (undocumented count) from the TOTAL row
+  undocumented=$(grep "| TOTAL" build/coverage/python.txt | awk -F'|' '{print $4}' | tr -d ' ')
+
+  if [ -z "$undocumented" ] || ! [[ "$undocumented" =~ ^[0-9]+$ ]]; then
    echo coverage output not found
    exit 1
-  elif [ $undocumented -gt 0 ]; then
-    echo undocumented objects found:
+  elif [ "$undocumented" -gt 0 ]; then
+    echo "undocumented objects found:"
    cat build/coverage/python.txt
    echo "Make sure you've updated relevant .rsts in docs/source!"
    echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'"
--- a/.clang-tidy
+++ b/.clang-tidy
@ -60,9 +60,11 @@ performance-*,
 readability-container-size-empty,
 readability-delete-null-pointer,
 readability-duplicate-include,
+readability-named-parameter,
 readability-misplaced-array-index,
 readability-redundant*,
 readability-simplify-subscript-expr,
+readability-static-definition-in-anonymous-namespace
 readability-string-compare,
 -readability-redundant-access-specifiers,
 -readability-redundant-control-flow,
--- a/.claude/skills/add-uint-support/SKILL.md
+++ b/.claude/skills/add-uint-support/SKILL.md
@ -0,0 +1,319 @@
+---
+name: add-uint-support
+description: Add unsigned integer (uint) type support to PyTorch operators by updating AT_DISPATCH macros. Use when adding support for uint16, uint32, uint64 types to operators, kernels, or when user mentions enabling unsigned types, barebones unsigned types, or uint support.
+---
+
+# Add Unsigned Integer (uint) Support to Operators
+
+This skill helps add support for unsigned integer types (uint16, uint32, uint64) to PyTorch operators by updating their AT_DISPATCH macros.
+
+## When to use this skill
+
+Use this skill when:
+- Adding uint16, uint32, or uint64 support to an operator
+- User mentions "unsigned types", "uint support", "barebones unsigned types"
+- Enabling support for kUInt16, kUInt32, kUInt64 in kernels
+- Working with operator implementations that need expanded type coverage
+
+## Quick reference
+
+**Add unsigned types to existing dispatch:**
+```cpp
+// Before
+AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
+  kernel<scalar_t>();
+}), AT_EXPAND(AT_ALL_TYPES));
+
+// After (method 1: add unsigned types explicitly)
+AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
+  kernel<scalar_t>();
+}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+
+// After (method 2: use V2 integral types if AT_INTEGRAL_TYPES present)
+AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
+  kernel<scalar_t>();
+}), AT_EXPAND(AT_INTEGRAL_TYPES_V2), AT_EXPAND(AT_FLOATING_TYPES));
+```
+
+## Type group reference
+
+**Unsigned type groups:**
+- `AT_BAREBONES_UNSIGNED_TYPES`: kUInt16, kUInt32, kUInt64
+- `AT_INTEGRAL_TYPES_V2`: AT_INTEGRAL_TYPES + AT_BAREBONES_UNSIGNED_TYPES
+
+**Relationship:**
+```cpp
+AT_INTEGRAL_TYPES          // kByte, kChar, kInt, kLong, kShort
+AT_BAREBONES_UNSIGNED_TYPES  // kUInt16, kUInt32, kUInt64
+AT_INTEGRAL_TYPES_V2       // INTEGRAL_TYPES + BAREBONES_UNSIGNED_TYPES
+```
+
+## Instructions
+
+### Step 1: Determine if conversion to V2 is needed
+
+Check if the file uses AT_DISPATCH_V2:
+
+**If using old AT_DISPATCH:**
+- First convert to AT_DISPATCH_V2 using the at-dispatch-v2 skill
+- Then proceed with adding uint support
+
+**If already using AT_DISPATCH_V2:**
+- Proceed directly to Step 2
+
+### Step 2: Analyze the current dispatch macro
+
+Identify what type groups are currently in use:
+
+```cpp
+AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
+  // body
+}), AT_EXPAND(AT_ALL_TYPES), kHalf, kBFloat16);
+    ^^^^^^^^^^^^^^^^^^^^^^^^^
+    Current type coverage
+```
+
+Common patterns:
+- `AT_EXPAND(AT_ALL_TYPES)` → includes AT_INTEGRAL_TYPES + AT_FLOATING_TYPES
+- `AT_EXPAND(AT_INTEGRAL_TYPES)` → signed integers only
+- `AT_EXPAND(AT_FLOATING_TYPES)` → floating point types
+
+### Step 3: Choose the uint addition method
+
+Two approaches:
+
+**Method 1: Add AT_BAREBONES_UNSIGNED_TYPES explicitly**
+- Use when: You want to be explicit about adding uint support
+- Add `AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)` to the type list
+
+**Method 2: Substitute AT_INTEGRAL_TYPES with AT_INTEGRAL_TYPES_V2**
+- Use when: The dispatch already uses `AT_EXPAND(AT_INTEGRAL_TYPES)`
+- More concise: replaces one type group with its superset
+- Only applicable if AT_INTEGRAL_TYPES is present
+
+### Step 4: Apply the transformation
+
+**Method 1 example:**
+```cpp
+// Before
+AT_DISPATCH_V2(
+    dtype,
+    "min_values_cuda",
+    AT_WRAP([&]() {
+      kernel_impl<scalar_t>(iter);
+    }),
+    AT_EXPAND(AT_ALL_TYPES),
+    kBFloat16, kHalf, kBool
+);
+
+// After (add unsigned types)
+AT_DISPATCH_V2(
+    dtype,
+    "min_values_cuda",
+    AT_WRAP([&]() {
+      kernel_impl<scalar_t>(iter);
+    }),
+    AT_EXPAND(AT_ALL_TYPES),
+    AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
+    kBFloat16, kHalf, kBool
+);
+```
+
+**Method 2 example:**
+```cpp
+// Before
+AT_DISPATCH_V2(
+    dtype,
+    "integral_op",
+    AT_WRAP([&]() {
+      kernel<scalar_t>();
+    }),
+    AT_EXPAND(AT_INTEGRAL_TYPES)
+);
+
+// After (substitute with V2)
+AT_DISPATCH_V2(
+    dtype,
+    "integral_op",
+    AT_WRAP([&]() {
+      kernel<scalar_t>();
+    }),
+    AT_EXPAND(AT_INTEGRAL_TYPES_V2)
+);
+```
+
+### Step 5: Handle AT_ALL_TYPES vs individual type groups
+
+If the dispatch uses `AT_EXPAND(AT_ALL_TYPES)`:
+- `AT_ALL_TYPES` = `AT_INTEGRAL_TYPES` + `AT_FLOATING_TYPES`
+- To add uint: add `AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)` to the list
+
+If the dispatch separately lists INTEGRAL and FLOATING:
+```cpp
+// Before
+AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES)
+
+// After (Method 2 preferred)
+AT_EXPAND(AT_INTEGRAL_TYPES_V2), AT_EXPAND(AT_FLOATING_TYPES)
+```
+
+### Step 6: Verify all dispatch sites
+
+Check the file for ALL dispatch macros that need uint support:
+- Some operators have multiple dispatch sites (CPU, CUDA, different functions)
+- Apply the transformation consistently across all sites
+- Ensure each gets the same type coverage updates
+
+### Step 7: Validate the changes
+
+Check that:
+- [ ] AT_DISPATCH_V2 format is used (not old AT_DISPATCH)
+- [ ] Unsigned types are added via one of the two methods
+- [ ] All relevant dispatch sites in the file are updated
+- [ ] Type groups use `AT_EXPAND()`
+- [ ] Arguments are properly formatted and comma-separated
+
+## Common patterns
+
+### Pattern 1: AT_ALL_TYPES + extras
+
+```cpp
+// Before
+AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
+  kernel<scalar_t>();
+}), AT_EXPAND(AT_ALL_TYPES), kHalf, kBFloat16);
+
+// After
+AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
+  kernel<scalar_t>();
+}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
+```
+
+### Pattern 2: Separate INTEGRAL + FLOATING
+
+```cpp
+// Before
+AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
+  kernel<scalar_t>();
+}), AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES));
+
+// After
+AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
+  kernel<scalar_t>();
+}), AT_EXPAND(AT_INTEGRAL_TYPES_V2), AT_EXPAND(AT_FLOATING_TYPES));
+```
+
+### Pattern 3: Old dispatch needs conversion first
+
+```cpp
+// Before (needs v2 conversion first)
+AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, dtype, "op", [&]() {
+  kernel<scalar_t>();
+});
+
+// After v2 conversion
+AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
+  kernel<scalar_t>();
+}), AT_EXPAND(AT_ALL_TYPES), kHalf, kBFloat16);
+
+// After adding uint support
+AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
+  kernel<scalar_t>();
+}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
+```
+
+## Multiple dispatch sites example
+
+For a file with multiple functions:
+
+```cpp
+void min_values_kernel_cuda(TensorIterator& iter) {
+  AT_DISPATCH_V2(iter.dtype(), "min_values_cuda", AT_WRAP([&]() {
+    impl<scalar_t>(iter);
+  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf);
+  //                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  //                           Added uint support
+}
+
+void min_launch_kernel(TensorIterator &iter) {
+  AT_DISPATCH_V2(iter.input_dtype(), "min_cuda", AT_WRAP([&]() {
+    gpu_reduce_kernel<scalar_t>(iter);
+  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf);
+  //                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  //                           Added uint support here too
+}
+```
+
+## Decision tree
+
+Use this decision tree to determine the approach:
+
+```
+Is the file using AT_DISPATCH_V2?
+├─ No → Use at-dispatch-v2 skill first, then continue
+└─ Yes
+   └─ Does it use AT_EXPAND(AT_INTEGRAL_TYPES)?
+      ├─ Yes → Replace with AT_EXPAND(AT_INTEGRAL_TYPES_V2)
+      └─ No → Add AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES) to type list
+```
+
+## Edge cases
+
+### Case 1: Dispatch with only floating types
+
+If the operator only supports floating point types, don't add uint support:
+
+```cpp
+// Leave as-is - floating point only operator
+AT_DISPATCH_V2(dtype, "float_op", AT_WRAP([&]() {
+  kernel<scalar_t>();
+}), AT_EXPAND(AT_FLOATING_TYPES), kHalf);
+```
+
+### Case 2: Complex types present
+
+Unsigned types work alongside complex types:
+
+```cpp
+AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
+  kernel<scalar_t>();
+}), AT_EXPAND(AT_ALL_TYPES),
+    AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
+    AT_EXPAND(AT_COMPLEX_TYPES),
+    kHalf, kBFloat16);
+```
+
+### Case 3: Already has uint support
+
+Check if uint types are already present:
+- If `AT_INTEGRAL_TYPES_V2` is used → already has uint support
+- If `AT_BAREBONES_UNSIGNED_TYPES` is already in list → already has uint support
+- Skip the file if uint support is already present
+
+## Workflow
+
+When asked to add uint support:
+
+1. Read the target file
+2. Check if using AT_DISPATCH_V2:
+   - If not → use at-dispatch-v2 skill first
+3. Identify all dispatch macro sites
+4. For each dispatch:
+   - Analyze current type groups
+   - Choose method (add BAREBONES_UNSIGNED or upgrade to V2)
+   - Apply transformation with Edit tool
+5. Show the user the changes
+6. Explain what was modified
+
+## Important notes
+
+- Always check if v2 conversion is needed first
+- Apply changes consistently across all dispatch sites in the file
+- Method 2 (AT_INTEGRAL_TYPES_V2) is cleaner when applicable
+- Method 1 (explicit AT_BAREBONES_UNSIGNED_TYPES) is more explicit
+- Unsigned types are: kUInt16, kUInt32, kUInt64 (not kByte which is uint8)
+- Some operators may not semantically support unsigned types - use judgment
+
+## Testing
+
+After adding uint support, the operator should accept uint16, uint32, and uint64 tensors. The user is responsible for functional testing.
--- a/.claude/skills/at-dispatch-v2/SKILL.md
+++ b/.claude/skills/at-dispatch-v2/SKILL.md
@ -0,0 +1,305 @@
+---
+name: at-dispatch-v2
+description: Convert PyTorch AT_DISPATCH macros to AT_DISPATCH_V2 format in ATen C++ code. Use when porting AT_DISPATCH_ALL_TYPES_AND*, AT_DISPATCH_FLOATING_TYPES*, or other dispatch macros to the new v2 API. For ATen kernel files, CUDA kernels, and native operator implementations.
+---
+
+# AT_DISPATCH to AT_DISPATCH_V2 Converter
+
+This skill helps convert PyTorch's legacy AT_DISPATCH macros to the new AT_DISPATCH_V2 format, as defined in `aten/src/ATen/Dispatch_v2.h`.
+
+## When to use this skill
+
+Use this skill when:
+- Converting AT_DISPATCH_* macros to AT_DISPATCH_V2
+- Porting ATen kernels to use the new dispatch API
+- Working with files in `aten/src/ATen/native/` that use dispatch macros
+- User mentions "AT_DISPATCH", "dispatch v2", "Dispatch_v2.h", or macro conversion
+
+## Quick reference
+
+**Old format:**
+```cpp
+AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, dtype, "kernel_name", [&]() {
+  // lambda body
+});
+```
+
+**New format:**
+```cpp
+AT_DISPATCH_V2(dtype, "kernel_name", AT_WRAP([&]() {
+  // lambda body
+}), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kHalf, kBool);
+```
+
+## Key transformations
+
+1. **Reorder arguments**: `scalar_type` and `name` come first, then lambda, then types
+2. **Wrap the lambda**: Use `AT_WRAP(lambda)` to handle internal commas
+3. **Expand type groups**: Use `AT_EXPAND(AT_ALL_TYPES)` instead of implicit expansion
+4. **List individual types**: Add extra types (kHalf, kBFloat16, etc.) after expanded groups
+5. **Add include**: `#include <ATen/Dispatch_v2.h>` near other Dispatch includes
+
+## Instructions
+
+### Step 1: Add the Dispatch_v2.h include
+
+Add the v2 header near the existing `#include <ATen/Dispatch.h>`:
+
+```cpp
+#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
+```
+
+Keep the old Dispatch.h include for now (other code may still need it).
+
+### Step 2: Identify the old dispatch pattern
+
+Common patterns to convert:
+
+- `AT_DISPATCH_ALL_TYPES_AND{2,3,4}(type1, type2, ..., scalar_type, name, lambda)`
+- `AT_DISPATCH_FLOATING_TYPES_AND{2,3}(type1, type2, ..., scalar_type, name, lambda)`
+- `AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND{2,3}(type1, ..., scalar_type, name, lambda)`
+- `AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND{2,3}(type1, ..., scalar_type, name, lambda)`
+
+### Step 3: Map the old macro to type groups
+
+Identify which type group macro corresponds to the base types:
+
+| Old macro base | AT_DISPATCH_V2 type group |
+|----------------|---------------------------|
+| `ALL_TYPES` | `AT_EXPAND(AT_ALL_TYPES)` |
+| `FLOATING_TYPES` | `AT_EXPAND(AT_FLOATING_TYPES)` |
+| `INTEGRAL_TYPES` | `AT_EXPAND(AT_INTEGRAL_TYPES)` |
+| `COMPLEX_TYPES` | `AT_EXPAND(AT_COMPLEX_TYPES)` |
+| `ALL_TYPES_AND_COMPLEX` | `AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX)` |
+
+For combined patterns, use multiple `AT_EXPAND()` entries:
+```cpp
+// Old: AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(...)
+// New: AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_COMPLEX_TYPES), type1, type2
+```
+
+### Step 4: Extract the individual types
+
+From `AT_DISPATCH_*_AND2(type1, type2, ...)` or `AT_DISPATCH_*_AND3(type1, type2, type3, ...)`, extract the individual types (type1, type2, etc.).
+
+These become the trailing arguments after the type group:
+```cpp
+AT_DISPATCH_V2(..., AT_EXPAND(AT_ALL_TYPES), kBFloat16, kHalf, kBool)
+                                             ^^^^^^^^^^^^^^^^^^^^^^^^
+                                             Individual types from AND3
+```
+
+### Step 5: Transform to AT_DISPATCH_V2
+
+Apply the transformation:
+
+**Pattern:**
+```cpp
+AT_DISPATCH_V2(
+  scalar_type,           // 1st: The dtype expression
+  "name",                // 2nd: The debug string
+  AT_WRAP(lambda),       // 3rd: The lambda wrapped in AT_WRAP
+  type_groups,           // 4th+: Type groups with AT_EXPAND()
+  individual_types       // Last: Individual types
+)
+```
+
+**Example transformation:**
+```cpp
+// BEFORE
+AT_DISPATCH_ALL_TYPES_AND3(
+    kBFloat16, kHalf, kBool,
+    iter.dtype(),
+    "min_values_cuda",
+    [&]() {
+      min_values_kernel_cuda_impl<scalar_t>(iter);
+    }
+);
+
+// AFTER
+AT_DISPATCH_V2(
+    iter.dtype(),
+    "min_values_cuda",
+    AT_WRAP([&]() {
+      min_values_kernel_cuda_impl<scalar_t>(iter);
+    }),
+    AT_EXPAND(AT_ALL_TYPES),
+    kBFloat16, kHalf, kBool
+);
+```
+
+### Step 6: Handle multi-line lambdas
+
+For lambdas with internal commas or complex expressions, AT_WRAP is essential:
+
+```cpp
+AT_DISPATCH_V2(
+    dtype,
+    "complex_kernel",
+    AT_WRAP([&]() {
+      gpu_reduce_kernel<scalar_t, scalar_t>(
+        iter,
+        MinOps<scalar_t>{},
+        thrust::pair<scalar_t, int64_t>(upper_bound(), 0)  // Commas inside!
+      );
+    }),
+    AT_EXPAND(AT_ALL_TYPES)
+);
+```
+
+### Step 7: Verify the conversion
+
+Check that:
+- [ ] `AT_WRAP()` wraps the entire lambda
+- [ ] Type groups use `AT_EXPAND()`
+- [ ] Individual types don't have `AT_EXPAND()` (just `kBFloat16`, not `AT_EXPAND(kBFloat16)`)
+- [ ] Argument order is: scalar_type, name, lambda, types
+- [ ] Include added: `#include <ATen/Dispatch_v2.h>`
+
+## Type group reference
+
+Available type group macros (use with `AT_EXPAND()`):
+
+```cpp
+AT_INTEGRAL_TYPES      // kByte, kChar, kInt, kLong, kShort
+AT_FLOATING_TYPES      // kDouble, kFloat
+AT_COMPLEX_TYPES       // kComplexDouble, kComplexFloat
+AT_QINT_TYPES         // kQInt8, kQUInt8, kQInt32
+AT_ALL_TYPES          // INTEGRAL_TYPES + FLOATING_TYPES
+AT_ALL_TYPES_AND_COMPLEX  // ALL_TYPES + COMPLEX_TYPES
+AT_INTEGRAL_TYPES_V2  // INTEGRAL_TYPES + unsigned types
+AT_BAREBONES_UNSIGNED_TYPES  // kUInt16, kUInt32, kUInt64
+AT_FLOAT8_TYPES       // Float8 variants
+```
+
+## Common patterns
+
+### Pattern: AT_DISPATCH_ALL_TYPES_AND2
+
+```cpp
+// Before
+AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, dtype, "op", [&]() {
+  kernel<scalar_t>(data);
+});
+
+// After
+AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
+  kernel<scalar_t>(data);
+}), AT_EXPAND(AT_ALL_TYPES), kHalf, kBFloat16);
+```
+
+### Pattern: AT_DISPATCH_FLOATING_TYPES_AND3
+
+```cpp
+// Before
+AT_DISPATCH_FLOATING_TYPES_AND3(kHalf, kBFloat16, kFloat8_e4m3fn,
+    tensor.scalar_type(), "float_op", [&] {
+  process<scalar_t>(tensor);
+});
+
+// After
+AT_DISPATCH_V2(tensor.scalar_type(), "float_op", AT_WRAP([&] {
+  process<scalar_t>(tensor);
+}), AT_EXPAND(AT_FLOATING_TYPES), kHalf, kBFloat16, kFloat8_e4m3fn);
+```
+
+### Pattern: AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2
+
+```cpp
+// Before
+AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+    kComplexHalf, kHalf,
+    self.scalar_type(),
+    "complex_op",
+    [&] {
+      result = compute<scalar_t>(self);
+    }
+);
+
+// After
+AT_DISPATCH_V2(
+    self.scalar_type(),
+    "complex_op",
+    AT_WRAP([&] {
+      result = compute<scalar_t>(self);
+    }),
+    AT_EXPAND(AT_ALL_TYPES),
+    AT_EXPAND(AT_COMPLEX_TYPES),
+    kComplexHalf,
+    kHalf
+);
+```
+
+## Edge cases
+
+### Case 1: No extra types (rare)
+
+```cpp
+// Before
+AT_DISPATCH_ALL_TYPES(dtype, "op", [&]() { kernel<scalar_t>(); });
+
+// After
+AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
+  kernel<scalar_t>();
+}), AT_EXPAND(AT_ALL_TYPES));
+```
+
+### Case 2: Many individual types (AND4, AND5, etc.)
+
+```cpp
+// Before
+AT_DISPATCH_FLOATING_TYPES_AND4(kHalf, kBFloat16, kFloat8_e4m3fn, kFloat8_e5m2,
+    dtype, "float8_op", [&]() { kernel<scalar_t>(); });
+
+// After
+AT_DISPATCH_V2(dtype, "float8_op", AT_WRAP([&]() {
+  kernel<scalar_t>();
+}), AT_EXPAND(AT_FLOATING_TYPES), kHalf, kBFloat16, kFloat8_e4m3fn, kFloat8_e5m2);
+```
+
+### Case 3: Lambda with no captures
+
+```cpp
+// Before
+AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, dtype, "op", []() {
+  static_kernel<scalar_t>();
+});
+
+// After
+AT_DISPATCH_V2(dtype, "op", AT_WRAP([]() {
+  static_kernel<scalar_t>();
+}), AT_EXPAND(AT_ALL_TYPES), kHalf, kBool);
+```
+
+## Benefits of AT_DISPATCH_V2
+
+1. **No arity in macro name**: Don't need different macros for AND2, AND3, AND4
+2. **Composable type sets**: Mix and match type groups with `AT_EXPAND()`
+3. **Extensible**: Easy to add more types without hitting macro limits
+4. **Clearer**: Type groups are explicit, not implicit in macro name
+
+## Important notes
+
+- Keep `#include <ATen/Dispatch.h>` - other code may need it
+- The `AT_WRAP()` is mandatory - prevents comma parsing issues in the lambda
+- Type groups need `AT_EXPAND()`, individual types don't
+- The v2 API is in `aten/src/ATen/Dispatch_v2.h` - refer to it for full docs
+- See the header file for the Python script to regenerate the macro implementation
+
+## Workflow
+
+When asked to convert AT_DISPATCH macros:
+
+1. Read the file to identify all AT_DISPATCH uses
+2. Add `#include <ATen/Dispatch_v2.h>` if not present
+3. For each dispatch macro:
+   - Identify the pattern and extract components
+   - Map the base type group
+   - Extract individual types
+   - Construct the AT_DISPATCH_V2 call
+   - Apply with Edit tool
+4. Show the user the complete converted file
+5. Explain what was changed
+
+Do NOT compile or test the code - focus on accurate conversion only.
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-218d2ab791d437309f91e0486eb9fa7f00badc17
+cfbc5c2f1c798991715a6b06bb3ce46478c4487c
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-df6798dfb931ce7c7fe5bed2447cd1092a5981af
+c8b09f5f77d6bf6fb7ed7a9aa83e5d8156b3a5e9
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -159,12 +159,7 @@ jobs:
            WITH_CLANG_LDD="--with-clang-ldd"
          fi

-          if [[ "${BUILD_DEVICE}" == xpu ]]; then
-            docker exec -t "${container_name}" bash -c "dnf install -y gcc-toolset-13-gcc-c++"
-            docker exec -t "${container_name}" bash -c "source /opt/rh/gcc-toolset-13/enable && ${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE"
-          else
-            docker exec -t "${container_name}" bash -c "${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE $WITH_CLANG_LDD"
-          fi
+          docker exec -t "${container_name}" bash -c "${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE $WITH_CLANG_LDD"

          if [[ ("${{ matrix.device }}" == "cuda" || "${{ matrix.device }}" == "xpu") ]]; then
            docker exec -t "${container_name}"  bash -c "auditwheel repair --plat ${PLATFORM} //artifacts/*.whl"
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -76,11 +76,12 @@ jobs:

  # NOTE: mypy needs its own job because it depends on --all-files, without assessing all files it sometimes
  #       fails to find types when it should
-  lintrunner-mypy:
+  # NOTE: We should be able to disable this and consolidate with Pyrefly
+  lintrunner-pyrefly:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    name: lintrunner-mypy-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }}
+    name: lintrunner-pyrefly-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }}
    needs: [get-label-type, get-changed-files]
-    # Only run if there are changed files relevant to mypy
+    # Only run if there are changed files relevant to pyrefly
    if: |
      github.repository_owner == 'pytorch' && (
        needs.get-changed-files.outputs.changed-files == '*' ||
@ -98,8 +99,8 @@ jobs:
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      script: |
        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
-        echo "Running mypy"
-        ADDITIONAL_LINTRUNNER_ARGS="--take MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
+        echo "Running pyrefly"
+        ADDITIONAL_LINTRUNNER_ARGS="--take PYREFLY --all-files" .github/scripts/lintrunner.sh

  lintrunner-noclang:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -118,9 +119,9 @@ jobs:
        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
        echo "Running all other linters"
        if [ "$CHANGED_FILES" = '*' ]; then
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT,PYREFLY --all-files" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,PYREFLY --all-files" .github/scripts/lintrunner.sh
        else
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT,PYREFLY ${CHANGED_FILES}" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,PYREFLY ${CHANGED_FILES}" .github/scripts/lintrunner.sh
        fi

  quick-checks:
--- a/.gitignore
+++ b/.gitignore
@ -398,3 +398,4 @@ CLAUDE.local.md
 /test_*.py
 /debug_*.py
 CLAUDE_CONTEXT/
+/.claude/settings.local.json
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -121,94 +121,6 @@ command = [
 ]
 is_formatter = true

-[[linter]]
-code = 'MYPY'
-include_patterns = [
-    'setup.py',
-    'functorch/dim/**/*.py',
-    'torch/**/*.py',
-    'torch/**/*.pyi',
-    'caffe2/**/*.py',
-    'caffe2/**/*.pyi',
-    'test/test_bundled_images.py',
-    'test/test_bundled_inputs.py',
-    'test/test_complex.py',
-    'test/test_datapipe.py',
-    'test/test_futures.py',
-    'test/test_numpy_interop.py',
-    'test/test_torch.py',
-    'test/test_type_hints.py',
-    'test/test_type_info.py',
-    'test/test_utils.py',
-]
-exclude_patterns = [
-    '**/fb/**',
-]
-command = [
-    'python3',
-    'tools/linter/adapters/mypy_linter.py',
-    '--config=mypy.ini',
-    '--',
-    '@{{PATHSFILE}}'
-]
-init_command = [
-    'python3',
-    'tools/linter/adapters/pip_init.py',
-    '--dry-run={{DRYRUN}}',
-    'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"',
-    'numpy==2.1.0 ; python_version >= "3.12"',
-    'expecttest==0.3.0',
-    'mypy==1.16.0',
-    'sympy==1.13.3',
-    'types-requests==2.27.25',
-    'types-pyyaml==6.0.2',
-    'types-tabulate==0.8.8',
-    'types-protobuf==5.29.1.20250403',
-    'types-setuptools==79.0.0.20250422',
-    'types-jinja2==2.11.9',
-    'types-colorama==0.4.6',
-    'filelock==3.18.0',
-    'junitparser==2.1.1',
-    'rich==14.1.0',
-    'pyyaml==6.0.2',
-    'optree==0.13.0',
-    'dataclasses-json==0.6.7',
-    'pandas==2.2.3',
-]
-
-[[linter]]
-code = 'MYPYSTRICT'
-include_patterns = [
-    '.github/**/*.py',
-    'benchmarks/instruction_counts/**/*.py',
-    'tools/**/*.py',
-    'torchgen/**/*.py',
-    'torch/utils/_pytree.py',
-    'torch/utils/_cxx_pytree.py',
-    'torch/utils/benchmark/utils/common.py',
-    'torch/utils/benchmark/utils/timer.py',
-    'torch/utils/benchmark/utils/valgrind_wrapper/**/*.py',
-]
-exclude_patterns = [
-    # (linbinyu) copied from internal repo
-    '**/fb/**',
-    'tools/code_analyzer/gen_operators_yaml.py',
-    'tools/dynamo/verify_dynamo.py',
-    'tools/gen_vulkan_spv.py',
-    'tools/test/gen_operators_yaml_test.py',
-    'tools/test/gen_oplist_test.py',
-    'tools/test/test_selective_build.py',
-    'tools/experimental/torchfuzz/**',
-]
-command = [
-    'python3',
-    'tools/linter/adapters/mypy_linter.py',
-    '--config=mypy-strict.ini',
-    '--code=MYPYSTRICT',
-    '--',
-    '@{{PATHSFILE}}'
-]
-

 [[linter]]
 code = 'PYREFLY'
@ -230,6 +142,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
+    'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"',
    'numpy==2.1.0 ; python_version >= "3.12"',
    'expecttest==0.3.0',
    'pyrefly==0.36.2',
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -11,7 +11,6 @@ aspects of contributing to PyTorch.
 <!-- toc -->

 - [Developing PyTorch](#developing-pytorch)
-  - [Setup the development environment](#setup-the-development-environment)
  - [Tips and Debugging](#tips-and-debugging)
 - [Nightly Checkout & Pull](#nightly-checkout--pull)
 - [Codebase structure](#codebase-structure)
@ -67,23 +66,6 @@ aspects of contributing to PyTorch.

 Follow the instructions for [installing PyTorch from source](https://github.com/pytorch/pytorch#from-source). If you get stuck when developing PyTorch on your machine, check out the [tips and debugging](#tips-and-debugging) section below for common solutions.

-### Setup the development environment
-
-First, you need to [fork the PyTorch project on GitHub](https://github.com/pytorch/pytorch/fork) and follow the instructions at [Connecting to GitHub with SSH](https://docs.github.com/en/authentication/connecting-to-github-with-ssh) to setup your SSH authentication credentials.
-
-Then clone the PyTorch project and setup the development environment:
-
-```bash
-git clone git@github.com:<USERNAME>/pytorch.git
-cd pytorch
-git remote add upstream git@github.com:pytorch/pytorch.git
-
-make setup-env
-# Or run `make setup-env-cuda` for pre-built CUDA binaries
-# Or run `make setup-env-rocm` for pre-built ROCm binaries
-source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
-```
-
 ### Tips and Debugging

 * If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below.
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -260,7 +260,7 @@ IF(USE_FBGEMM_GENAI)
  if(USE_CUDA)
    # To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
    # If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
-    set(FBGEMM_CUTLASS_KERNELS_REGEX ".*(mx8mx8bf16_grouped|f4f4bf16_grouped|f4f4bf16).*")
+    set(FBGEMM_CUTLASS_KERNELS_REGEX ".*(mx8mx8bf16_grouped|f4f4bf16_grouped).*")
    file(GLOB_RECURSE fbgemm_genai_native_cuda_cu
      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu"
      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
--- a/aten/src/ATen/CPUGeneratorImpl.cpp
+++ b/aten/src/ATen/CPUGeneratorImpl.cpp
@ -181,7 +181,7 @@ c10::intrusive_ptr<c10::TensorImpl> CPUGeneratorImpl::get_state() const {
  static const size_t size = sizeof(CPUGeneratorImplState);
  static_assert(std::is_standard_layout_v<CPUGeneratorImplState>, "CPUGeneratorImplState is not a PODType");

-  auto state_tensor = at::detail::empty_cpu({(int64_t)size}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
+  auto state_tensor = at::detail::empty_cpu({static_cast<int64_t>(size)}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
  auto rng_state = state_tensor.data_ptr();

  // accumulate generator data to be copied into byte tensor
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -223,7 +223,7 @@ void Context::setSDPPriorityOrder(const std::vector<int64_t>& order) {
    "setSDPPriority order expected ", sdp_priority_order.size() - 1, " but got ",
    at::num_sdp_backends, " unique backends specified in priority order.");
  for (uint32_t i = 0; i < order.size(); i++) {
-    sdp_priority_order[i] = (at::SDPBackend) order[i];
+    sdp_priority_order[i] = static_cast<at::SDPBackend>(order[i]);
  }
 }

@ -825,6 +825,14 @@ void Context::setDisplayVmapFallbackWarnings(bool enabled) {
  display_vmap_fallback_warnings_ = enabled;
 }

+bool Context::warnOnAccumulateGradStreamMismatch() const {
+  return warn_on_accumulate_grad_stream_mismatch_;
+}
+
+void Context::setWarnOnAccumulateGradStreamMismatch(bool enabled) {
+  warn_on_accumulate_grad_stream_mismatch_ = enabled;
+}
+
 bool Context::isDefaultMobileCPUAllocatorSet() {
  return prev_allocator_ptr_ != nullptr;
 }
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -404,6 +404,9 @@ class TORCH_API Context {
  void setDisplayVmapFallbackWarnings(bool enabled);
  bool areVmapFallbackWarningsEnabled() const;

+  void setWarnOnAccumulateGradStreamMismatch(bool enabled);
+  bool warnOnAccumulateGradStreamMismatch() const;
+
  bool isDefaultMobileCPUAllocatorSet();
  void setDefaultMobileCPUAllocator();
  void unsetDefaultMobileCPUAllocator();
@ -494,6 +497,7 @@ class TORCH_API Context {
  bool release_original_weights = false;
 #endif
  bool display_vmap_fallback_warnings_ = false;
+  bool warn_on_accumulate_grad_stream_mismatch_ = true;
  std::atomic<at::QEngine> quantized_engine = at::QEngine::NoQEngine;
  bool enable_sparse_tensor_invariant_checks = false;
  bool allow_fp16_reduction_cpu = false;
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@ -197,6 +197,7 @@ inline at::ScalarType scalar_type(at::ScalarType s) {
    /* don't use TYPE again in case it is an expensive or side-effect op */ \
    at::ScalarType _st = ::detail::scalar_type(the_type);                   \
    RECORD_KERNEL_FUNCTION_DTYPE(at_dispatch_name, _st);                    \
+    C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")             \
    switch (_st) {                                                          \
      __VA_ARGS__                                                           \
      default:                                                              \
@ -208,6 +209,7 @@ inline at::ScalarType scalar_type(at::ScalarType s) {
            toString(_st),                                                  \
            "'");                                                           \
    }                                                                       \
+    C10_DIAGNOSTIC_POP()                                                    \
  }()

 #define AT_DISPATCH_CASE_FLOATING_TYPES(...)            \
--- a/aten/src/ATen/MapAllocator.cpp
+++ b/aten/src/ATen/MapAllocator.cpp
@ -252,13 +252,13 @@ MapAllocator::MapAllocator(WithFd /*unused*/, std::string_view filename, int fd,
    if (!(flags_ & ALLOCATOR_MAPPED_FROMFD)) {
      if (flags_ & ALLOCATOR_MAPPED_SHARED) {
        // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
-        if ((fd = open(filename_.c_str(), flags, (mode_t)0600)) == -1) {
+        if ((fd = open(filename_.c_str(), flags, static_cast<mode_t>(0600))) == -1) {
          TORCH_CHECK(false, "unable to open file <", filename_, "> in read-write mode: ", c10::utils::str_error(errno), " (", errno, ")");
        }
      } else if (flags_ & ALLOCATOR_MAPPED_SHAREDMEM) {
 #ifdef HAVE_SHM_OPEN
        // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
-        if((fd = shm_open(filename_.c_str(), flags, (mode_t)0600)) == -1) {
+        if((fd = shm_open(filename_.c_str(), flags, static_cast<mode_t>(0600))) == -1) {
          TORCH_CHECK(false, "unable to open shared memory object <", filename_, "> in read-write mode: ", c10::utils::str_error(errno), " (", errno, ")");
        }
 #else
@ -503,7 +503,7 @@ RefcountedMapAllocator::RefcountedMapAllocator(WithFd /*unused*/, const char *fi

 void RefcountedMapAllocator::initializeAlloc() {
  TORCH_CHECK(base_ptr_, "base_ptr_ is null");
-  MapInfo *map_info = (MapInfo*)base_ptr_;
+  MapInfo *map_info = static_cast<MapInfo*>(base_ptr_);

 #ifdef _WIN32
  ReleaseContext* r_ctx = new ReleaseContext;
@ -539,7 +539,7 @@ void RefcountedMapAllocator::close() {
  }
 #else /* _WIN32 */

-  MapInfo *info = (MapInfo*)(data);
+  MapInfo *info = static_cast<MapInfo*>(data);
  if (--info->refcount == 0) {
 #ifdef HAVE_SHM_UNLINK
    if (shm_unlink(filename_.c_str()) == -1) {
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@ -862,7 +862,7 @@ void TensorIteratorBase::narrow(int dim, int64_t start, int64_t size) {
  shape_[dim] = size;
  view_offsets_[dim] += start;
  for (auto& op : operands_) {
-    op.data = ((char*)op.data) + op.stride_bytes[dim] * start;
+    op.data = (static_cast<char*>(op.data)) + op.stride_bytes[dim] * start;
  }
  if (size == 1 && !is_reduction_) {
    coalesce_dimensions();
@ -873,7 +873,7 @@ void TensorIteratorBase::select_all_keeping_dim(int start_dim, IntArrayRef indic
  TORCH_INTERNAL_ASSERT(start_dim <= ndim());
  for (const auto i : c10::irange(start_dim, ndim())) {
    for (auto& op : operands_) {
-      op.data = ((char*)op.data) + op.stride_bytes[i] * indices[i - start_dim];
+      op.data = (static_cast<char*>(op.data)) + op.stride_bytes[i] * indices[i - start_dim];
    }
    shape_[i] = 1;
  }
--- a/aten/src/ATen/TensorIteratorInternal.h
+++ b/aten/src/ATen/TensorIteratorInternal.h
@ -41,7 +41,7 @@ inline void serial_for_each(
    IntArrayRef strides,
    char** base_ptrs,
    size_t ntensors,
-    typename TensorIteratorBase::loop2d_t loop,
+    TensorIteratorBase::loop2d_t loop,
    Range range) {
  const auto ndim = shape.size();
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
--- a/aten/src/ATen/core/IListRef.h
+++ b/aten/src/ATen/core/IListRef.h
@ -190,12 +190,14 @@ class IListRef;
 * it to a function (e.g. `ImplT::<dispatch-function>(this_)`).
 */
 #define TORCH_ILISTREF_UNWRAP(TAG, BODY)                         \
+  C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")    \
  switch (TAG) {                                                 \
    TORCH_ILISTREF_FORALL_TAGS(TORCH_ILISTREF_UNWRAP_CASE, BODY) \
    break;                                                       \
    default:                                                     \
      TORCH_INTERNAL_ASSERT(false, "invalid IListRef tag.");     \
-  }
+  } \
+  C10_DIAGNOSTIC_POP()

 enum class IListRefTag {
 #define DEFINE_TAG(tag, ...) tag,
--- a/aten/src/ATen/core/TransformationHelper.h
+++ b/aten/src/ATen/core/TransformationHelper.h
@ -56,7 +56,7 @@ C10_HOST_DEVICE inline T uniform_int_full_range(V val) {
 * in this overloaded version
 */
 template <typename T, typename V>
-C10_HOST_DEVICE inline std::enable_if_t<!(std::is_floating_point_v<T>), T>uniform_int(V val) {
+C10_HOST_DEVICE inline std::enable_if_t<!std::is_floating_point_v<T>, T>uniform_int(V val) {
  if constexpr (std::is_same_v<T, bool>) {
    return static_cast<bool>(val & 1);
  } else if constexpr (std::is_same_v<T, int64_t>) {
--- a/aten/src/ATen/core/boxing/KernelFunction_impl.h
+++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h
@ -114,25 +114,25 @@ inline typename remove_symint<T>::type unpackSymInt(T x) {
 }

 template <>
-inline typename remove_symint<c10::SymInt>::type unpackSymInt(c10::SymInt x) {
+inline remove_symint<c10::SymInt>::type unpackSymInt(c10::SymInt x) {
  return x.guard_int(__FILE__, __LINE__);
 }

 template <>
-inline typename remove_symint<c10::SymIntArrayRef>::type unpackSymInt(
+inline remove_symint<c10::SymIntArrayRef>::type unpackSymInt(
    c10::SymIntArrayRef x) {
  return C10_AS_INTARRAYREF_SLOW(x);
 }

 template <>
-inline typename remove_symint<std::optional<c10::SymInt>>::type unpackSymInt(
+inline remove_symint<std::optional<c10::SymInt>>::type unpackSymInt(
    std::optional<c10::SymInt> x) {
  return x.has_value() ? std::make_optional(x->guard_int(__FILE__, __LINE__))
                       : std::nullopt;
 }

 template <>
-inline typename remove_symint<at::OptionalSymIntArrayRef>::type unpackSymInt(
+inline remove_symint<at::OptionalSymIntArrayRef>::type unpackSymInt(
    at::OptionalSymIntArrayRef x) {
  return x.has_value() ? std::make_optional(C10_AS_INTARRAYREF_SLOW(*x))
                       : std::nullopt;
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@ -631,8 +631,8 @@ call_functor_with_args_from_stack_(
    Stack* stack,
    std::index_sequence<ivalue_arg_indices...> /*unused*/,
    guts::typelist::typelist<ArgTypes...>* /*unused*/) {
-  (void)(stack); // when sizeof...(ivalue_arg_indices) == 0, this argument would
-                 // be unused and we have to silence the compiler warning.
+  (void)stack; // when sizeof...(ivalue_arg_indices) == 0, this argument would
+               // be unused and we have to silence the compiler warning.

  // We're explicitly filtering out DispatchKeySet from the argument list.
  // Some kernels take a DispatchKeySet as their first argument in order to
--- a/aten/src/ATen/core/enum_type.h
+++ b/aten/src/ATen/core/enum_type.h
@ -18,6 +18,7 @@ struct TORCH_API EnumType : public NamedType {
      TypePtr value,
      std::vector<EnumNameValue> enum_names_values,
      std::weak_ptr<::torch::jit::CompilationUnit> cu) {
+    C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
    switch (value->kind()) {
      case TypeKind::IntType:
      case TypeKind::FloatType:
@ -34,6 +35,7 @@ struct TORCH_API EnumType : public NamedType {
            value->str(),
            "', only int, float and string are supported");
    }
+    C10_DIAGNOSTIC_POP()
  }

  std::string str() const override {
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@ -601,8 +601,8 @@ std::ostream& IValue::repr(
      double d = v.toDouble();
      int c = std::fpclassify(d);
      if ((c == FP_NORMAL || c == FP_ZERO ) && std::abs(d) < 1e10) {
-        int64_t i = int64_t(d);
-        if (double(i) == d) {
+        int64_t i = static_cast<int64_t>(d);
+        if (static_cast<double>(i) == d) {
          // -0.0 (signed zero) needs to be parsed as -0.
          if (i == 0 && std::signbit(d)) {
            return out << "-" << i << ".";
@ -799,8 +799,8 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
      double d = v.toDouble();
      int c = std::fpclassify(d);
      if (c == FP_NORMAL || c == FP_ZERO) {
-        int64_t i = int64_t(d);
-        if (double(i) == d) {
+        int64_t i = static_cast<int64_t>(d);
+        if (static_cast<double>(i) == d) {
          return out << i << ".";
        }
      }
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -41,7 +41,7 @@ void standardizeVectorForUnion(std::vector<TypePtr>* to_flatten);
 inline bool is_contiguous_strides(
    const IntArrayRef sizes,
    const IntArrayRef strides) {
-  int n_dim = static_cast<int>(sizes.size());
+  size_t n_dim = sizes.size();
  if (n_dim == 0) {
    return true;
  }
@ -50,7 +50,7 @@ inline bool is_contiguous_strides(
    return false;
  }

-  for (int i = n_dim - 2; i >= 0; i--) {
+  for (int i = static_cast<int>(n_dim) - 2; i >= 0; i--) {
    if (strides[i] != strides[i + 1] * sizes[i + 1]) {
      return false;
    }
@ -922,6 +922,7 @@ struct TORCH_API DictType : public SharedType {
    if (auto dyn = key->castRaw<DynamicType>()) {
      kind = dyn->dynamicKind();
    }
+    C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
    switch (kind) {
      case TypeKind::AnyType:
      case TypeKind::IntType:
@ -938,6 +939,7 @@ struct TORCH_API DictType : public SharedType {
            key->str(),
            "', only int, float, complex, Tensor, device and string keys are supported");
    }
+    C10_DIAGNOSTIC_POP()
  }

  // aligned with the format in FunctionSchema
@ -2371,7 +2373,7 @@ private:
 };

 template<>
-inline typename detail::CastReturnType<NamedType>::type Type::cast<NamedType>() {
+inline detail::CastReturnType<NamedType>::type Type::cast<NamedType>() {
  if (kind() == TypeKind::TupleType || kind() == TypeKind::FunctionType ||
      kind() == TypeKind::ClassType || kind() == TypeKind::InterfaceType) {
    return std::static_pointer_cast<NamedType>(static_cast<NamedType *>(this)->shared_from_this());
@ -2380,7 +2382,7 @@ inline typename detail::CastReturnType<NamedType>::type Type::cast<NamedType>()
 }

 template<>
-inline typename detail::CastConstReturnType<NamedType>::type Type::cast<NamedType>() const {
+inline detail::CastConstReturnType<NamedType>::type Type::cast<NamedType>() const {
  if (kind() == TypeKind::TupleType || kind() == TypeKind::FunctionType ||
      kind() == TypeKind::ClassType || kind() == TypeKind::InterfaceType) {
    return std::static_pointer_cast<const NamedType>(static_cast<const NamedType *>(this)->shared_from_this());
--- a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
@ -6,9 +6,9 @@ namespace at::vec {
 inline namespace CPU_CAPABILITY {
 #if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))

-// Enable auto-vectorization for GCC-13+ and clang-17+
+// Enable auto-vectorization for clang-17+
 // GCC-12 has a bug: gcc.gnu.org/bugzilla/show_bug.cgi?id=117001
-#if __GNUC__ > 12 || (defined(__clang__) && (__clang_major__ >= 17))
+#if defined(__clang__) && (__clang_major__ >= 17)

 template <typename from_type, typename to_type>
 inline void convertImpl(
--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@ -514,7 +514,7 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {

  using float_vec_return_type = std::array<Vectorized<float>, kFloatNumVecs>;
  using int_vec_return_type = std::array<Vectorized<c10::qint32>, kIntNumVecs>;
-  using value_type = typename c10::qint8::underlying;
+  using value_type = c10::qint8::underlying;

 public:
  using Vectorizedqi::Vectorizedqi;
@ -727,7 +727,7 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {

  using float_vec_return_type = std::array<Vectorized<float>, kFloatNumVecs>;
  using int_vec_return_type = std::array<Vectorized<c10::qint32>, kIntNumVecs>;
-  using value_type = typename c10::quint8::underlying;
+  using value_type = c10::quint8::underlying;

 public:
  using Vectorizedqi::Vectorizedqi;
--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
@ -567,7 +567,7 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {

  using float_vec_return_type = std::array<Vectorized<float>, 4>;
  using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
-  using value_type = typename c10::qint8::underlying;
+  using value_type = c10::qint8::underlying;

 public:
  using Vectorizedqi::Vectorizedqi;
@ -804,7 +804,7 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {

  using float_vec_return_type = std::array<Vectorized<float>, 4>;
  using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
-  using value_type = typename c10::quint8::underlying;
+  using value_type = c10::quint8::underlying;

 public:
  using Vectorizedqi::Vectorizedqi;
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@ -672,7 +672,7 @@ struct Vectorized {
    return map(std::sqrt);
  }
  Vectorized<T> reciprocal() const {
-    return map([](T x) { return (T)(1) / x; });
+    return map([](T x) { return (T)1 / x; });
  }
  Vectorized<T> rsqrt() const {
    return map([](T x) { return (T)1 / std::sqrt(x); });
--- a/aten/src/ATen/cpu/vml.h
+++ b/aten/src/ATen/cpu/vml.h
@ -46,7 +46,7 @@ inline void vrsqrt(scalar_t* out, scalar_t* in, int64_t size) {
  parallel_for(0, size, 2048, [out, in](int64_t begin, int64_t end) {
    map(
        [](const Vectorized<scalar_t>& x) {
-          return Vectorized<scalar_t>((scalar_t)(1)) / x.sqrt();
+          return Vectorized<scalar_t>((scalar_t)1) / x.sqrt();
        },
        out + begin,
        in + begin,
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@ -194,8 +194,8 @@ void CUDAGeneratorState::unregister_graph(cuda::CUDAGraph* graph) {
 void CUDAGeneratorState::capture_prologue() {
  capturing_ = true;
  offset_intragraph_ = 0;
-  seed_extragraph_.fill_(int64_t(seed_));
-  offset_extragraph_.fill_(int64_t(0));
+  seed_extragraph_.fill_(static_cast<int64_t>(seed_));
+  offset_extragraph_.fill_(0);
 }

 /**
@ -216,8 +216,8 @@ void CUDAGeneratorState::replay_prologue(uint64_t wholegraph_increment) {
  at::cuda::assertNotCapturing(
      "Cannot prepare for replay during capturing stage.");
  if (wholegraph_increment) {
-      seed_extragraph_.fill_(int64_t(seed_));
-      offset_extragraph_.fill_(int64_t(philox_offset_per_thread_));
+      seed_extragraph_.fill_(static_cast<int64_t>(seed_));
+      offset_extragraph_.fill_(static_cast<int64_t>(philox_offset_per_thread_));
      // Applies the total increment achieved during previous captures to update the
      // offset.
      increase(wholegraph_increment);
@ -329,7 +329,7 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
  constexpr size_t offset_size = sizeof(int64_t);
  constexpr size_t total_size = seed_size + offset_size;

-  auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
+  auto state_tensor = at::detail::empty_cpu({static_cast<int64_t>(total_size)}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
  auto rng_state = state_tensor.data_ptr<uint8_t>();
  auto current_seed = this->current_seed();
  auto offset = static_cast<int64_t>(this->philox_offset_per_thread()); // Note that old THCGeneratorState had offset as std::atomic<int64_t>
--- a/aten/src/ATen/cuda/CUDAGreenContext.cpp
+++ b/aten/src/ATen/cuda/CUDAGreenContext.cpp
@ -1,78 +1,90 @@
 #include <ATen/cuda/CUDAGreenContext.h>

-namespace at::cuda {
-  GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
-#if CUDA_HAS_GREEN_CONTEXT
-    int driver_version;
-    C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
-    TORCH_CHECK(
-        driver_version >= 12080, "cuda driver too old to use green context!");
-    CUcontext pctx = nullptr;
-    C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
-    if (C10_UNLIKELY(!pctx)) {
-      TORCH_WARN(
-          "Attempted to create a green context but"
-          " there was no primary context! Creating a primary context...");
-
-      cudaFree(0);
-    }
-
-    CUdevice device;
-    device_id_ = device_id;
-    C10_CUDA_DRIVER_CHECK(
-        c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));
-
-    // Get device resources
-    CUdevResource device_resource;
-    C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
-        device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));
-
-    // Split resources
-    std::vector<CUdevResource> result(1);
-    auto result_data = result.data();
-    unsigned int nb_groups = 1;
-    CUdevResource remaining;
-
-    C10_CUDA_DRIVER_CHECK(
-        c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
-            result_data,
-            &nb_groups,
-            &device_resource,
-            &remaining,
-            0, // default flags
-            num_sms));
-
-    TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");
-
-    // Generate resource descriptor
-    CUdevResourceDesc desc;
-    C10_CUDA_DRIVER_CHECK(
-        c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
-            &desc, result_data, 1));
-
-    // Create green context
-    // CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
-    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
-    C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
-        &green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));
-
-    // Convert to regular context
-    C10_CUDA_DRIVER_CHECK(
-        c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
-    TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12030) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+#include <c10/cuda/driver_api.h>
+#include <stdexcept>
+#include <vector>
+#define HAS_CUDA_GREEN_CONTEXT() 1
 #else
-    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
+#define HAS_CUDA_GREEN_CONTEXT() 0
+// Suppress unsued private field warnings as this class is not supposed to be called
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-private-field")
+#endif
+
+namespace at::cuda {
+
+GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
+#if HAS_CUDA_GREEN_CONTEXT()
+  int driver_version;
+  C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
+  TORCH_CHECK(
+      driver_version >= 12080, "cuda driver too old to use green context!");
+  CUcontext pctx = nullptr;
+  C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
+  if (C10_UNLIKELY(!pctx)) {
+    TORCH_WARN(
+        "Attempted to create a green context but"
+        " there was no primary context! Creating a primary context...");
+
+    cudaFree(0);
+  }
+
+   CUdevice device;
+  device_id_ = device_id;
+  C10_CUDA_DRIVER_CHECK(
+      c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));
+
+  // Get device resources
+  CUdevResource device_resource;
+  C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
+      device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));
+
+  // Split resources
+  std::vector<CUdevResource> result(1);
+  auto result_data = result.data();
+  unsigned int nb_groups = 1;
+  CUdevResource remaining;
+
+  C10_CUDA_DRIVER_CHECK(
+      c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
+          result_data,
+          &nb_groups,
+          &device_resource,
+          &remaining,
+          0, // default flags
+          num_sms));
+
+  TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");
+
+  // Generate resource descriptor
+  CUdevResourceDesc desc;
+  C10_CUDA_DRIVER_CHECK(
+      c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
+          &desc, result_data, 1));
+
+  // Create green context
+  // CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
+  C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
+      &green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));
+
+  // Convert to regular context
+  C10_CUDA_DRIVER_CHECK(
+      c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
+  TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
+#else
+  TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
 #endif
  }

  std::unique_ptr<GreenContext> GreenContext::create(
      uint32_t num_sms,
      std::optional<uint32_t> device_id) {
-#if CUDA_HAS_GREEN_CONTEXT
+#if HAS_CUDA_GREEN_CONTEXT()
    if (!device_id.has_value()) {
      device_id = at::cuda::current_device();
    }
-    return std::make_unique<GreenContext>(device_id.value(), num_sms);
+    return std::unique_ptr<GreenContext>(new GreenContext(device_id.value(), num_sms));
 #else
    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
 #endif
@ -80,7 +92,7 @@ namespace at::cuda {

  // Implement move operations
  GreenContext::GreenContext(GreenContext&& other) noexcept{
-#if CUDA_HAS_GREEN_CONTEXT
+#if HAS_CUDA_GREEN_CONTEXT()
    device_id_ = std::exchange(other.device_id_, -1);
    green_ctx_ = std::exchange(other.green_ctx_, nullptr);
    context_ = std::exchange(other.context_, nullptr);
@ -91,7 +103,7 @@ namespace at::cuda {
  }

  GreenContext& GreenContext::operator=(GreenContext&& other) noexcept{
-#if CUDA_HAS_GREEN_CONTEXT
+#if HAS_CUDA_GREEN_CONTEXT()
    if (this != &other) {
      // Clean up current resources
      if (green_ctx_) {
@ -120,7 +132,7 @@ namespace at::cuda {
  }

  GreenContext::~GreenContext() noexcept{
-#if CUDA_HAS_GREEN_CONTEXT
+#if HAS_CUDA_GREEN_CONTEXT()
    C10_CUDA_DRIVER_CHECK(
        c10::cuda::DriverAPI::get()->cuGreenCtxDestroy_(green_ctx_));
 #else
@ -128,25 +140,9 @@ namespace at::cuda {
 #endif
  }

-  // Get the underlying CUDA context
-  CUcontext GreenContext::getContext() const {
-#if CUDA_HAS_GREEN_CONTEXT
-    return context_;
-#else
-    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
-#endif
-  }
-
-  // Get the underlying green context
-#if CUDA_HAS_GREEN_CONTEXT
-  CUgreenCtx GreenContext::getGreenContext() const {
-    return green_ctx_;
-  }
-#endif
-
  // Make this context current
  void GreenContext::setContext() {
-#if CUDA_HAS_GREEN_CONTEXT
+#if HAS_CUDA_GREEN_CONTEXT()
    auto current_stream = c10::cuda::getCurrentCUDAStream();
    parent_stream_ = current_stream.stream();

@ -175,7 +171,7 @@ namespace at::cuda {
  }

  void GreenContext::popContext() {
-#if CUDA_HAS_GREEN_CONTEXT
+#if HAS_CUDA_GREEN_CONTEXT()
    // see above note about stream being hardcoded to the default stream
    at::cuda::CUDAEvent ev;
    ev.record(c10::cuda::getCurrentCUDAStream());
--- a/aten/src/ATen/cuda/CUDAGreenContext.h
+++ b/aten/src/ATen/cuda/CUDAGreenContext.h
@ -1,53 +1,38 @@
 #pragma once
 #include <ATen/cuda/CUDAEvent.h>
-
-#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
-#include <c10/cuda/driver_api.h>
 #include <cuda.h>
-#include <memory>
-#include <stdexcept>
-#include <vector>
-#define CUDA_HAS_GREEN_CONTEXT 1
-#else
-#define CUDA_HAS_GREEN_CONTEXT 0
-#endif
+
+// Forward declare green context as opaque ptr
+typedef struct CUgreenCtx_st* CUgreenCtx;

 namespace at::cuda {

 class TORCH_CUDA_CPP_API GreenContext {
 public:
-  GreenContext(uint32_t device_id, uint32_t num_sms);
-
-  static std::unique_ptr<GreenContext> create(uint32_t num_sms, std::optional<uint32_t> device_id);
+  // Green context creation
+  static std::unique_ptr<GreenContext> create(
+      uint32_t num_sms,
+      std::optional<uint32_t> device_id);
+  ~GreenContext() noexcept;

  // Delete copy constructor and assignment
  GreenContext(const GreenContext&) = delete;
  GreenContext& operator=(const GreenContext&) = delete;

-  // Implement move operations
-  GreenContext(GreenContext&& other) noexcept;
-  GreenContext& operator=(GreenContext&& other) noexcept;
-  ~GreenContext() noexcept;
-
-  // Get the underlying CUDA context
-  CUcontext getContext() const;
-
-  // Get the underlying green context
-#if CUDA_HAS_GREEN_CONTEXT
-  CUgreenCtx getGreenContext() const;
-#endif
-
  // Make this context current
  void setContext();

  void popContext();

 private:
-#if CUDA_HAS_GREEN_CONTEXT
+  GreenContext(uint32_t device_id, uint32_t num_sms);
+  // Implement move operations
+  GreenContext(GreenContext&& other) noexcept;
+  GreenContext& operator=(GreenContext&& other) noexcept;
+
  int32_t device_id_ = -1;
  CUgreenCtx green_ctx_ = nullptr;
  CUcontext context_ = nullptr;
  cudaStream_t parent_stream_ = nullptr;
-#endif
 };
 } // namespace at::cuda
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@ -155,8 +155,8 @@ size_t parseChosenWorkspaceSize() {
    while (next != end) {
      std::smatch match = *next;
      TORCH_CHECK(match.size() == 3, "Expected CUBLAS_WORKSPACE_SPACE_CONFIG match of size 3 (Format :SIZE:COUNT)");
-      size_t curr_size = (size_t) std::stoi(match.str(1));
-      size_t count = (size_t) std::stoi(match.str(2));
+      size_t curr_size = std::stoull(match.str(1));
+      size_t count = std::stoull(match.str(2));
      total_size += curr_size * 1024 * count;
      next++;
    }
--- a/aten/src/ATen/cuda/detail/BLASConstants.cu
+++ b/aten/src/ATen/cuda/detail/BLASConstants.cu
@ -2,8 +2,6 @@
 #include <ATen/Tensor.h>
 #include <ATen/cuda/Exceptions.h>

-#include <mutex>
-
 namespace at {
 namespace cuda {
 namespace detail {
@ -12,39 +10,36 @@ __device__ __constant__ float cublas_one_device;
 __device__ __constant__ float cublas_zero_device;

 float *get_cublas_device_one() {
-  static c10::once_flag init_flag;
-
-  c10::call_once(init_flag, []() {
+  static float *ptr = nullptr;
+  static auto init_flag = [&]() {
    const float one = 1.f;
    AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_one_device, &one, sizeof(float)));
-  });
+    AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_one_device));
+    return true;
+  }();

-  float *ptr;
-  AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_one_device));
  return ptr;
 }

 float *get_cublas_device_zero() {
-  static c10::once_flag init_flag;
-
-  c10::call_once(init_flag, []() {
+  static float *ptr = nullptr;
+  static auto init_flag = [&]() {
    const float zero = 0.f;
    AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_zero_device, &zero, sizeof(float)));
-  });
+    AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_zero_device));
+    return true;
+  }();

-  float *ptr;
-  AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_zero_device));
  return ptr;
 }

 float *get_user_alpha_ptr() {
  static float *alpha_ptr;

-  static c10::once_flag init_flag;
-
-  c10::call_once(init_flag, []() {
+  static bool init_flag [[maybe_unused]] = []() {
    AT_CUDA_CHECK(cudaMalloc(&alpha_ptr, sizeof(float)));
-  });
+    return true;
+  }();

  return alpha_ptr;
 }
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@ -3,6 +3,7 @@
 #include <ATen/ATen.h>
 #include <c10/util/irange.h>

+#include <array>
 #include <iostream>
 #include <sstream>

@ -136,9 +137,9 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
    "Weight strides: ", t.strides(), "\n",
    "cuDNN suggested memory_format: ", memory_format);

-  int size[CUDNN_DIM_MAX];
+  std::array<int, CUDNN_DIM_MAX> size;
  for (const auto i : c10::irange(dim)) {
-    size[i] = (int) t.size(i);
+    size[i] = static_cast<int>(t.size(i));
  }
  for (const auto i : c10::irange(dim, pad)) {
    size[i] = 1;
@ -156,7 +157,7 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
    default:
      TORCH_INTERNAL_ASSERT(false, "unsupported memory_format for cuDNN filters");
  }
-  set(getDataType(t), static_cast<int>(dim), size, filter_format);
+  set(getDataType(t), static_cast<int>(dim), size.data(), filter_format);
 }

 std::string cudnnMemoryFormatToString(cudnnTensorFormat_t tformat) {
--- a/aten/src/ATen/functorch/ADInterpreters.cpp
+++ b/aten/src/ATen/functorch/ADInterpreters.cpp
@ -198,7 +198,7 @@ static void autogradBasedTransformSendToNext(
  }

  // Step 6
-  stack->erase(stack->end() - std::ptrdiff_t(args_size + ret_size), stack->end() - std::ptrdiff_t(ret_size));
+  stack->erase(stack->end() - static_cast<std::ptrdiff_t>(args_size + ret_size), stack->end() - static_cast<std::ptrdiff_t>(ret_size));
 }

 void GradInterpreterPtr::processImpl(
--- a/aten/src/ATen/functorch/BatchRulesNorm.cpp
+++ b/aten/src/ATen/functorch/BatchRulesNorm.cpp
@ -443,14 +443,14 @@ static bool has_same_shape(
  if (!tensor.defined()) {
    return true;
  }
-  if (rankWithoutBatchDim(tensor, tensor_bdim) != (int64_t) normalized_shape.size()) {
+  if (rankWithoutBatchDim(tensor, tensor_bdim) != static_cast<int64_t>(normalized_shape.size())) {
    return false;
  }
  const auto tensor_shape = tensor.sizes();
  for (const auto i : c10::irange(normalized_shape.size())) {
    auto j = i;
    // (0, 1, 2), 1 -> (0, 2, 3)
-    if (tensor_bdim.has_value() && (int64_t)i >= tensor_bdim.value()) {
+    if (tensor_bdim.has_value() && static_cast<int64_t>(i) >= tensor_bdim.value()) {
      j = j + 1;
    }
    if (normalized_shape[i] != tensor_shape[j]) {
--- a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
@ -135,7 +135,7 @@ static void boxed_reduction_batch_rule(const c10::OperatorHandle& op, torch::jit
    reduction_case = ReductionCase::DimArray;
    dims = arguments[dim_arg_pos].toIntList().vec();
    if (dims.empty()) {
-      auto all_dims = range(0, std::max((int64_t)1, logical_dim));
+      auto all_dims = range(0, std::max(static_cast<int64_t>(1), logical_dim));
      dims = std::vector<int64_t>(all_dims.begin(), all_dims.end());
    }
  } else if (arguments[dim_arg_pos].isInt()) {
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@ -432,7 +432,7 @@ namespace {
    // Eg. Given `indexed_shape.size()` is 5 and
    // shape of `values` is (N, 2, 3), then following block
    // will reshape `values` to (N, 1, 1, 2, 3).
-    if ( (int64_t) indexed_shape.size() > values_.dim()) {
+    if ( static_cast<int64_t>(indexed_shape.size()) > values_.dim()) {
      auto values_sizes = values_.sym_sizes();

      // number of unit dims (for broadcasting value to indexed_shape)
--- a/aten/src/ATen/functorch/BatchRulesViews.cpp
+++ b/aten/src/ATen/functorch/BatchRulesViews.cpp
@ -109,7 +109,7 @@ std::tuple<Tensor, std::optional<int64_t>> repeat_batch_rule(
  SymDimVector sizes_with_bdim = { sizes.begin(), sizes.end() };
  sizes_with_bdim.insert(sizes_with_bdim.begin(), 1);
  auto self_ = moveBatchDimToFront(self, self_bdim);
-  while (self_.dim() < (int64_t)sizes_with_bdim.size()) {
+  while (self_.dim() < static_cast<int64_t>(sizes_with_bdim.size())) {
    self_ = self_.unsqueeze(1);
  }
  return std::make_tuple(self_.repeat_symint(sizes_with_bdim), 0);
@ -534,20 +534,20 @@ Tensor trace_decomp(const Tensor& tensor) {
 std::tuple<Tensor, std::optional<int64_t>> tril_batch_rule(
    const Tensor& self,
    std::optional<int64_t> self_bdim,
-    int64_t diagonal = 0) {
+    c10::SymInt diagonal = 0) {
  TORCH_CHECK(self.dim() >= 2, "tril: The input tensor must have at least 2 dimensions.");
  auto self_ = moveBatchDimToFront(self, self_bdim);
-  auto result = at::tril(self_, diagonal);
+  auto result = at::tril_symint(self_, std::move(diagonal));
  return std::make_tuple(std::move(result), 0);
 }

 std::tuple<Tensor, std::optional<int64_t>> triu_batch_rule(
    const Tensor& self,
    std::optional<int64_t> self_bdim,
-    int64_t diagonal = 0) {
+    c10::SymInt diagonal = 0) {
  TORCH_CHECK(self.dim() >= 2, "triu: The input tensor must have at least 2 dimensions.");
  auto self_ = moveBatchDimToFront(self, self_bdim);
-  auto result = at::triu(self_, diagonal);
+  auto result = at::triu_symint(self_, std::move(diagonal));
  return std::make_tuple(std::move(result), 0);
 }

--- a/aten/src/ATen/functorch/BatchedFallback.cpp
+++ b/aten/src/ATen/functorch/BatchedFallback.cpp
@ -191,7 +191,7 @@ static void batchedTensorInplaceForLoopFallback(const c10::OperatorHandle& op, t
      // simplicity. When that is not the case, this code should be updated.
      const auto& argument = (*stack)[arguments_begin + arg_idx];
      if (batched_tensor_inputs_pos_iter == batched_tensor_inputs_position.end()
-          || (int64_t)arg_idx != *batched_tensor_inputs_pos_iter) {
+          || static_cast<int64_t>(arg_idx) != *batched_tensor_inputs_pos_iter) {
        // argument isn't a BatchedTensor
        torch::jit::push(stack, argument);
        continue;
@ -345,7 +345,7 @@ void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Sta
      // simplicity. When that is not the case, this code should be updated.
      const auto& argument = (*stack)[arguments_begin + arg_idx];
      if (batched_tensor_inputs_pos_iter == batched_tensor_inputs_position.end()
-          || (int64_t)arg_idx != *batched_tensor_inputs_pos_iter) {
+          || static_cast<int64_t>(arg_idx) != *batched_tensor_inputs_pos_iter) {
        // argument isn't a BatchedTensor
        torch::jit::push(stack, argument);
        continue;
@ -473,7 +473,7 @@ void batchedNestedTensorForLoopFallback(const c10::OperatorHandle& op, torch::ji
      // simplicity. When that is not the case, this code should be updated.
      const auto& argument = (*stack)[arguments_begin + arg_idx];
      if (batched_tensor_inputs_pos_iter == batched_tensor_inputs_position.end()
-          || (int64_t)arg_idx != *batched_tensor_inputs_pos_iter) {
+          || static_cast<int64_t>(arg_idx) != *batched_tensor_inputs_pos_iter) {
        // argument isn't a BatchedTensor
        torch::jit::push(stack, argument);
        continue;
--- a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
+++ b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
@ -157,7 +157,7 @@ Tensor& squeeze__batching_rule(Tensor& self) {
  const auto physical_shape = batched->value().sizes();
  auto how_many_dims_of_size_1_before_bdim = 0;
  for (const auto i : c10::irange(0, physical_shape.size())) {
-    if ((int64_t)i == bdim) {
+    if (static_cast<int64_t>(i) == bdim) {
      break;
    }
    if (physical_shape[i] == 1) {
@ -573,7 +573,7 @@ Tensor cat_batching_rule(const ITensorListRef& tensors, int64_t dim) {
  }

  auto new_dim = bdim_size.has_value() ? dim + 1 : dim;
-  std::optional<int64_t> new_bdim = bdim_size.has_value() ? std::make_optional((int64_t)0) : std::nullopt;
+  std::optional<int64_t> new_bdim = bdim_size.has_value() ? std::make_optional(static_cast<int64_t>(0)) : std::nullopt;
  auto result = at::cat(tensors_to_cat, new_dim);
  return makeBatched(result, new_bdim, get_current_level());
 }
--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@ -1,7 +1,5 @@
 //  Copyright © 2022 Apple Inc.

-#include <c10/util/CallOnce.h>
-
 #include <ATen/mps/IndexKernels.h>
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSDevice.h>
@ -10,9 +8,6 @@

 namespace at::mps {

-static std::unique_ptr<MPSDevice> mps_device;
-static c10::once_flag mpsdev_init;
-
 static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& device) {
  // MPS Advanced Indexing needs at least Metal 2.0 (support for Argument Buffers and function constants)
  // host_name attribute needs at least Metal 2.2 and ulong needs Metal 2.3 (supported on MacOS 11+
@ -21,8 +16,8 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
 }

 MPSDevice* MPSDevice::getInstance() {
-  c10::call_once(mpsdev_init, [] { mps_device = std::unique_ptr<MPSDevice>(new MPSDevice()); });
-  return mps_device.get();
+  static MPSDevice mps_device;
+  return &mps_device;
 }

 MPSDevice::~MPSDevice() {
--- a/aten/src/ATen/native/AveragePool2d.cpp
+++ b/aten/src/ATen/native/AveragePool2d.cpp
@ -25,18 +25,19 @@ TORCH_PRECOMPUTE_META_FUNC(avg_pool2d)
  // #20866, #22032: Guarantee this for the official C++ API?
  TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
    "avg_pool2d: kernel_size must either be a single int, or a tuple of two ints");
-  const int64_t kH = kernel_size[0];
-  const int64_t kW = kernel_size.size() == 1 ? kH : kernel_size[1];
+  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);

  TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 2,
    "avg_pool2d: stride must either be omitted, a single int, or a tuple of two ints");
-  const int64_t dH = stride.empty() ? kH : stride[0];
-  const int64_t dW = stride.empty() ? kW : stride.size() == 1 ? dH : stride[1];
+  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
+  const int dW = stride.empty() ? kW :
+                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);

  TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
    "avg_pool2d: padding must either be a single int, or a tuple of two ints");
-  const int64_t padH = padding[0];
-  const int64_t padW = padding.size() == 1 ? padH : padding[1];
+  const int padH = safe_downcast<int, int64_t>(padding[0]);
+  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);

  TORCH_CHECK(!divisor_override.has_value() || divisor_override.value() != 0,
    "divisor must be not zero");
--- a/aten/src/ATen/native/AveragePool3d.cpp
+++ b/aten/src/ATen/native/AveragePool3d.cpp
@ -198,9 +198,9 @@ void avg_pool3d_out_frame(
            int64_t hend = std::min(hstart + kH, iheight + padH);
            int64_t wend = std::min(wstart + kW, iwidth + padW);
            int64_t pool_size = (tend - tstart) * (hend - hstart) * (wend - wstart);
-            tstart = std::max(tstart, (int64_t) 0);
-            hstart = std::max(hstart, (int64_t) 0);
-            wstart = std::max(wstart, (int64_t) 0);
+            tstart = std::max(tstart, static_cast<int64_t>(0));
+            hstart = std::max(hstart, static_cast<int64_t>(0));
+            wstart = std::max(wstart, static_cast<int64_t>(0));
            tend = std::min(tend, itime);
            hend = std::min(hend, iheight);
            wend = std::min(wend, iwidth);
@ -377,9 +377,9 @@ void avg_pool3d_backward_out_frame(
            int64_t hend = std::min(hstart + kH, iheight + padH);
            int64_t wend = std::min(wstart + kW, iwidth + padW);
            int64_t pool_size = (tend -tstart) * (hend - hstart) * (wend - wstart);
-            tstart = std::max(tstart, (int64_t) 0);
-            hstart = std::max(hstart, (int64_t) 0);
-            wstart = std::max(wstart, (int64_t) 0);
+            tstart = std::max(tstart, static_cast<int64_t>(0));
+            hstart = std::max(hstart, static_cast<int64_t>(0));
+            wstart = std::max(wstart, static_cast<int64_t>(0));
            tend = std::min(tend, itime);
            hend = std::min(hend, iheight);
            wend = std::min(wend, iwidth);
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@ -946,10 +946,10 @@ void apply_lu_factor(const Tensor& input, const Tensor& pivots, const Tensor& in
    }
  };
  // avoid overflow
-  float matrix_rank = float(std::min(m, n));
+  auto matrix_rank = std::min(m, n);
  // A heuristic tested on a 32 core/socket ICX system
  // https://github.com/pytorch/pytorch/pull/93037#discussion_r1090112948
-  int64_t chunk_size_per_thread = int64_t(
+  int64_t chunk_size_per_thread = static_cast<int64_t>(
      std::min(1.0, 3200.0 / (matrix_rank * matrix_rank * matrix_rank)));
  int64_t grain_size = chunk_size_per_thread * at::get_num_threads();
  at::parallel_for(0, batch_size, grain_size, loop);
--- a/aten/src/ATen/native/Blas.cpp
+++ b/aten/src/ATen/native/Blas.cpp
@ -267,7 +267,7 @@ _scaled_mm_out_cpu_emulated(const Tensor& mat1, const Tensor& mat2,

  float input_scale = scale_a.item<float>();
  float weight_scale = scale_b.item<float>();
-  float output_scale = float(1.0);
+  float output_scale = 1.0f;
  if (scale_result.has_value() &&
      (*out_dtype == ScalarType::Float8_e4m3fn ||
       *out_dtype == ScalarType::Float8_e5m2)) {
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@ -331,7 +331,7 @@ bool gemv_use_fast_path<double>(
    [[maybe_unused]] double beta,
    int64_t incy) {
  return gemv_use_fast_path<float>(
-      trans, m, n, (float)alpha, lda, incx, (float)beta, incy);
+      trans, m, n, static_cast<float>(alpha), lda, incx, static_cast<float>(beta), incy);
 }

 template <>
@ -523,8 +523,8 @@ static inline void scal(int64_t n, scalar_t a, scalar_t *x, int64_t incx)
  if (n == 1) incx = 1;
 #if AT_BUILD_WITH_BLAS()
  if (blas_impl::scal_use_fast_path<scalar_t>(n, incx)) {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
+    int i_n = static_cast<int>(n);
+    int i_incx = static_cast<int>(incx);
    blas_impl::scal_fast_path<scalar_t>(&i_n, &a, x, &i_incx);
    return;
  }
@ -545,11 +545,11 @@ void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, const scalar_t *a, i
 #if AT_BUILD_WITH_BLAS()
  if (blas_impl::gemv_use_fast_path<scalar_t>(trans, m, n, alpha, lda, incx, beta, incy)) {
    TORCH_CHECK(lda >= std::max<int64_t>(1L, m), "lda should be at least max(1,", m, "), but have ", lda);
-    int i_m = (int)m;
-    int i_n = (int)n;
-    int i_lda = (int)lda;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    int i_m = static_cast<int>(m);
+    int i_n = static_cast<int>(n);
+    int i_lda = static_cast<int>(lda);
+    int i_incx = static_cast<int>(incx);
+    int i_incy = static_cast<int>(incy);
    blas_impl::gemv_fast_path<scalar_t>(&trans, &i_m, &i_n, &alpha, a, &i_lda, x, &i_incx, &beta, y, &i_incy);
    return;
  }
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@ -680,9 +680,9 @@ void axpy(int64_t n, double a, const double *x, int64_t incx, double *y, int64_t
  #if AT_BUILD_WITH_BLAS()
  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
  {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    int i_n = static_cast<int>(n);
+    int i_incx = static_cast<int>(incx);
+    int i_incy = static_cast<int>(incy);
    #if C10_IOS
    cblas_daxpy(i_n, a, x, i_incx, y, i_incy);
    #else
@ -705,9 +705,9 @@ void axpy(int64_t n, float a, const float *x, int64_t incx, float *y, int64_t in
  #if AT_BUILD_WITH_BLAS()
  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
  {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    int i_n = static_cast<int>(n);
+    int i_incx = static_cast<int>(incx);
+    int i_incy = static_cast<int>(incy);
    #if C10_IOS
    cblas_saxpy(i_n, a, x, i_incx, y, i_incy);
    #else
@ -730,9 +730,9 @@ void axpy(int64_t n, c10::complex<double> a, const c10::complex<double> *x, int6
  #if AT_BUILD_WITH_BLAS()
  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
  {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    int i_n = static_cast<int>(n);
+    int i_incx = static_cast<int>(incx);
+    int i_incy = static_cast<int>(incy);
    #if C10_IOS
    cblas_zaxpy(i_n, &a, x, i_incx, y, i_incy);
    #else
@ -755,9 +755,9 @@ void axpy(int64_t n, c10::complex<float> a, const c10::complex<float> *x, int64_
  #if AT_BUILD_WITH_BLAS()
  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
  {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    int i_n = static_cast<int>(n);
+    int i_incx = static_cast<int>(incx);
+    int i_incy = static_cast<int>(incy);
    #if C10_IOS
    cblas_caxpy(i_n, &a, x, i_incx, y, i_incy);
    #else
@ -781,9 +781,9 @@ void copy(int64_t n, const double *x, int64_t incx, double *y, int64_t incy) {
  }
  #if AT_BUILD_WITH_BLAS()
  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    int i_n = static_cast<int>(n);
+    int i_incx = static_cast<int>(incx);
+    int i_incy = static_cast<int>(incy);
    #if C10_IOS
    cblas_dcopy(i_n, x, i_incx, y, i_incy);
    #else
@ -805,9 +805,9 @@ void copy(int64_t n, const float *x, int64_t incx, float *y, int64_t incy) {
  }
  #if AT_BUILD_WITH_BLAS()
  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    int i_n = static_cast<int>(n);
+    int i_incx = static_cast<int>(incx);
+    int i_incy = static_cast<int>(incy);
    #if C10_IOS
    cblas_scopy(i_n, x, i_incx, y, i_incy);
    #else
@ -829,9 +829,9 @@ void copy(int64_t n, const c10::complex<double> *x, int64_t incx, c10::complex<d
  }
  #if AT_BUILD_WITH_BLAS()
  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    int i_n = static_cast<int>(n);
+    int i_incx = static_cast<int>(incx);
+    int i_incy = static_cast<int>(incy);
    #if C10_IOS
    cblas_zcopy(i_n, x, i_incx, y, i_incy);
    #else
@ -853,9 +853,9 @@ void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<fl
  }
  #if AT_BUILD_WITH_BLAS()
  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    int i_n = static_cast<int>(n);
+    int i_incx = static_cast<int>(incx);
+    int i_incy = static_cast<int>(incy);
    #if C10_IOS
    cblas_ccopy(i_n, &x, i_incx, y, i_incy);
    #else
@ -1082,7 +1082,7 @@ struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
        M,
        N,
        K,
-        int64_t(1),
+        1,
        ld_a,
        ld_b,
        ld_c,
@ -1096,7 +1096,7 @@ struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
          M,
          N,
          K,
-          int64_t(1),
+          1,
          ld_a,
          ld_b,
          ld_c,
--- a/aten/src/ATen/native/GridSampler.cpp
+++ b/aten/src/ATen/native/GridSampler.cpp
@ -487,17 +487,17 @@ static Tensor _grid_sampler_2d_cpu_quantized(
  int64_t out_sC = output.stride(1);
  int64_t out_sH = output.stride(2);
  int64_t out_sW = output.stride(3);
-  uint8_t* inp_ptr = (uint8_t*)input.data_ptr<quint8>();
-  uint8_t* out_ptr = (uint8_t*)output.data_ptr<quint8>();
-  float* grid_ptr = grid.data_ptr<float>();
+  const uint8_t* inp_ptr = input.const_data_ptr<uint8_t>();
+  uint8_t* out_ptr = output.data_ptr<uint8_t>();
+  const float* grid_ptr = grid.const_data_ptr<float>();
  at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
    for (const auto n : c10::irange(start, end)) {
-      float* grid_ptr_N = grid_ptr + n * grid_sN;
-      uint8_t* inp_ptr_N = inp_ptr + n * inp_sN;
+      const float* grid_ptr_N = grid_ptr + n * grid_sN;
+      const uint8_t* inp_ptr_N = inp_ptr + n * inp_sN;
      for (const auto h : c10::irange(out_H)) {
        for (const auto w : c10::irange(out_W)) {
          // get the corresponding input x, y, z coordinates from grid
-          float* grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
+          const float* grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
          float x = *grid_ptr_NHW;
          float y = grid_ptr_NHW[grid_sCoor];

@ -527,7 +527,7 @@ static Tensor _grid_sampler_2d_cpu_quantized(
          float se = (ix - ix_nw) * (iy - iy_nw);

          // calculate bilinear weighted pixel value and set output pixel
-          uint8_t* inp_ptr_NC = inp_ptr_N;
+          const uint8_t* inp_ptr_NC = inp_ptr_N;
          uint8_t* out_ptr_NCHW =
              out_ptr + n * out_sN + h * out_sH + w * out_sW;
          for (int64_t c = 0; c < C;
--- a/aten/src/ATen/native/Histogram.cpp
+++ b/aten/src/ATen/native/Histogram.cpp
@ -318,7 +318,7 @@ static std::vector<Tensor>& histogramdd_bin_edges_out(const Tensor& self, IntArr

    const int64_t N = self.size(-1);
    const int64_t M = std::accumulate(self.sizes().begin(), self.sizes().end() - 1,
-            (int64_t)1, std::multiplies<int64_t>());
+            static_cast<int64_t>(1), std::multiplies<int64_t>());
    Tensor reshaped_self = self.reshape({ M, N });

    auto outer_bin_edges = select_outer_bin_edges(reshaped_self, range);
--- a/aten/src/ATen/native/Integration.cpp
+++ b/aten/src/ATen/native/Integration.cpp
@ -40,7 +40,7 @@ Tensor do_trapezoid(const Tensor& y, const Tensor& dx, int64_t dim) {
 // When dx is constant, the above formula simplifies
 // to dx * [(\sum_{i=1}^n y_i) - (y_1 + y_n)/2]
 Tensor do_trapezoid(const Tensor& y, double dx, int64_t dim) {
-    return (y.sum(dim) - (y.select(dim, 0) + y.select(dim, -1)) * (0.5)) * dx;
+    return (y.sum(dim) - (y.select(dim, 0) + y.select(dim, -1)) * 0.5) * dx;
 }

 Tensor zeros_like_except(const Tensor& y, int64_t dim) {
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@ -201,7 +201,7 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
  out_size.reserve(out_num_dim);
  for (auto& d : lro) out_size.push_back(left.sym_size(d));
  for (auto& d : lo) out_size.push_back(left.sym_size(d));
-  for (auto& d : sum_dims_) { out_size.emplace_back(1); (void)(d); }; // avoid warning about not using d
+  for (auto& d : sum_dims_) { out_size.emplace_back(1); (void)d; }; // avoid warning about not using d
  for (auto& d : ro) out_size.push_back(right.sym_size(d));

  std::vector<int64_t> lpermutation(lro);
@ -640,7 +640,7 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
    }
  }

-  return ops[0];
+  return std::move(ops[0]);
 }

 // _trilinear computes a trilinear einstein sum with an unrolled dimension
@ -805,7 +805,7 @@ Tensor tensordot(const Tensor& input1, const Tensor& input2, IntArrayRef dims1,
  std::vector<SymInt> rsizes;  // rsizes: sizes of the result
  p1.reserve(input1.dim());
  p2.reserve(input2.dim());
-  rsizes.reserve(input1.dim() + input2.dim() - (int64_t) dims1.size());
+  rsizes.reserve(input1.dim() + input2.dim() - static_cast<int64_t>(dims1.size()));
  SymInt size1 = 1; // number of non-contracted elements in input1
  SymInt size2 = 1; // number of non-contracted elements in input2

--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@ -1655,7 +1655,7 @@ static inline void baddbmm_cpu_kernel(const Tensor& result, const Tensor& self,
  auto s0 = self.accessor<const scalar_t, 3>();
  auto m0 = mat2.accessor<const scalar_t, 3>();

-  int64_t grain_size = std::max(internal::GRAIN_SIZE / (is * js * ks), (int64_t)1);
+  int64_t grain_size = std::max(internal::GRAIN_SIZE / (is * js * ks), static_cast<int64_t>(1));
  using opmath_t = at::opmath_type<scalar_t>;
  parallel_for(0, bs, grain_size, [&](int64_t b_begin, int64_t b_end) {
      for (const auto b : c10::irange(b_begin, b_end)) {
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@ -235,7 +235,7 @@ void nll_loss_out_frame(

  constexpr int64_t cascade_sum_num_levels = 8;
  const int64_t level_power =
-      std::max(int64_t(4), utils::CeilLog2(batch_size) / cascade_sum_num_levels);
+      std::max(static_cast<int64_t>(4), utils::CeilLog2(batch_size) / cascade_sum_num_levels);
  const int64_t level_step = (1 << level_power);
  const int64_t level_mask = level_step - 1;

--- a/aten/src/ATen/native/LossNLL2d.cpp
+++ b/aten/src/ATen/native/LossNLL2d.cpp
@ -129,7 +129,7 @@ void nll_loss2d_forward_out_frame(
      for (const auto b : c10::irange(start, end)) {
        for (const auto h : c10::irange(H)) {
          for (const auto w : c10::irange(W)) {
-            const int64_t cur_target = (int64_t)target_acc[b][h][w];
+            const int64_t cur_target = target_acc[b][h][w];

            if (cur_target == ignore_index) {
              output_acc[b][h][w] = static_cast<scalar_t>(0);
@ -188,7 +188,7 @@ void nll_loss2d_forward_out_frame(
  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
  scalar_t loss_partial_sums[cascade_sum_num_levels] = {0};
  const int64_t level_power =
-      std::max(int64_t(4), utils::CeilLog2(numiter) / cascade_sum_num_levels);
+      std::max(static_cast<int64_t>(4), utils::CeilLog2(numiter) / cascade_sum_num_levels);
  const int64_t level_step = (1 << level_power);
  const int64_t level_mask = level_step - 1;

--- a/aten/src/ATen/native/Math.h
+++ b/aten/src/ATen/native/Math.h
@ -192,7 +192,7 @@ Date:  February 1996
  x = x - (std::erf(x) - y) / ((static_cast<T>(2.0)/static_cast<T>(std::sqrt(c10::pi<double>)))*std::exp(-x*x));
  x = x - (std::erf(x) - y) / ((static_cast<T>(2.0)/static_cast<T>(std::sqrt(c10::pi<double>)))*std::exp(-x*x));

-  return(x);
+  return x;
 }

 #undef CENTRAL_RANGE
@ -3819,7 +3819,7 @@ inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_v_forward(T x, int64_t n)

    if ((n > 6) && (std::abs(x + x - T(1.0)) < T(1.0))) {
        if (std::sin(std::acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) {
-            return std::cos(((n) + T(0.5)) * std::acos(x + x - T(1.0))) / std::cos(std::acos(x + x - T(1.0)) / T(2.0));
+            return std::cos((n + T(0.5)) * std::acos(x + x - T(1.0))) / std::cos(std::acos(x + x - T(1.0)) / T(2.0));
        }

        if (n % 2 == 0) {
--- a/aten/src/ATen/native/NNPACK.cpp
+++ b/aten/src/ATen/native/NNPACK.cpp
@ -193,22 +193,22 @@ Tensor _nnpack_spatial_convolution(
  const size_t input_channels = input.size(1);
  const size_t output_channels = weight.size(0);
  const struct nnp_size input_size = {
-      .width = (size_t)input.size(3),
-      .height = (size_t)input.size(2),
+      .width = static_cast<size_t>(input.size(3)),
+      .height = static_cast<size_t>(input.size(2)),
  };
  const struct nnp_padding input_padding = {
-      .top = (size_t)padding[0],
-      .right = (size_t)padding[1],
-      .bottom = (size_t)padding[0],
-      .left = (size_t)padding[1],
+      .top = static_cast<size_t>(padding[0]),
+      .right = static_cast<size_t>(padding[1]),
+      .bottom = static_cast<size_t>(padding[0]),
+      .left = static_cast<size_t>(padding[1]),
  };
  const struct nnp_size kernel_size = {
-      .width = (size_t)weight.size(3),
-      .height = (size_t)weight.size(2),
+      .width = static_cast<size_t>(weight.size(3)),
+      .height = static_cast<size_t>(weight.size(2)),
  };
  const struct nnp_size output_size = {
-      .width = (size_t)output.size(3),
-      .height = (size_t)output.size(2),
+      .width = static_cast<size_t>(output.size(3)),
+      .height = static_cast<size_t>(output.size(2)),
  };
  const nnp_size output_subsample = {
      .width = static_cast<std::size_t>(stride[1]),
--- a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
+++ b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
@ -248,8 +248,8 @@ void slow_conv_transpose3d_out_cpu_template(
  Tensor weight = weight_.contiguous();
  Tensor bias = bias_.defined() ? bias_.contiguous() : bias_;

-  const int n_input_plane = (int)weight.size(0);
-  const int n_output_plane = (int)weight.size(1);
+  const auto n_input_plane = weight.size(0);
+  const auto n_output_plane = weight.size(1);

  bool is_batch = false;
  if (input.dim() == 4) {
--- a/aten/src/ATen/native/NamedTensor.cpp
+++ b/aten/src/ATen/native/NamedTensor.cpp
@ -84,8 +84,8 @@ static std::vector<int64_t> aligned_size(
    DimnameList aligned_names,
    bool is_aligning_two_tensors) {
  std::vector<int64_t> expanded_sizes(aligned_names.size(), 1);
-  ptrdiff_t dim = (ptrdiff_t)tensor_sizes.size() - 1;
-  ptrdiff_t idx = (ptrdiff_t)aligned_names.size() - 1;
+  ptrdiff_t dim = static_cast<ptrdiff_t>(tensor_sizes.size()) - 1;
+  ptrdiff_t idx = static_cast<ptrdiff_t>(aligned_names.size()) - 1;
  for (; idx >= 0 && dim >= 0; --idx) {
    if (tensor_names[dim] != aligned_names[idx]) {
      continue;
--- a/aten/src/ATen/native/RowwisePrune.cpp
+++ b/aten/src/ATen/native/RowwisePrune.cpp
@ -25,7 +25,7 @@ std::tuple<Tensor, Tensor> _rowwise_prune_helper(
  auto mask_contig = mask.contiguous();
  auto mask_data = mask_contig.data_ptr<bool>();
  for (const auto i : c10::irange(mask.numel())) {
-    num_non_masked_rows += (((mask_data[i] == true)) ? 1 : 0);
+    num_non_masked_rows += ((mask_data[i] == true) ? 1 : 0);
  }
  int num_cols = weights.size(1);
  auto pruned_2d_tensor = at::empty({num_non_masked_rows, num_cols},
--- a/aten/src/ATen/native/SoftMax.cpp
+++ b/aten/src/ATen/native/SoftMax.cpp
@ -176,7 +176,7 @@ void host_softmax(
  scalar_t* input_data_base = input.data_ptr<scalar_t>();
  scalar_t* output_data_base = output.data_ptr<scalar_t>();
  bool* mask_data_base = mask;
-  int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1);
+  int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, static_cast<int64_t>(1));
  parallel_for(
      0, outer_size * inner_size, grain_size,
      [&](int64_t begin, int64_t end) {
@ -265,7 +265,7 @@ void host_softmax_backward(
  scalar_t* output_data_base = output.data_ptr<scalar_t>();
  scalar_t* gradOutput_data_base = grad.data_ptr<scalar_t>();
  bool* mask_data_base = mask;
-  int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1);
+  int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, static_cast<int64_t>(1));
  parallel_for(
      0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) {
        for (const auto i : c10::irange(begin, end)) {
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@ -1701,13 +1701,13 @@ Tensor& index_select_out_cpu_(
                  TORCH_CHECK_INDEX(
                      (self_i >= 0) && (self_i < self_dim_size),
                      "index out of range in self");
-                  auto self_data = static_cast<const char*>(selfSlice_data) +
+                  auto self_data = const_cast<char*>(static_cast<const char*>(
+                                       selfSlice_data)) +
                      self_i * self_stride_bytes;
                  auto result_data = static_cast<char*>(resultSlice_data) +
                      i * result_stride_bytes;
                  sub_iter.unsafe_replace_operand(0, result_data);
-                  sub_iter.unsafe_replace_operand(
-                      1, const_cast<char*>(self_data));
+                  sub_iter.unsafe_replace_operand(1, self_data);
                  copy_stub(sub_iter.device_type(), sub_iter, false);
                };
              });
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@ -1382,7 +1382,7 @@ void randperm_cpu(Tensor& result, int64_t n, CPUGeneratorImpl* generator) {
  // use no-initialization Fischer-Yates variant
  // https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle#The_.22inside-out.22_algorithm
  for (int64_t i = 0; i < n; i++) {
-    int64_t z = (int64_t)(generator->random64() % (i + 1));
+    int64_t z = static_cast<int64_t>(generator->random64() % (i + 1));
    r__data[i * r__stride_0] = i;
    r__data[i * r__stride_0] = r__data[z * r__stride_0];
    r__data[z * r__stride_0] = i;
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_dynamic.cpp
@ -40,7 +40,7 @@ at::Tensor PackedLinearWeightQnnp::apply_dynamic_impl<false>(
      "quantized_sparse_linear(): Input tensor rank should be >= 2");

  const auto rows_input = c10::multiply_integers(input.sizes().begin(), input.sizes().end() - 1);
-  const auto cols_input = static_cast<int64_t>(input.size(input.dim() - 1));
+  const auto cols_input = input.size(input.dim() - 1);
  TORCH_CHECK(
      cols_input == input_channels_,
      "quantized_sparse_linear: Input tensor's last and weight tensor's"
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp
@ -65,8 +65,8 @@ LinearPackedSerializationType PackedLinearWeight::unpack() {
 #ifdef USE_PYTORCH_QNNPACK

 LinearPackedSerializationType PackedLinearWeightQnnp::unpack() {
-  const int64_t N = static_cast<int64_t>(output_channels_);
-  const int64_t K = static_cast<int64_t>(input_channels_);
+  const int64_t N = output_channels_;
+  const int64_t K = input_channels_;

  float* w_scales_ptr = w_scales_.data_ptr<float>();

--- a/aten/src/ATen/native/cpu/Activation.cpp
+++ b/aten/src/ATen/native/cpu/Activation.cpp
@ -998,7 +998,7 @@ void softplus_backward_kernel(TensorIteratorBase& iter, const Scalar& beta_, con
    auto threshold = threshold_.to<float>();
    const Vec beta_vec(beta);
    const Vec threshold_vec(threshold);
-    const Vec one_vec(static_cast<float>(1.0));
+    const Vec one_vec(1.0f);
    cpu_kernel_vec(
        iter,
        [beta, threshold](scalar_t a, scalar_t b) -> scalar_t {
--- a/aten/src/ATen/native/cpu/AtomicAddFloat.h
+++ b/aten/src/ATen/native/cpu/AtomicAddFloat.h
@ -17,7 +17,7 @@ static inline void cpu_atomic_add_float(float* dst, float fvalue)
  } uf32_t;

  uf32_t new_value, old_value;
-  std::atomic<unsigned>* dst_intV = (std::atomic<unsigned>*)(dst);
+  std::atomic<unsigned>* dst_intV = (std::atomic<unsigned>*)dst;

  old_value.floatV = *dst;
  new_value.floatV = old_value.floatV + fvalue;
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@ -851,7 +851,7 @@ void sigmoid_backward_kernel(TensorIteratorBase& iter) {
          });
    });
  } else if (iter.dtype() == kBFloat16) {
-    auto one_vec = Vectorized<float>((float)(1));
+    auto one_vec = Vectorized<float>((float)1);
    cpu_kernel_vec(
        iter,
        [=](BFloat16 a, BFloat16 b) -> BFloat16 {
--- a/aten/src/ATen/native/cpu/CopyKernel.cpp
+++ b/aten/src/ATen/native/cpu/CopyKernel.cpp
@ -77,9 +77,7 @@ static void reduced_float_copy_kernel(TensorIteratorBase &iter, bool requires_ne

      int64_t grain_size = at::internal::GRAIN_SIZE;

-      auto loop = [strides_in, requires_neg](char** base, const int64_t* strides, int64_t size0, int64_t size1) {
-        std::array<char*, 2> data;
-        std::copy_n(base, 2, data.data());
+      auto loop = [strides_in, requires_neg](char** data, const int64_t* strides, int64_t size0, int64_t size1) {
        const int64_t *outer_strides = &strides[2];

        for ([[maybe_unused]] const auto it : c10::irange(size1)) {
@ -146,9 +144,7 @@ static void reduced_float_copy_kernel(TensorIteratorBase &iter, bool requires_ne

      int64_t grain_size = at::internal::GRAIN_SIZE;

-      auto loop = [strides_in, requires_neg](char** base, const int64_t* strides, int64_t size0, int64_t size1) {
-        std::array<char*, 2> data;
-        std::copy_n(base, 2, data.data());
+      auto loop = [strides_in, requires_neg](char** data, const int64_t* strides, int64_t size0, int64_t size1) {
        const int64_t *outer_strides = &strides[2];

        for ([[maybe_unused]] const auto it : c10::irange(size1)) {
--- a/aten/src/ATen/native/cpu/IndexKernel.cpp
+++ b/aten/src/ATen/native/cpu/IndexKernel.cpp
@ -493,40 +493,33 @@ void cpu_hflip_vec(at::TensorIterator& iter) {

    for ([[maybe_unused]] const auto j : c10::irange(size1)) {
      // vectorized loop with negative stride for output
-      char** C10_RESTRICT data_ = data_arr.data();
      int64_t n = size0;
-
-      char* C10_RESTRICT data[ntensors];
-      for (const auto arg : c10::irange(ntensors)) {
-        data[arg] = data_[arg];
-      }
-
      int64_t i = 0;

-      // data[0] unaligned pre-pass
+      // data_arr[0] unaligned pre-pass
      int64_t offset = (j * n + (n - i - Vec::size())) % 32;
      offset = (offset >= n) ? n : offset;
      for (; i < offset; i++) {
-        scalar_t* out_ptr = (scalar_t*)(data[0] - i * stride);
-        *out_ptr = c10::load((scalar_t *)(data[1] + i * stride));
+        scalar_t* out_ptr = (scalar_t*)(data_arr[0] - i * stride);
+        *out_ptr = c10::load((scalar_t *)(data_arr[1] + i * stride));
      }
      // Empirically found that it is faster to process 3 data items together vs 2 or 4
      for (; i <= n - 3 * Vec::size(); i += 3 * Vec::size()) {
-        auto out1 = Vec::loadu(data[1] + i * stride);
-        auto out2 = Vec::loadu(data[1] + (i + Vec::size()) * stride);
-        auto out3 = Vec::loadu(data[1] + (i + 2 * Vec::size()) * stride);
+        auto out1 = Vec::loadu(data_arr[1] + i * stride);
+        auto out2 = Vec::loadu(data_arr[1] + (i + Vec::size()) * stride);
+        auto out3 = Vec::loadu(data_arr[1] + (i + 2 * Vec::size()) * stride);
        // flip the vector: 1234 -> 4321
        out1 = flip(out1);
        out2 = flip(out2);
        out3 = flip(out3);
-        out1.store(data[0] - (i + Vec::size() - 1) * stride);
-        out2.store(data[0] - (i + 2 * Vec::size() - 1) * stride);
-        out3.store(data[0] - (i + 3 * Vec::size() - 1) * stride);
+        out1.store(data_arr[0] - (i + Vec::size() - 1) * stride);
+        out2.store(data_arr[0] - (i + 2 * Vec::size() - 1) * stride);
+        out3.store(data_arr[0] - (i + 3 * Vec::size() - 1) * stride);
      }
      if (i < n) {
        for (; i < n; i++) {
-          scalar_t* out_ptr = (scalar_t*)(data[0] - i * stride);
-          *out_ptr = c10::load((scalar_t *)(data[1] + i * stride));
+          scalar_t* out_ptr = (scalar_t*)(data_arr[0] - i * stride);
+          *out_ptr = c10::load((scalar_t *)(data_arr[1] + i * stride));
        }
      }

@ -560,15 +553,8 @@ void cpu_vflip_memcpy(at::TensorIterator& iter) {
    const int64_t stride = strides[0];

    for ([[maybe_unused]] const auto j : c10::irange(size1)) {
-      char** C10_RESTRICT data_ = data_arr.data();
      int64_t n = size0;
-
-      char* C10_RESTRICT data[ntensors];
-      for (const auto arg : c10::irange(ntensors)) {
-        data[arg] = data_[arg];
-      }
-
-      memcpy(data[0], data[1], n * stride);
+      memcpy(data_arr[0], data_arr[1], n * stride);

      // advance:
      for (const auto arg : c10::irange(data_arr.size())) {
--- a/aten/src/ATen/native/cpu/Unfold2d.cpp
+++ b/aten/src/ATen/native/cpu/Unfold2d.cpp
@ -298,7 +298,7 @@ void unfolded2d_copy(
                      memcpy(
                          dst + (size_t)y * output_width + x,
                          src + (size_t)iy * input_width + ix,
-                          sizeof(scalar_t) * (1));
+                          sizeof(scalar_t) * 1);
                  }
                }
              }
@ -317,7 +317,7 @@ void unfolded2d_copy(
                  memcpy(
                      dst + (size_t)y * output_width + x,
                      src + (size_t)iy * input_width + ix + x * dW,
-                      sizeof(scalar_t) * (1));
+                      sizeof(scalar_t) * 1);
              }
            }
          }
--- a/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
+++ b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
@ -342,7 +342,7 @@ void upsample_avx_bilinear_bicubic_uint8(

  if (need_horizontal) {
    int interp_dim = 3;
-    auto stride = (skip_unpacking) ? num_channels : 4;
+    auto stride = skip_unpacking ? num_channels : 4;
    std::tie(horiz_indices_weights, ksize_horiz, horiz_weights_precision) =
        F::compute_index_ranges_int16_weights(
            /*input_size=*/xin,
@ -358,7 +358,7 @@ void upsample_avx_bilinear_bicubic_uint8(

  if (need_vertical) {
    int interp_dim = 2;
-    auto stride = (skip_unpacking) ? num_channels * xout : 4 * xout;
+    auto stride = skip_unpacking ? num_channels * xout : 4 * xout;
    std::tie(vert_indices_weights, ksize_vert, vert_weights_precision) =
        F::compute_index_ranges_int16_weights(
            /*input_size=*/yin,
@ -377,17 +377,17 @@ void upsample_avx_bilinear_bicubic_uint8(
  // horizontal-only or vertical-only interpolation, and if the tensor doesn't
  // need repacking
  if (need_horizontal && (need_vertical || !skip_packing)) {
-    auto c = (skip_unpacking) ? num_channels : 4;
+    auto c = skip_unpacking ? num_channels : 4;
    buffer_horiz = at::empty({c, yin, xout}, input.options());
  }
  if (need_vertical && !skip_packing) {
-    auto c = (skip_unpacking) ? num_channels : 4;
+    auto c = skip_unpacking ? num_channels : 4;
    buffer_vert = at::empty({c, yout, xout}, input.options());
  }

  for (const auto i : c10::irange(batch_size)) {

-    at::Tensor unpacked_input = (skip_unpacking) ? input[i] : unpack_rgb(input[i]);
+    at::Tensor unpacked_input = skip_unpacking ? input[i] : unpack_rgb(input[i]);
    at::Tensor unpacked_output;

    if (need_horizontal) {
@ -411,7 +411,7 @@ void upsample_avx_bilinear_bicubic_uint8(
      unpacked_output = unpacked_input = unpacked_output_temp;
    }
    if (need_vertical) {
-      unpacked_output = (skip_packing) ? output[i] : buffer_vert;
+      unpacked_output = skip_packing ? output[i] : buffer_vert;

      ImagingResampleVertical(
          unpacked_output,
@ -502,7 +502,7 @@ void ImagingResampleHorizontalConvolution8u4x(
  // RGBA: b4_delta = b4_delta_soft = 3
  // RGB : b4_delta = 5
  // RGB : b4_delta_soft = 4
-  const auto b4_delta = (stride == 4) ? 3 : ((is_last_line) ? 5 : 4);
+  const auto b4_delta = (stride == 4) ? 3 : (is_last_line ? 5 : 4);

  // In block 2 (2 means we process 2 weights values together), we read input data
  // with _mm_loadl_epi64, i.e. 8 bytes, per one line:
@ -515,7 +515,7 @@ void ImagingResampleHorizontalConvolution8u4x(
  // RGBA: b2_delta = b2_delta_soft = 1
  // RGB : b2_delta = 2
  // RGB : b2_delta_soft = 1
-  const auto b2_delta = (stride == 4) ? 1 : ((is_last_line) ? 2 : 1);
+  const auto b2_delta = (stride == 4) ? 1 : (is_last_line ? 2 : 1);

  const auto max_out_x_strided = out_xsize * stride;
  const auto max_in_x_strided = in_xsize * stride;
@ -819,7 +819,7 @@ void ImagingResampleHorizontalConvolution8u(
  // RGBA: b8_delta = b8_delta_soft = 7
  // RGB : b8_delta = 10
  // RGB : b8_delta_soft = 9
-  const auto b8_delta = (stride == 4) ? 7 : ((is_last_line) ? 10 : 9);
+  const auto b8_delta = (stride == 4) ? 7 : (is_last_line ? 10 : 9);

  // In block 4 (4 means we process 4 weight values together), we read
  // 16 bytes of input data.
@ -832,7 +832,7 @@ void ImagingResampleHorizontalConvolution8u(
  // RGBA: b4_delta = b4_delta_soft = 3
  // RGB : b4_delta = 5
  // RGB : b4_delta_soft = 4
-  const auto b4_delta = (stride == 4) ? 3 : ((is_last_line) ? 5 : 4);
+  const auto b4_delta = (stride == 4) ? 3 : (is_last_line ? 5 : 4);

  // In block 2 (2 means we process 2 weight values together), we read
  // 8 bytes of input data.
@ -845,7 +845,7 @@ void ImagingResampleHorizontalConvolution8u(
  // RGBA: b2_delta = b2_delta_soft = 1
  // RGB : b2_delta = 2
  // RGB : b2_delta_soft = 1
-  const auto b2_delta = (stride == 4) ? 1 : ((is_last_line) ? 2 : 1);
+  const auto b2_delta = (stride == 4) ? 1 : (is_last_line ? 2 : 1);

  const auto max_out_x_strided = out_xsize * stride;
  const auto max_in_x_strided = in_xsize * stride;
--- a/aten/src/ATen/native/cpu/int4mm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/int4mm_kernel.cpp
@ -644,8 +644,8 @@ void weight_to_int4pack_kernel(
            int32_t val2 = src[(d + 32) * K + k];
            int32_t val3 = src[(d + 48) * K + k];

-            uint8_t packed02 = (((uint8_t)(val2) << 4)) | ((uint8_t)(val0));
-            uint8_t packed13 = (((uint8_t)(val3) << 4)) | ((uint8_t)(val1));
+            uint8_t packed02 = ((uint8_t)val2 << 4) | ((uint8_t)val0);
+            uint8_t packed13 = ((uint8_t)val3 << 4) | ((uint8_t)val1);

            dst[k * 32 + d] = packed02;
            dst[k * 32 + 16 + d] = packed13;
@ -656,7 +656,7 @@ void weight_to_int4pack_kernel(
            int32_t val0 = src[n * K + k];
            int32_t val1 = src[n * K + K + k];

-            uint8_t packed = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0));
+            uint8_t packed = ((uint8_t)val1 << 4) | ((uint8_t)val0);
            dst[k * nb_size / 2 + n / 2] = packed;
          }
        }
@ -667,7 +667,7 @@ void weight_to_int4pack_kernel(
            int32_t val0 = src[(d + 0) * K + k];
            int32_t val1 = src[(d + 16) * K + k];

-            uint8_t packed01 = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0));
+            uint8_t packed01 = ((uint8_t)val1 << 4) | ((uint8_t)val0);
            dst[k * 16 + d] = packed01;
          }
        } else {
@ -676,7 +676,7 @@ void weight_to_int4pack_kernel(
            int32_t val0 = src[n * K + k];
            int32_t val1 = src[n * K + K + k];

-            uint8_t packed = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0));
+            uint8_t packed = ((uint8_t)val1 << 4) | ((uint8_t)val0);
            dst[k * nb_size / 2 + n / 2] = packed;
          }
        }
@ -685,7 +685,7 @@ void weight_to_int4pack_kernel(
          int32_t val0 = src[n * K + k];
          int32_t val1 = src[n * K + K + k];

-          uint8_t packed = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0));
+          uint8_t packed = ((uint8_t)val1 << 4) | ((uint8_t)val0);
          dst[k * nb_size / 2 + n / 2] = packed;
        }
 #endif
@ -872,16 +872,16 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
          for (size_t k_idx = 0; k_idx < k; ++k_idx) {
            const float src0_0 = src_ptr[k_idx];

-            max0 = (std::max)(src0_0, max0);
-            min0 = (std::min)(src0_0, min0);
+            max0 = std::max(src0_0, max0);
+            min0 = std::min(src0_0, min0);
          }

          // Maximum/minimum int8 values
          const float qmin = (float)INT8_MIN;
          const float qmax = (float)INT8_MAX;

-          const float rmin0 = (std::min)(0.0f, min0);
-          const float rmax0 = (std::max)(0.0f, max0);
+          const float rmin0 = std::min(0.0f, min0);
+          const float rmax0 = std::max(0.0f, max0);

          const float scale0 =
              rmin0 == rmax0 ? 1.f : (qmax - qmin) / (rmax0 - rmin0);
@ -900,8 +900,8 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
              ? qmin - descaled_min0
              : qmax - descaled_max0;

-          zero_point0 = (std::max)(zero_point0, qmin);
-          zero_point0 = (std::min)(zero_point0, qmax);
+          zero_point0 = std::max(zero_point0, qmin);
+          zero_point0 = std::min(zero_point0, qmax);

          // Round to nearest integer
          const int32_t nudged_zero_point0 = lrintf(zero_point0);
@ -909,9 +909,9 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
          int8_t* dst_ptr = lhs_qa8dx + m_idx * dst_stride;

          // LHS offset at the beginning of the row
-          *((float*)(dst_ptr)) = recip_scale0;
+          *((float*)dst_ptr) = recip_scale0;
          dst_ptr += sizeof(float);
-          *((int32_t*)(dst_ptr)) = -nudged_zero_point0;
+          *((int32_t*)dst_ptr) = -nudged_zero_point0;
          dst_ptr += sizeof(int32_t);

          // Quantize the channels
@ -922,8 +922,8 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
            int32_t v0_s32 = (int32_t)(std::round(src0_0 * scale0));

            v0_s32 = v0_s32 + nudged_zero_point0;
-            v0_s32 = (std::max)(v0_s32, static_cast<int32_t>(INT8_MIN));
-            v0_s32 = (std::min)(v0_s32, static_cast<int32_t>(INT8_MAX));
+            v0_s32 = std::max(v0_s32, static_cast<int32_t>(INT8_MIN));
+            v0_s32 = std::min(v0_s32, static_cast<int32_t>(INT8_MAX));
            dst_ptr[0] = (int8_t)v0_s32;
            dst_ptr += sizeof(int8_t);
          }
@ -988,8 +988,8 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
      main_acc = main_acc * lhs_scale;

      // Clamp (min-max) operation
-      main_acc = (std::max)(main_acc, scalar_min);
-      main_acc = (std::min)(main_acc, scalar_max);
+      main_acc = std::max(main_acc, scalar_min);
+      main_acc = std::min(main_acc, scalar_max);

      dst_f32[0] = main_acc;
      dst_f32 += 1;
@ -1024,15 +1024,15 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel(

      for (size_t k_idx = 0; k_idx < k; ++k_idx) {
        const float src0_0 = src_ptr[k_idx];
-        max0 = (std::max)(src0_0, max0);
-        min0 = (std::min)(src0_0, min0);
+        max0 = std::max(src0_0, max0);
+        min0 = std::min(src0_0, min0);
      }

      const float qmin = (float)INT8_MIN;
      const float qmax = (float)INT8_MAX;

-      const float rmin0 = (std::min)(0.0f, min0);
-      const float rmax0 = (std::max)(0.0f, max0);
+      const float rmin0 = std::min(0.0f, min0);
+      const float rmax0 = std::max(0.0f, max0);
      const float scale0 =
          (rmin0 == rmax0) ? 1.f : (qmax - qmin) / (rmax0 - rmin0);
      const float recip_scale0 = scale0 ? 1.0f / scale0 : 0.0f;
@ -1044,22 +1044,22 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel(
          ? qmin - descaled_min0
          : qmax - descaled_max0;

-      zero_point0 = (std::max)(zero_point0, qmin);
-      zero_point0 = (std::min)(zero_point0, qmax);
+      zero_point0 = std::max(zero_point0, qmin);
+      zero_point0 = std::min(zero_point0, qmax);
      const int32_t nudged_zero_point0 = lrintf(zero_point0);

      int8_t* dst_ptr = lhs_qa8dx + row_idx * dst_stride;

-      *((float*)(dst_ptr)) = recip_scale0;
+      *((float*)dst_ptr) = recip_scale0;
      dst_ptr += sizeof(float);
-      *((int32_t*)(dst_ptr)) = -nudged_zero_point0;
+      *((int32_t*)dst_ptr) = -nudged_zero_point0;
      dst_ptr += sizeof(int32_t);

      for (size_t k_idx = 0; k_idx < k; ++k_idx) {
        const float src0_0 = src_ptr[k_idx];
        int32_t v0_s32 = (int32_t)(std::round(src0_0 * scale0));
-        v0_s32 = (std::max)(
-            (std::min)(
+        v0_s32 = std::max(
+            std::min(
                v0_s32 + nudged_zero_point0, static_cast<int32_t>(INT8_MAX)),
            static_cast<int32_t>(INT8_MIN));
        dst_ptr[0] = (int8_t)v0_s32;
@ -1118,8 +1118,8 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel(
      }

      main_acc = main_acc * lhs_scale;
-      main_acc = (std::max)(main_acc, scalar_min);
-      main_acc = (std::min)(main_acc, scalar_max);
+      main_acc = std::max(main_acc, scalar_min);
+      main_acc = std::min(main_acc, scalar_max);

      dst_f32[0] = main_acc;
      dst_f32 += 1;
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -4,7 +4,6 @@
 #include <c10/util/SmallVector.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/ScalarType.h>
-#include <c10/util/Exception.h>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
 #include <ATen/core/NamedTensor.h>
--- a/aten/src/ATen/native/cuda/ScaledBlas.cpp
+++ b/aten/src/ATen/native/cuda/ScaledBlas.cpp
@ -59,22 +59,6 @@
 // forward declare
 class cublasCommonArgs;

-namespace fbgemm_gpu {
-
-// NOTE(slayton58): FBGemm_GPU kernels come from <fbgemm_gpu/torch_ops.h> within the FBGemm repo.
-//                  To update supported ops means a submodule bump, which is.. painful. Instead, we
-//                  can simply forward-declare the methods we want to use.. Works at least as a short-term
-//                  thing, but should still be fixed somewhere/somehow.
-at::Tensor f4f4bf16(
-    at::Tensor,
-    at::Tensor,
-    at::Tensor,
-    at::Tensor,
-    std::optional<at::Tensor>,
-    bool use_mx);
-
-} // namespace fbgemm_gpu
-
 using at::blas::ScalingType;
 using at::blas::SwizzleType;

@ -1013,47 +997,26 @@ _scaled_mxfp4_mxfp4(
          const std::optional<Tensor>& bias,
          const c10::ScalarType out_dtype,
          Tensor& out) {
-#if !defined(USE_ROCM) && !defined(USE_FBGEMM_GENAI)
-  TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM and CUDA+FBGEMM_GENAI only");
+#ifndef USE_ROCM
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM only");
 #endif
  // Restrictions:
  // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
  TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ",
      mat_a.scalar_type(), mat_b.scalar_type());

-  // Packed FP4 format means actual-K = 2 * reported-K -- adjust
-  auto K_multiplier = 2;
-#ifdef USE_ROCM
-  // AMD
-  auto scale_a_elems = ceil_div<int64_t>(K_multiplier * mat_a.size(0), 32) * mat_a.size(1);
-  auto scale_b_elems = ceil_div<int64_t>(K_multiplier * mat_b.size(1), 32) * mat_b.size(0);
-#else
-  // NVIDIA
-  auto scale_a_elems = round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(K_multiplier * mat_a.size(1), 32), 4);
-  auto scale_b_elems = round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(K_multiplier * mat_b.size(0), 32), 4);
-#endif
+  auto scale_a_elems = ceil_div<int64_t>(2 * mat_a.size(0), 32) * mat_a.size(1);
+  auto scale_b_elems = ceil_div<int64_t>(2 * mat_b.size(1), 32) * mat_b.size(0);
  TORCH_CHECK_VALUE(scale_a_elems == scale_a.numel(),
         "For Blockwise scaling scale_a should have ", scale_a_elems, " elements, got: ", scale_a.numel());
  TORCH_CHECK_VALUE(scale_b_elems == scale_b.numel(),
         "For Blockwise scaling scale_b should have ", scale_b_elems, " elements, got: ", scale_b.numel());

-#ifdef USE_ROCM
-  // AMD
-  TORCH_CHECK_VALUE(swizzle_a == SwizzleType::NO_SWIZZLE, "scale_a must not be swizzled (NO_SWIZZLE format)");
-  TORCH_CHECK_VALUE(swizzle_b == SwizzleType::NO_SWIZZLE, "scale_b must not be swizzled (NO_SWIZZLE format)");
-#else
-  // NVIDIA
-  TORCH_CHECK_VALUE(swizzle_a == SwizzleType::SWIZZLE_32_4_4, "scale_a must be swizzled to SWIZZLE_32_4_4 format");
-  TORCH_CHECK_VALUE(swizzle_b == SwizzleType::SWIZZLE_32_4_4, "scale_b must be swizzled to SWIZZLE_32_4_4 format");
-#endif
-
  TORCH_CHECK_VALUE(scale_a.is_contiguous() && scale_b.is_contiguous(),
        "For Blockwise scaling both scales should be contiguous");

  TORCH_CHECK_VALUE(out.scalar_type() == out_dtype, "expected out.scalar_type() to be ", out_dtype, ", but got ", out_dtype);

-#ifdef USE_ROCM
-  // AMD
  auto scaling_choice_a = ScalingType::BlockWise1x32;
  auto scaling_choice_b = ScalingType::BlockWise1x32;

@ -1068,29 +1031,11 @@ _scaled_mxfp4_mxfp4(
  TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16 ||
              out.scalar_type() == ScalarType::Half,
              "Block-wise scaling only supports BFloat16 or Half output types");
+#else
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
 #endif

  return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
-#else
-  // NVIDIA
-  // NOTE(slayton58): fbgemm_gpu::f4f4bf16 does *not* allow passing an output tensor,
-  //                  but we have one we need to use. Two clear options are to copy into
-  //                  our output (slow), or use a move-assignment-operator (faster).
-  //                  However, the compiler can complain about the explicit move preventing
-  //                  copy elision because the return from f4f4bf16 is a temporary object.
-  //                  So we don't explicitly move, and trust the compiler here...
-  //                  In the longer term this should be fixed on the FBGemm side.
-  out = fbgemm_gpu::f4f4bf16(
-      mat_a,
-      mat_b.transpose(-2, -1),
-      scale_a,
-      scale_b,
-      std::nullopt, /* global_scale */
-      true          /* use_mx */
-  );
-
-  return out;
-#endif
 }

 Tensor&
@ -1215,20 +1160,17 @@ _scaled_mm_cuda_v2_out(
        mat_a.size(0), "x", mat_a.size(1), " and ", mat_b.size(0), "x", mat_b.size(1), ")");
  }

-  // Handle fp4 packed-K dimension
-  int K_multiplier = (mat_a.scalar_type() == ScalarType::Float4_e2m1fn_x2) ? 2 : 1;
-
  TORCH_CHECK_VALUE(!bias || bias->numel() == mat_b.sizes()[1], "Bias must be size ", mat_b.sizes()[1],
       " but got ", bias->numel());
  TORCH_CHECK_VALUE(
-      K_multiplier * mat_a.sizes()[1] % 16 == 0,
+      mat_a.sizes()[1] % 16 == 0,
      "Expected trailing dimension of mat1 to be divisible by 16 ",
      "but got mat1 shape: (",
      mat_a.sizes()[0],
      "x",
-      K_multiplier * mat_a.sizes()[1],
+      mat_a.sizes()[1],
      ").");
-  TORCH_CHECK_VALUE(K_multiplier * mat_b.sizes()[0] % 16 == 0 && mat_b.sizes()[1] % 16 == 0, "mat2 shape (", mat_b.sizes()[0], "x",
+  TORCH_CHECK_VALUE(mat_b.sizes()[0] % 16 == 0 && mat_b.sizes()[1] % 16 == 0, "mat2 shape (", mat_b.sizes()[0], "x",
       mat_b.sizes()[1], ") must be divisible by 16");

  // TODO(slayton): Existing checks, not sure if they should really be here.
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
@ -753,8 +753,8 @@ static void apply_cholesky_cusolver_potrf_looped(const Tensor& self_working_copy
      handle, params, uplo, n, datatype,
      self_working_copy_ptr + i * matrix_stride,
      lda, datatype,
-      (char*)workdata_device_ptr + i * worksize_device, worksize_device,
-      (char*)workdata_host_ptr + i * worksize_host, worksize_host,
+      static_cast<char*>(workdata_device_ptr) + i * worksize_device, worksize_device,
+      static_cast<char*>(workdata_host_ptr) + i * worksize_host, worksize_host,
      infos_ptr + i
    );
  }
--- a/aten/src/ATen/native/cudnn/ConvShared.cpp
+++ b/aten/src/ATen/native/cudnn/ConvShared.cpp
@ -119,8 +119,8 @@ void setConvolutionParams(
  params->input_dim = input.dim();
  params->memory_format = memory_format;
  for (int i = 0; i != params->input_dim; ++i) {
-    params->input_size[i] = (int)input.sizes()[i];
-    params->weight_size[i] = (int)weight.sizes()[i];
+    params->input_size[i] = static_cast<int>(input.sizes()[i]);
+    params->weight_size[i] = static_cast<int>(weight.sizes()[i]);
  }
  // ASSERT(padding.size() == stride.size())
  // ASSERT(padding.size() == dilation.size())
--- a/aten/src/ATen/native/cudnn/Conv_v7.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp
@ -64,7 +64,7 @@
 // fastest algorithm combination with a sub optimal mathType.

 constexpr size_t operator"" _TiB(unsigned long long n) {
-  return size_t(n) * 1024 * 1024 * 1024 * 1024;
+  return static_cast<size_t>(n) * 1024 * 1024 * 1024 * 1024;
 }

 namespace at {
--- a/aten/src/ATen/native/cudnn/Conv_v8.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v8.cpp
@ -46,7 +46,7 @@ namespace {

 // TODO: remove duplicate code in Conv_v7.cpp
 constexpr int64_t operator"" _TiB(unsigned long long n) {
-  return size_t(n) << 40;
+  return static_cast<size_t>(n) << 40;
 }

 uint8_t getAlignment(const Tensor& t) {
@ -93,7 +93,10 @@ cudnn_frontend::Tensor getTensorDescriptorWithTypeVirtual(

  std::vector<int64_t> strides_copy(std::begin(strides), std::end(strides));
  fixSizeOneDimStride<int64_t>(
-      sizes.size(), &sizes[0], (int64_t*)&strides_copy[0], channels_last);
+      sizes.size(),
+      &sizes[0],
+      static_cast<int64_t*>(&strides_copy[0]),
+      channels_last);
  auto r = cudnn_frontend::TensorBuilder()
               .setDim(sizes.size(), sizes.data())
               .setStrides(strides_copy.size(), strides_copy.data())
--- a/aten/src/ATen/native/cudnn/GridSampler.cpp
+++ b/aten/src/ATen/native/cudnn/GridSampler.cpp
@ -44,6 +44,7 @@ std::tuple<Tensor, Tensor> cudnn_grid_sampler_backward(
 #include <ATen/cudnn/Descriptors.h>
 #include <ATen/cudnn/Types.h>
 #include <ATen/cudnn/Utils.h>
+#include <array>

 #include <ATen/TensorUtils.h>
 #include <c10/util/irange.h>
@ -59,11 +60,11 @@ void setSamplerDescriptor(
    SpatialTransformerDescriptor& desc,
    cudnnDataType_t dataType,
    const at::Tensor& tensor) {
-  int inputSize[4] = {0};
+  std::array<int, 4> inputSize{0};
  for (const auto i : c10::irange(tensor.dim())) {
-    inputSize[i] = (int)tensor.size(i);
+    inputSize[i] = static_cast<int>(tensor.size(i));
  }
-  desc.set(dataType, 4, inputSize);
+  desc.set(dataType, 4, inputSize.data());
 }

 void checkGridSize(CheckedFrom c, TensorArg grid, TensorArg input) {
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@ -656,7 +656,8 @@ void add_projection_weights(
  TORCH_INTERNAL_ASSERT(
      nb_dims <= min_dim, "nb_dims = ", nb_dims, "; min_dim  = ", min_dim);
  auto elem_size = dataSize(getCudnnDataType(weight_buf));
-  auto offset_bytes = (char*)matrix_pointer - (char*)weight_buf.data_ptr();
+  auto offset_bytes = static_cast<const char*>(matrix_pointer) -
+      static_cast<const char*>(weight_buf.data_ptr());
  TORCH_INTERNAL_ASSERT(
      offset_bytes % elem_size == 0,
      "offset_bytes = ",
@ -794,8 +795,8 @@ get_parameters(
            "; min_dim  = ",
            min_dim);
        auto elem_size = dataSize(getCudnnDataType(weight_buf));
-        auto offset_bytes =
-            (char*)matrix_pointer - (char*)weight_buf.data_ptr();
+        auto offset_bytes = static_cast<const char*>(matrix_pointer) -
+            static_cast<const char*>(weight_buf.data_ptr());
        TORCH_INTERNAL_ASSERT(
            offset_bytes % elem_size == 0,
            "offset_bytes = ",
--- a/aten/src/ATen/native/mkl/SpectralOps.cpp
+++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
@ -330,7 +330,6 @@ Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
 }

 #elif AT_MKL_ENABLED()
-#include <ATen/Dispatch.h>

 #include <algorithm>
 #include <numeric>
--- a/aten/src/ATen/native/mkldnn/Linear.cpp
+++ b/aten/src/ATen/native/mkldnn/Linear.cpp
@ -535,7 +535,7 @@ mkldnn_scaled_mm(const Tensor& mat1, const Tensor& mat2,

  float input_scale = scale_a.item<float>();
  float weight_scale = scale_b.item<float>();
-  float output_scale = float(1.0);
+  float output_scale = 1.0f;
  if (scale_result.has_value() &&
      (*out_dtype == ScalarType::Float8_e4m3fn ||
       *out_dtype == ScalarType::Float8_e5m2)) {
--- a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
+++ b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
@ -530,7 +530,7 @@ static Tensor get_mkldnn_serialized_md(const Tensor& self) {
 #else
      TORCH_CHECK(false, "Unexpected IDeep version to do weight serialization.");
 #endif
-  Tensor serialized_md = at::from_blob((void*)serialized_wei_desc.data(), {(int64_t)serialized_wei_desc.size()}, at::TensorOptions(at::kByte));
+  Tensor serialized_md = at::from_blob((void*)serialized_wei_desc.data(), {static_cast<int64_t>(serialized_wei_desc.size())}, at::TensorOptions(at::kByte));
  auto res = at::empty_like(serialized_md);
  // serialized_md shares the buffer with serialized_wei_desc,
  // which will be released outside of this function thus invalidating the buffer of serialized_md.
--- a/aten/src/ATen/native/mkldnn/Matmul.cpp
+++ b/aten/src/ATen/native/mkldnn/Matmul.cpp
@ -576,14 +576,14 @@ static void _mkldnn_gemm_i8i8i32_with_blas(
        n,
        k,
        alpha,
-        (int8_t*)self.data_ptr(),
+        static_cast<int8_t*>(self.data_ptr()),
        lda,
        ao,
-        (int8_t*)mat2.data_ptr(),
+        static_cast<int8_t*>(mat2.data_ptr()),
        ldb,
        bo,
        beta,
-        (int32_t*)result.data_ptr(),
+        static_cast<int32_t*>(result.data_ptr()),
        ldc,
        &co);
  }
--- a/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
@ -40,37 +40,14 @@ bool check_head_dim_size_xpu(sdp::sdp_params const& params, bool debug) {
  return true;
 }

-bool input_require_grad(
-    const at::Tensor& query,
-    const at::Tensor& key,
-    const at::Tensor& value,
-    const std::optional<at::Tensor>& attn_mask) {
-  return at::GradMode::is_enabled() &&
-      (query.requires_grad() || key.requires_grad() || value.requires_grad() ||
-       (attn_mask.has_value() && attn_mask.value().requires_grad()));
-}
-
-bool check_grad(sdp::sdp_params const& params, bool debug) {
-  if (!input_require_grad(
-          params.query, params.key, params.value, params.attn_mask))
-    return true;
-
-  auto q_num_heads = params.query.sym_size(-3);
-  auto k_num_heads = params.key.sym_size(-3);
-  auto v_num_heads = params.value.sym_size(-3);
-  bool is_gqa = q_num_heads != k_num_heads || q_num_heads != v_num_heads;
-  if (debug && is_gqa)
-    TORCH_WARN(
-        "scale_dot_product_attention with gqa is not supported for gradient computation on xpu.");
-
-  bool attn_mask_needs_grad =
-      params.attn_mask.has_value() && params.attn_mask.value().requires_grad();
-  if (debug && attn_mask_needs_grad) {
-    TORCH_WARN(
-        "scale_dot_product_attention on xpu is not supported when attn_mask.requires_grad() == True.");
+bool check_no_grad(sdp::sdp_params const& params, bool debug) {
+  const bool any_inputs_require_grad = params.query.requires_grad() ||
+      params.key.requires_grad() || params.value.requires_grad();
+  const bool gradmode_enabled = at::GradMode::is_enabled();
+  if (debug && any_inputs_require_grad && gradmode_enabled) {
+    TORCH_WARN("Backward or grad to be supported.");
  }
-
-  return !is_gqa && !attn_mask_needs_grad;
+  return !any_inputs_require_grad || !gradmode_enabled;
 }

 bool can_use_overrideable_attention(sdp::sdp_params const& params, bool debug) {
@ -88,7 +65,7 @@ bool can_use_overrideable_attention(sdp::sdp_params const& params, bool debug) {
      sdp::check_nonzero_sequence_lengths_dense,
      sdp::check_last_dim_stride_equals_1_dense<false /*ignore_singleton_dim*/>,
      check_head_dim_size_xpu,
-      check_grad);
+      check_no_grad);
  for (auto& constraint : constraints) {
    if (!constraint(params, debug)) {
      return false;
@ -248,11 +225,10 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
    double dropout_p,
    bool is_causal,
    bool return_debug_mask,
-    std::optional<double> scale,
-    bool compute_logsumexp) {
+    std::optional<double> scale) {
  TORCH_INTERNAL_ASSERT(
      query.dim() == 4 && key.dim() == 4 && value.dim() == 4,
-      "scaled_dot_product_fused_attention_overrideable_xpu: Accept only 4 dims inputs shape of {B, H, T, K}");
+      "scaled_dot_product_fused_attention_overrideable_xpu: Accept only 4 dims inputs shape of {(B), H, T, K}");
  TORCH_INTERNAL_ASSERT(
      (key.size(0) == value.size(0)) && (key.size(1) == value.size(1)) &&
          (key.size(2) == value.size(2)),
@ -269,9 +245,6 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
  TORCH_INTERNAL_ASSERT(
      !(attn_bias.has_value() && is_causal),
      "scaled_dot_product_fused_attention_overrideable_xpu: attn_bias cannot present with is_causal");
-  TORCH_INTERNAL_ASSERT(
-      !(attn_bias.has_value() && attn_bias.value().requires_grad()),
-      "scaled_dot_product_fused_attention_overrideable_xpu: attn_bias cannot have requires_grad=True");

  const int64_t batch_size = query.size(0);
  const int64_t num_head_q = query.size(1);
@ -281,14 +254,11 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
  const int64_t seq_len_q = query.size(2);
  const int64_t seq_len_kv = key.size(2);

-  at::Tensor attention;
-  std::vector<int64_t> attention_shape = {
+  at::Tensor output;
+  std::vector<int64_t> output_shape = {
      batch_size, num_head_q, seq_len_q, head_dim_v};
-  alloc_with_matching_layout(query, attention, attention_shape);
-
-  auto opts = query.options();
-  at::Tensor logsumexp =
-      at::empty({batch_size, num_head_q, seq_len_q}, opts.dtype(at::kFloat));
+  alloc_with_matching_layout(query, output, output_shape);
+  at::Tensor logsumexp, debug_attn_mask; // not supported

  at::native::onednn::sdpa(
      batch_size,
@ -304,15 +274,15 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
      attn_bias,
      is_causal,
      scale.has_value() ? scale.value() : (1.0 / std::sqrt(head_dim_qk)),
-      attention,
-      compute_logsumexp,
+      output,
+      false,
      logsumexp);

  // rng not used
  auto philox_seed = at::empty({}, at::dtype(at::kLong));
  auto philox_offset = at::empty({}, at::dtype(at::kLong));
  return std::make_tuple(
-      attention,
+      output,
      logsumexp,
      /* cum_seq_q */ at::Tensor(),
      /* cum_seq_k */ at::Tensor(),
@ -320,106 +290,7 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
      seq_len_kv,
      philox_seed,
      philox_offset,
-      /*debug_attn_mask */ at::Tensor());
-}
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
-_scaled_dot_product_fused_attention_overrideable_backward_xpu(
-    const at::Tensor& grad_out,
-    const at::Tensor& query,
-    const at::Tensor& key,
-    const at::Tensor& value,
-    const at::Tensor& attn_bias,
-    std::array<bool, 4> grad_input_mask,
-    const at::Tensor& out,
-    const at::Tensor& logsumexp,
-    const at::Tensor& cum_seq_q,
-    const at::Tensor& cum_seq_k,
-    int64_t max_q,
-    int64_t max_k,
-    double dropout_p,
-    bool is_causal,
-    const at::Tensor& philox_seed,
-    const at::Tensor& philox_offset,
-    std::optional<double> scale) {
-  TORCH_INTERNAL_ASSERT(
-      grad_out.dim() == 4 && out.dim() == 4 &&
-          grad_out.size(0) == out.size(0) && grad_out.size(1) == out.size(1) &&
-          grad_out.size(2) == out.size(2) && grad_out.size(3) == out.size(3),
-      "scaled_dot_product_fused_attention_overrideable_backward_xpu: grad_out and out should have the same shape of {B, H, T, K}");
-  TORCH_INTERNAL_ASSERT(
-      query.dim() == 4 && key.dim() == 4 && value.dim() == 4,
-      "scaled_dot_product_fused_attention_overrideable_backward_xpu: Accept only 4 dims inputs shape of {B, H, T, K}");
-  TORCH_INTERNAL_ASSERT(
-      (key.size(0) == value.size(0)) && (key.size(1) == value.size(1)) &&
-          (key.size(2) == value.size(2)),
-      "scaled_dot_product_fused_attention_overrideable_backward_xpu: K/V should have the same batch / seq / num_head");
-  TORCH_INTERNAL_ASSERT(
-      query.size(0) == grad_out.size(0) && query.size(1) == grad_out.size(1) &&
-          query.size(2) == grad_out.size(2),
-      "scaled_dot_product_fused_attention_overrideable_backward_xpu: Q should have the same batch / num_head / seq_len as grad_out");
-  TORCH_INTERNAL_ASSERT(
-      query.size(3) == key.size(3),
-      "scaled_dot_product_fused_attention_overrideable_backward_xpu: Q/K should have the same head_dim");
-  TORCH_INTERNAL_ASSERT(
-      value.size(3) == grad_out.size(3),
-      "scaled_dot_product_fused_attention_overrideable_backward_xpu: V should have the same head_dim as grad_out");
-  TORCH_INTERNAL_ASSERT(
-      query.size(1) == key.size(1),
-      "scaled_dot_product_fused_attention_overrideable_backward_xpu: number of heads in K/V must equal to number of heads in Q");
-  TORCH_INTERNAL_ASSERT(
-      dropout_p == 0.0,
-      "scaled_dot_product_fused_attention_overrideable_backward_xpu: Currently do not support dropout > 0");
-  TORCH_INTERNAL_ASSERT(
-      logsumexp.dim() == 3 && logsumexp.size(0) == query.size(0) &&
-      logsumexp.size(1) == query.size(1) &&
-      logsumexp.size(2) == query.size(2) &&
-      "scaled_dot_product_fused_attention_overrideable_backward_xpu: logsumexp should have the shape of {B, H, T}");
-
-  std::optional<Tensor> attn_bias_opt;
-  if (attn_bias.defined()) {
-    attn_bias_opt = attn_bias;
-  }
-
-  const int64_t batch_size = query.size(0);
-  const int64_t num_head_q = query.size(1);
-  const int64_t num_head_kv = key.size(1);
-  const int64_t seq_len_q = query.size(2);
-  const int64_t seq_len_kv = key.size(2);
-  const int64_t head_dim_qk = query.size(3);
-  const int64_t head_dim_v = value.size(3);
-
-  auto grad_q = at::empty_like(query);
-  auto grad_k = at::empty_like(key);
-  auto grad_v = at::empty_like(value);
-  auto grad_attn_bias = attn_bias_opt.has_value()
-      ? at::empty_like(attn_bias_opt.value())
-      : at::Tensor();
-  at::native::onednn::sdpa_backward(
-      batch_size,
-      num_head_q,
-      num_head_kv,
-      seq_len_q,
-      seq_len_kv,
-      head_dim_qk,
-      head_dim_v,
-      grad_out,
-      query,
-      key,
-      value,
-      out,
-      logsumexp,
-      attn_bias_opt,
-      is_causal,
-      scale.has_value() ? scale.value() : (1.0 / std::sqrt(query.size(3))),
-      grad_q,
-      grad_k,
-      grad_v);
-  return std::make_tuple(
-      std::move(grad_q),
-      std::move(grad_k),
-      std::move(grad_v),
-      std::move(grad_attn_bias));
+      debug_attn_mask);
 }

 REGISTER_XPU_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_xpu);
--- a/aten/src/ATen/native/mkldnn/xpu/detail/WoQMatmul.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/WoQMatmul.cpp
@ -41,7 +41,7 @@ void woq_matmul_int4_impl(
      dst_usr_dims;
  dnnl::memory::dims m1_usr_strides, m2_usr_strides, scale_usr_strides,
      zp_usr_strides, dst_usr_strides;
-  int compressed_k = (int)(k / 8);
+  int compressed_k = k / 8;
  int num_groups = (int)(k / group_size);
  m1_usr_dims = {m, k};
  m1_usr_strides = {m1.stride(0), m1.stride(1)};
--- a/aten/src/ATen/native/mps/operations/Normalization.mm
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@ -84,6 +84,9 @@ std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_mps_out(const Tensor& self,
                                                         Tensor& output,
                                                         Tensor& save_mean,
                                                         Tensor& save_var) {
+  TORCH_CHECK_NOT_IMPLEMENTED(self.scalar_type() != kLong, "Long batch norm is not supported with MPS");
+  TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(self.scalar_type()),
+                              "Batch norm for complex is not supported for MPS");
  using namespace at::native::mps;
  struct CachedGraph : public MPSCachedGraph {
    CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
@ -918,6 +921,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_mps(const Tensor& input,
  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
  const int axis = input_ndim - normalized_ndim;
  MPSStream* stream = getCurrentMPSStream();
+  TORCH_CHECK_NOT_IMPLEMENTED(input.scalar_type() != kLong, "Not implemented for long on MPS");
  @autoreleasepool {
    mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
      // which kernel variant to use based on the normalized axis N size
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@ -10,6 +10,7 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/aminmax.h>
 #include <ATen/ops/avg_pool2d.h>
 #include <ATen/ops/avg_pool2d_backward.h>
 #include <ATen/ops/avg_pool2d_backward_native.h>
@ -544,8 +545,9 @@ static void max_unpool_out_mps_template(const Tensor& input,
  if (indices.defined() && indices.numel() > 0) {
    auto output_image_size = c10::multiply_integers(output_size_);

-    int64_t min_idx = indices.min().item<int64_t>();
-    int64_t max_idx = indices.max().item<int64_t>();
+    auto [min_idx_tensor, max_idx_tensor] = indices.aminmax();
+    int64_t min_idx = min_idx_tensor.item<int64_t>();
+    int64_t max_idx = max_idx_tensor.item<int64_t>();

    if (min_idx < 0 || max_idx >= output_image_size) {
      int64_t error_idx = (min_idx < 0) ? min_idx : max_idx;
--- a/aten/src/ATen/native/mps/operations/Shape.mm
+++ b/aten/src/ATen/native/mps/operations/Shape.mm
@ -83,6 +83,31 @@ std::string get_type_str<int32_t>() {
  return "int32_t";
 }

+// If all tensors are contiguous with the same dtype and the cat dimension is 0,
+// then we can simply copy each tensor's underlying buffer contiguously into the
+// output.
+static void cat_out_mps_contiguous_impl(const ITensorListRef& inputs, const Tensor& output) {
+  MPSStream* stream = getCurrentMPSStream();
+  id<MTLBuffer> output_buffer = getMTLBufferStorage(output);
+  size_t output_offset = output.storage_offset() * output.itemsize();
+
+  for (const Tensor& input : inputs) {
+    if (cat_should_skip_tensor(input)) {
+      continue;
+    }
+
+    id<MTLBuffer> input_buffer = getMTLBufferStorage(input);
+    size_t input_offset = input.storage_offset() * input.itemsize();
+    auto nbytes = input.nbytes();
+    auto profile_id =
+        getMPSProfiler().beginProfileCopy(input_buffer, output_buffer, input, output, nbytes, /*non_blocking=*/true);
+
+    stream->copy(input_buffer, output_buffer, nbytes, input_offset, output_offset, profile_id, SyncType::NONE);
+
+    output_offset += nbytes;
+  }
+}
+
 // NOTE: `output` is expected to already have the correct size.
 template <typename idx_type_t>
 static void cat_out_mps_impl(const ITensorListRef& inputs, int64_t dimension, const Tensor& output) {
@ -105,7 +130,7 @@ static void cat_out_mps_impl(const ITensorListRef& inputs, int64_t dimension, co
  // copy all the input tensor data into a packed buffer, which would not be
  // ideal.
  for (const Tensor& input : inputs) {
-    if (input.numel() == 0) {
+    if (cat_should_skip_tensor(input)) {
      continue;
    }

@ -243,101 +268,16 @@ TORCH_IMPL_FUNC(cat_out_mps)
  if (out.numel() == 0) {
    return;
  }
+
  auto materialized_inputs = inputs.materialize();
-  auto out_dtype = at::native::result_type(inputs);
+  bool has_large_tensor =
+      isTooLargeForMPSGraph(out) || std::any_of(materialized_inputs.begin(), materialized_inputs.end(), [](auto& t) {
+        return !cat_should_skip_tensor(t) && isTooLargeForMPSGraph(t);
+      });

-  int idx = 0;
-  for (const Tensor& t : materialized_inputs) {
-    TORCH_CHECK(t.dim() > 0, "zero-dimensional tensor (at position ", idx, ") cannot be concatenated");
-    auto lap = at::get_overlap_status(out, t);
-    TORCH_CHECK(lap != at::MemOverlapStatus::Partial && lap != at::MemOverlapStatus::Full,
-                "torch.cat(): unsupported operation: the input tensors cannot refer to any "
-                "of the output memory locations. Found overlap in input tensor ",
-                idx);
-    idx++;
-  }
-  // Check for type promotion
-  TORCH_CHECK(canCast(out_dtype, out.scalar_type()),
-              "torch.cat(): input types can't be cast to the desired output type ",
-              out.scalar_type());
-  TORCH_CHECK(!inputs.empty(), "torch.cat(): invalid number of inputs ", inputs.size());
-
-  dimension = legacy_cat_wrap_dim(dimension, materialized_inputs);
-  TORCH_CHECK(dimension >= 0, "torch.cat(): invalid dimension ", dimension);
-
-  // previously, size [0] tensors were the only possible empty tensors; thus, it
-  // wasn't possible to cat empty tensors unless all the other tensors were
-  // 1-dimensional, so we allowed these tensors to be "skipped".  We maintain
-  // this behavior for backwards compatibility, but only for this specific size
-  // (i.e. other empty sizes are not skipped).
-  // FIXME: warn if this is the case
-  auto should_skip = [](const Tensor& t) { return t.dim() == 1 && t.size(0) == 0; };
-  at::assert_no_internal_overlap(out);
-
-  Tensor notSkippedTensor;
-  // Indices of tensors to be skipped because they're empty
-  std::vector<int64_t> skipped_tensor_indices;
-  // Tensors to be read
-  std::vector<Tensor> input_tensors;
-  int tensor_idx = 0;
-  for (const Tensor& t : materialized_inputs) {
-    if (t.numel() == 0 || should_skip(t)) {
-      skipped_tensor_indices.push_back(tensor_idx);
-      tensor_idx++;
-      continue;
-    }
-    input_tensors.push_back(t);
-    // TODO: Is this OK?
-    notSkippedTensor = t;
-    tensor_idx++;
-  }
-  // If all inputs are empty tensors, return an empty tensor
-  if (!notSkippedTensor.defined()) {
-    return;
-  }
-  for (const Tensor& t : inputs) {
-    TORCH_CHECK(t.device() == notSkippedTensor.device(),
-                "torch.cat(): all input tensors must be on the same device. Received ",
-                t.device(),
-                " and ",
-                notSkippedTensor.device());
-  }
-  TORCH_CHECK(out.device() == notSkippedTensor.device(),
-              "torch.cat(): all input tensors and out must be on the same device, but inputs are on ",
-              notSkippedTensor.device(),
-              " and out is on ",
-              out.device());
-
-  std::vector<int64_t> size(notSkippedTensor.sizes().vec());
-
-  // Compute size of the result in the cat dimension
-  int64_t cat_dim_size = 0;
-  idx = 0;
-  bool has_large_tensor = false;
-  for (const Tensor& tensor : materialized_inputs) {
-    if (isTooLargeForMPSGraph(tensor)) {
-      has_large_tensor |= true;
-    }
-    if (!should_skip(tensor)) {
-      // TODO: Factor out `check_shape_except_dim`
-      check_shape_except_dim(notSkippedTensor, tensor, dimension, idx);
-      cat_dim_size += tensor.size(dimension);
-      idx++;
-    }
-  }
-  // Compute the size of the result
-  size[dimension] = cat_dim_size;
-  // skip resizing if size of result is same as expected
-  if (out.sizes() != size) {
-    out.resize_(size, MemoryFormat::Contiguous);
-  }
-  if (out.numel() == 0) {
-    return;
-  }
-
-  has_large_tensor |= isTooLargeForMPSGraph(out);
-
-  if (has_large_tensor) {
+  if (all_contiguous && all_same_dtype && (memory_format == MemoryFormat::Contiguous) && (dimension == 0)) {
+    return mps::cat_out_mps_contiguous_impl(materialized_inputs, out);
+  } else if (has_large_tensor) {
    return mps::cat_out_mps_impl<int64_t>(materialized_inputs, dimension, out);
  } else {
    return mps::cat_out_mps_impl<int32_t>(materialized_inputs, dimension, out);
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2602,12 +2602,16 @@
  device_check: NoCheck   # TensorIterator
  structured_delegate: exp.out
  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMPS: exp_sparse
  tags: [core, pointwise]

 - func: exp_(Tensor(a!) self) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  structured_delegate: exp.out
  variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA, SparseMPS: exp_sparse_
  tags: pointwise

 - func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -2616,6 +2620,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS, MTIA: exp_out
+    SparseCPU, SparseCUDA, SparseMPS: exp_sparse_out
  tags: pointwise

 - func: exp2(Tensor self) -> Tensor
@ -8865,11 +8870,11 @@
  autogen: bitwise_right_shift.Scalar_Tensor_out
  tags: pointwise

- func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
+- func: tril_(Tensor(a!) self, SymInt diagonal=0) -> Tensor(a!)
  structured_delegate: tril.out
  variants: method

- func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
+- func: triu_(Tensor(a!) self, SymInt diagonal=0) -> Tensor(a!)
  structured_delegate: triu.out
  variants: method

@ -8993,25 +8998,25 @@
 - func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor
  variants: method, function

- func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+- func: triu.out(Tensor self, SymInt diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
  structured: True
  dispatch:
    CPU: triu_cpu
    CUDA: triu_cuda
    MPS: triu_mps_out

- func: triu(Tensor self, int diagonal=0) -> Tensor
+- func: triu(Tensor self, SymInt diagonal=0) -> Tensor
  structured_delegate: triu.out
  variants: method, function

- func: tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+- func: tril.out(Tensor self, SymInt diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
  structured: True
  dispatch:
    CPU: tril_cpu
    CUDA: tril_cuda
    MPS: tril_mps_out

- func: tril(Tensor self, int diagonal=0) -> Tensor
+- func: tril(Tensor self, SymInt diagonal=0) -> Tensor
  structured_delegate: tril.out
  variants: method, function

@ -15095,7 +15100,7 @@
    CPU: _scaled_dot_product_flash_attention_cpu
  tags: nondeterministic_seeded

- func: _scaled_dot_product_fused_attention_overrideable(Tensor query, Tensor key, Tensor value, Tensor? attn_bias=None, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None, bool compute_log_sumexp=True) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
+- func: _scaled_dot_product_fused_attention_overrideable(Tensor query, Tensor key, Tensor value, Tensor? attn_bias=None, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
  dispatch:
    CompositeExplicitAutograd: _scaled_dot_product_fused_attention_overrideable
    XPU: _scaled_dot_product_fused_attention_overrideable_xpu
@ -15119,7 +15124,6 @@
  variants: function
  dispatch:
    CompositeExplicitAutograd: _scaled_dot_product_fused_attention_overrideable_backward
-    XPU: _scaled_dot_product_fused_attention_overrideable_backward_xpu

 - func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset)
  dispatch:
--- a/aten/src/ATen/native/quantized/AffineQuantizerBase.cpp
+++ b/aten/src/ATen/native/quantized/AffineQuantizerBase.cpp
@ -65,7 +65,7 @@ void quantize_vec(
      (typename T::underlying*)dst,
      count,
      fbgemm::TensorQuantizationParams{
-          (float)scale, (int32_t)zero_point, precision});
+          static_cast<float>(scale), static_cast<int32_t>(zero_point), precision});
 }

 #if defined(__ARM_NEON__) || defined(__aarch64__)
--- a/aten/src/ATen/native/quantized/cpu/AdaptiveAveragePooling.cpp
+++ b/aten/src/ATen/native/quantized/cpu/AdaptiveAveragePooling.cpp
@ -40,7 +40,7 @@ inline int start_index(int out_idx, int out_len, int in_len) {
   * This function computes the start index on input matrix.
   */
  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-  return (int)std::floor((float)(out_idx * in_len) / out_len);
+  return static_cast<int>(std::floor(static_cast<float>(out_idx * in_len) / out_len));
 }

 inline int end_index(int out_idx, int out_len, int in_len) {
@ -49,7 +49,7 @@ inline int end_index(int out_idx, int out_len, int in_len) {
   * This function computes the end index on input matrix.
   */
  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-  return (int)std::ceil((float)((out_idx + 1) * in_len) / out_len);
+  return static_cast<int>(std::ceil(static_cast<float>((out_idx + 1) * in_len) / out_len));
 }

 // adaptive avg pool for 2D and 3D inputs
--- a/Show More
+++ b/Show More