Compare commits

..

1 Commits

Author SHA1 Message Date
98826fd37b [annotate] add annotate_fn function decorator 2025-10-17 09:23:55 -07:00
247 changed files with 1413 additions and 4924 deletions

View File

@ -113,7 +113,6 @@ case "$tag" in
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INSTALL_MINGW=yes
;;
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
CUDA_VERSION=13.0.0
@ -362,7 +361,6 @@ docker build \
--build-arg "OPENBLAS=${OPENBLAS:-}" \
--build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
--build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
--build-arg "INSTALL_MINGW=${INSTALL_MINGW:-}" \
-f $(dirname ${DOCKERFILE})/Dockerfile \
-t "$tmp_tag" \
"$@" \

View File

@ -1,10 +0,0 @@
#!/bin/bash
set -ex
# Install MinGW-w64 for Windows cross-compilation
apt-get update
apt-get install -y g++-mingw-w64-x86-64-posix
echo "MinGW-w64 installed successfully"
x86_64-w64-mingw32-g++ --version

View File

@ -20,7 +20,7 @@ pip_install \
pip_install coloredlogs packaging
pip_install onnxruntime==1.23.0
pip_install onnxscript==0.5.4
pip_install onnxscript==0.5.3
# Cache the transformers model to be used later by ONNX tests. We need to run the transformers
# package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/

View File

@ -103,11 +103,6 @@ COPY ci_commit_pins/torchbench.txt torchbench.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
ARG INSTALL_MINGW
COPY ./common/install_mingw.sh install_mingw.sh
RUN if [ -n "${INSTALL_MINGW}" ]; then bash ./install_mingw.sh; fi
RUN rm install_mingw.sh
ARG TRITON
ARG TRITON_CPU

View File

@ -485,22 +485,6 @@ test_inductor_aoti() {
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
}
test_inductor_aoti_cross_compile_for_windows() {
TEST_REPORTS_DIR=$(pwd)/test/test-reports
mkdir -p "$TEST_REPORTS_DIR"
# Set WINDOWS_CUDA_HOME environment variable
WINDOWS_CUDA_HOME="$(pwd)/win-torch-wheel-extracted"
export WINDOWS_CUDA_HOME
echo "WINDOWS_CUDA_HOME is set to: $WINDOWS_CUDA_HOME"
echo "Contents:"
ls -lah "$(pwd)/win-torch-wheel-extracted/lib/x64/" || true
python test/inductor/test_aoti_cross_compile_windows.py -k compile --package-dir "$TEST_REPORTS_DIR" --win-torch-lib-dir "$(pwd)/win-torch-wheel-extracted/torch/lib"
}
test_inductor_cpp_wrapper_shard() {
if [[ -z "$NUM_TEST_SHARDS" ]]; then
echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
@ -916,7 +900,7 @@ test_inductor_set_cpu_affinity(){
export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD"
export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
if [[ "$(uname -m)" != "aarch64" ]]; then
if [[ "${TEST_CONFIG}" != *aarch64* ]]; then
# Use Intel OpenMP for x86
IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
export LD_PRELOAD="$IOMP_LIB":"$LD_PRELOAD"
@ -930,7 +914,7 @@ test_inductor_set_cpu_affinity(){
cores=$((cpus / thread_per_core))
# Set number of cores to 16 on aarch64 for performance runs
if [[ "$(uname -m)" == "aarch64" && $cores -gt 16 ]]; then
if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then
cores=16
fi
export OMP_NUM_THREADS=$cores
@ -1683,7 +1667,7 @@ if [[ "${TEST_CONFIG}" == *numpy_2* ]]; then
python -m pip install --pre numpy==2.0.2 scipy==1.13.1 numba==0.60.0
fi
python test/run_test.py --include dynamo/test_functions.py dynamo/test_unspec.py test_binary_ufuncs.py test_fake_tensor.py test_linalg.py test_numpy_interop.py test_tensor_creation_ops.py test_torch.py torch_np/test_basic.py
elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]]; then
elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" != *perf_cpu_aarch64* ]]; then
test_linux_aarch64
elif [[ "${TEST_CONFIG}" == *backward* ]]; then
test_forward_backward_compatibility
@ -1734,8 +1718,6 @@ elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
test_inductor_triton_cpu
elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
test_inductor_micro_benchmark
elif [[ "${TEST_CONFIG}" == *aoti_cross_compile_for_windows* ]]; then
test_inductor_aoti_cross_compile_for_windows
elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
install_torchvision
id=$((SHARD_NUMBER-1))

View File

@ -3,7 +3,6 @@ ciflow_tracking_issue: 64124
ciflow_push_tags:
- ciflow/b200
- ciflow/b200-symm-mem
- ciflow/b200-distributed
- ciflow/binaries
- ciflow/binaries_libtorch
- ciflow/binaries_wheel

View File

@ -1092,7 +1092,7 @@ class GitHubPR:
editor = node["editor"]
return GitHubComment(
body_text=node["bodyText"],
created_at=node.get("createdAt", ""),
created_at=node["createdAt"] if "createdAt" in node else "",
author_login=node["author"]["login"],
author_url=node["author"].get("url", None),
author_association=node["authorAssociation"],

View File

@ -224,46 +224,6 @@ jobs:
continue-on-error: true
uses: ./.github/actions/download-td-artifacts
- name: Download Windows torch wheel for cross-compilation
if: matrix.win_torch_wheel_artifact != ''
uses: seemethere/download-artifact-s3@1da556a7aa0a088e3153970611f6c432d58e80e6 # v4.2.0
with:
name: ${{ matrix.win_torch_wheel_artifact }}
path: win-torch-wheel
- name: Extract Windows wheel and setup CUDA libraries
if: matrix.win_torch_wheel_artifact != ''
shell: bash
run: |
set -x
# Find the wheel file
WHEEL_FILE=$(find win-torch-wheel -name "*.whl" -type f | head -n 1)
if [ -z "$WHEEL_FILE" ]; then
echo "Error: No wheel file found in win-torch-wheel directory"
exit 1
fi
echo "Found wheel file: $WHEEL_FILE"
# Unzip the wheel file
unzip -q "$WHEEL_FILE" -d win-torch-wheel-extracted
echo "Extracted wheel contents"
# Setup CUDA libraries (cuda.lib and cudart.lib) directory
mkdir -p win-torch-wheel-extracted/lib/x64
if [ -f "win-torch-wheel/cuda.lib" ]; then
mv win-torch-wheel/cuda.lib win-torch-wheel-extracted/lib/x64/
echo "Moved cuda.lib to win-torch-wheel-extracted/lib/x64/"
fi
if [ -f "win-torch-wheel/cudart.lib" ]; then
mv win-torch-wheel/cudart.lib win-torch-wheel-extracted/lib/x64/
echo "Moved cudart.lib to win-torch-wheel-extracted/lib/x64/"
fi
# Verify CUDA libraries are present
echo "CUDA libraries:"
ls -la win-torch-wheel-extracted/lib/x64/ || echo "No CUDA libraries found"
- name: Parse ref
id: parse-ref
run: .github/scripts/parse_ref.py

View File

@ -168,31 +168,6 @@ jobs:
run: |
.ci/pytorch/win-build.sh
# Collect Windows torch libs and CUDA libs for cross-compilation
- name: Collect Windows CUDA libs for cross-compilation
if: steps.build.outcome != 'skipped' && inputs.cuda-version != 'cpu'
shell: bash
run: |
set -ex
# Create directory structure if does not exist
mkdir -p /c/${{ github.run_id }}/build-results
# Copy CUDA libs
CUDA_PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${{ inputs.cuda-version }}"
if [ -f "${CUDA_PATH}/lib/x64/cuda.lib" ]; then
cp "${CUDA_PATH}/lib/x64/cuda.lib" /c/${{ github.run_id }}/build-results/
fi
if [ -f "${CUDA_PATH}/lib/x64/cudart.lib" ]; then
cp "${CUDA_PATH}/lib/x64/cudart.lib" /c/${{ github.run_id }}/build-results/
fi
# List collected files
echo "Collected CUDA libs:"
ls -lah /c/${{ github.run_id }}/build-results/*.lib
# Upload to github so that people can click and download artifacts
- name: Upload artifacts to s3
if: steps.build.outcome != 'skipped'

View File

@ -1,62 +0,0 @@
name: CI for distributed tests on B200
on:
pull_request:
paths:
- .github/workflows/b200-distributed.yml
workflow_dispatch:
push:
tags:
- ciflow/b200-distributed/*
schedule:
- cron: 46 8 * * * # about 1:46am PDT
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-label-type:
if: github.repository_owner == 'pytorch'
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200:
name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed-b200
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '10.0'
test-matrix: |
{ include: [
{ config: "distributed", shard: 1, num_shards: 2, runner: "linux.dgx.b200.8" },
{ config: "distributed", shard: 2, num_shards: 2, runner: "linux.dgx.b200.8" },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-test-distributed-b200:
name: linux-jammy-cuda12.8-py3.10-gcc11-test-b200
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200
with:
timeout-minutes: 1200
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
secrets: inherit

View File

@ -88,27 +88,27 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 1, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 2, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 3, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 4, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 5, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 6, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 7, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 1, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 2, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 3, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 4, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 5, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 6, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 7, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 8, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 9, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 1, num_shards: 7, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 2, num_shards: 7, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 3, num_shards: 7, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 4, num_shards: 7, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 5, num_shards: 7, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 6, num_shards: 7, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_timm_perf_rocm_mi355", shard: 7, num_shards: 7, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 1, num_shards: 9, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 2, num_shards: 9, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 3, num_shards: 9, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 4, num_shards: 9, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 5, num_shards: 9, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 6, num_shards: 9, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 7, num_shards: 9, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 8, num_shards: 9, runner: "linux.rocm.gpu.mi355.2" },
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 9, num_shards: 9, runner: "linux.rocm.gpu.mi355.2" },
]}
secrets: inherit

View File

@ -52,27 +52,3 @@ jobs:
docker-image: ${{ needs.x86-opbenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.x86-opbenchmark-build.outputs.test-matrix }}
secrets: inherit
aarch64-opbenchmark-build:
if: github.repository_owner == 'pytorch'
name: aarch64-opbenchmark-build
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-aarch64-py3.10
runner: linux.arm64.m7g.4xlarge
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11
test-matrix: |
{ include: [
{ config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.arm64.m8g.4xlarge" },
]}
secrets: inherit
aarch64-opbenchmark-test:
name: aarch64-opbenchmark-test
uses: ./.github/workflows/_linux-test.yml
needs: aarch64-opbenchmark-build
with:
build-environment: linux-jammy-aarch64-py3.10
docker-image: ${{ needs.aarch64-opbenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.aarch64-opbenchmark-build.outputs.test-matrix }}
secrets: inherit

View File

@ -45,12 +45,12 @@ jobs:
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
]}
secrets: inherit

View File

@ -200,23 +200,6 @@ jobs:
cuda-arch-list: '8.0'
secrets: inherit
# Test cross-compiled models with Windows libs extracted from wheel
cross-compile-linux-test:
name: cross-compile-linux-test
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-build
- get-label-type
- win-vs2022-cuda12_8-py3-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
test-matrix: |
{ include: [
{ config: "aoti_cross_compile_for_windows", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", win_torch_wheel_artifact: "win-vs2022-cuda12.8-py3" },
]}
secrets: inherit
verify-cachebench-cpu-build:
name: verify-cachebench-cpu-build
uses: ./.github/workflows/_linux-build.yml

View File

@ -2,6 +2,7 @@
#include <mutex>
#include <ATen/CachedTensorUtils.h>
#include <c10/core/GradMode.h>
#include <c10/util/flat_hash_map.h>
namespace at::autocast {
@ -36,10 +37,29 @@ namespace {
using weakref_type = c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl>;
using val_type = std::tuple<weakref_type, Tensor>;
ska::flat_hash_map<TensorImpl*, val_type>& get_cached_casts() {
static ska::flat_hash_map<TensorImpl*, val_type> cached_casts;
return cached_casts;
// We maintain separate caches for gradient-enabled and gradient-disabled modes.
// This ensures that tensors cached in torch.no_grad() (with requires_grad=False)
// are not incorrectly reused in gradient-enabled contexts.
// This fixes issue #158232 while maintaining optimal performance for both modes.
static ska::flat_hash_map<TensorImpl*, val_type>& get_cached_casts_grad_enabled() {
static ska::flat_hash_map<TensorImpl*, val_type> cached_casts_grad_enabled;
return cached_casts_grad_enabled;
}
static ska::flat_hash_map<TensorImpl*, val_type>& get_cached_casts_grad_disabled() {
static ska::flat_hash_map<TensorImpl*, val_type> cached_casts_grad_disabled;
return cached_casts_grad_disabled;
}
// Helper function to get the appropriate cache based on current gradient mode.
// This allows us to cache tensors separately for grad-enabled and grad-disabled contexts,
// preventing incorrect cache hits when gradient mode changes.
static ska::flat_hash_map<TensorImpl*, val_type>& get_cached_casts() {
return at::GradMode::is_enabled() ?
get_cached_casts_grad_enabled() :
get_cached_casts_grad_disabled();
}
std::mutex cached_casts_mutex;
@ -86,7 +106,9 @@ thread_local bool cache_enabled = true;
void clear_cache() {
const std::lock_guard<std::mutex> lock(cached_casts_mutex);
get_cached_casts().clear();
// Clear both caches to ensure consistent behavior regardless of current gradient mode
get_cached_casts_grad_enabled().clear();
get_cached_casts_grad_disabled().clear();
}
int increment_nesting() {
@ -121,6 +143,11 @@ Tensor cached_cast(at::ScalarType to_type, const Tensor& arg, DeviceType device_
if (is_eligible(arg, device_type) && (arg.scalar_type() != to_type)) {
// Heuristic: Do what Apex does, and cache lower_precision_fp casts of fp32 model weights (leaves).
// See cached_casts declaration above for detailed strategy.
//
// We maintain separate caches for gradient-enabled and gradient-disabled modes
// (see get_cached_casts() above). This ensures correctness when mixing torch.no_grad()
// with torch.autocast(), while maintaining optimal performance for both training and inference.
// This fixes issue #158232 without any performance regression.
bool can_try_cache = (to_type == get_lower_precision_fp_from_device_type(device_type) &&
arg.scalar_type() == at::kFloat && arg.requires_grad() &&
arg.is_leaf() && !arg.is_view() && cache_enabled &&

View File

@ -229,10 +229,10 @@ private:
}
static constexpr uint32_t kPhilox10A = 0x9E3779B9;
static constexpr uint32_t kPhilox10B = 0xBB67AE85;
static constexpr uint32_t kPhiloxSA = 0xD2511F53;
static constexpr uint32_t kPhiloxSB = 0xCD9E8D57;
static const uint32_t kPhilox10A = 0x9E3779B9;
static const uint32_t kPhilox10B = 0xBB67AE85;
static const uint32_t kPhiloxSA = 0xD2511F53;
static const uint32_t kPhiloxSB = 0xCD9E8D57;
};
typedef philox_engine Philox4_32;

View File

@ -8,7 +8,6 @@
#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
#include <ATen/cpu/vec/vec128/vec128_int_aarch64.h>
#endif
#include <ATen/cpu/vec/vec128/vec128_convert.h>

View File

@ -1,794 +0,0 @@
#pragma once
#include <ATen/cpu/vec/intrinsics.h>
#include <ATen/cpu/vec/vec_base.h>
#include <c10/macros/Macros.h>
#include <c10/util/irange.h>
namespace at::vec {
// Note [CPU_CAPABILITY namespace]
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// This header, and all of its subheaders, will be compiled with
// different architecture flags for each supported set of vector
// intrinsics. So we need to make sure they aren't inadvertently
// linked together. We do this by declaring objects in an `inline
// namespace` which changes the name mangling, but can still be
// accessed as `at::vec`.
inline namespace CPU_CAPABILITY {
#define VEC_INT_NEON_TEMPLATE(vl, bit) \
template <> \
struct is_vec_specialized_for<int##bit##_t> : std::bool_constant<true> {}; \
\
template <> \
class Vectorized<int##bit##_t> { \
using neon_type = int##bit##x##vl##_t; \
\
private: \
neon_type values; \
\
public: \
using value_type = int##bit##_t; \
using size_type = int; \
static constexpr size_type size() { \
return vl; \
} \
Vectorized() { \
values = vdupq_n_s##bit(0); \
} \
Vectorized(neon_type v) : values(v) {} \
Vectorized(int##bit##_t val); \
template < \
typename... Args, \
typename = std::enable_if_t<(sizeof...(Args) == size())>> \
Vectorized(Args... vals) { \
__at_align__ int##bit##_t buffer[size()] = {vals...}; \
values = vld1q_s##bit(buffer); \
} \
operator neon_type() const { \
return values; \
} \
static Vectorized<int##bit##_t> loadu( \
const void* ptr, \
int64_t count = size()); \
void store(void* ptr, int64_t count = size()) const; \
template <int64_t mask> \
static Vectorized<int##bit##_t> blend( \
const Vectorized<int##bit##_t>& a, \
const Vectorized<int##bit##_t>& b); \
static Vectorized<int##bit##_t> blendv( \
const Vectorized<int##bit##_t>& a, \
const Vectorized<int##bit##_t>& b, \
const Vectorized<int##bit##_t>& mask_) { \
return vbslq_s##bit(vreinterpretq_u##bit##_s##bit(mask_.values), b, a); \
} \
template <typename step_t> \
static Vectorized<int##bit##_t> arange( \
value_type base = 0, \
step_t step = static_cast<step_t>(1)); \
static Vectorized<int##bit##_t> set( \
const Vectorized<int##bit##_t>& a, \
const Vectorized<int##bit##_t>& b, \
int64_t count = size()); \
const int##bit##_t& operator[](int idx) const = delete; \
int##bit##_t& operator[](int idx) = delete; \
Vectorized<int##bit##_t> abs() const { \
return vabsq_s##bit(values); \
} \
Vectorized<int##bit##_t> real() const { \
return values; \
} \
Vectorized<int##bit##_t> imag() const { \
return vdupq_n_s##bit(0); \
} \
Vectorized<int##bit##_t> conj() const { \
return values; \
} \
Vectorized<int##bit##_t> neg() const { \
return vnegq_s##bit(values); \
} \
int##bit##_t reduce_add() const { \
return vaddvq_s##bit(values); \
} \
int##bit##_t reduce_max() const; \
Vectorized<int##bit##_t> operator==( \
const Vectorized<int##bit##_t>& other) const { \
return Vectorized<value_type>( \
vreinterpretq_s##bit##_u##bit(vceqq_s##bit(values, other.values))); \
} \
Vectorized<int##bit##_t> operator!=( \
const Vectorized<int##bit##_t>& other) const; \
Vectorized<int##bit##_t> operator<( \
const Vectorized<int##bit##_t>& other) const { \
return Vectorized<value_type>( \
vreinterpretq_s##bit##_u##bit(vcltq_s##bit(values, other.values))); \
} \
Vectorized<int##bit##_t> operator<=( \
const Vectorized<int##bit##_t>& other) const { \
return Vectorized<value_type>( \
vreinterpretq_s##bit##_u##bit(vcleq_s##bit(values, other.values))); \
} \
Vectorized<int##bit##_t> operator>( \
const Vectorized<int##bit##_t>& other) const { \
return Vectorized<value_type>( \
vreinterpretq_s##bit##_u##bit(vcgtq_s##bit(values, other.values))); \
} \
Vectorized<int##bit##_t> operator>=( \
const Vectorized<int##bit##_t>& other) const { \
return Vectorized<value_type>( \
vreinterpretq_s##bit##_u##bit(vcgeq_s##bit(values, other.values))); \
} \
Vectorized<int##bit##_t> eq(const Vectorized<int##bit##_t>& other) const; \
Vectorized<int##bit##_t> ne(const Vectorized<int##bit##_t>& other) const; \
Vectorized<int##bit##_t> gt(const Vectorized<int##bit##_t>& other) const; \
Vectorized<int##bit##_t> ge(const Vectorized<int##bit##_t>& other) const; \
Vectorized<int##bit##_t> lt(const Vectorized<int##bit##_t>& other) const; \
Vectorized<int##bit##_t> le(const Vectorized<int##bit##_t>& other) const; \
}; \
template <> \
Vectorized<int##bit##_t> inline operator+( \
const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
return vaddq_s##bit(a, b); \
} \
template <> \
Vectorized<int##bit##_t> inline operator-( \
const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
return vsubq_s##bit(a, b); \
} \
template <> \
Vectorized<int##bit##_t> inline operator&( \
const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
return vandq_s##bit(a, b); \
} \
template <> \
Vectorized<int##bit##_t> inline operator|( \
const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
return vorrq_s##bit(a, b); \
} \
template <> \
Vectorized<int##bit##_t> inline operator^( \
const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
return veorq_s##bit(a, b); \
} \
Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::eq( \
const Vectorized<int##bit##_t>& other) const { \
return (*this == other) & Vectorized<int##bit##_t>(1); \
} \
Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::ne( \
const Vectorized<int##bit##_t>& other) const { \
return (*this != other) & Vectorized<int##bit##_t>(1); \
} \
Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::gt( \
const Vectorized<int##bit##_t>& other) const { \
return (*this > other) & Vectorized<int##bit##_t>(1); \
} \
Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::ge( \
const Vectorized<int##bit##_t>& other) const { \
return (*this >= other) & Vectorized<int##bit##_t>(1); \
} \
Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::lt( \
const Vectorized<int##bit##_t>& other) const { \
return (*this < other) & Vectorized<int##bit##_t>(1); \
} \
Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::le( \
const Vectorized<int##bit##_t>& other) const { \
return (*this <= other) & Vectorized<int##bit##_t>(1); \
}
VEC_INT_NEON_TEMPLATE(2, 64)
VEC_INT_NEON_TEMPLATE(4, 32)
VEC_INT_NEON_TEMPLATE(8, 16)
VEC_INT_NEON_TEMPLATE(16, 8)
inline int32_t Vectorized<int32_t>::reduce_max() const {
return vmaxvq_s32(values);
}
inline int16_t Vectorized<int16_t>::reduce_max() const {
return vmaxvq_s16(values);
}
inline int8_t Vectorized<int8_t>::reduce_max() const {
return vmaxvq_s8(values);
}
template <>
Vectorized<int32_t> inline operator*(
const Vectorized<int32_t>& a,
const Vectorized<int32_t>& b) {
return vmulq_s32(a, b);
}
template <>
Vectorized<int16_t> inline operator*(
const Vectorized<int16_t>& a,
const Vectorized<int16_t>& b) {
return vmulq_s16(a, b);
}
template <>
Vectorized<int8_t> inline operator*(
const Vectorized<int8_t>& a,
const Vectorized<int8_t>& b) {
return vmulq_s8(a, b);
}
template <>
inline Vectorized<int64_t> operator~(const Vectorized<int64_t>& a) {
int64x2_t val = a;
return ~val;
}
template <>
inline Vectorized<int32_t> operator~(const Vectorized<int32_t>& a) {
return vmvnq_s32(a);
}
template <>
inline Vectorized<int16_t> operator~(const Vectorized<int16_t>& a) {
return vmvnq_s16(a);
}
template <>
inline Vectorized<int8_t> operator~(const Vectorized<int8_t>& a) {
return vmvnq_s8(a);
}
inline Vectorized<int64_t> Vectorized<int64_t>::operator!=(
const Vectorized<int64_t>& other) const {
return ~(*this == other);
}
inline Vectorized<int32_t> Vectorized<int32_t>::operator!=(
const Vectorized<int32_t>& other) const {
return ~(*this == other);
}
inline Vectorized<int16_t> Vectorized<int16_t>::operator!=(
const Vectorized<int16_t>& other) const {
return ~(*this == other);
}
inline Vectorized<int8_t> Vectorized<int8_t>::operator!=(
const Vectorized<int8_t>& other) const {
return ~(*this == other);
}
template <>
Vectorized<int32_t> inline minimum(
const Vectorized<int32_t>& a,
const Vectorized<int32_t>& b) {
return vminq_s32(a, b);
}
template <>
Vectorized<int16_t> inline minimum(
const Vectorized<int16_t>& a,
const Vectorized<int16_t>& b) {
return vminq_s16(a, b);
}
template <>
Vectorized<int8_t> inline minimum(
const Vectorized<int8_t>& a,
const Vectorized<int8_t>& b) {
return vminq_s8(a, b);
}
template <>
Vectorized<int32_t> inline maximum(
const Vectorized<int32_t>& a,
const Vectorized<int32_t>& b) {
return vmaxq_s32(a, b);
}
template <>
Vectorized<int16_t> inline maximum(
const Vectorized<int16_t>& a,
const Vectorized<int16_t>& b) {
return vmaxq_s16(a, b);
}
template <>
Vectorized<int8_t> inline maximum(
const Vectorized<int8_t>& a,
const Vectorized<int8_t>& b) {
return vmaxq_s8(a, b);
}
template <int64_t mask>
Vectorized<int64_t> Vectorized<int64_t>::blend(
const Vectorized<int64_t>& a,
const Vectorized<int64_t>& b) {
// Build an array of flags: each bit of element is 1 if the corresponding bit
// in 'mask' is set, 0 otherwise.
uint64x2_t maskArray = {
(mask & 1LL) ? 0xFFFFFFFFFFFFFFFF : 0,
(mask & 2LL) ? 0xFFFFFFFFFFFFFFFF : 0};
// Use BSL to select elements from b where the mask is 1, else from a
return vbslq_s64(maskArray, b.values, a.values);
}
template <int64_t mask>
Vectorized<int32_t> Vectorized<int32_t>::blend(
const Vectorized<int32_t>& a,
const Vectorized<int32_t>& b) {
// Build an array of flags: each bit of element is 1 if the corresponding bit
// in 'mask' is set, 0 otherwise.
uint32x4_t maskArray = {
(mask & 1LL) ? 0xFFFFFFFF : 0,
(mask & 2LL) ? 0xFFFFFFFF : 0,
(mask & 4LL) ? 0xFFFFFFFF : 0,
(mask & 8LL) ? 0xFFFFFFFF : 0};
// Use BSL to select elements from b where the mask is 1, else from a
return vbslq_s32(maskArray, b.values, a.values);
}
template <int64_t mask>
Vectorized<int16_t> Vectorized<int16_t>::blend(
const Vectorized<int16_t>& a,
const Vectorized<int16_t>& b) {
// Build an array of flags: each bit of element is 1 if the corresponding bit
// in 'mask' is set, 0 otherwise.
uint16x8_t maskArray = {
(mask & 1LL) ? 0xFFFF : 0,
(mask & 2LL) ? 0xFFFF : 0,
(mask & 4LL) ? 0xFFFF : 0,
(mask & 8LL) ? 0xFFFF : 0,
(mask & 16LL) ? 0xFFFF : 0,
(mask & 32LL) ? 0xFFFF : 0,
(mask & 64LL) ? 0xFFFF : 0,
(mask & 128LL) ? 0xFFFF : 0};
// Use BSL to select elements from b where the mask is 1, else from a
return vbslq_s16(maskArray, b.values, a.values);
}
template <int64_t mask>
Vectorized<int8_t> Vectorized<int8_t>::blend(
const Vectorized<int8_t>& a,
const Vectorized<int8_t>& b) {
// Build an array of flags: each bit of element is 1 if the corresponding bit
// in 'mask' is set, 0 otherwise.
uint8x16_t maskArray = {
(mask & 1LL) ? 0xFF : 0,
(mask & 2LL) ? 0xFF : 0,
(mask & 4LL) ? 0xFF : 0,
(mask & 8LL) ? 0xFF : 0,
(mask & 16LL) ? 0xFF : 0,
(mask & 32LL) ? 0xFF : 0,
(mask & 64LL) ? 0xFF : 0,
(mask & 128LL) ? 0xFF : 0,
(mask & 256LL) ? 0xFF : 0,
(mask & 512LL) ? 0xFF : 0,
(mask & 1024LL) ? 0xFF : 0,
(mask & 2048LL) ? 0xFF : 0,
(mask & 4096LL) ? 0xFF : 0,
(mask & 8192LL) ? 0xFF : 0,
(mask & 16384LL) ? 0xFF : 0,
(mask & 32768LL) ? 0xFF : 0};
// Use BSL to select elements from b where the mask is 1, else from a
return vbslq_s8(maskArray, b.values, a.values);
}
#define VEC_INT_NEON_OPS(vl, bit) \
inline Vectorized<int##bit##_t>::Vectorized(int##bit##_t val) { \
values = vdupq_n_s##bit(val); \
} \
inline Vectorized<int##bit##_t> Vectorized<int##bit##_t>::loadu( \
const void* ptr, int64_t count) { \
if (count == size()) { \
return vld1q_s##bit(reinterpret_cast<const int##bit##_t*>(ptr)); \
} else { \
__at_align__ int##bit##_t tmp_values[size()]; \
for (const auto i : c10::irange(size())) { \
tmp_values[i] = 0; \
} \
std::memcpy( \
tmp_values, \
reinterpret_cast<const int##bit##_t*>(ptr), \
count * sizeof(int##bit##_t)); \
return vld1q_s##bit(reinterpret_cast<const int##bit##_t*>(tmp_values)); \
} \
} \
inline void Vectorized<int##bit##_t>::store(void* ptr, int64_t count) \
const { \
if (count == size()) { \
vst1q_s##bit(reinterpret_cast<int##bit##_t*>(ptr), values); \
} else { \
int##bit##_t tmp_values[size()]; \
vst1q_s##bit(reinterpret_cast<int##bit##_t*>(tmp_values), values); \
std::memcpy(ptr, tmp_values, count * sizeof(int##bit##_t)); \
} \
}
VEC_INT_NEON_OPS(2, 64)
VEC_INT_NEON_OPS(4, 32)
VEC_INT_NEON_OPS(8, 16)
VEC_INT_NEON_OPS(16, 8)
template <>
Vectorized<int64_t> inline operator*(
const Vectorized<int64_t>& a,
const Vectorized<int64_t>& b) {
int64x2_t x = a;
int64x2_t y = b;
return x * y;
}
template <>
Vectorized<int64_t> inline operator/(
const Vectorized<int64_t>& a,
const Vectorized<int64_t>& b) {
int64x2_t x = a;
int64x2_t y = b;
return x / y;
}
template <>
Vectorized<int32_t> inline operator/(
const Vectorized<int32_t>& a,
const Vectorized<int32_t>& b) {
int32x4_t x = a;
int32x4_t y = b;
return x / y;
}
inline int64_t Vectorized<int64_t>::reduce_max() const {
return std::max(values[0], values[1]);
}
template <>
Vectorized<int64_t> inline minimum(
const Vectorized<int64_t>& a,
const Vectorized<int64_t>& b) {
int64x2_t x = a;
int64x2_t y = b;
return {std::min(x[0], y[0]), std::min(x[1], y[1])};
}
template <>
Vectorized<int64_t> inline maximum(
const Vectorized<int64_t>& a,
const Vectorized<int64_t>& b) {
int64x2_t x = a;
int64x2_t y = b;
return {std::max(x[0], y[0]), std::max(x[1], y[1])};
}
template <typename step_t>
inline Vectorized<int64_t> Vectorized<int64_t>::arange(
int64_t base,
step_t step) {
const Vectorized<int64_t> base_vec(base);
const Vectorized<int64_t> step_vec(step);
const int64x2_t step_sizes = {0, 1};
return base_vec.values + step_sizes * step_vec.values;
}
template <typename step_t>
inline Vectorized<int32_t> Vectorized<int32_t>::arange(
int32_t base,
step_t step) {
const Vectorized<int32_t> base_vec(base);
const Vectorized<int32_t> step_vec(step);
const int32x4_t step_sizes = {0, 1, 2, 3};
return vmlaq_s32(base_vec, step_sizes, step_vec);
}
template <typename step_t>
inline Vectorized<int16_t> Vectorized<int16_t>::arange(
int16_t base,
step_t step) {
const Vectorized<int16_t> base_vec(base);
const Vectorized<int16_t> step_vec(step);
const int16x8_t step_sizes = {0, 1, 2, 3, 4, 5, 6, 7};
return vmlaq_s16(base_vec, step_sizes, step_vec);
}
template <typename step_t>
inline Vectorized<int8_t> Vectorized<int8_t>::arange(int8_t base, step_t step) {
const Vectorized<int8_t> base_vec(base);
const Vectorized<int8_t> step_vec(step);
const int8x16_t step_sizes = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
return vmlaq_s8(base_vec, step_sizes, step_vec);
}
template <>
Vectorized<int64_t> inline operator>>(
const Vectorized<int64_t>& a,
const Vectorized<int64_t>& b) {
int64x2_t x = a;
int64x2_t y = b;
uint64x2_t u = vreinterpretq_u64_s64(y);
uint64x2_t z = {std::min(u[0], (uint64_t)63), std::min(u[1], (uint64_t)63)};
return x >> vreinterpretq_s64_u64(z);
}
template <>
Vectorized<int32_t> inline operator>>(
const Vectorized<int32_t>& a,
const Vectorized<int32_t>& b) {
int32x4_t x = a;
int32x4_t y = b;
uint32x4_t bound = vdupq_n_u32(31);
uint32x4_t z = vminq_u32(vreinterpretq_u32_s32(y), bound);
return x >> vreinterpretq_s32_u32(z);
}
template <>
Vectorized<int16_t> inline operator>>(
const Vectorized<int16_t>& a,
const Vectorized<int16_t>& b) {
int16x8_t x = a;
int16x8_t y = b;
uint16x8_t bound = vdupq_n_u16(15);
uint16x8_t z = vminq_u16(vreinterpretq_u16_s16(y), bound);
return x >> vreinterpretq_s16_u16(z);
}
template <>
Vectorized<int8_t> inline operator>>(
const Vectorized<int8_t>& a,
const Vectorized<int8_t>& b) {
int8x16_t x = a;
int8x16_t y = b;
uint8x16_t bound = vdupq_n_u8(7);
int8x16_t z = vreinterpretq_s8_u8(vminq_u8(vreinterpretq_u8_s8(y), bound));
return x >> z;
}
template <>
Vectorized<int64_t> inline operator<<(
const Vectorized<int64_t>& a,
const Vectorized<int64_t>& b) {
int64x2_t y = b;
uint64x2_t u = vreinterpretq_u64_s64(y);
uint64x2_t z = {std::min(u[0], (uint64_t)64), std::min(u[1], (uint64_t)64)};
return vshlq_s64(a, vreinterpretq_s64_u64(z));
}
template <>
Vectorized<int32_t> inline operator<<(
const Vectorized<int32_t>& a,
const Vectorized<int32_t>& b) {
int32x4_t y = b;
uint32x4_t bound = vdupq_n_u32(32);
uint32x4_t z = vminq_u32(vreinterpretq_u32_s32(y), bound);
return vshlq_s32(a, vreinterpretq_s32_u32(z));
}
template <>
Vectorized<int16_t> inline operator<<(
const Vectorized<int16_t>& a,
const Vectorized<int16_t>& b) {
int16x8_t y = b;
uint16x8_t bound = vdupq_n_u16(16);
uint16x8_t z = vminq_u16(vreinterpretq_u16_s16(y), bound);
return vshlq_s16(a, vreinterpretq_s16_u16(z));
}
template <>
Vectorized<int8_t> inline operator<<(
const Vectorized<int8_t>& a,
const Vectorized<int8_t>& b) {
int8x16_t y = b;
uint8x16_t bound = vdupq_n_u8(8);
int8x16_t z = vreinterpretq_s8_u8(vminq_u8(vreinterpretq_u8_s8(y), bound));
return vshlq_s8(a, z);
}
inline Vectorized<int64_t> Vectorized<int64_t>::set(
const Vectorized<int64_t>& a,
const Vectorized<int64_t>& b,
int64_t count) {
if (count == 0) {
return a;
} else if (count >= 2) {
return b;
} else {
int64x2_t c = {b.values[0], a.values[1]};
return c;
}
}
inline Vectorized<int32_t> Vectorized<int32_t>::set(
const Vectorized<int32_t>& a,
const Vectorized<int32_t>& b,
int64_t count) {
if (count == 0) {
return a;
} else if (count >= 4) {
return b;
} else {
// Build an array of flags: each bit of element is 1 if the corresponding
// bit in 'mask' is set, 0 otherwise.
uint32x4_t maskArray = {
(count >= 1LL) ? 0xFFFFFFFF : 0,
(count >= 2LL) ? 0xFFFFFFFF : 0,
(count >= 3LL) ? 0xFFFFFFFF : 0,
0};
// Use BSL to select elements from b where the mask is 1, else from a
return vbslq_s32(maskArray, b.values, a.values);
}
}
inline Vectorized<int16_t> Vectorized<int16_t>::set(
const Vectorized<int16_t>& a,
const Vectorized<int16_t>& b,
int64_t count) {
if (count == 0) {
return a;
} else if (count >= 8) {
return b;
} else {
// Build an array of flags: each bit of element is 1 if the corresponding
// bit in 'mask' is set, 0 otherwise.
uint16x8_t maskArray = {
static_cast<uint16_t>((count >= 1LL) ? 0xFFFF : 0),
static_cast<uint16_t>((count >= 2LL) ? 0xFFFF : 0),
static_cast<uint16_t>((count >= 3LL) ? 0xFFFF : 0),
static_cast<uint16_t>((count >= 4LL) ? 0xFFFF : 0),
static_cast<uint16_t>((count >= 5LL) ? 0xFFFF : 0),
static_cast<uint16_t>((count >= 6LL) ? 0xFFFF : 0),
static_cast<uint16_t>((count >= 7LL) ? 0xFFFF : 0),
0};
// Use BSL to select elements from b where the mask is 1, else from a
return vbslq_s16(maskArray, b.values, a.values);
}
}
inline Vectorized<int8_t> Vectorized<int8_t>::set(
const Vectorized<int8_t>& a,
const Vectorized<int8_t>& b,
int64_t count) {
if (count == 0) {
return a;
} else if (count >= 16) {
return b;
} else {
// Build an array of flags: each bit of element is 1 if the corresponding
// bit in 'mask' is set, 0 otherwise.
uint8x16_t maskArray = {
static_cast<uint8_t>((count >= 1LL) ? 0xFF : 0),
static_cast<uint8_t>((count >= 2LL) ? 0xFF : 0),
static_cast<uint8_t>((count >= 3LL) ? 0xFF : 0),
static_cast<uint8_t>((count >= 4LL) ? 0xFF : 0),
static_cast<uint8_t>((count >= 5LL) ? 0xFF : 0),
static_cast<uint8_t>((count >= 6LL) ? 0xFF : 0),
static_cast<uint8_t>((count >= 7LL) ? 0xFF : 0),
static_cast<uint8_t>((count >= 8LL) ? 0xFF : 0),
static_cast<uint8_t>((count >= 9LL) ? 0xFF : 0),
static_cast<uint8_t>((count >= 10LL) ? 0xFF : 0),
static_cast<uint8_t>((count >= 11LL) ? 0xFF : 0),
static_cast<uint8_t>((count >= 12LL) ? 0xFF : 0),
static_cast<uint8_t>((count >= 13LL) ? 0xFF : 0),
static_cast<uint8_t>((count >= 14LL) ? 0xFF : 0),
static_cast<uint8_t>((count >= 15LL) ? 0xFF : 0),
0};
// Use BSL to select elements from b where the mask is 1, else from a
return vbslq_s8(maskArray, b.values, a.values);
}
}
template <>
Vectorized<int16_t> inline operator/(
const Vectorized<int16_t>& a,
const Vectorized<int16_t>& b) {
Vectorized<int32_t> highBitsA = vmovl_high_s16(a);
Vectorized<int32_t> highBitsB = vmovl_high_s16(b);
Vectorized<int32_t> lowBitsA = vmovl_s16(vget_low_s16(a));
Vectorized<int32_t> lowBitsB = vmovl_s16(vget_low_s16(b));
int32x4_t highBitsResult = highBitsA / highBitsB;
int32x4_t lowBitsResult = lowBitsA / lowBitsB;
return vuzp1q_s16(
vreinterpretq_s16_s32(lowBitsResult),
vreinterpretq_s16_s32(highBitsResult));
}
template <>
Vectorized<int8_t> inline operator/(
const Vectorized<int8_t>& a,
const Vectorized<int8_t>& b) {
Vectorized<int16_t> highBitsA = vmovl_high_s8(a);
Vectorized<int16_t> highBitsB = vmovl_high_s8(b);
Vectorized<int16_t> lowBitsA = vmovl_s8(vget_low_s8(a));
Vectorized<int16_t> lowBitsB = vmovl_s8(vget_low_s8(b));
int16x8_t highBitsResult = highBitsA / highBitsB;
int16x8_t lowBitsResult = lowBitsA / lowBitsB;
return vuzp1q_s8(
vreinterpretq_s8_s16(lowBitsResult),
vreinterpretq_s8_s16(highBitsResult));
}
template <>
Vectorized<int64_t> inline clamp(
const Vectorized<int64_t>& a,
const Vectorized<int64_t>& min,
const Vectorized<int64_t>& max) {
return minimum(max, maximum(min, a));
}
template <>
Vectorized<int32_t> inline clamp(
const Vectorized<int32_t>& a,
const Vectorized<int32_t>& min,
const Vectorized<int32_t>& max) {
return minimum(max, maximum(min, a));
}
template <>
Vectorized<int16_t> inline clamp(
const Vectorized<int16_t>& a,
const Vectorized<int16_t>& min,
const Vectorized<int16_t>& max) {
return minimum(max, maximum(min, a));
}
template <>
Vectorized<int8_t> inline clamp(
const Vectorized<int8_t>& a,
const Vectorized<int8_t>& min,
const Vectorized<int8_t>& max) {
return minimum(max, maximum(min, a));
}
template <>
Vectorized<int64_t> inline clamp_max(
const Vectorized<int64_t>& a,
const Vectorized<int64_t>& max) {
return minimum(max, a);
}
template <>
Vectorized<int32_t> inline clamp_max(
const Vectorized<int32_t>& a,
const Vectorized<int32_t>& max) {
return minimum(max, a);
}
template <>
Vectorized<int16_t> inline clamp_max(
const Vectorized<int16_t>& a,
const Vectorized<int16_t>& max) {
return minimum(max, a);
}
template <>
Vectorized<int8_t> inline clamp_max(
const Vectorized<int8_t>& a,
const Vectorized<int8_t>& max) {
return minimum(max, a);
}
template <>
Vectorized<int64_t> inline clamp_min(
const Vectorized<int64_t>& a,
const Vectorized<int64_t>& min) {
return maximum(min, a);
}
template <>
Vectorized<int32_t> inline clamp_min(
const Vectorized<int32_t>& a,
const Vectorized<int32_t>& min) {
return maximum(min, a);
}
template <>
Vectorized<int16_t> inline clamp_min(
const Vectorized<int16_t>& a,
const Vectorized<int16_t>& min) {
return maximum(min, a);
}
template <>
Vectorized<int8_t> inline clamp_min(
const Vectorized<int8_t>& a,
const Vectorized<int8_t>& min) {
return maximum(min, a);
}
} // namespace CPU_CAPABILITY
} // namespace at::vec

View File

@ -1377,7 +1377,7 @@ Vectorized<c10::quint8> inline maximum(
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
at::vec::Vectorized<int8_t> src) {
auto s8x8 = vget_low_s8(src);
auto s8x8 = vld1_s8(src.operator const int8_t*());
auto s16x8 = vmovl_s8(s8x8);
auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
@ -1402,7 +1402,7 @@ std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
Vectorized<float> inline convert_int8_half_register_to_float(
at::vec::Vectorized<int8_t> src) {
auto s8x8 = vget_low_s8(src);
auto s8x8 = vld1_s8(src.operator const int8_t*());
auto s16x8 = vmovl_s8(s8x8);
auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));

View File

@ -16,8 +16,6 @@
#include <c10/util/irange.h>
#include <c10/core/ScalarType.h>
#include <ATen/cuda/detail/BLASConstants.h>
#ifdef USE_ROCM
#include <c10/cuda/CUDAStream.h>
#include <hipblaslt/hipblaslt-ext.hpp>
@ -1956,15 +1954,13 @@ void scaled_gemm(
const void *result_scale_ptr,
int64_t result_ld,
ScalarType result_dtype,
bool use_fast_accum,
const std::optional<Tensor>& alpha) {
bool use_fast_accum) {
// Note: see `cublasCommonArgs` for various non-intuitive manupulations
// of input arguments to this function.
const auto computeType = CUBLAS_COMPUTE_32F;
const auto scaleType = CUDA_R_32F;
// Note: alpha_val may change later depending on user-passed argument
float alpha_val = 1.0;
float beta_val = 0.0;
const float alpha_val = 1.0;
const float beta_val = 0.0;
CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, _cublasOpFromChar(transa));
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
@ -2035,33 +2031,6 @@ void scaled_gemm(
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, CUBLASLT_EPILOGUE_BIAS);
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype));
}
// Handle user-passed alpha
float *alpha_ptr = &alpha_val;
float *beta_ptr = &beta_val;
if (alpha.has_value()) {
auto& a = alpha.value();
// if device-tensor
if (a.is_cuda()) {
// NOTE: there are lifetime requirements on device-side pointers for alpha/beta -- the value must be
// valid & correct until the cublas call finishes (not is scheduled like host-side values). Thus
// we need to use allocations for alpha/beta that have some guarantees on lifetime - a statically
// managed 4B buffer for alpha that we'll copy the passed alpha value into, and constant memory
// for beta respectively.
float *user_alpha_ptr = at::cuda::detail::get_user_alpha_ptr();
at::Tensor user_alpha = at::from_blob(user_alpha_ptr, {1}, TensorOptions().device(kCUDA).dtype(kFloat));
user_alpha.copy_(a);
// Tell cublasLt we're using device-side pointers for alpha/beta
auto pointer_mode = CUBLASLT_POINTER_MODE_DEVICE;
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_POINTER_MODE, pointer_mode);
alpha_ptr = user_alpha.data_ptr<float>();
beta_ptr = at::cuda::detail::get_cublas_device_zero();
} else {
alpha_val = a.item<float>();
}
}
// For other data types, use the get_scale_mode function based on scaling type
// The SCALE_MODE attrs only exist in cuBLAS 12.8+/ROCm 7.0 or in recent hipblaslt,
// but we must invoke get_scale_mode anyways to trigger the version checks.
@ -2079,7 +2048,6 @@ void scaled_gemm(
cublasLtMatmulHeuristicResult_t heuristicResult = {};
int returnedResult = 0;
cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
ltHandle,
computeDesc.descriptor(),
@ -2120,10 +2088,10 @@ void scaled_gemm(
auto is_valid_status = hipblaslt_ext::matmulIsAlgoSupported(
ltHandle,
computeDesc.descriptor(),
alpha_ptr,
&alpha_val,
Adesc.descriptor(),
Bdesc.descriptor(),
beta_ptr,
&beta_val,
Cdesc.descriptor(),
Ddesc.descriptor(),
all_algos[i].algo,
@ -2142,14 +2110,17 @@ void scaled_gemm(
cublasStatus_t cublasStatus = cublasLtMatmul(
ltHandle,
computeDesc.descriptor(),
alpha_ptr,
&alpha_val,
mat1_ptr,
Adesc.descriptor(),
mat2_ptr,
Bdesc.descriptor(),
beta_ptr,
// NOTE: always use result_ptr here, because cuBLASLt w/device beta=0 can't handle nullptr either
&beta_val,
#ifdef USE_ROCM
result_ptr, // unused, since beta_val is 0, but hipblaslt can't handle nullptr
#else
nullptr,
#endif // ifdef USE_ROCM
Cdesc.descriptor(),
result_ptr,
Ddesc.descriptor(),

View File

@ -161,8 +161,7 @@ void scaled_gemm(
const void* result_scale_ptr,
int64_t result_ld,
ScalarType result_dtype,
bool use_fast_accum,
const std::optional<Tensor>& alpha);
bool use_fast_accum);
#define CUDABLAS_BGEMM_ARGTYPES(Dtype) CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype)

View File

@ -325,9 +325,9 @@ uint64_t CUDAGeneratorImpl::seed() {
*/
c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
// The RNG state comprises the seed, and an offset used for Philox.
constexpr size_t seed_size = sizeof(uint64_t);
constexpr size_t offset_size = sizeof(int64_t);
constexpr size_t total_size = seed_size + offset_size;
static const size_t seed_size = sizeof(uint64_t);
static const size_t offset_size = sizeof(int64_t);
static const size_t total_size = seed_size + offset_size;
auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
auto rng_state = state_tensor.data_ptr<uint8_t>();
@ -346,9 +346,9 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
* and size of the internal state.
*/
void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
constexpr size_t seed_size = sizeof(uint64_t);
constexpr size_t offset_size = sizeof(int64_t);
constexpr size_t total_size = seed_size + offset_size;
static const size_t seed_size = sizeof(uint64_t);
static const size_t offset_size = sizeof(int64_t);
static const size_t total_size = seed_size + offset_size;
detail::check_rng_state(new_state);

View File

@ -1,54 +0,0 @@
#include <ATen/Functions.h>
#include <ATen/Tensor.h>
#include <ATen/cuda/Exceptions.h>
#include <mutex>
namespace at {
namespace cuda {
namespace detail {
__device__ __constant__ float cublas_one_device;
__device__ __constant__ float cublas_zero_device;
float *get_cublas_device_one() {
static c10::once_flag init_flag;
c10::call_once(init_flag, []() {
const float one = 1.f;
AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_one_device, &one, sizeof(float)));
});
float *ptr;
AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_one_device));
return ptr;
}
float *get_cublas_device_zero() {
static c10::once_flag init_flag;
c10::call_once(init_flag, []() {
const float zero = 0.f;
AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_zero_device, &zero, sizeof(float)));
});
float *ptr;
AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_zero_device));
return ptr;
}
float *get_user_alpha_ptr() {
static float *alpha_ptr;
static c10::once_flag init_flag;
c10::call_once(init_flag, []() {
AT_CUDA_CHECK(cudaMalloc(&alpha_ptr, sizeof(float)));
});
return alpha_ptr;
}
} // namespace detail
} // namespace cuda
} // namespace at

View File

@ -1,11 +0,0 @@
#pragma once
#include <ATen/core/TensorBase.h>
namespace at::cuda::detail {
float *get_cublas_device_one();
float *get_cublas_device_zero();
float *get_user_alpha_ptr();
} // namespace at::cuda::detail

View File

@ -109,8 +109,7 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
params->c_scale_ptr,
params->ldc,
params->c_dtype,
params->use_fast_accum,
std::nullopt /* alpha */);
params->use_fast_accum);
return OK;
}
};

View File

@ -240,8 +240,8 @@ TORCH_META_FUNC(gelu_backward) (
namespace at::native {
static constexpr double SELU_ALPHA = 1.6732632423543772848170429916717;
static constexpr double SELU_SCALE = 1.0507009873554804934193349852946;
static const double SELU_ALPHA = 1.6732632423543772848170429916717;
static const double SELU_SCALE = 1.0507009873554804934193349852946;
DEFINE_DISPATCH(elu_stub);
DEFINE_DISPATCH(elu_backward_stub);

View File

@ -286,7 +286,7 @@ template void scal_fast_path<scalar_t>(int *n, scalar_t *a, scalar_t *x, int *in
#if AT_BUILD_WITH_BLAS()
template <>
bool scal_use_fast_path<double>(int64_t n, int64_t incx) {
auto constexpr intmax = std::numeric_limits<int>::max();
auto intmax = std::numeric_limits<int>::max();
return n <= intmax && incx <= intmax;
}
@ -315,7 +315,7 @@ bool gemv_use_fast_path<float>(
int64_t incx,
[[maybe_unused]] float beta,
int64_t incy) {
auto constexpr intmax = std::numeric_limits<int>::max();
auto intmax = std::numeric_limits<int>::max();
return (m <= intmax) && (n <= intmax) && (lda <= intmax) &&
(incx > 0) && (incx <= intmax) && (incy > 0) && (incy <= intmax);
}

View File

@ -1,6 +1,5 @@
#pragma once
#include <array>
#include <ATen/native/Math.h>
#include <c10/macros/Macros.h>
#include <c10/util/MathConstants.h>
@ -128,7 +127,7 @@ C10_DEVICE scalar_t sample_gamma(scalar_t alpha, BaseSampler<accscalar_t, unifor
template<typename scalar_t>
C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) {
constexpr static scalar_t kTailValues[] = {
const static scalar_t kTailValues[] = {
0.0810614667953272,
0.0413406959554092,
0.0276779256849983,
@ -140,7 +139,7 @@ C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) {
0.00925546218271273,
0.00833056343336287
};
if (k < std::size(kTailValues)) {
if (k <= 9) {
return kTailValues[static_cast<size_t>(k)];
}
scalar_t kp1sq = (k + 1) * (k + 1);

View File

@ -581,7 +581,7 @@ scalar_t ratevl(scalar_t x, const scalar_t num[], int64_t M,
template <typename scalar_t>
static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
// lanczos approximation
static constexpr scalar_t lanczos_sum_expg_scaled_num[13] = {
static const scalar_t lanczos_sum_expg_scaled_num[13] = {
0.006061842346248906525783753964555936883222,
0.5098416655656676188125178644804694509993,
19.51992788247617482847860966235652136208,
@ -596,7 +596,7 @@ static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
103794043.1163445451906271053616070238554,
56906521.91347156388090791033559122686859
};
static constexpr scalar_t lanczos_sum_expg_scaled_denom[13] = {
static const scalar_t lanczos_sum_expg_scaled_denom[13] = {
1.,
66.,
1925.,
@ -712,7 +712,7 @@ static scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
template <typename scalar_t>
static scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) {
// Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
static constexpr scalar_t d[25][25] =
static const scalar_t d[25][25] =
{{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2,
1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4,
3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6,

View File

@ -62,7 +62,7 @@
#include <utility>
#include <vector>
static constexpr int MIOPEN_DIM_MAX = 5;
static const int MIOPEN_DIM_MAX = 5;
namespace at::meta {

View File

@ -77,7 +77,7 @@ inline AdvancedIndex make_info(Tensor self, IOptTensorListRef orig) {
// next broadcast all index tensors together
try {
indices = expand_outplace(indices);
} catch (std::exception&) {
} catch (std::exception& e) {
TORCH_CHECK_INDEX(
false,
"shape mismatch: indexing tensors could not be broadcast together"

View File

@ -1038,7 +1038,7 @@ struct HelperInterpNearest : public HelperInterpBase {
// We keep this structure for BC and consider as deprecated.
// See HelperInterpNearestExact as replacement
static constexpr int interp_size = 1;
static const int interp_size = 1;
static inline void init_indices_weights(
at::ScalarType output_type,
@ -1155,7 +1155,7 @@ struct HelperInterpNearestExact : public HelperInterpNearest {
struct HelperInterpLinear : public HelperInterpBase {
static constexpr int interp_size = 2;
static const int interp_size = 2;
// Compute indices and weights for each interpolated dimension
// indices_weights = {
@ -1275,7 +1275,7 @@ struct HelperInterpLinear : public HelperInterpBase {
struct HelperInterpCubic : public HelperInterpBase {
static constexpr int interp_size = 4;
static const int interp_size = 4;
// Compute indices and weights for each interpolated dimension
// indices_weights = {

View File

@ -1359,8 +1359,7 @@ _scaled_gemm(
const ScalingType scaling_choice_a, const ScalingType scaling_choice_b,
const std::optional<Tensor>& bias,
const bool use_fast_accum,
Tensor& out,
const std::optional<Tensor>& alpha = std::nullopt) {
Tensor& out) {
cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, std::nullopt, scaling_choice_a, scaling_choice_b);
const auto out_dtype_ = args.result->scalar_type();
TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
@ -1411,8 +1410,7 @@ _scaled_gemm(
args.scale_result_ptr,
args.result_ld,
out_dtype_,
use_fast_accum,
alpha);
use_fast_accum);
return out;
}
}

View File

@ -249,7 +249,7 @@ __global__ void max_pool_forward_nhwc(
}
static constexpr int BLOCK_THREADS = 256;
static const int BLOCK_THREADS = 256;
template <typename scalar_t, typename accscalar_t>
#if defined (USE_ROCM)

View File

@ -36,9 +36,9 @@ namespace at::native {
namespace {
#if defined(USE_ROCM)
static constexpr int BLOCKDIMY = 16;
static const int BLOCKDIMY = 16;
#else
static constexpr int BLOCKDIMY = 32;
static const int BLOCKDIMY = 32;
#endif
template

View File

@ -82,7 +82,7 @@ __host__ __device__ scalar_t lanczos_sum_expg_scaled(scalar_t x) {
// lanczos approximation
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
constexpr accscalar_t lanczos_sum_expg_scaled_num[13] = {
static const accscalar_t lanczos_sum_expg_scaled_num[13] = {
0.006061842346248906525783753964555936883222,
0.5098416655656676188125178644804694509993,
19.51992788247617482847860966235652136208,
@ -97,7 +97,7 @@ __host__ __device__ scalar_t lanczos_sum_expg_scaled(scalar_t x) {
103794043.1163445451906271053616070238554,
56906521.91347156388090791033559122686859
};
constexpr accscalar_t lanczos_sum_expg_scaled_denom[13] = {
static const accscalar_t lanczos_sum_expg_scaled_denom[13] = {
1.,
66.,
1925.,
@ -126,10 +126,10 @@ __host__ __device__ scalar_t _igam_helper_fac(scalar_t a, scalar_t x) {
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
accscalar_t ax, fac, res, num, numfac;
constexpr accscalar_t MAXLOG = std::is_same_v<accscalar_t,double> ?
static const accscalar_t MAXLOG = std::is_same_v<accscalar_t,double> ?
7.09782712893383996843E2 : 88.72283905206835;
constexpr accscalar_t EXP1 = 2.718281828459045;
constexpr accscalar_t lanczos_g = 6.024680040776729583740234375;
static const accscalar_t EXP1 = 2.718281828459045;
static const accscalar_t lanczos_g = 6.024680040776729583740234375;
if (::fabs(a - x) > 0.4 * ::fabs(a)) {
ax = a * ::log(x) - x - ::lgamma(a);
@ -158,9 +158,9 @@ __host__ __device__ scalar_t _igam_helper_series(scalar_t a, scalar_t x) {
// Compute igam using DLMF 8.11.4. [igam1]
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
1.11022302462515654042E-16 : 5.9604644775390625E-8;
constexpr int MAXITER = 2000;
static const int MAXITER = 2000;
int i;
accscalar_t ans, ax, c, r;
@ -196,8 +196,8 @@ __host__ __device__ scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
accscalar_t fac = 1;
accscalar_t sum = 0;
accscalar_t term, logx;
constexpr int MAXITER = 2000;
constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
static const int MAXITER = 2000;
static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
1.11022302462515654042E-16 : 5.9604644775390625E-8;
for (n = 1; n < MAXITER; n++) {
@ -219,7 +219,7 @@ __host__ __device__ scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t
// Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
constexpr accscalar_t d[25][25] =
static const accscalar_t d[25][25] =
{{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2, 1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4, 3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6, 8.296711340953086e-7, -1.7665952736826079e-7, 6.7078535434014986e-9, 1.0261809784240308e-8, -4.3820360184533532e-9, 9.1476995822367902e-10, -2.551419399494625e-11, -5.8307721325504251e-11, 2.4361948020667416e-11, -5.0276692801141756e-12, 1.1004392031956135e-13, 3.3717632624009854e-13, -1.3923887224181621e-13, 2.8534893807047443e-14, -5.1391118342425726e-16, -1.9752288294349443e-15},
{-1.8518518518518519e-3, -3.4722222222222222e-3, 2.6455026455026455e-3, -9.9022633744855967e-4, 2.0576131687242798e-4, -4.0187757201646091e-7, -1.8098550334489978e-5, 7.6491609160811101e-6, -1.6120900894563446e-6, 4.6471278028074343e-9, 1.378633446915721e-7, -5.752545603517705e-8, 1.1951628599778147e-8, -1.7543241719747648e-11, -1.0091543710600413e-9, 4.1627929918425826e-10, -8.5639070264929806e-11, 6.0672151016047586e-14, 7.1624989648114854e-12, -2.9331866437714371e-12, 5.9966963656836887e-13, -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14, -4.13125571381061e-15},
{4.1335978835978836e-3, -2.6813271604938272e-3, 7.7160493827160494e-4, 2.0093878600823045e-6, -1.0736653226365161e-4, 5.2923448829120125e-5, -1.2760635188618728e-5, 3.4235787340961381e-8, 1.3721957309062933e-6, -6.298992138380055e-7, 1.4280614206064242e-7, -2.0477098421990866e-10, -1.4092529910867521e-8, 6.228974084922022e-9, -1.3670488396617113e-9, 9.4283561590146782e-13, 1.2872252400089318e-10, -5.5645956134363321e-11, 1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12, 4.6622399463901357e-13, -9.905105763906906e-14, 1.8931876768373515e-17, 8.8592218725911273e-15},
@ -248,7 +248,7 @@ __host__ __device__ scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t
int k, n, sgn;
int maxpow = 0;
constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
1.11022302462515654042E-16 : 5.9604644775390625E-8;
accscalar_t lambda = x / a;
accscalar_t sigma = (x - a) / a;
@ -314,12 +314,12 @@ __host__ __device__ scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar
int i;
accscalar_t ans, ax, c, yc, r, t, y, z;
accscalar_t pk, pkm1, pkm2, qk, qkm1, qkm2;
constexpr int MAXITER = 2000;
constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
static const int MAXITER = 2000;
static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
1.11022302462515654042E-16 : 5.9604644775390625E-8;
constexpr accscalar_t BIG = std::is_same_v<accscalar_t,double> ?
static const accscalar_t BIG = std::is_same_v<accscalar_t,double> ?
4.503599627370496e15 : 16777216.;
constexpr accscalar_t BIGINV = std::is_same_v<accscalar_t,double> ?
static const accscalar_t BIGINV = std::is_same_v<accscalar_t,double> ?
2.22044604925031308085e-16 : 5.9604644775390625E-8;
ax = _igam_helper_fac(a, x);
@ -385,10 +385,10 @@ __noinline__ __host__ __device__ scalar_t calc_igammac(scalar_t a, scalar_t x) {
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
accscalar_t absxma_a;
constexpr accscalar_t SMALL = 20.0;
constexpr accscalar_t LARGE = 200.0;
constexpr accscalar_t SMALLRATIO = 0.3;
constexpr accscalar_t LARGERATIO = 4.5;
static const accscalar_t SMALL = 20.0;
static const accscalar_t LARGE = 200.0;
static const accscalar_t SMALLRATIO = 0.3;
static const accscalar_t LARGERATIO = 4.5;
if ((x < 0) || (a < 0)) {
// out of defined-region of the function
@ -467,10 +467,10 @@ __noinline__ __host__ __device__ scalar_t calc_igamma(scalar_t a, scalar_t x) {
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
accscalar_t absxma_a;
constexpr accscalar_t SMALL = 20.0;
constexpr accscalar_t LARGE = 200.0;
constexpr accscalar_t SMALLRATIO = 0.3;
constexpr accscalar_t LARGERATIO = 4.5;
static const accscalar_t SMALL = 20.0;
static const accscalar_t LARGE = 200.0;
static const accscalar_t SMALLRATIO = 0.3;
static const accscalar_t LARGERATIO = 4.5;
// boundary values following SciPy
if ((x < 0) || (a < 0)) {

View File

@ -231,7 +231,7 @@ const auto lcm_string = jiterator_stringify(
const auto digamma_string = jiterator_stringify(
template <typename T>
T digamma(T x) {
static constexpr double PI_f64 = 3.14159265358979323846;
static const double PI_f64 = 3.14159265358979323846;
// Short-circuits if x is +/- 0 and returns -/+ ∞ per the C++ standard
if (x == 0) {
@ -3072,9 +3072,9 @@ template <typename scalar_t>
static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) {
// [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
static constexpr double PI_f64 = 3.14159265358979323846;
constexpr accscalar_t PSI_10 = 2.25175258906672110764;
constexpr accscalar_t A[] = {
static const double PI_f64 = 3.14159265358979323846;
const accscalar_t PSI_10 = 2.25175258906672110764;
const accscalar_t A[] = {
8.33333333333333333333E-2,
-2.10927960927960927961E-2,
7.57575757575757575758E-3,

View File

@ -1097,7 +1097,11 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
// threads with different threadIdx.x are independent and will produce results for different outputs.
// In such case, values in each loaded vector always correspond to different outputs.
if (fastest_moving_stride == sizeof(scalar_t)) {
#ifdef USE_ROCM
if (reduction_on_fastest_striding_dimension && dim0 >= 128 && iter.num_reduce_dims() == 1) {
#else
if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1 && vt0 >= input_vec_size) {
#endif
// Case 1: "vectorize along input"
// Note that if vt0 < ReduceConfig::vec_size, then this means the register pressure could be high, in such case,
// we should avoid vectorization.

View File

@ -39,14 +39,9 @@ static void std_var_kernel_cuda(TensorIterator& iter, double correction, bool ta
template <typename scalar_t, typename acc_t=scalar_t, typename out_t=scalar_t>
void mean_kernel_impl(TensorIterator& iter) {
// returns acc_t for all non-complex dtypes and returns T for c10::complex<T>
constexpr bool is_16_bits = sizeof(scalar_t) == 2;
using factor_t = typename c10::scalar_value_type<acc_t>::type;
factor_t factor = static_cast<factor_t>(iter.num_output_elements()) / iter.numel();
if constexpr (is_16_bits) {
gpu_reduce_kernel<scalar_t, out_t, /*vt0=*/4, /*input_vec_size=*/8>(iter, MeanOps<scalar_t, acc_t, factor_t, out_t> {factor});
} else {
gpu_reduce_kernel<scalar_t, out_t>(iter, MeanOps<scalar_t, acc_t, factor_t, out_t> {factor});
}
gpu_reduce_kernel<scalar_t, out_t>(iter, MeanOps<scalar_t, acc_t, factor_t, out_t> {factor});
}
static void mean_kernel_cuda(TensorIterator& iter) {

View File

@ -13,19 +13,24 @@ namespace at::native {
template <typename scalar_t, typename acc_t = scalar_t, typename out_t = scalar_t>
struct sum_functor {
void operator()(TensorIterator& iter) {
const auto sum_combine = [] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
return a + b;
};
constexpr bool is_16_bits = sizeof(scalar_t) == 2;
if constexpr (is_16_bits) {
#ifdef USE_ROCM
// Half and BFloat16 can be packed in groups of up to 8 elements and
// can use *_DWORDX4 instructions to achieve that.
const bool is_16_bits =
( (std::is_same<at::Half, scalar_t>::value) ||
(std::is_same<at::BFloat16, scalar_t>::value) );
if (is_16_bits) {
gpu_reduce_kernel<scalar_t, out_t, /*vt0=*/4, /*input_vec_size=*/8>(
iter, func_wrapper<out_t>(sum_combine)
);
} else {
gpu_reduce_kernel<scalar_t, out_t>(
iter, func_wrapper<out_t>(sum_combine)
);
iter, func_wrapper<out_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
return a + b;
}));
return;
}
#endif
gpu_reduce_kernel<scalar_t, out_t>(
iter, func_wrapper<out_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
return a + b;
}));
}
};

View File

@ -277,7 +277,7 @@ struct BilinearFilterFunctor {
return 0;
}
static constexpr int size = 2;
static const int size = 2;
};
// taken from
@ -301,7 +301,7 @@ struct BicubicFilterFunctor {
return 0;
}
static constexpr int size = 4;
static const int size = 4;
};
template <typename accscalar_t>

View File

@ -141,11 +141,7 @@ WelfordDataLN cuWelfordOnlineSum(
if constexpr (!rms_norm){
U delta = val - curr_sum.mean;
U new_count = curr_sum.count + 1.f;
#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
#else
U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
#endif
return {new_mean, curr_sum.sigma2 + delta * (val - new_mean), new_count};
} else{
return {0.f, curr_sum.sigma2 + val * val, 0};
@ -163,11 +159,7 @@ WelfordDataLN cuWelfordCombine(
U count = dataA.count + dataB.count;
U mean, sigma2;
if (count > decltype(dataB.count){0}) {
#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
auto coef = __builtin_amdgcn_rcpf(count);
#else
auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division
#endif
auto nA = dataA.count * coef;
auto nB = dataB.count * coef;
mean = nA*dataA.mean + nB*dataB.mean;

View File

@ -416,7 +416,7 @@ static inline bool checksize(const Tensor& mat1, const Tensor& mat2){
// else if dim = 3, mat1's size = (b * m * n), mat2's size = (b * n * k)
// else called from aten::mv, mat1.size = (m * n), mat2.size = (n)
// only m * n * b * k(if exist) are large enough we can get benefit from mkldnn optimized gemm kernel
constexpr int64_t mkldnn_gemm_min_size = 16 * 16 * 16;
static const int64_t mkldnn_gemm_min_size = 16 * 16 * 16;
if (mat1.dim() == 1 && mat2.dim() == 1) {
// aten::dot
return mat1.size(0) > mkldnn_gemm_min_size;

View File

@ -1,16 +1,16 @@
#pragma once
#include <c10/metal/common.h>
template <typename idx_type_t = int64_t, unsigned N = c10::metal::max_ndim>
struct CatSharedParams {
template <unsigned N = c10::metal::max_ndim, typename idx_type_t = int64_t>
struct CatLargeSharedParams {
int32_t ndim;
int32_t cat_dim;
::c10::metal::array<idx_type_t, N> output_strides;
::c10::metal::array<idx_type_t, N> output_sizes;
};
template <typename idx_type_t = int64_t, unsigned N = c10::metal::max_ndim>
struct CatInputParams {
template <unsigned N = c10::metal::max_ndim, typename idx_type_t = int64_t>
struct CatLargeInputParams {
idx_type_t cat_dim_offset;
idx_type_t input_element_offset;
::c10::metal::array<idx_type_t, N> input_strides;

View File

@ -6,25 +6,26 @@
using namespace metal;
using namespace c10::metal;
template <typename I, typename T_in, typename T_out>
kernel void cat(
template <typename T_in, typename T_out>
kernel void cat_large(
constant T_in* input [[buffer(0)]],
device T_out* output [[buffer(1)]],
constant CatSharedParams<I>& shared_params [[buffer(2)]],
constant CatInputParams<I>& input_params [[buffer(3)]],
constant CatLargeSharedParams<>& shared_params [[buffer(2)]],
constant CatLargeInputParams<>& input_params [[buffer(3)]],
uint tid [[thread_position_in_grid]]) {
auto ndim = shared_params.ndim;
auto cat_dim = shared_params.cat_dim;
constant auto& output_strides = shared_params.output_strides;
constant auto& output_sizes = shared_params.output_sizes;
auto cat_dim_offset = input_params.cat_dim_offset;
auto input_element_offset = input_params.input_element_offset;
constant auto& input_strides = input_params.input_strides;
constant auto& input_sizes = input_params.input_sizes;
auto input_element_idx = static_cast<I>(tid) + input_element_offset;
I input_offset = 0;
I output_offset = 0;
auto input_element_idx = static_cast<int64_t>(tid) + input_element_offset;
int64_t input_offset = 0;
int64_t output_offset = 0;
for (auto dim = ndim - 1; dim >= 0; dim--) {
auto dim_size = input_sizes[dim];
@ -41,45 +42,41 @@ kernel void cat(
output[output_offset] = static_cast<T_out>(input[input_offset]);
}
#define REGISTER_CAT_OP(I, T_in, T_out) \
template [[host_name("cat_" #I "_" #T_in "_" #T_out)]] \
kernel void cat<I, T_in, T_out>( \
constant T_in * input [[buffer(0)]], \
device T_out * output [[buffer(1)]], \
constant CatSharedParams<I> & shared_params [[buffer(2)]], \
constant CatInputParams<I> & input_params [[buffer(3)]], \
#define REGISTER_CAT_LARGE_OP(T_in, T_out) \
template [[host_name("cat_large_" #T_in "_" #T_out)]] \
kernel void cat_large<T_in, T_out>( \
constant T_in * input [[buffer(0)]], \
device T_out * output [[buffer(1)]], \
constant CatLargeSharedParams<> & shared_params [[buffer(2)]], \
constant CatLargeInputParams<> & input_params [[buffer(3)]], \
uint tid [[thread_position_in_grid]]);
#define REGISTER_CAT_OP_ALL_INPUT_TYPES(I, T_out) \
REGISTER_CAT_OP(I, float, T_out); \
REGISTER_CAT_OP(I, half, T_out); \
REGISTER_CAT_OP(I, bfloat, T_out); \
REGISTER_CAT_OP(I, int, T_out); \
REGISTER_CAT_OP(I, uint, T_out); \
REGISTER_CAT_OP(I, long, T_out); \
REGISTER_CAT_OP(I, ulong, T_out); \
REGISTER_CAT_OP(I, short, T_out); \
REGISTER_CAT_OP(I, ushort, T_out); \
REGISTER_CAT_OP(I, char, T_out); \
REGISTER_CAT_OP(I, uchar, T_out); \
REGISTER_CAT_OP(I, bool, T_out);
#define REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(T_out) \
REGISTER_CAT_LARGE_OP(float, T_out); \
REGISTER_CAT_LARGE_OP(half, T_out); \
REGISTER_CAT_LARGE_OP(bfloat, T_out); \
REGISTER_CAT_LARGE_OP(int, T_out); \
REGISTER_CAT_LARGE_OP(uint, T_out); \
REGISTER_CAT_LARGE_OP(long, T_out); \
REGISTER_CAT_LARGE_OP(ulong, T_out); \
REGISTER_CAT_LARGE_OP(short, T_out); \
REGISTER_CAT_LARGE_OP(ushort, T_out); \
REGISTER_CAT_LARGE_OP(char, T_out); \
REGISTER_CAT_LARGE_OP(uchar, T_out); \
REGISTER_CAT_LARGE_OP(bool, T_out);
#define REGISTER_CAT_FOR_INDEX_TYPE(I) \
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, float); \
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, half); \
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, bfloat); \
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, int); \
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, uint); \
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, long); \
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, ulong); \
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, short); \
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, ushort); \
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, char); \
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, uchar); \
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, bool); \
\
REGISTER_CAT_OP(I, float2, float2); \
REGISTER_CAT_OP(I, half2, half2);
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(float);
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(half);
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(bfloat);
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(int);
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(uint);
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(long);
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(ulong);
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(short);
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(ushort);
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(char);
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(uchar);
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(bool);
REGISTER_CAT_FOR_INDEX_TYPE(int64_t);
REGISTER_CAT_FOR_INDEX_TYPE(int32_t);
REGISTER_CAT_LARGE_OP(float2, float2);
REGISTER_CAT_LARGE_OP(half2, half2);

View File

@ -3,7 +3,6 @@
#include <ATen/MemoryOverlap.h>
#include <ATen/WrapDimUtils.h>
#include <ATen/mps/MPSProfiler.h>
#include <ATen/native/Pool.h>
#include <ATen/native/TensorShape.h>
#include <ATen/native/TypeProperties.h>
#include <ATen/native/mps/OperationUtils.h>
@ -70,40 +69,29 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in
}
}
template <typename T>
std::string get_type_str();
template <>
std::string get_type_str<int64_t>() {
return "int64_t";
}
template <>
std::string get_type_str<int32_t>() {
return "int32_t";
}
// This implementation of cat is used only if one of the inputs or the output is
// too large to use MPSGraph.
// NOTE: `output` is expected to already have the correct size.
template <typename idx_type_t>
static void cat_out_mps_impl(const ITensorListRef& inputs, int64_t dimension, const Tensor& output) {
CatSharedParams<idx_type_t> shared_params;
static void cat_out_large_tensor_mps(const ITensorListRef& inputs, int64_t dimension, const Tensor& output) {
CatLargeSharedParams shared_params;
shared_params.ndim = output.dim();
shared_params.cat_dim = dimension;
for (const auto dim : c10::irange(output.dim())) {
shared_params.output_strides[dim] = safe_downcast<idx_type_t, int64_t>(output.stride(dim));
shared_params.output_sizes[dim] = safe_downcast<idx_type_t, int64_t>(output.size(dim));
shared_params.output_strides[dim] = output.stride(dim);
shared_params.output_sizes[dim] = output.size(dim);
}
idx_type_t cat_dim_offset = 0;
int64_t cat_dim_offset = 0;
size_t input_idx = 0;
MPSStream* stream = getCurrentMPSStream();
// Launch a separate kernels for each input. This will produce some overhead.
// In order to launch only one kernel to process all inputs, we would have to
// copy all the input tensor data into a packed buffer, which would not be
// ideal.
// Launch a separate kernels for each input. This will produce some overhead,
// but that should be relatively minimal since at least one of the inputs is
// very large. In order to launch only one kernel to process all inputs, we
// would have to copy all the input tensor data into a packed buffer, which
// would not be ideal.
for (const Tensor& input : inputs) {
if (input.numel() == 0) {
continue;
@ -116,23 +104,21 @@ static void cat_out_mps_impl(const ITensorListRef& inputs, int64_t dimension, co
for (int64_t numel_remaining = input.numel(); numel_remaining > 0; numel_remaining -= max_num_threads) {
auto num_threads = std::min(max_num_threads, numel_remaining);
CatInputParams<idx_type_t> input_params;
CatLargeInputParams input_params;
input_params.cat_dim_offset = safe_downcast<idx_type_t, int64_t>(cat_dim_offset);
input_params.input_element_offset = safe_downcast<idx_type_t, int64_t>(input.numel() - numel_remaining);
input_params.cat_dim_offset = cat_dim_offset;
input_params.input_element_offset = input.numel() - numel_remaining;
for (const auto dim : c10::irange(input.dim())) {
input_params.input_strides[dim] = safe_downcast<idx_type_t, int64_t>(input.stride(dim));
input_params.input_sizes[dim] = safe_downcast<idx_type_t, int64_t>(input.size(dim));
input_params.input_strides[dim] = input.stride(dim);
input_params.input_sizes[dim] = input.size(dim);
}
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("cat_{}_{}_{}",
get_type_str<idx_type_t>(),
scalarToMetalTypeString(input),
scalarToMetalTypeString(output)));
auto pipeline_state = lib.getPipelineStateForFunc(
fmt::format("cat_large_{}_{}", scalarToMetalTypeString(input), scalarToMetalTypeString(output)));
getMPSProfiler().beginProfileKernel(pipeline_state, "cat", {input});
[computeEncoder setComputePipelineState:pipeline_state];
mtl_setArgs(computeEncoder, input, output, shared_params, input_params);
@ -308,6 +294,13 @@ TORCH_IMPL_FUNC(cat_out_mps)
" and out is on ",
out.device());
// TODO: For better performance by eliminating input tensor gathering and post transpose,
// TODO: it is better to keep the out tensor's memory format.
// TODO: dimension needs to be recomputed as:
// TODO: dim = 0 --> dim = 0; dim = 1 or 2 --> dim = out.dim()- dim; otherwise dim = dim-1
if (needsGather(out)) {
out.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
}
std::vector<int64_t> size(notSkippedTensor.sizes().vec());
// Compute size of the result in the cat dimension
@ -338,9 +331,82 @@ TORCH_IMPL_FUNC(cat_out_mps)
has_large_tensor |= isTooLargeForMPSGraph(out);
if (has_large_tensor) {
return mps::cat_out_mps_impl<int64_t>(materialized_inputs, dimension, out);
} else {
return mps::cat_out_mps_impl<int32_t>(materialized_inputs, dimension, out);
return mps::cat_out_large_tensor_mps(materialized_inputs, dimension, out);
}
struct CachedGraph : public MPSCachedGraph {
CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
std::vector<MPSGraphTensor*> inputTensors_;
MPSGraphTensor* outputTensor_ = nil;
};
@autoreleasepool {
std::string key = "cat_out_mps:" + std::to_string(dimension) + ":" +
(memory_format == MemoryFormat::ChannelsLast ? "NHWC" : "NCHW");
if (!all_same_dtype) {
key += getTensorsStringKey(input_tensors, true, all_same_sizes_and_stride);
} else {
key += ":" + getMPSTypeString(input_tensors[0].scalar_type(), true) + ":" + std::to_string(inputs.size());
}
for (auto idx : skipped_tensor_indices) {
key += "," + std::to_string(idx);
}
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
auto len_tensor_array = inputs.size() - skipped_tensor_indices.size();
std::vector<MPSGraphTensor*> castInputTensors(len_tensor_array);
newCachedGraph->inputTensors_.reserve(len_tensor_array);
for (const auto idx : c10::irange(len_tensor_array)) {
const Tensor& tensor = input_tensors[idx];
auto scalar_type = getMPSScalarType(tensor.scalar_type());
if (tensor.scalar_type() == kBool) {
scalar_type = MPSDataTypeInt8;
}
newCachedGraph->inputTensors_[idx] = mpsGraphUnrankedPlaceHolder(mpsGraph, scalar_type);
if (tensor.scalar_type() != out_dtype) {
castInputTensors[idx] = [mpsGraph castTensor:newCachedGraph->inputTensors_[idx]
toType:getMPSDataType(out_dtype)
name:@"castInput"];
} else {
castInputTensors[idx] = newCachedGraph->inputTensors_[idx];
}
}
auto inputTensorsArray = [NSArray arrayWithObjects:castInputTensors.data() count:len_tensor_array];
MPSGraphTensor* outputTensor = [mpsGraph concatTensors:inputTensorsArray
dimension:dimension // Maybe convert this from int64_t -> int32
name:nil];
if (getMPSDataType(out_dtype) == MPSDataTypeBool) {
outputTensor = [mpsGraph castTensor:outputTensor toType:MPSDataTypeBool name:@"outputTensor"];
}
newCachedGraph->outputTensor_ = outputTensor;
});
std::vector<Placeholder> inputPlaceholders;
int i = 0;
int t_idx = 0;
for (const Tensor& tensor : materialized_inputs) {
if (std::find(skipped_tensor_indices.begin(), skipped_tensor_indices.end(), i) == skipped_tensor_indices.end()) {
auto scalar_type = getMPSScalarType(tensor.scalar_type());
if (tensor.scalar_type() == kBool) {
scalar_type = MPSDataTypeInt8;
}
inputPlaceholders.emplace_back(cachedGraph->inputTensors_[t_idx], tensor, nullptr, true, scalar_type);
t_idx++;
}
i++;
}
auto outputDataType = getMPSScalarType(out.scalar_type());
Placeholder outputPlaceholder =
Placeholder(cachedGraph->outputTensor_, out, /*mpsShape=*/nil, /*gatherTensorData=*/false, outputDataType);
NSMutableDictionary* feeds = [[NSMutableDictionary new] autorelease];
for (auto& inputPlaceholder : inputPlaceholders) {
feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
}
runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
}
}

View File

@ -6531,7 +6531,6 @@
dispatch:
CPU, CUDA: var
MPS: var_mps
MTIA: var_mtia
tags: core
- func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)

View File

@ -3551,7 +3551,7 @@ void dequantize_tensor_per_tensor_affine_cpu(
#if defined(__ARM_NEON__) || defined(__aarch64__)
constexpr static int PARALLEL_THRESHOLD = 1 << 20;
const static int PARALLEL_THRESHOLD = 1 << 20;
// Generic template defaults to naive quantize implementation
template <typename T>

View File

@ -1388,7 +1388,7 @@ namespace at::native {
TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() <= 1,
"onednn int8 linear: act scale/zp size should be 1/<=1");
static std::optional<at::Tensor> other = std::nullopt;
constexpr std::string_view binary_post_op = "none";
static const std::string_view binary_post_op = "none";
int64_t act_zp = act_zero_point.numel() == 1 ? act_zero_point.item().toLong() : 0;
return linear_int8_with_onednn_weight(
act, act_scale.item().toDouble(), act_zp,

View File

@ -16,8 +16,8 @@ namespace {
#ifdef USE_PYTORCH_QNNPACK
constexpr static float qnnpack_softmax_output_scale = 0x1.0p-8f;
constexpr static int qnnpack_softmax_output_zero_point = 0;
const static float qnnpack_softmax_output_scale = 0x1.0p-8f;
const static int qnnpack_softmax_output_zero_point = 0;
bool is_qnnpack_compatible(
const Tensor& qx,

View File

@ -110,9 +110,9 @@ class ApplyLogSumExp {
using ElementCompute = ElementCompute_;
using ElementLSE = ElementLSE_;
static int constexpr kElementsPerAccess = ElementsPerAccess;
static int constexpr kCount = kElementsPerAccess;
static constexpr ScaleType::Kind kScale =
static int const kElementsPerAccess = ElementsPerAccess;
static int const kCount = kElementsPerAccess;
static const ScaleType::Kind kScale =
cutlass::epilogue::thread::ScaleType::NoBetaScaling;
using FragmentOutput = Array<ElementOutput, kCount>;

View File

@ -14,16 +14,16 @@ using namespace at;
namespace {
constexpr auto int_min = std::numeric_limits<int>::min();
constexpr auto int_max = std::numeric_limits<int>::max();
constexpr auto long_min = std::numeric_limits<int64_t>::min();
constexpr auto long_max = std::numeric_limits<int64_t>::max();
constexpr auto float_lowest = std::numeric_limits<float>::lowest();
constexpr auto float_min = std::numeric_limits<float>::min();
constexpr auto float_max = std::numeric_limits<float>::max();
constexpr auto double_lowest = std::numeric_limits<double>::lowest();
constexpr auto double_min = std::numeric_limits<double>::min();
constexpr auto double_max = std::numeric_limits<double>::max();
const auto int_min = std::numeric_limits<int>::min();
const auto int_max = std::numeric_limits<int>::max();
const auto long_min = std::numeric_limits<int64_t>::min();
const auto long_max = std::numeric_limits<int64_t>::max();
const auto float_lowest = std::numeric_limits<float>::lowest();
const auto float_min = std::numeric_limits<float>::min();
const auto float_max = std::numeric_limits<float>::max();
const auto double_lowest = std::numeric_limits<double>::lowest();
const auto double_min = std::numeric_limits<double>::min();
const auto double_max = std::numeric_limits<double>::max();
const std::vector<int> ints {
int_min,

View File

@ -146,9 +146,9 @@ uint64_t XPUGeneratorImpl::seed() {
c10::intrusive_ptr<c10::TensorImpl> XPUGeneratorImpl::get_state() const {
// The RNG state comprises the seed, and an offset used for Philox.
constexpr size_t seed_size = sizeof(uint64_t);
constexpr size_t offset_size = sizeof(uint64_t);
constexpr size_t total_size = seed_size + offset_size;
static const size_t seed_size = sizeof(uint64_t);
static const size_t offset_size = sizeof(uint64_t);
static const size_t total_size = seed_size + offset_size;
// The internal state is returned as a CPU byte tensor.
auto state_tensor = at::detail::empty_cpu(
@ -170,9 +170,9 @@ c10::intrusive_ptr<c10::TensorImpl> XPUGeneratorImpl::get_state() const {
void XPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
at::xpu::assertNotCapturing(
"Please ensure to utilize the XPUGeneratorImpl::set_state_index method during capturing.");
constexpr size_t seed_size = sizeof(uint64_t);
constexpr size_t offset_size = sizeof(uint64_t);
constexpr size_t total_size = seed_size + offset_size;
static const size_t seed_size = sizeof(uint64_t);
static const size_t offset_size = sizeof(uint64_t);
static const size_t total_size = seed_size + offset_size;
at::detail::check_rng_state(new_state);

View File

@ -6,7 +6,7 @@ import os
import subprocess
import sys
import tempfile
from collections.abc import Callable
from typing import Callable
from torch._inductor.utils import fresh_cache

View File

@ -4060,7 +4060,7 @@ def run(runner, args, original_dir=None):
else:
optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython)
experiment = (
speedup_experiment if args.backend != "torchao" else latency_experiment
speedup_experiment if not args.backend == "torchao" else latency_experiment
)
if args.accuracy:
output_filename = f"accuracy_{args.backend}.csv"

View File

@ -1,8 +1,7 @@
import os
from collections import defaultdict
from collections.abc import Callable
from dataclasses import dataclass
from typing import Any, Optional
from typing import Any, Callable, Optional
import matplotlib.pyplot as plt

View File

@ -3,7 +3,6 @@ import sys
from benchmark_base import BenchmarkBase
import torch
from torch._dynamo.utils import CompileTimeInstructionCounter
class Benchmark(BenchmarkBase):
@ -33,11 +32,7 @@ class Benchmark(BenchmarkBase):
def _work(self):
# enable_cpp_symbolic_shape_guards has impact on this benchmark
# Keep using False value for consistency.
with (
torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", False),
torch._export.config.patch(use_new_tracer_experimental=True),
CompileTimeInstructionCounter.record(),
):
with torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", False):
torch.export.export(self.m, (self.input,), strict=True)

View File

@ -38,7 +38,7 @@ update_hint_regression,compile_time_instruction_count,1719000000,0.1
sum_floordiv_regression,compile_time_instruction_count,3686995725,0.1
sum_floordiv_regression,compile_time_instruction_count,966100000,0.1

1 add_loop_eager compile_time_instruction_count 3070000000 0.1
38
39
40
41
42
43
44

View File

@ -1,5 +1,4 @@
from collections.abc import Callable
from typing import Any
from typing import Any, Callable
import torch

View File

@ -1,8 +1,7 @@
import time
from argparse import ArgumentParser
from collections import defaultdict
from collections.abc import Callable
from typing import Any, NamedTuple
from typing import Any, Callable, NamedTuple
import torch
from torch.autograd import functional

View File

@ -1,6 +1,5 @@
from collections import defaultdict
from collections.abc import Callable
from typing import Optional, Union
from typing import Callable, Optional, Union
import torch
from torch import nn, Tensor

View File

@ -1,6 +1,5 @@
import dataclasses
from collections.abc import Callable
from typing import Optional
from typing import Callable, Optional
all_experiments: dict[str, Callable] = {}

View File

@ -9,9 +9,8 @@ import logging
import time
from abc import abstractmethod
from collections import defaultdict
from collections.abc import Callable
from dataclasses import asdict, dataclass, field
from typing import Any, Optional
from typing import Any, Callable, Optional
from tabulate import tabulate
from tqdm import tqdm

View File

@ -7,7 +7,6 @@ from pt import ( # noqa: F401
binary_inplace_test,
binary_test,
bmm_test,
boolean_test,
cat_test,
channel_shuffle_test,
chunk_test,

View File

@ -56,9 +56,6 @@ binary_ops_list = op_bench.op_list(
["sub", torch.sub],
["div", torch.div],
["mul", torch.mul],
["asr", torch.bitwise_right_shift],
["lsl", torch.bitwise_left_shift],
["xor", torch.bitwise_xor],
],
)

View File

@ -1,73 +0,0 @@
import operator_benchmark as op_bench
import torch
"""Microbenchmarks for boolean operators. Supports both Caffe2/PyTorch."""
# Configs for PT all operator
all_long_configs = op_bench.cross_product_configs(
M=[8, 128], N=[32, 64], K=[256, 512], device=["cpu", "cuda"], tags=["long"]
)
all_short_configs = op_bench.config_list(
attr_names=["M", "N", "K"],
attrs=[
[1, 1, 1],
[64, 64, 64],
[64, 64, 128],
],
cross_product_configs={
"device": ["cpu", "cuda"],
},
tags=["short"],
)
class AllBenchmark(op_bench.TorchBenchmarkBase):
def init(self, M, N, K, device):
self.inputs = {
"input_one": torch.randint(0, 2, (M, N, K), device=device, dtype=torch.bool)
}
self.set_module_name("all")
def forward(self, input_one):
return torch.all(input_one)
# The generated test names based on all_short_configs will be in the following pattern:
# all_M8_N16_K32_devicecpu
# all_M8_N16_K32_devicecpu_bwdall
# all_M8_N16_K32_devicecpu_bwd1
# all_M8_N16_K32_devicecpu_bwd2
# ...
# Those names can be used to filter tests.
op_bench.generate_pt_test(all_long_configs + all_short_configs, AllBenchmark)
"""Mircobenchmark for any operator."""
class AnyBenchmark(op_bench.TorchBenchmarkBase):
def init(self, M, N, device):
self.inputs = {
"input_one": torch.randint(0, 2, (M, N), device=device, dtype=torch.bool)
}
self.set_module_name("any")
def forward(self, input_one):
return torch.any(input_one)
any_configs = op_bench.cross_product_configs(
M=[8, 256],
N=[256, 16],
device=["cpu", "cuda"],
tags=["any"],
)
op_bench.generate_pt_test(any_configs, AnyBenchmark)
if __name__ == "__main__":
op_bench.benchmark_runner.main()

View File

@ -38,16 +38,12 @@ class ConvTranspose1dBenchmark(op_bench.TorchBenchmarkBase):
op_bench.generate_pt_test(
configs.conv_1d_configs_short + configs.conv_1d_configs_long, Conv1dBenchmark
)
if not torch.backends.mkldnn.is_acl_available():
# convtranpose1d crashes with ACL, see https://github.com/pytorch/pytorch/issues/165654
op_bench.generate_pt_test(
configs.convtranspose_1d_configs_short
+ configs.conv_1d_configs_short
+ configs.conv_1d_configs_long,
ConvTranspose1dBenchmark,
)
op_bench.generate_pt_test(
configs.convtranspose_1d_configs_short
+ configs.conv_1d_configs_short
+ configs.conv_1d_configs_long,
ConvTranspose1dBenchmark,
)
"""

View File

@ -1,8 +1,7 @@
import itertools
from collections.abc import Callable
from dataclasses import asdict, dataclass
from functools import partial
from typing import Union
from typing import Callable, Union
import numpy as np
from tabulate import tabulate

View File

@ -3,11 +3,10 @@ import csv
import itertools
import random
from collections import defaultdict
from collections.abc import Callable
from contextlib import nullcontext
from dataclasses import asdict, dataclass
from functools import partial
from typing import Optional, Union
from typing import Callable, Optional, Union
import numpy as np
from tabulate import tabulate
@ -271,7 +270,7 @@ def run_single_backend_sdpa(
if config.calculate_bwd_time:
# TODO: debug backward pass for njt
if eager_sdpa and config.attn_type != "document_mask":
if eager_sdpa and not config.attn_type == "document_mask":
d_out = torch.randn_like(out_eager.transpose(1, 2)).transpose(1, 2)
backward_eager_time = benchmark_torch_function_in_microseconds(
out_eager.backward, d_out, retain_graph=True

View File

@ -1,8 +1,8 @@
import itertools
from collections import defaultdict
from collections.abc import Callable
from contextlib import nullcontext
from dataclasses import asdict, dataclass
from typing import Callable
from tabulate import tabulate
from tqdm import tqdm

View File

@ -1729,10 +1729,8 @@ def define_buck_targets(
"torch/csrc/jit/backends/backend_debug_info.cpp",
"torch/csrc/jit/backends/backend_interface.cpp",
],
compiler_flags = get_pt_compiler_flags() + select({
"DEFAULT": [],
"ovr_config//os:android": c2_fbandroid_xplat_compiler_flags
}),
compiler_flags = get_pt_compiler_flags(),
fbandroid_compiler_flags = c2_fbandroid_xplat_compiler_flags,
# @lint-ignore BUCKLINT link_whole
link_whole = True,
linker_flags = get_no_as_needed_linker_flag(),
@ -2025,9 +2023,6 @@ def define_buck_targets(
"ovr_config//os:android-x86_64": [
"-mssse3",
],
}) + select({
"DEFAULT": [],
"ovr_config//os:android": c2_fbandroid_xplat_compiler_flags,
}),
exported_preprocessor_flags = get_aten_preprocessor_flags(),
exported_deps = [

View File

@ -120,23 +120,17 @@ inline void initGlobalDevicePoolState() {
TORCH_CHECK(
gDevicePool.devices.size() <= std::numeric_limits<DeviceIndex>::max(),
"Too many XPU devices, DeviceIndex overflowed!");
// Check each device's architecture and issue a warning if it is older than
// the officially supported range (Intel GPUs starting from Arc (Alchemist)
// series).
namespace syclex = sycl::ext::oneapi::experimental;
for (const auto& device : gDevicePool.devices) {
auto architecture = device->get_info<syclex::info::device::architecture>();
if (architecture < syclex::architecture::intel_gpu_acm_g10) {
TORCH_WARN(
"The detected GPU (",
device->get_info<sycl::info::device::name>(),
") is not officially supported by PyTorch XPU. Running workloads on this device may result in unexpected behavior.\n",
"For stable and fully supported execution, please use GPUs based on Intel Arc (Alchemist) series or newer.\n",
"Refer to the hardware prerequisites for more information: ",
"https://github.com/pytorch/pytorch/blob/main/docs/source/notes/get_start_xpu.rst#hardware-prerequisite");
}
}
#if defined(_WIN32) && SYCL_COMPILER_VERSION < 20250000
// The default context feature is disabled by default on Windows for SYCL
// compiler versions earlier than 2025.0.0.
std::vector<sycl::device> deviceList;
for (auto it = gDevicePool.devices.begin(); it != gDevicePool.devices.end();
++it) {
deviceList.push_back(*(*it));
}
gDevicePool.context = std::make_unique<sycl::context>(deviceList);
#else
// The default context is utilized for each Intel GPU device, allowing the
// retrieval of the context from any GPU device.
const auto& platform = gDevicePool.devices[0]->get_platform();
@ -146,6 +140,7 @@ inline void initGlobalDevicePoolState() {
#else
platform.ext_oneapi_get_default_context());
#endif
#endif
}
inline void initDevicePoolCallOnce() {
@ -170,9 +165,9 @@ void initDeviceProperties(DeviceProp* device_prop, DeviceIndex device) {
#define ASSIGN_DEVICE_ASPECT(member) \
device_prop->has_##member = raw_device.has(sycl::aspect::member);
#define ASSIGN_EXP_CL_ASPECT(member) \
device_prop->has_##member = \
raw_device.ext_oneapi_supports_cl_extension("cl_intel_" #member);
#define ASSIGN_EXP_CL_ASPECT(member) \
device_prop->has_##member = raw_device.ext_oneapi_supports_cl_extension( \
"cl_intel_" #member, &cl_version);
#define ASSIGN_EXP_DEVICE_PROP(property) \
device_prop->property = \
@ -187,6 +182,8 @@ void initDeviceProperties(DeviceProp* device_prop, DeviceIndex device) {
AT_FORALL_XPU_DEVICE_ASPECT(ASSIGN_DEVICE_ASPECT);
// TODO: Remove cl_version since it is unnecessary.
sycl::ext::oneapi::experimental::cl_version cl_version;
AT_FORALL_XPU_EXP_CL_ASPECT(ASSIGN_EXP_CL_ASPECT);
#if SYCL_COMPILER_VERSION >= 20250000

View File

@ -1044,17 +1044,6 @@ if(USE_ROCM)
list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling)
endif(CMAKE_BUILD_TYPE MATCHES Debug)
# Get EnVar 'USE_LAYERNORM_FAST_RECIPROCAL' (or default to on).
if(DEFINED ENV{USE_LAYERNORM_FAST_RECIPROCAL})
set(USE_LAYERNORM_FAST_RECIPROCAL $ENV{USE_LAYERNORM_FAST_RECIPROCAL})
else()
set(USE_LAYERNORM_FAST_RECIPROCAL ON)
endif()
if(USE_LAYERNORM_FAST_RECIPROCAL)
add_definitions(-DUSE_LAYERNORM_FAST_RECIPROCAL)
endif()
# needed for compat with newer versions of hip-clang that introduced C++20 mangling rules
list(APPEND HIP_HIPCC_FLAGS -fclang-abi-compat=17)

View File

@ -128,12 +128,11 @@ function(caffe2_print_configuration_summary)
endif()
message(STATUS " USE_ROCM : ${USE_ROCM}")
if(${USE_ROCM})
message(STATUS " ROCM_VERSION : ${ROCM_VERSION}")
message(STATUS " USE_FLASH_ATTENTION : ${USE_FLASH_ATTENTION}")
message(STATUS " USE_MEM_EFF_ATTENTION : ${USE_MEM_EFF_ATTENTION}")
message(STATUS " USE_ROCM_CK_SDPA : ${USE_ROCM_CK_SDPA}")
message(STATUS " USE_ROCM_CK_GEMM : ${USE_ROCM_CK_GEMM}")
message(STATUS " USE_LAYERNORM_FAST_RECIPROCAL : ${USE_LAYERNORM_FAST_RECIPROCAL}")
message(STATUS " ROCM_VERSION : ${ROCM_VERSION}")
message(STATUS " USE_FLASH_ATTENTION : ${USE_FLASH_ATTENTION}")
message(STATUS " USE_MEM_EFF_ATTENTION : ${USE_MEM_EFF_ATTENTION}")
message(STATUS " USE_ROCM_CK_SDPA : ${USE_ROCM_CK_SDPA}")
message(STATUS " USE_ROCM_CK_GEMM : ${USE_ROCM_CK_GEMM}")
endif()
message(STATUS " BUILD_NVFUSER : ${BUILD_NVFUSER}")
message(STATUS " USE_EIGEN_FOR_BLAS : ${CAFFE2_USE_EIGEN_FOR_BLAS}")

View File

@ -3,11 +3,11 @@ from __future__ import annotations
import dis
import inspect
import sys
from typing import Any, Optional, TYPE_CHECKING, Union
from typing import Any, Callable, Optional, TYPE_CHECKING, Union
if TYPE_CHECKING:
from collections.abc import Callable, Sequence
from collections.abc import Sequence
import torch
from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten

View File

@ -5,7 +5,7 @@ Python implementation of function wrapping functionality for functorch.dim.
from __future__ import annotations
import functools
from typing import Any, Optional, TYPE_CHECKING
from typing import Any, Callable, Optional
import torch
from torch.utils._pytree import tree_map
@ -15,10 +15,6 @@ from ._enable_all_layers import EnableAllLayers
from ._tensor_info import TensorInfo
if TYPE_CHECKING:
from collections.abc import Callable
def handle_from_tensor(tensor: torch.Tensor) -> torch.Tensor:
"""Handle tensor conversion for torch function integration."""
return tensor

View File

@ -5,7 +5,6 @@
# LICENSE file in the root directory of this source tree.
import functools
from collections.abc import Callable
from types import (
BuiltinMethodType,
FunctionType,
@ -13,7 +12,7 @@ from types import (
MethodDescriptorType,
WrapperDescriptorType,
)
from typing import Any
from typing import Any, Callable
FUNC_TYPES = (

View File

@ -1,7 +1,7 @@
from __future__ import annotations
import functools
from typing import TYPE_CHECKING, Union
from typing import Callable, TYPE_CHECKING, Union
import torch
from functorch.dim import dims # noqa: F401
@ -16,7 +16,7 @@ from ._parsing import (
if TYPE_CHECKING:
from collections.abc import Callable, Sequence
from collections.abc import Sequence
__all__ = ["rearrange"]

View File

@ -180,7 +180,6 @@ ignore = [
"SIM116", # Disable Use a dictionary instead of consecutive `if` statements
"SIM117",
"SIM118",
"SIM300", # Yoda condition detected
"UP007", # keep-runtime-typing
"UP045", # keep-runtime-typing
"TC006",
@ -196,7 +195,8 @@ select = [
"E",
"EXE",
"F",
"SIM",
"SIM1",
"SIM911",
"W",
# Not included in flake8
"FURB",

View File

@ -22,10 +22,8 @@ project-includes = [
project-excludes = [
# ==== below will be enabled directory by directory ====
# ==== to test Pyrefly on a specific directory, simply comment it out ====
"torch/_inductor/codegen/triton.py",
"torch/_inductor/runtime/triton_helpers.py",
"torch/_inductor/runtime/triton_heuristics.py",
"torch/_inductor/runtime/halide_helpers.py",
"torch/_inductor/runtime",
"torch/_inductor/codegen",
# formatting issues, will turn on after adjusting where suppressions can be
# in import statements
"torch/linalg/__init__.py",

View File

@ -156,10 +156,6 @@
# USE_ROCM_KERNEL_ASSERT=1
# Enable kernel assert in ROCm platform
#
# USE_LAYERNORM_FAST_RECIPROCAL
# If set, enables the use of builtin functions for fast reciprocals (1/x) w.r.t.
# layer normalization. Default: enabled.
#
# USE_ROCM_CK_GEMM=1
# Enable building CK GEMM backend in ROCm platform
#

View File

@ -55,7 +55,7 @@ class TestActivationSparsifier(TestCase):
for key, config in sparsifier_defaults.items():
# all the keys in combined_defaults should be present in sparsifier defaults
assert config == combined_defaults.get(key)
assert config == combined_defaults.get(key, None)
def _check_register_layer(
self, activation_sparsifier, defaults, sparse_config, layer_args_list

View File

@ -3074,7 +3074,7 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase):
wrong_dtype_shards, [10, 10], init_rrefs=True
)
tensor_requires_grad = self.rank == 0
tensor_requires_grad = True if self.rank == 0 else False
wrong_requires_grad_shards = [
sharded_tensor.Shard(
torch.randn(
@ -3121,7 +3121,7 @@ class TestShardedTensorFromLocalShards(ShardedTensorTestBase):
wrong_pin_memory_local_shards, [10, 10], init_rrefs=True
)
tensor_pin_memory = self.rank == 0
tensor_pin_memory = True if self.rank == 0 else False
wrong_pin_memory_shards_cross_ranks = [
sharded_tensor.Shard(
torch.randn(5, 5, pin_memory=tensor_pin_memory), local_shard_metadata

View File

@ -152,7 +152,7 @@ class TestStorageBase:
self.rank = 0 if not dist.is_initialized() else dist.get_rank()
def _get_ranks(self, name):
return self.fail_conf.get(name, None)
return self.fail_conf[name] if name in self.fail_conf else None
def _fail_rank(self, name):
ranks = self._get_ranks(name)

View File

@ -155,7 +155,7 @@ class TestFreezingWeights(FSDPTest):
ddp_kwargs = {
"device_ids": [self.rank],
"find_unused_parameters": bool(disable_autograd),
"find_unused_parameters": True if disable_autograd else False,
}
model = self._create_model(

View File

@ -66,7 +66,7 @@ class MockPipelineStage(_PipelineStageBase):
self.num_stages = kwargs.get("num_stages", 1)
self.group_size = kwargs.get("group_size", 1)
self.group_rank = kwargs.get("group_rank", 0)
self.group = kwargs.get("group")
self.group = kwargs.get("group", None)
def _create_grad_recv_info(self, *args, **kwargs):
return None

View File

@ -1023,7 +1023,7 @@ class DTensorMeshTest(DTensorTestBase):
DTensorMeshTestWithLocalTensor = create_local_tensor_test_class(
DTensorMeshTest,
skipped_tests=[
# Test asserts must be rewritten for local tensor
# Submeshes are not supported by local tensor mode
"test_from_local_sub_mesh",
"test_default_value_sub_mesh",
"test_redistribute_sub_mesh",
@ -1066,7 +1066,7 @@ class TestDTensorPlacementTypes(DTensorTestBase):
assert_array_equal(expected_pad_sizes, pad_sizes)
is_tensor_empty = [
not splitted_tensor.numel() > 0
False if splitted_tensor.numel() > 0 else True
for splitted_tensor in splitted_tensor_list
]
expected_is_tensor_empty = [True] * self.world_size
@ -1089,10 +1089,12 @@ class TestDTensorPlacementTypes(DTensorTestBase):
for i, tensor in enumerate(splitted_tensor_list)
]
expected_is_tensor_empty = [
not idx < size for idx, _ in enumerate(range(self.world_size))
False if idx < size else True
for idx, _ in enumerate(range(self.world_size))
]
is_tensor_empty = [
not unpadded_tensor.numel() > 0 for unpadded_tensor in unpadded_list
False if unpadded_tensor.numel() > 0 else True
for unpadded_tensor in unpadded_list
]
assert_array_equal(expected_is_tensor_empty, is_tensor_empty)

View File

@ -1,65 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates
# Owner(s): ["oncall: distributed"]
from unittest.mock import patch
import torch
from torch.distributed.tensor import distribute_tensor, DTensor
from torch.distributed.tensor.placement_types import Replicate
from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
parametrize,
run_tests,
)
from torch.testing._internal.distributed._tensor.common_dtensor import (
DTensorTestBase,
with_comms,
)
from torch.testing._internal.inductor_utils import GPU_TYPE
from torch.testing._internal.triton_utils import requires_gpu
class TestDynamic(DTensorTestBase):
@requires_gpu
@with_comms
@parametrize("fake_tensor_cache_enabled", [False, True])
def test_embedding(self, fake_tensor_cache_enabled):
with patch.object(
torch._dynamo.config, "fake_tensor_cache_enabled", fake_tensor_cache_enabled
):
device_mesh = self.build_device_mesh()
placements = (Replicate(),)
num_embeddings = 202048
embedding_dim = 256
weight = distribute_tensor(
torch.rand(
[num_embeddings, embedding_dim],
dtype=torch.float32,
device=GPU_TYPE,
requires_grad=True,
),
device_mesh,
placements, # [Replicate()],
)
def forward(input_batch_inputs_):
to = weight.to(torch.float32)
emb = torch.nn.functional.embedding(input_batch_inputs_, to)
return emb
arg0 = torch.randint(
low=0, high=100, size=(2, 512), dtype=torch.int64, device=GPU_TYPE
)
arg0 = DTensor.from_local(arg0, device_mesh, placements)
compiled_forward = torch.compile(forward, fullgraph=True, dynamic=True)
_out = compiled_forward(arg0)
instantiate_parametrized_tests(TestDynamic)
if __name__ == "__main__":
run_tests()

View File

@ -30,7 +30,6 @@ from torch.distributed.tensor.debug import CommDebugMode
from torch.distributed.tensor.placement_types import _StridedShard, Placement
from torch.testing._internal.common_utils import run_tests
from torch.testing._internal.distributed._tensor.common_dtensor import (
create_local_tensor_test_class,
DTensorTestBase,
with_comms,
)
@ -648,7 +647,7 @@ class TestViewOps(DTensorTestBase):
@with_comms
def test_squeeze_(self):
mesh_2d = init_device_mesh(self.device_type, (3, 2), mesh_dim_names=("a", "b"))
self.init_manual_seed_for_rank()
torch.manual_seed(self.rank)
x = torch.randn((1, 4), device=self.device_type)
dist_x = DTensor.from_local(x, mesh_2d, [Partial(), Shard(1)])
self._test_op_on_dtensor(
@ -665,13 +664,5 @@ class TestViewOps(DTensorTestBase):
self.assertEqual(dist_x.placements, [Partial(), Shard(0)])
TestViewOpsWithLocalTensor = create_local_tensor_test_class(
TestViewOps,
skipped_tests=[
# Comparing data pointers is not supported for local tensor
"test_dtensor_view_op_uneven",
],
)
if __name__ == "__main__":
run_tests()

View File

@ -2770,7 +2770,11 @@ class WorkHookTest(MultiProcessTestCase):
# from rank0 to other ranks. However, this is DDP's internal implementation,
# which is subject to change in future versions.
self.assertTrue(num_hook_fired[OpType.BROADCAST] > 0)
ctor_allreduce = num_hook_fired.get(OpType.ALLREDUCE, 0)
ctor_allreduce = (
num_hook_fired[OpType.ALLREDUCE]
if OpType.ALLREDUCE in num_hook_fired
else 0
)
x = torch.zeros(2, 1000).cuda(self.rank)
ddp(x).sum().backward()

View File

@ -7,13 +7,8 @@ from dataclasses import dataclass
import torch
from torch.multiprocessing.reductions import reduce_tensor
from torch.testing._internal.common_cuda import SM100OrLater
from torch.testing._internal.common_distributed import MultiProcContinuousTest
from torch.testing._internal.common_utils import (
requires_cuda_p2p_access,
run_tests,
skip_but_pass_in_sandcastle_if,
)
from torch.testing._internal.common_utils import requires_cuda_p2p_access, run_tests
# So that tests are written in device-agnostic way
@ -64,10 +59,6 @@ class CupyAsTensorTest(MultiProcContinuousTest):
def device(self) -> torch.device:
return torch.device(device_type, self.rank)
@skip_but_pass_in_sandcastle_if(
SM100OrLater,
"Fails if ran in docker environment without privileged access (https://github.com/pytorch/pytorch/issues/165170)",
)
def test_cupy_as_tensor(self) -> None:
"""
Test that torch.as_tensor works for cupy array interface

View File

@ -1664,14 +1664,14 @@ class CuTeLayoutTest(TestCase):
def test_remap_to_tensor(self):
"""Test the remap_to_tensor method for various scenarios."""
# Test 1: Consecutive ranks, full world - should return logical groups directly
original_mesh = torch.tensor([[0, 1], [2, 3]], dtype=torch.int)
original_mesh = torch.tensor([0, 1, 2, 3], dtype=torch.int)
layout1 = _Layout((2, 2), (2, 1)) # row-major 2x2
result1 = layout1.remap_to_tensor(original_mesh)
expected1 = torch.tensor([[[0, 1], [2, 3]]], dtype=torch.int)
self.assertEqual(result1, expected1)
# Test 2: Non-consecutive ranks - should map to actual ranks
original_mesh = torch.tensor([[10, 20], [30, 40]], dtype=torch.int)
original_mesh = torch.tensor([10, 20, 30, 40], dtype=torch.int)
layout2 = _Layout((2, 2), (2, 1))
result2 = layout2.remap_to_tensor(original_mesh)
expected2 = torch.tensor([[[10, 20], [30, 40]]], dtype=torch.int)
@ -1692,7 +1692,7 @@ class CuTeLayoutTest(TestCase):
self.assertEqual(result5, expected5)
# Test 6: Tensor Cute representation of a 2D mesh
original_mesh = torch.tensor([[0, 2], [1, 3]], dtype=torch.int)
original_mesh = torch.tensor([0, 2, 1, 3], dtype=torch.int)
layout6 = _Layout((2, 2), (1, 2)) # column-major style
result6 = layout6.remap_to_tensor(original_mesh)
expected6 = torch.tensor([[[0, 1], [2, 3]]], dtype=torch.int)

View File

@ -12,7 +12,6 @@ import torch.distributed._symmetric_memory as symm_mem
import torch.distributed._symmetric_memory._nvshmem_triton as nvshmem
from torch._inductor.runtime.triton_compat import triton
from torch.distributed._symmetric_memory._nvshmem_triton import requires_nvshmem
from torch.testing._internal.common_cuda import SM100OrLater
from torch.testing._internal.common_distributed import MultiProcContinuousTest
from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
@ -265,10 +264,6 @@ def my_reduce_kernel(
nvshmem.reduce(team_handle, dest_tensor, source_tensor, nreduce, operation)
@skip_but_pass_in_sandcastle_if(
SM100OrLater,
"Skipping all NVSHMEM Triton tests due to https://github.com/pytorch/pytorch/issues/162897",
)
@instantiate_parametrized_tests
class NVSHMEMTritonTest(MultiProcContinuousTest):
def _init_device(self) -> None:

View File

@ -52,9 +52,6 @@ from torch.testing._internal.common_utils import (
test_contexts = [nullcontext, _test_mode]
# Set environment variable to disable multicast for all tests in this module
os.environ["TORCH_SYMM_MEM_DISABLE_MULTICAST"] = "1"
# So that tests are written in device-agnostic way
device_type = "cuda"
device_module = torch.get_device_module(device_type)
@ -552,10 +549,6 @@ class AsyncTPTest(MultiProcContinuousTest):
@skipUnless(SM89OrLater, "Requires compute capability >= 8.9")
@parametrize("scatter_dim", [0, 1])
@parametrize("rowwise", [True, False])
@skipIf(
SM100OrLater,
"https://github.com/pytorch/pytorch/issues/162940",
)
def test_fused_scaled_matmul_reduce_scatter(
self, scatter_dim: int, rowwise: bool
) -> None:

View File

@ -2712,20 +2712,19 @@ def forward(self, x):
torch._dynamo.exc.UserError,
".*y.*size.*2.* = 4 is not equal to .*x.*size.*1.* = 3",
):
with torch._export.config.patch(use_new_tracer_experimental=True):
torch.export.export(
bar, (x, y), dynamic_shapes=dynamic_shapes, strict=True
)
torch.export.export(bar, (x, y), dynamic_shapes=dynamic_shapes, strict=True)
y = torch.randn(10, 3, 3)
with torch._export.config.patch(use_new_tracer_experimental=True):
ebar = torch.export.export(
bar, (x, y), dynamic_shapes=dynamic_shapes, strict=True
)
for node in ebar.graph_module.graph.nodes:
if node.op == "placeholder":
shape = node.meta["val"].shape
self.assertEqual(shape[1], shape[2])
ebar = torch.export.export(
bar, (x, y), dynamic_shapes=dynamic_shapes, strict=True
)
self.assertEqual(
[
str(node.meta["val"].shape)
for node in ebar.graph_module.graph.nodes
if node.op == "placeholder"
],
["torch.Size([s17, s27, s27])", "torch.Size([s17, s27, s27])"],
)
@torch._dynamo.config.patch(
capture_dynamic_output_shape_ops=True,

View File

@ -5173,9 +5173,10 @@ class DefaultsTests(torch._dynamo.test_case.TestCase):
res = opt_fn(x)
self.assertEqual(ref, res)
@unittest.expectedFailure
def test_property_class_transmute(self):
class PropertyGetter:
def __call__(self, obj):
def __call__(self):
return True
p = property(PropertyGetter())
@ -5194,31 +5195,6 @@ class DefaultsTests(torch._dynamo.test_case.TestCase):
x = torch.randn(1)
self.assertEqual(opt_mod(x), x + 1)
def test_property_functools_partial(self):
def p_getter(obj, *, delta: int):
# Use instance state + a bound constant
return (getattr(obj, "flag", 0) + delta) > 0
class Mod(torch.nn.Module):
def __init__(self, flag: int):
super().__init__()
self.flag = flag
# fget is a functools.partial object
p = property(functools.partial(p_getter, delta=1))
def forward(self, x):
if self.p: # calls p_getter(self, delta=1)
return x + 1
else:
raise RuntimeError("whoops")
mod = Mod(flag=1)
opt_mod = torch.compile(mod, backend="eager", fullgraph=True)
x = torch.randn(1)
self.assertEqual(opt_mod(x), x + 1)
instantiate_parametrized_tests(FunctionTests)
instantiate_parametrized_tests(DefaultsTests)

View File

@ -82,7 +82,7 @@ def grad(L, desired_results: list[Variable]) -> list[Variable]:
# look up dL_dentries. If a variable is never used to compute the loss,
# we consider its gradient None, see the note below about zeros for more information.
def gather_grad(entries: list[str]):
return [dL_d.get(entry) for entry in entries]
return [dL_d[entry] if entry in dL_d else None for entry in entries]
# propagate the gradient information backward
for entry in reversed(gradient_tape):

View File

@ -286,7 +286,7 @@ class OptionalScaledTensor(torch.Tensor):
def __tensor_unflatten__(inner_tensors, metadata, outer_size, outer_stride):
return OptionalScaledTensor(
inner_tensors["_data"],
inner_tensors.get("_scale", None),
inner_tensors["_scale"] if "_scale" in inner_tensors else None,
constant=metadata["_constant"],
)

View File

@ -510,7 +510,6 @@ class TestDynamoTimed(TestCase):
raw = dataclasses.asdict(compilation_events[0])
del raw["feature_usage"]
del raw["ir_count"]
del raw["inductor_provenance"]
del raw["param_numel"]
del raw["param_bytes"]
del raw["param_count"]
@ -695,7 +694,6 @@ class TestDynamoTimed(TestCase):
raw = dataclasses.asdict(compilation_events[1])
del raw["feature_usage"]
del raw["ir_count"]
del raw["inductor_provenance"]
del raw["guard_latency_us"]
del raw["param_numel"]
del raw["param_bytes"]
@ -913,27 +911,6 @@ class TestDynamoTimed(TestCase):
compilation_events = [arg[0][0] for arg in log_event.call_args_list]
self.assertEqual(compilation_events[0].ir_count, second)
@dynamo_config.patch(
{
"log_compilation_metrics": True,
}
)
@inductor_config.patch(
{"trace.enabled": True, "trace.provenance_tracking_level": 1},
)
def test_inductor_provenance(self):
module = torch.nn.Linear(6, 66)
graph_module = torch.fx.symbolic_trace(module)
compilation_events = []
with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
torch.compile(graph_module)(torch.randn(6, 6))
compilation_events = [arg[0][0] for arg in log_event.call_args_list]
self.assertEqual(
compilation_events[0].inductor_provenance,
{'{"extern_kernels.addmm:1": []}'},
)
@dynamo_config.patch({"log_compilation_metrics": True})
@inductor_config.patch({"force_disable_caches": True})
def test_dynamic_shape_feature_use(self):

Some files were not shown because too many files have changed in this diff Show More