Compare commits

..

1 Commits

Author SHA1 Message Date
a87ac298a5 [Dynamo] Frozen dataclass attr access test
ghstack-source-id: 5844a30db4c185e80a8902b657a84da03c9933e7
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159513
2025-07-30 14:27:50 -07:00
840 changed files with 18066 additions and 69812 deletions

View File

@ -144,6 +144,16 @@ case "$tag" in
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9)
CUDA_VERSION=12.6.3
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
CUDA_VERSION=12.8.1
ANACONDA_PYTHON_VERSION=3.12
@ -154,6 +164,39 @@ case "$tag" in
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
;;
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=12.6
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
CUDA_VERSION=12.6
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
CUDA_VERSION=12.6
ANACONDA_PYTHON_VERSION=3.13
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
CUDA_VERSION=12.8.1
ANACONDA_PYTHON_VERSION=3.10
@ -176,6 +219,18 @@ case "$tag" in
VISION=yes
TRITON=yes
;;
pytorch-linux-jammy-py3.11-clang12)
ANACONDA_PYTHON_VERSION=3.11
CLANG_VERSION=12
VISION=yes
TRITON=yes
;;
pytorch-linux-jammy-py3.9-gcc9)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=9
VISION=yes
TRITON=yes
;;
pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-noble-rocm-n-py3)
if [[ $tag =~ "jammy" ]]; then
ANACONDA_PYTHON_VERSION=3.10

View File

@ -1 +1 @@
f7888497a1eb9e98d4c07537f0d0bcfe180d1363
11ec6354315768a85da41032535e3b7b99c5f706

View File

@ -68,8 +68,8 @@ function install_nvshmem {
# download, unpack, install
wget -q "${url}"
tar xf "${filename}.tar.gz"
cp -a "libnvshmem/include/"* /usr/local/cuda/include/
cp -a "libnvshmem/lib/"* /usr/local/cuda/lib64/
cp -a "libnvshmem/include/"* /usr/local/include/
cp -a "libnvshmem/lib/"* /usr/local/lib/
# cleanup
cd ..

View File

@ -15,37 +15,11 @@ function install_timm() {
commit=$(get_pinned_commit timm)
pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
}
function install_torchbench() {
local commit
commit=$(get_pinned_commit torchbench)
git clone https://github.com/pytorch/benchmark torchbench
pushd torchbench
git checkout "$commit"
python install.py --continue_on_fail
# TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
# is regressing speedup metric. This needs to be investigated further
pip install transformers==4.38.1
echo "Print all dependencies after TorchBench is installed"
python -mpip freeze
popd
chown -R jenkins torchbench
# Clean up
conda_run pip uninstall -y torch torchvision triton
}
# Pango is needed for weasyprint which is needed for doctr
conda_install pango
# Stable packages are ok here, just to satisfy TorchBench check
pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
install_torchbench
install_huggingface
install_timm
# Clean up
conda_run pip uninstall -y torch torchvision torchaudio triton

View File

@ -221,9 +221,9 @@ pygments==2.15.0
#Pinned versions: 2.12.0
#test that import: the doctests
#pyyaml
#PyYAML
#Description: data serialization format
#Pinned versions: 6.0.2
#Pinned versions:
#test that import:
#requests
@ -233,7 +233,7 @@ pygments==2.15.0
#rich
#Description: rich text and beautiful formatting in the terminal
#Pinned versions: 14.1.0
#Pinned versions: 10.9.0
#test that import:
scikit-image==0.19.3 ; python_version < "3.10"
@ -361,6 +361,7 @@ pwlf==2.2.1
#Pinned versions: 2.2.1
#test that import: test_sac_estimator.py
# To build PyTorch itself
pyyaml
pyzstd

View File

@ -1,7 +1,7 @@
sphinx==5.3.0
#Description: This is used to generate PyTorch docs
#Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
@ -50,8 +50,8 @@ IPython==8.12.0
#Pinned versions: 8.12.0
myst-nb==0.17.2
#Description: This is used to generate PyTorch functorch and torch.compile docs.
#Pinned versions: 0.17.2
#Description: This is used to generate PyTorch functorch docs
#Pinned versions: 0.13.2
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
python-etcd==0.4.5
@ -59,3 +59,4 @@ sphinx-copybutton==0.5.0
sphinx-design==0.4.0
sphinxcontrib-mermaid==1.0.0
myst-parser==0.18.1
myst-nb

View File

@ -98,9 +98,8 @@ COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/huggingface.txt huggingface.txt
COPY ci_commit_pins/timm.txt timm.txt
COPY ci_commit_pins/torchbench.txt torchbench.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
# (optional) Install non-default Ninja version
ARG NINJA_VERSION

View File

@ -98,9 +98,8 @@ COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/huggingface.txt huggingface.txt
COPY ci_commit_pins/timm.txt timm.txt
COPY ci_commit_pins/torchbench.txt torchbench.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
ARG TRITON
ARG TRITON_CPU

View File

@ -194,7 +194,7 @@ ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library
ROCBLAS_LIB_DST=lib/rocblas/library
ROCBLAS_ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
ROCBLAS_OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $ROCBLAS_OTHER_FILES)
ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $OTHER_FILES)
# hipblaslt library files
HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library

View File

@ -50,9 +50,6 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
export ATEN_THREADING=NATIVE
fi
# Enable LLVM dependency for TensorExpr testing
export USE_LLVM=/opt/llvm
export LLVM_DIR=/opt/llvm/lib/cmake/llvm
if ! which conda; then
# In ROCm CIs, we are doing cross compilation on build machines with
@ -192,7 +189,6 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
export USE_ASAN=1
export REL_WITH_DEB_INFO=1
export UBSAN_FLAGS="-fno-sanitize-recover=all"
unset USE_LLVM
fi
if [[ "${BUILD_ENVIRONMENT}" == *no-ops* ]]; then

View File

@ -229,6 +229,7 @@ function install_torchrec_and_fbgemm() {
pip_install tabulate # needed for newer fbgemm
pip_install patchelf # needed for rocm fbgemm
pushd /tmp
local wheel_dir=dist/fbgemm_gpu
local found_whl=0
@ -244,7 +245,7 @@ function install_torchrec_and_fbgemm() {
if [ "${found_whl}" == "0" ]; then
git clone --recursive https://github.com/pytorch/fbgemm
pushd fbgemm/fbgemm_gpu
git checkout "${fbgemm_commit}" --recurse-submodules
git checkout "${fbgemm_commit}"
python setup.py bdist_wheel \
--build-variant=rocm \
-DHIP_ROOT_DIR="${ROCM_PATH}" \
@ -263,6 +264,7 @@ function install_torchrec_and_fbgemm() {
done
rm -rf fbgemm
popd
else
pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
@ -281,6 +283,30 @@ function clone_pytorch_xla() {
fi
}
function checkout_install_torchbench() {
local commit
commit=$(get_pinned_commit torchbench)
git clone https://github.com/pytorch/benchmark torchbench
pushd torchbench
git checkout "$commit"
if [ "$1" ]; then
python install.py --continue_on_fail models "$@"
else
# Occasionally the installation may fail on one model but it is ok to continue
# to install and test other models
python install.py --continue_on_fail
fi
# TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
# is regressing speedup metric. This needs to be investigated further
pip install transformers==4.38.1
echo "Print all dependencies after TorchBench is installed"
python -mpip freeze
popd
}
function install_torchao() {
local commit
commit=$(get_pinned_commit torchao)

View File

@ -157,29 +157,6 @@ test_jit_hooks() {
assert_git_not_dirty
}
# Shellcheck doesn't like it when you pass no arguments to a function
# that can take args. See https://www.shellcheck.net/wiki/SC2120
# shellcheck disable=SC2120
checkout_install_torchbench() {
local commit
commit=$(cat .ci/docker/ci_commit_pins/torchbench.txt)
git clone https://github.com/pytorch/benchmark torchbench
pushd torchbench
git checkout "$commit"
if [ "$1" ]; then
python install.py --continue_on_fail models "$@"
else
# Occasionally the installation may fail on one model but it is ok to continue
# to install and test other models
python install.py --continue_on_fail
fi
echo "Print all dependencies after TorchBench is installed"
python -mpip freeze
popd
}
torchbench_setup_macos() {
git clone --recursive https://github.com/pytorch/vision torchvision
git clone --recursive https://github.com/pytorch/audio torchaudio
@ -202,6 +179,8 @@ torchbench_setup_macos() {
USE_OPENMP=0 python setup.py develop
popd
# Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120
# shellcheck disable=SC2119,SC2120
checkout_install_torchbench
}

View File

@ -462,7 +462,7 @@ test_inductor_aoti() {
# rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
/usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
}
test_inductor_cpp_wrapper_shard() {
@ -627,8 +627,6 @@ test_perf_for_dashboard() {
device=cuda_a10g
elif [[ "${TEST_CONFIG}" == *h100* ]]; then
device=cuda_h100
elif [[ "${TEST_CONFIG}" == *b200* ]]; then
device=cuda_b200
elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
device=rocm
fi
@ -803,16 +801,6 @@ test_dynamo_benchmark() {
if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
elif [[ "${TEST_CONFIG}" == *perf* ]]; then
# TODO (huydhn): Just smoke test some sample models
if [[ "${TEST_CONFIG}" == *b200* ]]; then
if [[ "${suite}" == "huggingface" ]]; then
export TORCHBENCH_ONLY_MODELS="DistillGPT2"
elif [[ "${suite}" == "timm_models" ]]; then
export TORCHBENCH_ONLY_MODELS="inception_v3"
elif [[ "${suite}" == "torchbench" ]]; then
export TORCHBENCH_ONLY_MODELS="hf_Bert"
fi
fi
test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
else
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
@ -1051,20 +1039,10 @@ test_libtorch_api() {
mkdir -p $TEST_REPORTS_DIR
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
"$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml
else
# Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"
# On s390x, pytorch is built without llvm.
# Even if it would be built with llvm, llvm currently doesn't support used features on s390x and
# test fails with errors like:
# JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer
# unknown file: Failure
# C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) }
if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
fi
fi
# quantization is not fully supported on s390x yet
@ -1684,11 +1662,13 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
elif [[ "${TEST_CONFIG}" == cachebench ]]; then
install_torchaudio
install_torchvision
PYTHONPATH=/torchbench test_cachebench
checkout_install_torchbench nanogpt BERT_pytorch resnet50 hf_T5 llama moco
PYTHONPATH=$(pwd)/torchbench test_cachebench
elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
install_torchaudio
install_torchvision
PYTHONPATH=/torchbench test_verify_cachebench
checkout_install_torchbench nanogpt
PYTHONPATH=$(pwd)/torchbench test_verify_cachebench
elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
install_torchaudio
install_torchvision
@ -1697,22 +1677,28 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
# https://github.com/opencv/opencv-python/issues/885
pip_install opencv-python==4.8.0.74
if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
PYTHONPATH=/torchbench test_inductor_torchbench_smoketest_perf
checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer
PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
PYTHONPATH=/torchbench test_inductor_torchbench_cpu_smoketest_perf
checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \
llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
functorch_maml_omniglot yolov3 mobilenet_v2 resnext50_32x4d densenet121 mnasnet1_0
PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
TORCHBENCHPATH=/torchbench test_torchbench_gcp_smoketest
checkout_install_torchbench
TORCHBENCHPATH=$(pwd)/torchbench test_torchbench_gcp_smoketest
else
checkout_install_torchbench
# Do this after checkout_install_torchbench to ensure we clobber any
# nightlies that torchbench may pull in
if [[ "${TEST_CONFIG}" != *cpu* ]]; then
install_torchrec_and_fbgemm
fi
PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
fi
elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
install_torchvision
PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
test_inductor_aoti
fi

View File

@ -7,7 +7,7 @@ max-line-length = 120
# C408 ignored because we like the dict keyword argument syntax
# E501 is not flexible enough, we're using B950 instead
ignore =
E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
# to line this up with executable bit
EXE001,

View File

@ -53,9 +53,9 @@ self-hosted-runner:
- linux.rocm.gpu.mi250
- linux.rocm.gpu.2
- linux.rocm.gpu.4
# gfx942 runners
- linux.rocm.gpu.gfx942.2
- linux.rocm.gpu.gfx942.4
# MI300 runners
- linux.rocm.gpu.mi300.2
- linux.rocm.gpu.mi300.4
- rocm-docker
# Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
- macos-m1-stable

View File

@ -70,7 +70,7 @@ runs:
set -eux
# PyYAML 6.0 doesn't work with MacOS x86 anymore
# This must run on Python-3.7 (AmazonLinux2) so can't use request=3.32.2
python3 -m pip install requests==2.27.1 pyyaml==6.0.2
python3 -m pip install requests==2.27.1 pyyaml==6.0.1
- name: Parse ref
id: parse-ref

View File

@ -1 +1 @@
6fbc710b617f79b992ef2ebc7f95e818aa390293
f6dfe1231dcdd221a68416e49ab85c2575cbb824

View File

@ -1 +1 @@
6a39ba85fe0f2fff9494b5eccea717c93510c230
8f605ee30912541126c0fe46d0c8c413101b600a

View File

@ -1 +1 @@
b6a5b82b9948b610fa4c304d0d869c82b8f17db1
29ae4c76c026185f417a25e841d2cd5e65f087a3

View File

@ -488,10 +488,6 @@
- torch/_dynamo/**
- torch/csrc/dynamo/**
- test/dynamo/**
- test/dynamo_expected_failures/**
- test/dynamo_skips/**
- test/inductor_expected_failures/**
- test/inductor_skips/**
approved_by:
- guilhermeleobas
mandatory_checks_name:

View File

@ -7,9 +7,9 @@
# .ci/docker/requirements-ci.txt
boto3==1.35.42
jinja2==3.1.6
lintrunner==0.12.7
lintrunner==0.10.7
ninja==1.10.0.post1
nvidia-ml-py==11.525.84
pyyaml==6.0.2
pyyaml==6.0
requests==2.32.4
rich==14.1.0
rich==10.9.0

View File

@ -2,7 +2,7 @@ boto3==1.35.42
cmake==3.27.*
expecttest==0.3.0
fbscribelogger==0.1.7
filelock==3.18.0
filelock==3.6.0
hypothesis==6.56.4
librosa>=0.6.2
mpmath==1.3.0

View File

@ -193,7 +193,7 @@ LIBTORCH_CONTAINER_IMAGES: dict[str, str] = {
"cpu": "libtorch-cxx11-builder:cpu",
}
FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]
def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
@ -315,11 +315,6 @@ def generate_wheels_matrix(
# TODO: Enable python 3.13t on cpu-s390x
if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
continue
# TODO: Enable python 3.14 on non linux OSes
if os != "linux" and (
python_version == "3.14" or python_version == "3.14t"
):
continue
if use_split_build and (
arch_version not in ["12.6", "12.8", "12.9", "cpu"] or os != "linux"

View File

@ -2,7 +2,7 @@
set -ex
# Use uv to speed up lintrunner init
python3 -m pip install -U uv==0.8.* setuptools
python3 -m pip install uv==0.1.45 setuptools
CACHE_DIRECTORY="/tmp/.lintbin"
# Try to recover the cached binaries

View File

@ -1891,9 +1891,7 @@ def validate_revert(
else pr.get_comment_by_id(comment_id)
)
if comment.editor_login is not None:
raise PostCommentError(
"Halting the revert as the revert comment has been edited."
)
raise PostCommentError("Don't want to revert based on edited command")
author_association = comment.author_association
author_login = comment.author_login
allowed_reverters = ["COLLABORATOR", "MEMBER", "OWNER"]

View File

@ -96,7 +96,7 @@ jobs:
steps:
- name: Setup SSH (Click me for login details)
uses: pytorch/test-infra/.github/actions/setup-ssh@main
if: ${{ !contains(matrix.runner, 'b200') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
if: ${{ matrix.runner != 'B200' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
instructions: |
@ -109,7 +109,7 @@ jobs:
no-sudo: true
- name: Setup Python
if: contains(matrix.runner, 'b200')
if: matrix.runner == 'B200'
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3.12'
@ -117,7 +117,7 @@ jobs:
- name: Setup Linux
uses: ./.github/actions/setup-linux
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && !contains(matrix.runner, 'b200')
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && matrix.runner != 'B200'
- name: configure aws credentials
if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
@ -128,7 +128,7 @@ jobs:
aws-region: us-east-1
- name: Login to Amazon ECR
if: ${{ inputs.aws-role-to-assume != '' && contains(matrix.runner, 'b200') }}
if: ${{ inputs.aws-role-to-assume != '' && matrix.runner == 'B200' }}
id: login-ecr
continue-on-error: true
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
@ -166,17 +166,17 @@ jobs:
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
with:
driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && matrix.runner != 'B200' }}
- name: Setup GPU_FLAG for docker run
id: setup-gpu-flag
run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || contains(matrix.runner, 'b200')) }}
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || matrix.runner == 'B200') }}
- name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
id: setup-sscache-port-flag
run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && !contains(matrix.runner, 'b200') }}
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && matrix.runner != 'B200' }}
- name: Lock NVIDIA A100 40GB Frequency
run: |
@ -277,8 +277,8 @@ jobs:
NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
# Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }}
SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
SCCACHE_BUCKET: ${{ matrix.runner != 'B200' && 'ossci-compiler-cache-circleci-v2' || '' }}
SCCACHE_REGION: ${{ matrix.runner != 'B200' && 'us-east-1' || '' }}
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
DOCKER_IMAGE: ${{ inputs.docker-image }}
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
@ -403,7 +403,7 @@ jobs:
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
- name: Authenticate with AWS
if: ${{ contains(matrix.runner, 'b200') }}
if: ${{ matrix.runner == 'B200' }}
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results

View File

@ -269,8 +269,8 @@ jobs:
# copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"
- name: Change permissions (only needed for kubernetes runners for now)
if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'gfx942') || contains(matrix.runner, 'mi355')) }}
- name: Change permissions (only needed for MI300 and MI355 kubernetes runners for now)
if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'mi300') || contains(matrix.runner, 'mi355')) }}
run: |
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"

View File

@ -34,8 +34,7 @@ jobs:
contents: read
pull-requests: write
name: Check labels
# Disabling the job until https://github.com/pytorch/pytorch/issues/159825 is resolved
if: github.repository_owner == 'pytorch' && false
if: github.repository_owner == 'pytorch'
runs-on: linux.24_04.4x
steps:
- name: Checkout PyTorch

View File

@ -7,8 +7,7 @@ on:
jobs:
ghstack-mergeability-check:
# Disabling the job until https://github.com/pytorch/pytorch/issues/159825 is resolved
if: github.repository_owner == 'pytorch' && false
if: github.repository_owner == 'pytorch'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@ -57,7 +56,7 @@ jobs:
cache: pip
architecture: x64
- run: pip install pyyaml==6.0.2
- run: pip install pyyaml==6.0
shell: bash
- name: Verify mergeability

View File

@ -26,7 +26,7 @@ jobs:
cache: pip
# Not the direct dependencies but the script uses trymerge
- run: pip install pyyaml==6.0.2
- run: pip install pyyaml==6.0
- name: Setup committer id
run: |

View File

@ -51,12 +51,17 @@ jobs:
docker-image-name: [
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
pytorch-linux-jammy-py3.9-clang12,
pytorch-linux-jammy-py3.11-clang12,
pytorch-linux-jammy-py3.12-clang12,
pytorch-linux-jammy-py3.13-clang12,
pytorch-linux-jammy-rocm-n-py3,
pytorch-linux-noble-rocm-n-py3,
@ -71,8 +76,7 @@ jobs:
pytorch-linux-jammy-py3-clang12-onnx,
pytorch-linux-jammy-linter,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
# Executorch pin needs update
# pytorch-linux-jammy-py3-clang12-executorch,
pytorch-linux-jammy-py3-clang12-executorch,
pytorch-linux-jammy-py3.12-triton-cpu
]
include:

File diff suppressed because it is too large Load Diff

View File

@ -1,154 +0,0 @@
name: inductor-perf-b200
on:
schedule:
- cron: 0 7 * * 1-6
- cron: 0 7 * * 0
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
workflow_dispatch:
inputs:
training:
description: Run training (on by default)?
required: false
type: boolean
default: true
inference:
description: Run inference (on by default)?
required: false
type: boolean
default: true
default:
description: Run inductor_default?
required: false
type: boolean
default: false
dynamic:
description: Run inductor_dynamic_shapes?
required: false
type: boolean
default: false
cppwrapper:
description: Run inductor_cpp_wrapper?
required: false
type: boolean
default: false
cudagraphs:
description: Run inductor_cudagraphs?
required: false
type: boolean
default: true
freezing_cudagraphs:
description: Run inductor_cudagraphs with freezing for inference?
required: false
type: boolean
default: false
aotinductor:
description: Run aot_inductor for inference?
required: false
type: boolean
default: false
maxautotune:
description: Run inductor_max_autotune?
required: false
type: boolean
default: false
benchmark_configs:
description: The list of configs used the benchmark
required: false
type: string
default: inductor_huggingface_perf_cuda_b200,inductor_timm_perf_cuda_b200,inductor_torchbench_perf_cuda_b200
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-label-type:
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
build:
name: cuda12.8-py3.10-gcc9-sm100
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
# Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100
# or newer GPUs, so it doesn't benefit much from existing compiler cache
# from trunk. Also use a memory-intensive runner here because memory is
# usually the bottleneck
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '10.0'
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
{ config: "inductor_timm_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
{ config: "inductor_torchbench_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
]}
selected-test-configs: ${{ inputs.benchmark_configs }}
build-additional-packages: "vision audio fbgemm torchao"
secrets: inherit
test-periodically:
name: cuda12.8-py3.10-gcc9-sm100
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event.schedule == '0 7 * * 1-6'
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
timeout-minutes: 720
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test-weekly:
name: cuda12.8-py3.10-gcc9-sm100
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event.schedule == '0 7 * * 0'
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
timeout-minutes: 1440
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test:
name: cuda12.8-py3.10-gcc9-sm100
uses: ./.github/workflows/_linux-test.yml
needs: build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
timeout-minutes: 720
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit

View File

@ -88,23 +88,23 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
]}
secrets: inherit

View File

@ -81,21 +81,21 @@ jobs:
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
{ config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
]}
secrets: inherit

View File

@ -47,8 +47,8 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
test-matrix: |
{ include: [
{ config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
]}
secrets: inherit

View File

@ -75,11 +75,10 @@ jobs:
repo-owner: pytorch
branch: main
pin-folder: .github/ci_commit_pins
# executorch jobs are disabled since it needs some manual work for the hash update
# - repo-name: executorch
# repo-owner: pytorch
# branch: main
# pin-folder: .ci/docker/ci_commit_pins
- repo-name: executorch
repo-owner: pytorch
branch: main
pin-folder: .ci/docker/ci_commit_pins
- repo-name: triton
repo-owner: triton-lang
branch: main

View File

@ -59,9 +59,9 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
test-matrix: |
{ include: [
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
]}
secrets: inherit

View File

@ -51,6 +51,37 @@ jobs:
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-cuda12_4-py3_10-gcc11-sm89-build:
name: linux-jammy-cuda12.4-py3.10-gcc11-sm89
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda12.4-py3.10-gcc11-sm89
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11
cuda-arch-list: 8.9
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda12_4-py3_10-gcc11-sm89-test:
name: linux-jammy-cuda12.4-py3.10-gcc11-sm89
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_4-py3_10-gcc11-sm89-build
- target-determination
with:
build-environment: linux-jammy-cuda12.4-py3.10-gcc11-sm89
docker-image: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-sm89-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-sm89-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_4-py3_10-gcc11-build:
name: linux-jammy-cuda12.4-py3.10-gcc11
uses: ./.github/workflows/_linux-build.yml

View File

@ -292,14 +292,13 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: 8.9
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
]}
secrets: inherit
@ -403,8 +402,38 @@ jobs:
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-sm89-build:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: 8.9
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-sm89-test:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-sm89-build
- target-determination
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-py3-clang12-executorch-build:
if: false # Docker build needs pin update
name: linux-jammy-py3-clang12-executorch
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type

View File

@ -26,7 +26,7 @@ jobs:
architecture: x64
check-latest: false
cache: pip
- run: pip install pyyaml==6.0.2
- run: pip install pyyaml==6.0
- name: Setup committer id
run: |

View File

@ -48,12 +48,12 @@ jobs:
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
]}
secrets: inherit

View File

@ -3,7 +3,7 @@ name: rocm-mi355
on:
workflow_dispatch:
schedule:
- cron: 30 11,1 * * * # about 4:30am PDT and 6:30pm PDT
- cron: 30 9 * * * # about 2:30am PDT
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}

View File

@ -10,10 +10,6 @@ concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-default-label-prefix:
if: github.repository_owner == 'pytorch'

View File

@ -205,7 +205,7 @@ jobs:
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
test-matrix: |
{ include: [
{ config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },

View File

@ -28,7 +28,7 @@ jobs:
check-latest: false
cache: pip
architecture: x64
- run: pip install pyyaml==6.0.2
- run: pip install pyyaml==6.0
- name: Setup committer id
run: |

View File

@ -25,7 +25,7 @@ jobs:
architecture: x64
check-latest: false
cache: pip
- run: pip install pyyaml==6.0.2
- run: pip install pyyaml==6.0
- name: Setup committer id
run: |

View File

@ -23,7 +23,7 @@ jobs:
with:
repository: pytorch/pytorch
stable-branch: viable/strict
requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\", \"linux-aarch64\"]'
requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\"]'
secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}

View File

@ -39,16 +39,16 @@ init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'flake8==7.3.0',
'flake8-bugbear==24.12.12',
'flake8-comprehensions==3.16.0',
'flake8==6.1.0',
'flake8-bugbear==23.3.23',
'flake8-comprehensions==3.15.0',
'flake8-executable==2.1.3',
'flake8-logging-format==2024.24.12',
'flake8-pyi==25.5.0',
'flake8-simplify==0.22.0',
'flake8-logging-format==0.9.0',
'flake8-pyi==23.3.1',
'flake8-simplify==0.19.3',
'mccabe==0.7.0',
'pycodestyle==2.14.0',
'pyflakes==3.4.0',
'pycodestyle==2.11.1',
'pyflakes==3.1.0',
'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"',
]
@ -158,16 +158,16 @@ init_command = [
'mypy==1.16.0',
'sympy==1.13.3',
'types-requests==2.27.25',
'types-pyyaml==6.0.2',
'types-pyyaml==6.0.1',
'types-tabulate==0.8.8',
'types-protobuf==5.29.1.20250403',
'types-setuptools==79.0.0.20250422',
'types-jinja2==2.11.9',
'types-colorama==0.4.6',
'filelock==3.18.0',
'filelock==3.13.1',
'junitparser==2.1.1',
'rich==14.1.0',
'pyyaml==6.0.2',
'rich==10.9.0',
'pyyaml==6.0.1',
'optree==0.13.0',
'dataclasses-json==0.6.7',
'pandas==2.2.3',
@ -1111,7 +1111,7 @@ init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'pyyaml==6.0.2',
'PyYAML==6.0.1',
]
[[linter]]
@ -1133,7 +1133,7 @@ init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'pyyaml==6.0.2',
'PyYAML==6.0.1',
]
[[linter]]

View File

@ -1,18 +1 @@
- This is the only AGENTS.md, there are no recursive AGENTS.md
- When you are working on a bug, first create a standalone file that
reproduces the bug and verify it fails in the expected way. Use this to
test if your changes work. Once the change is passing, find an appropriate
test file to add the test to and make sure to follow local conventions on
the test file.
- If you are running the real test suite, DO NOT run the entire test suite.
Instead run only a single test case, e.g., 'python test/test_torch.py TestTorch.test_dir'
- Do NOT run setup.py, you do not have a working build environment
- Do NOT run pre-commit, it is not setup
- To run lint, run 'lintrunner -a' (which will autoapply changes). lintrunner
ONLY accepts this flag, do not try to run on individual files.
- Do NOT attempt to install dependencies, you do not have Internet access
- When you are ready to make a PR, do exactly these steps:
- git stash -u
- git reset --hard $(cat /tmp/orig_work.txt) # NB: reset to the LOCAL branch, do NOT fetch
- git stash pop
- Resolve conflicts if necessary

View File

@ -679,7 +679,6 @@ cc_library(
[
"torch/*.h",
"torch/csrc/**/*.h",
"torch/nativert/**/*.h",
"torch/csrc/distributed/c10d/**/*.hpp",
"torch/lib/libshm/*.h",
],

View File

@ -564,7 +564,7 @@ if(MSVC)
set(CMAKE_NINJA_CMCLDEPS_RC OFF)
if(MSVC_Z7_OVERRIDE)
# CMake set debug flags to use /Z7
set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug,RelWithDebInfo>:Embedded>")
set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT Embedded)
endif()
foreach(
flag_var
@ -872,14 +872,6 @@ cmake_dependent_option(
"USE_CUDA OR USE_ROCM;NOT MSVC"
OFF)
cmake_dependent_option(
USE_FBGEMM_GENAI
"Whether to build FBGEMM GenAI quantized GEMM kernels.\
Will be disabled if not supported by the platform"
OFF
"USE_CUDA OR USE_ROCM"
OFF)
# CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
# Eff Attention won't
cmake_dependent_option(
@ -913,10 +905,6 @@ if(USE_FBGEMM)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM")
endif()
if(USE_FBGEMM_GENAI)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM_GENAI")
endif()
if(USE_PYTORCH_QNNPACK)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
endif()

View File

@ -14,6 +14,7 @@
/torch/csrc/autograd/ @albanD @soulitzer
/torch/autograd/ @albanD @soulitzer
/tools/autograd/ @albanD @soulitzer
/torch/header_only_apis.txt @janeyx99
/torch/nn/ @albanD @jbschlosser @mikaylagawarecki
/torch/optim/ @albanD @janeyx99
/test/test_public_bindings.py @albanD
@ -50,12 +51,12 @@ nn/qat/ @jerryzh168
/torch/csrc/distributed/c10d/Ops.* @kwen2501
# ONNX Export
/torch/_dynamo/backends/onnxrt.py @titaiwangms @xadupre @justinchuby
/torch/csrc/jit/passes/onnx.h @titaiwangms @xadupre
/torch/csrc/jit/passes/onnx.cpp @titaiwangms @xadupre
/torch/csrc/jit/passes/onnx/ @titaiwangms @xadupre
/torch/onnx/ @titaiwangms @xadupre @justinchuby
/test/onnx/ @titaiwangms @xadupre @justinchuby
/torch/_dynamo/backends/onnxrt.py @wschin
/torch/csrc/jit/passes/onnx.h @titaiwangms @shubhambhokare1
/torch/csrc/jit/passes/onnx.cpp @titaiwangms @shubhambhokare1
/torch/csrc/jit/passes/onnx/ @titaiwangms @shubhambhokare1
/torch/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin
/test/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin
# CI
/.ci @pytorch/pytorch-dev-infra
@ -195,8 +196,3 @@ torch/backends/cudnn/ @eqy @syed-ahmed
/torch/utils/_cxx_pytree.py @XuehaiPan
/torch/utils/pytree/ @XuehaiPan
/torch/_dynamo/polyfills/pytree.py @XuehaiPan
# Relating to libtorch ABI
/torch/csrc/stable/ @janeyx99 @mikaylagawarecki
/torch/headeronly/ @janeyx99
/torch/header_only_apis.txt @janeyx99

View File

@ -276,7 +276,7 @@ conda install pkg-config libuv
pip install mkl-static mkl-include
# Add these packages if torch.distributed is needed.
# Distributed package support on Windows is a prototype feature and is subject to changes.
conda install -c conda-forge libuv
conda install -c conda-forge libuv=1.39
```
#### Install PyTorch

View File

@ -247,50 +247,6 @@ if(USE_MEM_EFF_ATTENTION)
list(APPEND ATen_ATTENTION_KERNEL_SRCS ${mem_eff_attention_cuda_kernels_cu})
endif()
IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
set(USE_FBGEMM_GENAI off)
endif()
# FBGEMM GenAI
IF(USE_FBGEMM_GENAI)
set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/)
set(FBGEMM_GENAI_DIR ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
if(USE_ROCM)
# Only include the kernels we want to build to avoid increasing binary size.
file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
"${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
"${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
# Add additional HIPCC compiler flags for performance
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
-mllvm
-amdgpu-coerce-illegal-types=1
-mllvm
-enable-post-misched=0
-mllvm
-greedy-reverse-local-assignment=1
-fhip-new-launch-api)
hip_add_library(
fbgemm_genai STATIC
${fbgemm_genai_native_rocm_hip}
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
target_include_directories(fbgemm_genai PUBLIC
# FBGEMM version of Composable Kernel is used due to some customizations
${FBGEMM_THIRD_PARTY}/composable_kernel/include
${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
${FBGEMM_GENAI_DIR}/include/
${FBGEMM_GENAI_DIR}/common/include/
)
endif()
endif()
# XNNPACK
file(GLOB native_xnnpack "native/xnnpack/*.cpp")
@ -439,7 +395,6 @@ if(USE_ROCM)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/example/ck_tile/01_fmha)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/aiter/csrc/include)
_pytorch_rocm_generate_ck_conf()
@ -704,17 +659,21 @@ if(USE_MPS)
if(CAN_COMPILE_METAL)
foreach(SHADER ${native_mps_metal})
cmake_path(GET SHADER STEM TGT_STEM)
string(CONCAT TGT_BASIC ${TGT_STEM} "_31.air")
string(CONCAT TGT_BASIC ${TGT_STEM} "_30.air")
string(CONCAT TGT_BFLOAT ${TGT_STEM} "_31.air")
list(APPEND AIR_BASIC ${TGT_BASIC})
metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.1")
list(APPEND AIR_BFLOAT ${TGT_BFLOAT})
metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.0")
metal_to_air(${SHADER} ${TGT_BFLOAT} "-std=metal3.1")
endforeach()
air_to_metallib(kernels_basic.metallib ${AIR_BASIC})
air_to_metallib(kernels_bfloat.metallib ${AIR_BFLOAT})
add_custom_command(
COMMAND echo "// $$(date)" > metallib_dummy.cpp
DEPENDS kernels_basic.metallib
DEPENDS kernels_basic.metallib kernels_bfloat.metallib
OUTPUT metallib_dummy.cpp
COMMENT "Updating metallibs timestamp")
add_custom_target(metallibs DEPENDS kernels_basic.metallib metallib_dummy.cpp)
add_custom_target(metallibs DEPENDS kernels_basic.metallib kernels_bfloat.metallib metallib_dummy.cpp)
else()
file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/native/mps")
foreach(SHADER ${native_mps_metal})

View File

@ -1,6 +1,5 @@
#pragma once
#include <c10/core/CachingDeviceAllocator.h>
#include <c10/core/DeviceType.h>
#include <c10/macros/Macros.h>
@ -73,27 +72,6 @@ TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index);
// original device index that was active before the change.
TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index);
TORCH_API inline void emptyCache() {
const auto device_type = getAccelerator(true).value();
at::getDeviceAllocator(device_type)->emptyCache();
}
TORCH_API inline at::CachingDeviceAllocator::DeviceStats getDeviceStats(
c10::DeviceIndex device_index) {
const auto device_type = getAccelerator(true).value();
return at::getDeviceAllocator(device_type)->getDeviceStats(device_index);
}
TORCH_API inline void resetAccumulatedStats(c10::DeviceIndex device_index) {
const auto device_type = getAccelerator(true).value();
at::getDeviceAllocator(device_type)->resetAccumulatedStats(device_index);
}
TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
const auto device_type = getAccelerator(true).value();
at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
}
} // namespace at::accelerator
namespace at {

View File

@ -1 +1,55 @@
#include <torch/headeronly/cpu/vec/intrinsics.h>
#pragma once
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
/* GCC or clang-compatible compiler, targeting x86/x86-64 */
#include <x86intrin.h>
#elif defined(__clang__) && (defined(__ARM_NEON__) || defined(__aarch64__))
/* Clang-compatible compiler, targeting arm neon */
#include <arm_neon.h>
#if defined(__ARM_FEATURE_SVE)
/* CLANG-compatible compiler, targeting ARM with SVE */
#include <arm_sve.h>
#endif
#elif defined(_MSC_VER)
/* Microsoft C/C++-compatible compiler */
#include <intrin.h>
#if _MSC_VER <= 1900
#define _mm256_extract_epi64(X, Y) \
(_mm_extract_epi64(_mm256_extractf128_si256(X, Y >> 1), Y % 2))
#define _mm256_extract_epi32(X, Y) \
(_mm_extract_epi32(_mm256_extractf128_si256(X, Y >> 2), Y % 4))
#define _mm256_extract_epi16(X, Y) \
(_mm_extract_epi16(_mm256_extractf128_si256(X, Y >> 3), Y % 8))
#define _mm256_extract_epi8(X, Y) \
(_mm_extract_epi8(_mm256_extractf128_si256(X, Y >> 4), Y % 16))
#endif
#elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__aarch64__))
/* GCC-compatible compiler, targeting ARM with NEON */
#include <arm_neon.h>
#if defined(__ARM_FEATURE_SVE)
/* GCC-compatible compiler, targeting ARM with SVE */
#include <arm_sve.h>
#endif
#if defined(MISSING_ARM_VLD1)
#include <ATen/cpu/vec/vec256/missing_vld1_neon.h>
#elif defined(MISSING_ARM_VST1)
#include <ATen/cpu/vec/vec256/missing_vst1_neon.h>
#endif
#elif defined(__GNUC__) && defined(__IWMMXT__)
/* GCC-compatible compiler, targeting ARM with WMMX */
#include <mmintrin.h>
#elif defined(__s390x__)
// targets Z/architecture
// we will include vecintrin later
#elif (defined(__GNUC__) || defined(__xlC__)) && \
(defined(__VEC__) || defined(__ALTIVEC__))
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
#include <altivec.h>
/* We need to undef those tokens defined by <altivec.h> to avoid conflicts
with the C++ types. => Can still use __bool/__vector */
#undef bool
#undef vector
#undef pixel
#elif defined(__GNUC__) && defined(__SPE__)
/* GCC-compatible compiler, targeting PowerPC with SPE */
#include <spe.h>
#endif

View File

@ -1 +1,396 @@
#include <torch/headeronly/cpu/vec/vec256/missing_vld1_neon.h>
/* Workaround for missing vld1_*_x2 and vst1_*_x2 intrinsics in gcc-7. */
__extension__ extern __inline uint8x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_u8_x2(const uint8_t* __a) {
uint8x8x2_t ret;
asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int8x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_s8_x2(const int8_t* __a) {
int8x8x2_t ret;
asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint16x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_u16_x2(const uint16_t* __a) {
uint16x4x2_t ret;
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int16x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_s16_x2(const int16_t* __a) {
int16x4x2_t ret;
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint32x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_u32_x2(const uint32_t* __a) {
uint32x2x2_t ret;
asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int32x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_s32_x2(const int32_t* __a) {
int32x2x2_t ret;
asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint64x1x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_u64_x2(const uint64_t* __a) {
uint64x1x2_t ret;
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int64x1x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_s64_x2(const int64_t* __a) {
int64x1x2_t ret;
__builtin_aarch64_simd_oi __o;
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline float16x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_f16_x2(const float16_t* __a) {
float16x4x2_t ret;
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline float32x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_f32_x2(const float32_t* __a) {
float32x2x2_t ret;
asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline float64x1x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_f64_x2(const float64_t* __a) {
float64x1x2_t ret;
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline poly8x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_p8_x2(const poly8_t* __a) {
poly8x8x2_t ret;
asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline poly16x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_p16_x2(const poly16_t* __a) {
poly16x4x2_t ret;
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline poly64x1x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_p64_x2(const poly64_t* __a) {
poly64x1x2_t ret;
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint8x16x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u8_x2(const uint8_t* __a) {
uint8x16x2_t ret;
asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int8x16x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s8_x2(const int8_t* __a) {
int8x16x2_t ret;
asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint16x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u16_x2(const uint16_t* __a) {
uint16x8x2_t ret;
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int16x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s16_x2(const int16_t* __a) {
int16x8x2_t ret;
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint32x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u32_x2(const uint32_t* __a) {
uint32x4x2_t ret;
asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int32x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s32_x2(const int32_t* __a) {
int32x4x2_t ret;
asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint64x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u64_x2(const uint64_t* __a) {
uint64x2x2_t ret;
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int64x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s64_x2(const int64_t* __a) {
int64x2x2_t ret;
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline float16x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_f16_x2(const float16_t* __a) {
float16x8x2_t ret;
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline float32x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_f32_x2(const float32_t* __a) {
float32x4x2_t ret;
asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline float64x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_f64_x2(const float64_t* __a) {
float64x2x2_t ret;
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline poly8x16x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_p8_x2(const poly8_t* __a) {
poly8x16x2_t ret;
asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline poly16x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_p16_x2(const poly16_t* __a) {
poly16x8x2_t ret;
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline poly64x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_p64_x2(const poly64_t* __a) {
poly64x2x2_t ret;
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
/* vst1x2 */
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_s64_x2(int64_t* __a, int64x1x2_t val) {
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_u64_x2(uint64_t* __a, uint64x1x2_t val) {
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_f64_x2(float64_t* __a, float64x1x2_t val) {
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_s8_x2(int8_t* __a, int8x8x2_t val) {
asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_p8_x2(poly8_t* __a, poly8x8x2_t val) {
asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_s16_x2(int16_t* __a, int16x4x2_t val) {
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_p16_x2(poly16_t* __a, poly16x4x2_t val) {
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_s32_x2(int32_t* __a, int32x2x2_t val) {
asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_u8_x2(uint8_t* __a, uint8x8x2_t val) {
asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_u16_x2(uint16_t* __a, uint16x4x2_t val) {
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_u32_x2(uint32_t* __a, uint32x2x2_t val) {
asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_f16_x2(float16_t* __a, float16x4x2_t val) {
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_f32_x2(float32_t* __a, float32x2x2_t val) {
asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_p64_x2(poly64_t* __a, poly64x1x2_t val) {
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s8_x2(int8_t* __a, int8x16x2_t val) {
asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_p8_x2(poly8_t* __a, poly8x16x2_t val) {
asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s16_x2(int16_t* __a, int16x8x2_t val) {
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_p16_x2(poly16_t* __a, poly16x8x2_t val) {
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s32_x2(int32_t* __a, int32x4x2_t val) {
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s64_x2(int64_t* __a, int64x2x2_t val) {
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u8_x2(uint8_t* __a, uint8x16x2_t val) {
asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u16_x2(uint16_t* __a, uint16x8x2_t val) {
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u32_x2(uint32_t* __a, uint32x4x2_t val) {
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u64_x2(uint64_t* __a, uint64x2x2_t val) {
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f16_x2(float16_t* __a, float16x8x2_t val) {
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f32_x2(float32_t* __a, float32x4x2_t val) {
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f64_x2(float64_t* __a, float64x2x2_t val) {
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_p64_x2(poly64_t* __a, poly64x2x2_t val) {
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
}

View File

@ -1 +1,7 @@
#include <torch/headeronly/cpu/vec/vec256/missing_vst1_neon.h>
/* Workaround for missing vst1q_f32_x2 in gcc-8. */
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f32_x2(float32_t* __a, float32x4x2_t val) {
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
}

View File

@ -3,12 +3,50 @@
#include <ATen/cpu/vec/intrinsics.h>
#include <c10/util/Exception.h>
#include <torch/headeronly/cpu/vec/vec_half.h>
namespace at::vec {
// See Note [CPU_CAPABILITY namespace]
inline namespace CPU_CAPABILITY {
#if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
!defined(__APPLE__)
static inline uint16_t float2half_scalar(float val) {
#if defined(CPU_CAPABILITY_AVX2)
#if defined(_MSC_VER)
__m256 v = _mm256_set1_ps(val);
__m128i o =
_mm256_cvtps_ph(v, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
return static_cast<std::uint16_t>(_mm_cvtsi128_si32(o));
#else
return _cvtss_sh(val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
#endif
#elif defined(CPU_CAPABILITY_AVX512)
__m512 v = _mm512_set1_ps(val);
__m256i o =
_mm512_cvtps_ph(v, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
return static_cast<std::uint16_t>(
_mm_cvtsi128_si32(_mm256_castsi256_si128(o)));
#endif
}
static inline float half2float_scalar(uint16_t val) {
#if defined(CPU_CAPABILITY_AVX2)
#if defined(_MSC_VER)
__m128i v = _mm_cvtsi32_si128(val);
__m256 o = _mm256_cvtph_ps(v);
return _mm256_cvtss_f32(o);
#else
return _cvtsh_ss(val);
#endif
#elif defined(CPU_CAPABILITY_AVX512)
__m256i v =
_mm256_setr_epi16(val, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 o = _mm512_cvtph_ps(v);
return _mm512_cvtss_f32(o);
#endif
}
#endif
// Transpose a [2, 32] matrix to [32, 2]
// Note: the output leading dimension should be 2,
// that is, the output must be contiguous

View File

@ -2,6 +2,7 @@
#include <ATen/cuda/CUDAGraph.h>
#include <ATen/cuda/Exceptions.h>
#include <ATen/Functions.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAFunctions.h>
#include <cstddef>

View File

@ -2,7 +2,6 @@
#include <ATen/Tensor.h>
#include <c10/core/Device.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAGraphsC10Utils.h>
#include <c10/cuda/CUDAStream.h>
#include <c10/util/flat_hash_map.h>

View File

@ -162,7 +162,7 @@ struct CUDACachingHostAllocatorImpl
}
bool pinned_use_background_threads() override {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
pinned_use_background_threads();
}

View File

@ -24,29 +24,6 @@ static void _assert_match(const O& original, const C& compared, const std::strin
}
}
template<>
void _assert_match<c10::Device, std::optional<c10::Device>>(
const c10::Device& original,
const std::optional<c10::Device>& compared,
const std::string& name) {
if (compared) {
const c10::Device& expected = compared.value();
if (original.type() != expected.type()) {
std::stringstream msg;
msg << "Tensor " << name << " mismatch! Expected: " << expected << ", Got: " << original;
throw std::runtime_error(msg.str());
}
// If the expected device doesn't have an index (e.g., just "cuda"),
// or if both devices have the same index, consider them equal
if (expected.has_index() && original.has_index() && expected.index() != original.index()) {
std::stringstream msg;
msg << "Tensor " << name << " mismatch! Expected: " << expected << ", Got: " << original;
throw std::runtime_error(msg.str());
}
}
}
void _assert_tensor_metadata_meta_symint(at::Tensor const& tensor, at::OptionalSymIntArrayRef sizes, at::OptionalSymIntArrayRef strides, std::optional<c10::ScalarType> dtype, std::optional<c10::Device> device, std::optional<c10::Layout> layout) {
_assert_match(tensor.sym_sizes(), sizes, "sizes");
_assert_match(tensor.sym_strides(), strides, "strides");

View File

@ -367,27 +367,27 @@ void int8pack_mm_kernel_(
auto* C_data = C.data_ptr<T>();
const auto* S_data = scales.const_data_ptr<T>();
int64_t M = A.size(0);
int64_t N = B.size(0);
int64_t K = A.size(1);
int64_t lda = A.stride(0);
constexpr int64_t BLOCK_M = 4;
constexpr int64_t BLOCK_N = 4;
int M = A.size(0);
int N = B.size(0);
int K = A.size(1);
int lda = A.stride(0);
constexpr int BLOCK_M = 4;
constexpr int BLOCK_N = 4;
const int64_t MB = (M + BLOCK_M - 1) / BLOCK_M;
const int64_t NB = (N + BLOCK_N - 1) / BLOCK_N;
const int MB = (M + BLOCK_M - 1) / BLOCK_M;
const int NB = (N + BLOCK_N - 1) / BLOCK_N;
at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
int64_t mb{0}, nb{0};
at::parallel_for(0, MB * NB, 0, [&](int begin, int end) {
int mb{0}, nb{0};
data_index_init(begin, mb, MB, nb, NB);
for (const auto i : c10::irange(begin, end)) {
(void)i;
int64_t mb_start = mb * BLOCK_M;
int64_t mb_size = std::min(BLOCK_M, M - mb_start);
int64_t nb_start = nb * BLOCK_N;
int64_t nb_size = std::min(BLOCK_N, N - nb_start);
int mb_start = mb * BLOCK_M;
int mb_size = std::min(BLOCK_M, M - mb_start);
int nb_start = nb * BLOCK_N;
int nb_size = std::min(BLOCK_N, N - nb_start);
const auto* A_ptr = A_data + mb_start * lda;
const auto* B_ptr = B_data + nb_start * K;

View File

@ -526,7 +526,7 @@ namespace {
// we are dealing with packed tensor here. max index is the same as numel.
// TODO: to really support input tensor large enough to go beyond int32,
// TODO: to really support input tensor large enought to go beyond int32,
// we will need to restrict out shared memory usage and adjust the launch
// config;
AT_ASSERT(input_.numel() < std::numeric_limits<int32_t>::max());
@ -681,7 +681,7 @@ namespace {
const dim3 grid(grid_x, grid_y, grid_z);
// we are dealing with packed tensor here. max index is the same as numel.
// TODO: to really support input tensor large enough to go beyond int32,
// TODO: to really support input tensor large enought to go beyond int32,
// we will need to restrict out shared memory usage and adjust the launch
// config;
AT_ASSERT(input.numel() < std::numeric_limits<int32_t>::max());

View File

@ -21,10 +21,6 @@
#include <ATen/native/cuda/GroupMM.h>
#include <ATen/ceil_div.h>
#ifdef USE_FBGEMM_GENAI
#include <fbgemm_gpu/torch_ops.h>
#endif
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
@ -1220,7 +1216,7 @@ std::pair<ScalingType, ScalingType> get_joint_scaling(
// - `scale_a`: a tensor with the inverse scale of `mat1`, whose shape/strides/dtype depend on the scaling scheme
// - `scale_b`: a tensor with the inverse scale of `mat2`, whose shape/strides/dtype depend on the scaling scheme
// - `scale_result`: a scalar tensor with the scale of the output, only utilized if the output is a float8 type
// - `use_fast_accum`: if true, enables fast float8 accumulation. Backends may ignore this option if not applicable.
// - `use_fast_accum`: if true, enables fast float8 accumulation
// - `out`: a reference to the output tensor
Tensor&
@ -1529,7 +1525,6 @@ namespace {
const auto out_dtype_ = out_dtype.value_or(kBFloat16);
TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm");
#ifndef USE_ROCM
// For TMA transfers, strides of output tensor have to be either
// 1, or aligned to 16 bytes.
const auto last_dim = out_size.size() - 1;
@ -1541,10 +1536,9 @@ namespace {
} else {
out_stride = {out_size[1] * size_padded, size_padded, 1};
}
return at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype_));
#else
return at::empty(out_size, mat_a.options().dtype(out_dtype_));
#endif
auto out = at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype_));
return out;
}
bool check_valid_strides_and_return_transposed(const Tensor& mat) {
@ -1625,18 +1619,18 @@ const std::optional<at::Tensor>& bias,
const std::optional<at::Tensor>& scale_result,
std::optional<c10::ScalarType> out_dtype,
bool use_fast_accum) {
bool allowed_device = _scaled_mm_allowed_device();
TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0, or ROCm MI300+");
#ifndef USE_ROCM
bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true);
TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0");
TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type());
TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_b.scalar_type());
TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed");
TORCH_CHECK(check_valid_strides_and_return_transposed(mat_b), "Expected mat2 to be transposed");
TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d");
TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
const bool a_is_2d = mat_a.dim() == 2;
const bool b_is_2d = mat_b.dim() == 2;
if (!a_is_2d || !b_is_2d) {
TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
}
TORCH_CHECK(
mat_a.size(-1) % 16 == 0,
"Expected trailing dimension of mat_a to be divisible by 16 ",
@ -1670,10 +1664,6 @@ bool use_fast_accum) {
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype);
#ifndef USE_ROCM
TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type());
TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_b.scalar_type());
at::cuda::detail::f8f8bf16_grouped_mm(
mat_a,
mat_b,
@ -1684,23 +1674,12 @@ bool use_fast_accum) {
use_fast_accum,
out);
return out;
#else
#ifdef USE_FBGEMM_GENAI
TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fnuz, "Expected mat_a to be Float8_e4m3fnuz matrix got ", mat_a.scalar_type());
TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fnuz, "Expected mat_a to be Float8_e4m3fnuz matrix got ", mat_b.scalar_type());
fbgemm_gpu::f8f8bf16_rowwise_grouped_mm(
mat_a,
// FBGEMM expects B matrix shape to be (.., N, K)
mat_b.transpose(-2, -1),
scale_a,
scale_b,
offs,
out);
return out;
#else
TORCH_CHECK(false, "grouped gemm is not supported without USE_FBGEMM_GENAI on ROCM")
#endif
TORCH_CHECK(false, "grouped gemm is not supported on ROCM")
#endif
}
@ -1719,9 +1698,6 @@ std::optional<c10::ScalarType> out_dtype) {
TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
const bool a_is_2d = mat_a.dim() == 2;
const bool b_is_2d = mat_b.dim() == 2;
if (!a_is_2d || !b_is_2d) {
TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
}
// check that the strides are valid, the fn will throw an error if not
check_valid_strides_and_return_transposed(mat_a);

View File

@ -223,7 +223,7 @@ inline CuFFTDataLayout as_cufft_embed(IntArrayRef strides, IntArrayRef sizes, bo
class CuFFTConfig {
public:
// Only move semantics is enough for this class. Although we already use
// Only move semantics is enought for this class. Although we already use
// unique_ptr for the plan, still remove copy constructor and assignment op so
// we don't accidentally copy and take perf hit.
CuFFTConfig(const CuFFTConfig&) = delete;

View File

@ -38,19 +38,17 @@ static inline std::string _cudaGetErrorEnum(cufftResult error)
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
case CUFFT_INCOMPLETE_PARAMETER_LIST:
return "CUFFT_INCOMPLETE_PARAMETER_LIST";
case CUFFT_INVALID_DEVICE:
return "CUFFT_INVALID_DEVICE";
case CUFFT_PARSE_ERROR:
return "CUFFT_PARSE_ERROR";
case CUFFT_NO_WORKSPACE:
return "CUFFT_NO_WORKSPACE";
case CUFFT_NOT_IMPLEMENTED:
return "CUFFT_NOT_IMPLEMENTED";
#if CUDA_VERSION <= 12090
case CUFFT_INCOMPLETE_PARAMETER_LIST:
return "CUFFT_INCOMPLETE_PARAMETER_LIST";
case CUFFT_PARSE_ERROR:
return "CUFFT_PARSE_ERROR";
#endif
#if !defined(USE_ROCM) && CUDA_VERSION <= 12090
#if !defined(USE_ROCM)
case CUFFT_LICENSE_ERROR:
return "CUFFT_LICENSE_ERROR";
#endif

View File

@ -241,8 +241,6 @@ void bf16bf16_grouped_gemm_impl_sm90_sm100(
Strides tensor_StrideA = make_strides(mat_a.strides());
Strides tensor_StrideB = make_strides(mat_b.strides());
Strides tensor_StrideOutput = make_strides(out.strides());
Strides tensor_ShapeA = make_strides(mat_a.sizes());
Strides tensor_ShapeB = make_strides(mat_b.sizes());
at::cuda::detail::prepare_grouped_gemm_data<<<1, group_count, 0, stream>>>(
reinterpret_cast<DtypeA*>(mat_a.data_ptr()),
@ -266,8 +264,6 @@ void bf16bf16_grouped_gemm_impl_sm90_sm100(
tensor_StrideA,
tensor_StrideB,
tensor_StrideOutput,
tensor_ShapeA,
tensor_ShapeB,
0,
0,
a_row_major,

View File

@ -38,20 +38,18 @@ __global__ void prepare_grouped_gemm_data(
Strides tensor_StrideA,
Strides tensor_StrideB,
Strides tensor_StrideOutput,
Strides tensor_ShapeA,
Strides tensor_ShapeB,
int64_t a_scale_stride,
int64_t b_scale_stride,
bool a_row_major = true,
bool b_row_major = false) {
int32_t tid = threadIdx.x;
int32_t delta = 0;
int32_t offset = 0;
if (offs != nullptr) {
int32_t start = tid == 0 ? 0 : offs[tid - 1];
offset = offs[tid];
delta = offset - start;
CUDA_KERNEL_ASSERT(delta >=0 && "expected gemm dimension to be greater or equal 0\n");
delta = offs[tid] - start;
if (K < 0) {
CUDA_KERNEL_ASSERT(delta >=0 && "expected ofsets to be greater or equal 0\n");
}
// TMA transfers require global memory tensor addresses to be
// aligned to 16 bytes.
@ -86,7 +84,6 @@ __global__ void prepare_grouped_gemm_data(
int64_t lda, ldb, ldoutput;
if (M < 0) {
// A and output is 2d
CUDA_KERNEL_ASSERT(offset <= tensor_ShapeA[0] && "expected offset to be less than tensor size\n");
M = delta;
lda = a_row_major ? tensor_StrideA[0] : tensor_StrideA[1];
ldb = b_row_major ? tensor_StrideB[1] : tensor_StrideB[2];
@ -99,7 +96,6 @@ __global__ void prepare_grouped_gemm_data(
output_ptrs[tid] = tid == 0 ? output : output + offs[tid - 1] * ldoutput;
B_ptrs[tid] = B + tid * tensor_StrideB[0];
} else if (N < 0) {
CUDA_KERNEL_ASSERT(offset <= tensor_ShapeB[1] && "expected offset to be less than tensor size\n");
N = delta;
lda = a_row_major ? tensor_StrideA[1] : tensor_StrideA[2];
ldb = b_row_major ? tensor_StrideB[0] : tensor_StrideB[1]; // B is transposed
@ -112,7 +108,6 @@ __global__ void prepare_grouped_gemm_data(
inputB_scale_ptrs[tid] = tid == 0 ? scale_B : scale_B + offs[tid - 1];
}
} else if (K < 0) {
CUDA_KERNEL_ASSERT(offset <= tensor_ShapeA[1] && offset <= tensor_ShapeB[0] && "expected offset to be less than tensor size\n");
// A, B is 2d, output is 3d
K = delta;
lda = a_row_major ? tensor_StrideA[0] : tensor_StrideA[1];

View File

@ -644,12 +644,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
Tensor grad = at::full_like(log_probs, neginf, LEGACY_CONTIGUOUS_MEMORY_FORMAT); // initialization for log(sum (alpha beta))
// As above, there may be better configurations to use.
constexpr int max_threads_ = std::is_same_v<scalar_t, float> ? 1024 : 896; // we need 72 or so 32 bit registers for double
int max_threads = max_threads_;
// Blackwell launch bounds
if (at::cuda::getCurrentDeviceProperties()->major >= 10) {
max_threads = 512;
}
constexpr int max_threads = std::is_same_v<scalar_t, float> ? 1024 : 896; // we need 72 or so 32 bit registers for double
int threads_target = max_threads;
while (threads_target / 2 >= 2*max_target_length+1) {
threads_target /= 2;

View File

@ -9,7 +9,6 @@
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wmissing-field-initializers")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
// Determine if the architecture supports rowwise scaled mm
// Currently failing on windows with:
@ -45,7 +44,6 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
#include <ATen/native/cuda/cutlass_common.cuh>
C10_DIAGNOSTIC_POP()
C10_DIAGNOSTIC_POP()
C10_DIAGNOSTIC_POP()

View File

@ -10,7 +10,6 @@
// Two warninngs in Cutlass included header files
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
// Determine if the architecture supports rowwise scaled mm
// Currently failing on windows with:
@ -45,7 +44,6 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
#include <cutlass/gemm/kernel/gemm_universal.hpp>
#include <cutlass/util/packed_stride.hpp>
C10_DIAGNOSTIC_POP()
C10_DIAGNOSTIC_POP()
C10_DIAGNOSTIC_POP()
@ -298,9 +296,6 @@ void f8f8bf16_grouped_gemm_impl_sm90(
Strides tensor_StrideA = make_strides(mat_a.strides());
Strides tensor_StrideB = make_strides(mat_b.strides());
Strides tensor_StrideOutput = make_strides(out.strides());
Strides tensor_ShapeA = make_strides(mat_a.sizes());
Strides tensor_ShapeB = make_strides(mat_b.sizes());
// scale stride will be used inside the kernel only if needed,
// so for 1d scales the "1" assigned here won't be used
int64_t a_scale_stride = scale_a.stride(0);
@ -328,8 +323,6 @@ void f8f8bf16_grouped_gemm_impl_sm90(
tensor_StrideA,
tensor_StrideB,
tensor_StrideOutput,
tensor_ShapeA,
tensor_ShapeB,
a_scale_stride,
b_scale_stride);

View File

@ -1,74 +0,0 @@
#include <ATen/ATen.h>
#include <ATen/core/Tensor.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
namespace at::native {
__global__ void weight_int8pack_mm_kernel(const float* x, const int8_t* w, const float* scale, float* out, int B, int K, int N) {
// one thread per output element: [B, N]
int b = blockIdx.y * blockDim.y + threadIdx.y;
int n = blockIdx.x * blockDim.x + threadIdx.x;
if (b >= B || n >= N) return;
float acc = 0.0f;
for (int k = 0; k < K; ++k) {
acc += x[b * K + k] * static_cast<float>(w[n * K + k]);
}
out[b * N + n] = acc * scale[n];
}
void launch_weight_int8pack_mm_cuda_kernel(const Tensor& x, const Tensor& w_int8, const Tensor& scale, Tensor& out) {
const int B = x.size(0);
const int K = x.size(1);
const int N = w_int8.size(0);
const dim3 block(16, 16);
const dim3 grid((N + block.x - 1) / block.x, (B + block.y - 1) / block.y);
auto stream = at::cuda::getCurrentCUDAStream();
weight_int8pack_mm_kernel<<<grid, block, 0, stream>>>(
x.data_ptr<float>(),
w_int8.data_ptr<int8_t>(),
scale.data_ptr<float>(),
out.data_ptr<float>(),
B, K, N);
}
// Main GPU entry point
at::Tensor _weight_int8pack_mm_cuda(const at::Tensor& x, const at::Tensor& w_int8, const at::Tensor& scale) {
// --- Check inputs ---
TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor");
TORCH_CHECK(w_int8.is_cuda(), "w must be a CUDA tensor");
TORCH_CHECK(scale.is_cuda(), "scale must be a CUDA tensor");
TORCH_CHECK(x.dim() == 2, "x must be 2D");
TORCH_CHECK(w_int8.dim() == 2, "w must be 2D");
TORCH_CHECK(scale.dim() == 1, "scale must be 1D");
TORCH_CHECK(x.size(1) == w_int8.size(1), "K dimension mismatch: x.size(1) != w.size(1)");
TORCH_CHECK(w_int8.size(0) == scale.size(0), "Output dim mismatch: w.size(0) != scale.size(0)");
// --- Determine shapes ---
auto B = x.size(0); // batch size
auto N = w_int8.size(0); // output dim
// Ensure inputs are in the correct types for the kernel
auto x_f32 = x.to(at::kFloat);
auto w_int8_contiguous = w_int8.contiguous();
auto scale_f32 = scale.to(at::kFloat);
// --- Allocate output ---
auto out = at::empty({B, N}, x.options().dtype(at::kFloat));
// --- Launch kernel ---
launch_weight_int8pack_mm_cuda_kernel(x_f32, w_int8_contiguous, scale_f32, out);
return out;
}
} // namespace at::native

View File

@ -45,7 +45,7 @@ namespace at::cuda::jit {
// Copied from aten/src/ATen/cuda/llvm_basic.cpp, then modified as above.
// If not compiling for ROCm, return the original get_traits_string().
std::string get_traits_string_but_hiprtc_safe() {
#if defined(USE_ROCM) && HIP_VERSION_MAJOR < 7
#if defined(USE_ROCM) && ROCM_VERSION < 70000
return R"ESCAPE(
namespace std {

View File

@ -28,22 +28,6 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
TORCH_CHECK(false, "cudnn_batch_norm: ATen not compiled with cuDNN support");
}
std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
const Tensor& input,
const Tensor& weight,
const std::optional<Tensor>& bias,
const std::optional<Tensor>& running_mean,
const std::optional<Tensor>& running_var,
bool training,
double exponential_average_factor,
double epsilon,
Tensor& out,
Tensor& save_mean,
Tensor& save_var,
Tensor& reserve) {
AT_ERROR("cudnn_batch_norm_out: ATen not compiled with cuDNN support");
}
std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
const Tensor& input,
const Tensor& grad_output,
@ -136,12 +120,7 @@ size_t _get_cudnn_batch_norm_reserve_space_size(
return reserve_size;
}
// Param `reserve` is a placeholder, just passing an empty tensor.
// usage:
// auto reserve = torch::empty({0}, torch::device(torch::kCUDA));
// at::native::cudnn_batch_norm_out(..., epsilon, output, save_mean, save_var,
// reserve);
std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
const Tensor& input_t,
const Tensor& weight_t,
const std::optional<Tensor>& bias_t_opt,
@ -149,11 +128,7 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
const std::optional<Tensor>& running_var_t_opt,
bool training,
double exponential_average_factor,
double epsilon,
Tensor& output_t,
Tensor& save_mean,
Tensor& save_var,
Tensor& reserve) {
double epsilon) {
// See [Note: hacky wrapper removal for optional tensor]
c10::MaybeOwned<Tensor> bias_t_maybe_owned =
at::borrow_from_optional_tensor(bias_t_opt);
@ -193,6 +168,9 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
cudnnBatchNormMode_t mode = getCudnnBatchNormMode(
training, input->suggest_memory_format(), input->dim());
auto output_t =
at::empty_like(*input, input->options(), input->suggest_memory_format());
TensorArg output{output_t, "output", 0};
auto handle = getCudnnHandle();
@ -204,8 +182,15 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
Constant one(dataType, 1);
Constant zero(dataType, 0);
Tensor save_mean, save_var;
Tensor reserve;
if (training) {
int64_t num_features = input_t.size(1);
save_mean = at::empty({num_features}, weight_t.options());
save_var = at::empty({num_features}, weight_t.options());
auto op = CUDNN_BATCHNORM_OPS_BN;
size_t workspace_size;
AT_CUDNN_CHECK(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
@ -253,6 +238,9 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
reserve_size));
} else {
reserve = at::empty({0}, input->options().dtype(kByte));
// This keeps a consistent output with native_batch_norm
save_mean = at::empty({0}, weight_t.options());
save_var = at::empty({0}, weight_t.options());
AT_CUDNN_CHECK(cudnnBatchNormalizationForwardInference(
handle,
mode,
@ -273,48 +261,10 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
// save_mean and save_var can be undefined
// If this causes problems, we can initialize them to empty tensors
// of the correct type
return std::tuple<Tensor&, Tensor&, Tensor&, Tensor&>{
return std::tuple<Tensor, Tensor, Tensor, Tensor>{
output_t, save_mean, save_var, reserve};
}
std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
const Tensor& input_t,
const Tensor& weight_t,
const std::optional<Tensor>& bias_t_opt,
const std::optional<Tensor>& running_mean_t_opt,
const std::optional<Tensor>& running_var_t_opt,
bool training,
double exponential_average_factor,
double epsilon) {
auto output_t = at::empty_like(
input_t, input_t.options(), input_t.suggest_memory_format());
Tensor save_mean, save_var, reserve;
if (training) {
int64_t num_features = input_t.size(1);
save_mean = at::empty({num_features}, weight_t.options());
save_var = at::empty({num_features}, weight_t.options());
} else {
// This keeps a consistent output with native_batch_norm
save_mean = at::empty({0}, weight_t.options());
save_var = at::empty({0}, weight_t.options());
}
return cudnn_batch_norm_out(
input_t,
weight_t,
bias_t_opt,
running_mean_t_opt,
running_var_t_opt,
training,
exponential_average_factor,
epsilon,
output_t,
save_mean,
save_var,
reserve);
}
// NB: CuDNN only implements the backward algorithm for batchnorm
// in training mode (evaluation mode batchnorm has a different algorithm),
// which is why this doesn't accept a 'training' parameter.

View File

@ -342,8 +342,8 @@ Tensor rms_norm_symint(
if (weight_opt.has_value() && weight_opt.value().defined() && weight_opt.value().dtype() != input.dtype()) {
TORCH_WARN_ONCE(
"Mismatch dtype between input and weight: input dtype = ", input.dtype(),
", weight dtype = ", weight_opt.value().dtype(), ", Cannot dispatch to fused implementation."
"Mismatch dtype between input and module: input dtype = ", input.dtype(),
", module dtype = ", weight_opt.value().dtype(), ", Can not dispatch to fused implementation"
);
return std::get<0>(rms_norm_composite(input, IntArrayRef(reinterpret_cast<const int64_t*>(normalized_shape.data()), normalized_shape.size()), weight_opt, eps));
}

View File

@ -1,6 +1,7 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/Config.h>
#include <ATen/Context.h>
#include <ATen/Dispatch.h>
#include <ATen/core/Tensor.h>
#include <ATen/native/mkldnn/Matmul.h>
@ -427,74 +428,56 @@ static inline bool checksize(const Tensor& mat1, const Tensor& mat2){
}
}
bool use_mkldnn_bf16_matmul(
template <typename T>
bool use_mkldnn_typed_matmul(
const Tensor& mat1,
const Tensor& mat2,
const Tensor& result) {
bool dtype_check = false;
if constexpr (std::is_same_v<T, c10::BFloat16>) {
#if defined(__aarch64__)
if (mkldnn_bf16_device_check_arm()) {
// onednn fastmath mode can leverage bf16 HW even for the fp32 input, e.g.
// Arm Neoverse V1 so, don't restrict the mkldnn_matmul only for bf16
// inputs, allow it for float as well
return (
use_mkldnn_bf16_matmul() &&
(mat1.scalar_type() == mat2.scalar_type()) &&
(!result.defined() || (mat1.scalar_type() == result.scalar_type())) &&
((mat1.scalar_type() == kFloat) || (mat1.scalar_type() == kBFloat16)) &&
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
} else
if (mkldnn_bf16_device_check_arm()) {
// onednn fastmath mode can leverage bf16 HW even for the fp32 input, e.g.
// Arm Neoverse V1 so, don't restrict the mkldnn_matmul only for bf16
// inputs, allow it for float as well
dtype_check = use_mkldnn_bf16_matmul() &&
((mat1.scalar_type() == kFloat) || (mat1.scalar_type() == kBFloat16));
}
#else
dtype_check = dtype_check && use_mkldnn_bf16_matmul() &&
(mat1.scalar_type() == kBFloat16);
#endif
{
return (
use_mkldnn_bf16_matmul() && mat1.scalar_type() == kBFloat16 &&
mat2.scalar_type() == kBFloat16 &&
(!result.defined() || result.scalar_type() == kBFloat16) &&
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
} else if constexpr (std::is_same_v<T, c10::Half>) {
dtype_check = dtype_check && use_mkldnn_fp16_matmul() &&
(mat1.scalar_type() == kHalf);
} else if constexpr (std::is_same_v<T, float>) {
dtype_check = dtype_check &&
(use_mkldnn_bf32_matmul() || use_mkldnn_tf32_matmul()) &&
(mat1.scalar_type() == kFloat);
}
}
bool use_mkldnn_fp16_matmul(
const Tensor& mat1,
const Tensor& mat2,
const Tensor& result) {
return (
use_mkldnn_fp16_matmul() && mat1.scalar_type() == kHalf &&
mat2.scalar_type() == kHalf &&
(!result.defined() || result.scalar_type() == kHalf) &&
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
}
bool use_mkldnn_bf32_matmul(
const Tensor& mat1,
const Tensor& mat2,
const Tensor& result) {
return (
use_mkldnn_bf32_matmul() && mat1.scalar_type() == kFloat &&
mat2.scalar_type() == kFloat &&
(!result.defined() || result.scalar_type() == kFloat) &&
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
}
bool use_mkldnn_tf32_matmul(
const Tensor& mat1,
const Tensor& mat2,
const Tensor& result) {
return (
use_mkldnn_tf32_matmul() && mat1.scalar_type() == kFloat &&
mat2.scalar_type() == kFloat &&
(!result.defined() || result.scalar_type() == kFloat) &&
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
if (!dtype_check) {
return false;
}
bool size_check =
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2);
dtype_check = (mat1.scalar_type() == mat2.scalar_type()) &&
(!result.defined() || result.scalar_type() == mat1.scalar_type());
return dtype_check && size_check;
}
bool use_mkldnn_matmul(
const Tensor& mat1,
const Tensor& mat2,
const Tensor& result) {
return (
use_mkldnn_bf16_matmul(mat1, mat2, result) ||
use_mkldnn_fp16_matmul(mat1, mat2, result) ||
use_mkldnn_bf32_matmul(mat1, mat2, result) ||
use_mkldnn_tf32_matmul(mat1, mat2, result));
auto mat1_type = mat1.scalar_type();
if (mat1_type != kBFloat16 || mat1_type != kHalf || mat1_type != kFloat) {
return false;
}
AT_DISPATCH_FLOATING_TYPES_AND2(
kBFloat16, kHalf, mat1.scalar_type(), "use_mkldnn_matmul", [&] {
return use_mkldnn_typed_matmul<scalar_t>(mat1, mat2, result);
});
return false;
}
static void _mkldnn_matmul_i8i8i32_with_primitive(

View File

@ -469,94 +469,4 @@ Tensor _weight_int4pack_mm_xpu(
return C;
}
Tensor& _int_mm_out_xpu(
const Tensor& self,
const Tensor& mat2,
Tensor& result) {
TORCH_CHECK(
self.dim() == 2,
"Expected self to be of dimension 2 but got ",
self.dim());
TORCH_CHECK(
mat2.dim() == 2,
"Expected mat2 to be of dimension 2 but got ",
mat2.dim());
TORCH_CHECK(
self.size(1) == mat2.size(0),
"self.size(1) needs to match mat2.size(0) but got ",
self.size(1),
" and ",
mat2.size(0));
TORCH_CHECK(
self.dtype() == at::kChar,
"Expected self dtype to be of type int8 but got ",
self.dtype());
TORCH_CHECK(
mat2.dtype() == at::kChar,
"Expected mat2 dtype to be of type int8 but got ",
mat2.dtype());
TORCH_CHECK(
result.dtype() == at::kInt,
"Expected result dtype to be of type kInt but got ",
result.dtype());
TORCH_CHECK(
result.size(0) == self.size(0),
"Expected result.size(0) to be ",
self.size(0),
" but got ",
result.size(0));
TORCH_CHECK(
result.size(1) == mat2.size(1),
"Expected result.size(1) to be ",
mat2.size(1),
" but got ",
result.size(1));
TORCH_CHECK(
result.dim() == 2,
"Expected result to be of dimension 2 but got ",
result.dim());
TORCH_CHECK(result.is_contiguous(), "Expected result to be contiguous.");
if (result.numel() == 0 || self.size(1) == 0) {
return result.zero_();
}
Tensor bias = at::Tensor();
Tensor mat2_scales = at::ones({1}, mat2.options().dtype(at::kFloat));
Tensor mat2_zero_points = at::Tensor();
auto post_op_args = torch::List<std::optional<at::Scalar>>();
at::native::onednn::quantized_matmul(
self.contiguous(),
1.0,
0,
mat2.contiguous(),
mat2_scales,
mat2_zero_points,
bias,
result,
1.0,
0,
result.scalar_type(),
/*other*/ std::nullopt,
/*other scale*/ 1.0,
/*other zp*/ 0,
/*binary post op*/ "none",
/*binary alpha*/ 1.0,
/*post_op_name*/ "none",
post_op_args,
/*post_op_algorithm*/ "none",
/*m2_trans*/ true);
return result;
}
Tensor _int_mm_xpu(const Tensor& self, const Tensor& mat2) {
Tensor result =
at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt));
return _int_mm_out_xpu(self, mat2, result);
}
} // namespace at::native

View File

@ -953,7 +953,8 @@ class BundledShaderLibary : public MetalShaderLibrary {
if (C10_UNLIKELY(!library)) {
auto device = MPSDevice::getInstance()->device();
NSError* error = nil;
library = [device newLibraryWithData:getSectionData("metal_basic") error:&error];
auto section_name = is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS) ? "metal_bfloat" : "metal_basic";
library = [device newLibraryWithData:getSectionData(section_name) error:&error];
TORCH_CHECK(library, "Failed to create metal library, error: ", [[error description] UTF8String]);
}
return library;

View File

@ -33,15 +33,21 @@ struct shrink_backward_functor {
REGISTER_UNARY_ALPHA_OP(hardshrink, float, float, float);
REGISTER_UNARY_ALPHA_OP(hardshrink, half, half, half);
#if __METAL_VERSION__ >= 310
REGISTER_UNARY_ALPHA_OP(hardshrink, bfloat, bfloat, bfloat);
#endif
REGISTER_UNARY_ALPHA_OP(softshrink, float, float, float);
REGISTER_UNARY_ALPHA_OP(softshrink, half, half, half);
#if __METAL_VERSION__ >= 310
REGISTER_UNARY_ALPHA_OP(softshrink, bfloat, bfloat, bfloat);
#endif
REGISTER_BINARY_ALPHA_OP(shrink_backward, float, float, float);
REGISTER_BINARY_ALPHA_OP(shrink_backward, half, half, half);
#if __METAL_VERSION__ >= 310
REGISTER_BINARY_ALPHA_OP(shrink_backward, bfloat, bfloat, bfloat);
#endif
struct hardsigmoid_functor {
template <typename T>
@ -61,11 +67,15 @@ struct hardsigmoid_backward_functor {
REGISTER_UNARY_OP(hardsigmoid, float, float);
REGISTER_UNARY_OP(hardsigmoid, half, half);
#if __METAL_VERSION__ >= 310
REGISTER_UNARY_OP(hardsigmoid, bfloat, bfloat);
#endif
REGISTER_BINARY_OP(hardsigmoid_backward, float, float);
REGISTER_BINARY_OP(hardsigmoid_backward, half, half);
#if __METAL_VERSION__ >= 310
REGISTER_BINARY_OP(hardsigmoid_backward, bfloat, bfloat);
#endif
struct hardswish_functor {
template <typename T>
@ -93,11 +103,15 @@ struct hardswish_backward_functor {
REGISTER_UNARY_OP(hardswish, float, float);
REGISTER_UNARY_OP(hardswish, half, half);
#if __METAL_VERSION__ >= 310
REGISTER_UNARY_OP(hardswish, bfloat, bfloat);
#endif
REGISTER_BINARY_OP(hardswish_backward, float, float);
REGISTER_BINARY_OP(hardswish_backward, half, half);
#if __METAL_VERSION__ >= 310
REGISTER_BINARY_OP(hardswish_backward, bfloat, bfloat);
#endif
struct leaky_relu_functor {
template <typename T>
@ -121,8 +135,12 @@ struct leaky_relu_backward_functor {
REGISTER_UNARY_ALPHA_OP(leaky_relu, float, float, float);
REGISTER_UNARY_ALPHA_OP(leaky_relu, half, half, half);
#if __METAL_VERSION__ >= 310
REGISTER_UNARY_ALPHA_OP(leaky_relu, bfloat, bfloat, bfloat);
#endif
REGISTER_BINARY_ALPHA_OP(leaky_relu_backward, float, float, float);
REGISTER_BINARY_ALPHA_OP(leaky_relu_backward, half, half, half);
#if __METAL_VERSION__ >= 310
REGISTER_BINARY_ALPHA_OP(leaky_relu_backward, bfloat, bfloat, bfloat);
#endif

View File

@ -113,12 +113,18 @@ kernel void ampUpdateScale(
INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE(float);
INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE(half);
#if __METAL_VERSION__ >= 310
INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE(bfloat);
#endif
INSTANTIATE_AMP_UPDATE_SCALE(float);
INSTANTIATE_AMP_UPDATE_SCALE(half);
#if __METAL_VERSION__ >= 310
INSTANTIATE_AMP_UPDATE_SCALE(bfloat);
#endif
INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE_SINGLE(float);
INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE_SINGLE(half);
#if __METAL_VERSION__ >= 310
INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE_SINGLE(bfloat);
#endif

View File

@ -590,7 +590,9 @@ kernel void attention(
INSTANTIATE_SDPA_VECTOR_HEADS(float);
INSTANTIATE_SDPA_VECTOR_HEADS(half);
#if __METAL_VERSION__ >= 310
INSTANTIATE_SDPA_VECTOR_HEADS(bfloat);
#endif
#define INSTANTIATE_ATTN(DTYPE, bq, bk, bd, wm, wn) \
template [[host_name("attention_" #DTYPE "_bq" #bq "_bk" #bk "_bd" #bd \
@ -619,4 +621,6 @@ INSTANTIATE_SDPA_VECTOR_HEADS(bfloat);
INSTANTIATE_ATTN_SHAPES_HELPER(float);
INSTANTIATE_ATTN_SHAPES_HELPER(half);
#if __METAL_VERSION__ >= 310
INSTANTIATE_ATTN_SHAPES_HELPER(bfloat);
#endif

View File

@ -209,9 +209,38 @@ struct hermite_polynomial_he_functor {
};
struct nextafter_functor {
#if __METAL_VERSION__ < 310
template <typename U>
struct bit_type {};
template <>
struct bit_type<float> {
using type = int;
};
template <>
struct bit_type<half> {
using type = short;
};
#endif
template <typename T>
inline T operator()(const T a, const T b) {
#if __METAL_VERSION__ >= 310
return static_cast<T>(::metal::nextafter(a, b));
#else
using U = typename bit_type<T>::type;
if (a == b) {
return a;
}
if (::metal::isunordered(a, b)) {
return NAN;
}
if (a == 0) {
constexpr auto eps = as_type<T>(static_cast<U>(1));
return b > 0 ? eps : -eps;
}
auto bits = as_type<U>(a);
(a > 0) ^ (a > b) ? bits++ : bits--;
return as_type<T>(bits);
#endif
}
};
@ -315,6 +344,13 @@ struct fmod_functor {
}
};
// Some helper defines
#if __METAL_VERSION__ >= 310
#define _METAL_310_PLUS(x) x
#else
#define _METAL_310_PLUS(x)
#endif
#define REGISTER_INTEGER_BINARY_OP(NAME) \
REGISTER_BINARY_OP(NAME, long, long); \
REGISTER_BINARY_OP(NAME, int, int); \
@ -334,12 +370,12 @@ struct fmod_functor {
#define REGISTER_FLOAT_BINARY_OP(NAME) \
REGISTER_BINARY_OP(NAME, float, float); \
REGISTER_BINARY_OP(NAME, half, half); \
REGISTER_BINARY_OP(NAME, bfloat, bfloat)
_METAL_310_PLUS(REGISTER_BINARY_OP(NAME, bfloat, bfloat))
#define REGISTER_OPMATH_FLOAT_BINARY_OP(NAME) \
REGISTER_OPMATH_BINARY_OP(NAME, float, float); \
REGISTER_OPMATH_BINARY_OP(NAME, half, half); \
REGISTER_OPMATH_BINARY_OP(NAME, bfloat, bfloat)
_METAL_310_PLUS(REGISTER_OPMATH_BINARY_OP(NAME, bfloat, bfloat))
REGISTER_FLOAT_BINARY_OP(copysign);
REGISTER_INT2FLOAT_BINARY_OP(copysign);
@ -411,9 +447,11 @@ REGISTER_BINARY_ALPHA_OP(lerp_alpha, uchar, uchar, uchar);
REGISTER_BINARY_ALPHA_OP(lerp_alpha, char, char, char);
REGISTER_BINARY_ALPHA_OP(lerp_alpha, bool, bool, bool);
#if __METAL_VERSION__ >= 310
REGISTER_BINARY_ALPHA_OP(add_alpha, bfloat, bfloat, bfloat);
REGISTER_BINARY_ALPHA_OP(sub_alpha, bfloat, bfloat, bfloat);
REGISTER_BINARY_ALPHA_OP(lerp_alpha, bfloat, bfloat, bfloat);
#endif
// Complex binary functions
REGISTER_BINARY_OP(polar, float, float2);

View File

@ -180,8 +180,10 @@ REGISTER_SEARCHSORTED_OP(float, int);
REGISTER_SEARCHSORTED_OP(float, long);
REGISTER_SEARCHSORTED_OP(half, int);
REGISTER_SEARCHSORTED_OP(half, long);
#if __METAL_VERSION__ >= 310
REGISTER_SEARCHSORTED_OP(bfloat, int);
REGISTER_SEARCHSORTED_OP(bfloat, long);
#endif
REGISTER_SEARCHSORTED_OP(char, int);
REGISTER_SEARCHSORTED_OP(char, long);
REGISTER_SEARCHSORTED_OP(uchar, int);

View File

@ -96,4 +96,6 @@ kernel void col2im_kernel(
INSTANTIATE_COL2IM(bool);
INSTANTIATE_COL2IM(float);
INSTANTIATE_COL2IM(half);
#if __METAL_VERSION__ >= 310
INSTANTIATE_COL2IM(bfloat);
#endif

View File

@ -20,7 +20,9 @@ REGISTER_CROSS_FUNC(short);
REGISTER_CROSS_FUNC(char);
REGISTER_CROSS_FUNC(uchar);
REGISTER_CROSS_FUNC(bool);
#if __METAL_VERSION__ >= 310
REGISTER_CROSS_FUNC(bfloat);
#endif
template <typename T, typename U>
kernel void cross(
@ -66,4 +68,6 @@ REGISTER_CROSS_OP(short);
REGISTER_CROSS_OP(char);
REGISTER_CROSS_OP(uchar);
REGISTER_CROSS_OP(bool);
#if __METAL_VERSION__ >= 310
REGISTER_CROSS_OP(bfloat);
#endif

View File

@ -1,9 +1,11 @@
#include <metal_stdlib>
using metal::max;
#if __METAL_VERSION__ >= 310
bfloat max(bfloat a, bfloat b) {
return a > b ? a : b;
}
#endif
#define kmaxThreadGroups 32
#define kmaxTensors 32
@ -304,9 +306,11 @@ REGISTER_ADAM_OPS_QUART(float, float);
REGISTER_ADAM_OPS_QUART(float, half);
REGISTER_ADAM_OPS_QUART(half, float);
REGISTER_ADAM_OPS_QUART(half, half);
#if __METAL_VERSION__ >= 310
REGISTER_ADAM_OPS_QUART(float, bfloat);
REGISTER_ADAM_OPS_QUART(bfloat, bfloat);
REGISTER_ADAM_OPS_QUART(bfloat, float);
#endif
template <typename T>
inline void sgd_momentum_math(
@ -456,5 +460,7 @@ REGISTER_FUSED_SGD_OP(float);
REGISTER_FUSED_SGD_OP(half);
REGISTER_FUSED_SGD_MOMENTUM_OP(float);
REGISTER_FUSED_SGD_MOMENTUM_OP(half);
#if __METAL_VERSION__ >= 310
REGISTER_FUSED_SGD_OP(bfloat);
REGISTER_FUSED_SGD_MOMENTUM_OP(bfloat);
#endif

View File

@ -106,7 +106,9 @@ kernel void polygamma(
constant int64_t& order [[buffer(2)]], \
uint id [[thread_position_in_grid]]);
#if __METAL_VERSION__ >= 310
INSTANTIATE_GAMMA_KERNELS(bfloat, bfloat);
#endif
INSTANTIATE_GAMMA_KERNELS(half, half);
INSTANTIATE_GAMMA_KERNELS(float, float);
INSTANTIATE_GAMMA_KERNELS(bool, float);

View File

@ -76,4 +76,6 @@ INSTANTIATE_IM2COL(float);
INSTANTIATE_IM2COL(float2);
INSTANTIATE_IM2COL(half);
INSTANTIATE_IM2COL(half2);
#if __METAL_VERSION__ >= 310
INSTANTIATE_IM2COL(bfloat);
#endif

View File

@ -240,7 +240,9 @@ REGISTER_INDEX_OP(put_accumulate, short, short);
REGISTER_INDEX_OP(put_accumulate, char, char);
REGISTER_INDEX_OP(put_accumulate, uchar, uchar);
REGISTER_INDEX_OP(put_accumulate, bool, bool);
#if __METAL_VERSION__ >= 310
REGISTER_INDEX_OP(put_accumulate, bfloat, bfloat);
#endif
template <typename StridesT, typename DataT>
kernel void kernel_index_offsets(
@ -475,8 +477,10 @@ INSTANTIATE_INDEX_COPY(char, long);
INSTANTIATE_INDEX_COPY(uchar, int);
INSTANTIATE_INDEX_COPY(uchar, long);
#if __METAL_VERSION__ >= 310
INSTANTIATE_INDEX_COPY(bfloat, int);
INSTANTIATE_INDEX_COPY(bfloat, long);
#endif
INSTANTIATE_INDEX_COPY(float2, int);
INSTANTIATE_INDEX_COPY(float2, long);
INSTANTIATE_INDEX_COPY(half2, int);

View File

@ -288,6 +288,7 @@ kernel void layer_norm_looped(
#define instantiate_layer_norm(DTYPE) \
instantiate_layer_norm_single_row(DTYPE) instantiate_layer_norm_looped(DTYPE)
instantiate_layer_norm(float);
instantiate_layer_norm(half);
instantiate_layer_norm(bfloat);
instantiate_layer_norm(float) instantiate_layer_norm(half)
#if __METAL_VERSION__ >= 310
instantiate_layer_norm(bfloat)
#endif

View File

@ -635,7 +635,9 @@ kernel void applyPivots(
INSTANTIATE_NAIVE_MM(float);
INSTANTIATE_NAIVE_MM(half);
#if __METAL_VERSION__ >= 310
INSTANTIATE_NAIVE_MM(bfloat);
#endif
// Integral MM
INSTANTIATE_NAIVE_MM(short);

View File

@ -22,22 +22,6 @@ struct PoolingParams {
bool return_indices;
};
template <unsigned N = 5, typename idx_type_t = int32_t>
struct AvgPoolingParams {
int32_t dims;
int32_t pooling_dims;
::c10::metal::array<idx_type_t, N> input_sizes;
::c10::metal::array<idx_type_t, N> input_strides;
::c10::metal::array<idx_type_t, N> output_sizes;
::c10::metal::array<idx_type_t, N> output_strides;
::c10::metal::array<idx_type_t, N - 2> kernel_size;
::c10::metal::array<idx_type_t, N - 2> stride;
::c10::metal::array<idx_type_t, N - 2> padding;
bool count_include_pad;
bool has_divisor_override;
int32_t divisor_override;
};
template <unsigned N = 5, typename idx_type_t = int32_t>
struct PoolingBackwardParams {
int32_t dims;
@ -48,14 +32,3 @@ struct PoolingBackwardParams {
::c10::metal::array<idx_type_t, N> grad_output_strides;
::c10::metal::array<idx_type_t, N> indices_strides;
};
template <unsigned N = 5, typename idx_type_t = int32_t>
struct MaxUnpoolingParams {
int32_t dims;
int32_t pooling_dims;
::c10::metal::array<idx_type_t, N> input_sizes;
::c10::metal::array<idx_type_t, N> input_strides;
::c10::metal::array<idx_type_t, N> output_sizes;
::c10::metal::array<idx_type_t, N> output_strides;
::c10::metal::array<idx_type_t, N> indices_strides;
};

View File

@ -168,16 +168,6 @@ PoolOffsets find_pool_offsets(
leading_dims,
return_indices,
tid);
case 3:
return find_pool_offsets_dim_specific<3>(
output_sizes,
output_strides,
indices_strides,
input_strides,
pooling_dim_indices,
leading_dims,
return_indices,
tid);
}
return PoolOffsets();
}
@ -302,359 +292,36 @@ kernel void max_pool_backward(
pooling_dims);
}
template <typename T>
void max_unpool_impl(
device T* output,
T input_element,
int32_t input_index,
constant int32_t* output_sizes,
constant int32_t* output_strides,
int32_t pooling_dims) {
int32_t size_prod = 1;
int32_t pool_offset = 0;
for (auto dim = pooling_dims - 1; dim >= 0; dim--) {
auto next_size_prod = output_sizes[dim] * size_prod;
pool_offset +=
output_strides[dim] * ((input_index % next_size_prod) / size_prod);
size_prod *= output_sizes[dim];
}
output[pool_offset] = input_element;
}
// Kernel computes one element of the grad input per kernel call.
template <typename T>
kernel void max_unpool(
device T* output [[buffer(0)]],
constant T* input [[buffer(1)]],
constant int64_t* indices [[buffer(2)]],
constant MaxUnpoolingParams<5>& params [[buffer(3)]],
uint tid [[thread_position_in_grid]]) {
auto pooling_dims = params.pooling_dims;
auto dims = params.dims;
auto input_sizes = params.input_sizes.data();
auto input_strides = params.input_strides.data();
auto output_sizes = params.output_sizes.data();
auto output_strides = params.output_strides.data();
auto indices_strides = params.indices_strides.data();
auto leading_dims = dims - pooling_dims;
// NOTE: Since we're doing unpooling, the variable names "input" and "output"
// are reversed compared to the pooling operations. So in `find_pool_offsets`,
// we need to map "input" -> "output" and "output" -> "input".
PoolOffsets offsets = find_pool_offsets(
/*output_sizes=*/input_sizes,
/*output_strides=*/input_strides,
indices_strides,
/*input_strides=*/output_strides,
/*pooling_dim_indices=*/nullptr,
dims,
leading_dims,
/*return_indices=*/true,
tid);
max_unpool_impl<T>(
output + offsets.input_leading,
input[offsets.output],
indices[offsets.indices],
output_sizes + leading_dims,
output_strides + leading_dims,
pooling_dims);
}
template <typename T>
struct AvgPoolIterBounds {
T start;
T end;
T count;
};
template <int32_t dim>
AvgPoolIterBounds<int32_t> get_avg_pool_input_iter_bounds(
constant int32_t* input_sizes,
thread int32_t (&pooling_dim_indices)[3],
constant int32_t* kernel_size,
constant int32_t* stride,
constant int32_t* padding,
bool count_include_pad) {
auto start = stride[dim] * pooling_dim_indices[dim] - padding[dim];
auto end = start + kernel_size[dim];
auto end_corrected = min(start + kernel_size[dim], input_sizes[dim]);
auto start_corrected = (start < 0) ? 0 : start;
auto count = count_include_pad
? (min(end, input_sizes[dim] + padding[dim]) - start)
: (end_corrected - start_corrected);
return {start_corrected, end_corrected, count};
}
// Iterates through all the input elements that this kernel needs to
// apply max to. Specialized for 3 pooling dimensions.
template <typename T>
void avg_pool_3d_input_iter(
constant T* input,
device T* output,
constant int32_t* input_sizes,
constant int32_t* input_strides,
thread int32_t (&pooling_dim_indices)[3],
constant int32_t* kernel_size,
constant int32_t* stride,
constant int32_t* padding,
bool count_include_pad,
bool has_divisor_override,
int32_t divisor_override) {
auto bounds0 = get_avg_pool_input_iter_bounds<0>(
input_sizes,
pooling_dim_indices,
kernel_size,
stride,
padding,
count_include_pad);
auto bounds1 = get_avg_pool_input_iter_bounds<1>(
input_sizes,
pooling_dim_indices,
kernel_size,
stride,
padding,
count_include_pad);
auto bounds2 = get_avg_pool_input_iter_bounds<2>(
input_sizes,
pooling_dim_indices,
kernel_size,
stride,
padding,
count_include_pad);
T value_sum = 0;
auto divisor = has_divisor_override
? divisor_override
: (bounds0.count) * (bounds1.count) * (bounds2.count);
for (auto i0 = bounds0.start; i0 < bounds0.end; i0++) {
auto offset0 = input_strides[0] * i0;
for (auto i1 = bounds1.start; i1 < bounds1.end; i1++) {
auto offset1 = input_strides[1] * i1;
for (auto i2 = bounds2.start; i2 < bounds2.end; i2++) {
auto offset2 = input_strides[2] * i2;
auto input_value = input[offset0 + offset1 + offset2];
value_sum += input_value;
}
}
}
*output = value_sum / static_cast<T>(divisor);
}
template <typename T>
void avg_pool_backward_3d_input_iter(
device AtomicType_t<T>* grad_input,
constant T* grad_output,
constant int32_t* grad_input_sizes,
constant int32_t* grad_input_strides,
int32_t grad_input_leading_offset,
thread int32_t (&pooling_dim_indices)[3],
constant int32_t* kernel_size,
constant int32_t* stride,
constant int32_t* padding,
bool count_include_pad,
bool has_divisor_override,
int32_t divisor_override) {
auto bounds0 = get_avg_pool_input_iter_bounds<0>(
grad_input_sizes,
pooling_dim_indices,
kernel_size,
stride,
padding,
count_include_pad);
auto bounds1 = get_avg_pool_input_iter_bounds<1>(
grad_input_sizes,
pooling_dim_indices,
kernel_size,
stride,
padding,
count_include_pad);
auto bounds2 = get_avg_pool_input_iter_bounds<2>(
grad_input_sizes,
pooling_dim_indices,
kernel_size,
stride,
padding,
count_include_pad);
auto divisor = has_divisor_override
? divisor_override
: (bounds0.count) * (bounds1.count) * (bounds2.count);
auto grad_val = *grad_output / static_cast<T>(divisor);
for (auto i0 = bounds0.start; i0 < bounds0.end; i0++) {
auto offset0 = grad_input_strides[0] * i0;
for (auto i1 = bounds1.start; i1 < bounds1.end; i1++) {
auto offset1 = grad_input_strides[1] * i1;
for (auto i2 = bounds2.start; i2 < bounds2.end; i2++) {
auto offset2 = grad_input_strides[2] * i2;
auto pool_offset = offset0 + offset1 + offset2;
AtomicType<T>::atomic_add(
grad_input, grad_input_leading_offset + pool_offset, grad_val);
}
}
}
}
// Kernel computes one element of the output per kernel call.
template <typename T>
kernel void avg_pool(
constant T* input [[buffer(0)]],
device T* output [[buffer(1)]],
constant AvgPoolingParams<5>& params [[buffer(2)]],
uint tid [[thread_position_in_grid]]) {
auto pooling_dims = params.pooling_dims;
auto dims = params.dims;
auto input_sizes = params.input_sizes.data();
auto input_strides = params.input_strides.data();
auto output_sizes = params.output_sizes.data();
auto output_strides = params.output_strides.data();
auto kernel_size = params.kernel_size.data();
auto stride = params.stride.data();
auto padding = params.padding.data();
auto leading_dims = dims - pooling_dims;
// This buffer keeps track of the pooling dimension indices of this thread's
// element of the output. We need to fill it with the proper values below.
int32_t pooling_dim_indices[3];
PoolOffsets offsets = find_pool_offsets(
output_sizes,
output_strides,
/*indices_strides=*/nullptr,
input_strides,
pooling_dim_indices,
dims,
leading_dims,
/*return_indices=*/false,
tid);
output += offsets.output;
input += offsets.input_leading;
input_sizes += leading_dims;
input_strides += leading_dims;
avg_pool_3d_input_iter<T>(
input,
output,
input_sizes,
input_strides,
pooling_dim_indices,
kernel_size,
stride,
padding,
params.count_include_pad,
params.has_divisor_override,
params.divisor_override);
}
template <typename T>
kernel void avg_pool_backward(
device AtomicType_t<T>* grad_input [[buffer(0)]],
constant T* grad_output [[buffer(1)]],
constant AvgPoolingParams<5>& params [[buffer(2)]],
uint tid [[thread_position_in_grid]]) {
auto pooling_dims = params.pooling_dims;
auto dims = params.dims;
auto grad_input_sizes = params.input_sizes.data();
auto grad_input_strides = params.input_strides.data();
auto grad_output_sizes = params.output_sizes.data();
auto grad_output_strides = params.output_strides.data();
auto kernel_size = params.kernel_size.data();
auto stride = params.stride.data();
auto padding = params.padding.data();
auto leading_dims = dims - pooling_dims;
// This buffer keeps track of the pooling dimension indices of this thread's
// element of the output. We need to fill it with the proper values below.
int32_t pooling_dim_indices[3];
PoolOffsets offsets = find_pool_offsets(
grad_output_sizes,
grad_output_strides,
/*indices_strides=*/nullptr,
grad_input_strides,
pooling_dim_indices,
dims,
leading_dims,
/*return_indices=*/false,
tid);
grad_output += offsets.output;
grad_input_sizes += leading_dims;
grad_input_strides += leading_dims;
avg_pool_backward_3d_input_iter<T>(
grad_input,
grad_output,
grad_input_sizes,
grad_input_strides,
offsets.input_leading,
pooling_dim_indices,
kernel_size,
stride,
padding,
params.count_include_pad,
params.has_divisor_override,
params.divisor_override);
}
#define REGISTER_POOL_OP(DTYPE) \
template [[host_name("max_pool_" #DTYPE)]] kernel void max_pool<DTYPE>( \
constant DTYPE * input [[buffer(0)]], \
device DTYPE * output [[buffer(1)]], \
device int64_t* indices [[buffer(2)]], \
constant PoolingParams<5>& params [[buffer(3)]], \
uint tid [[thread_position_in_grid]]); \
\
template [[host_name("max_unpool_" #DTYPE)]] kernel void max_unpool<DTYPE>( \
device DTYPE * output [[buffer(0)]], \
constant DTYPE * input [[buffer(1)]], \
constant int64_t* indices [[buffer(2)]], \
constant MaxUnpoolingParams<5>& params [[buffer(3)]], \
uint tid [[thread_position_in_grid]]); \
\
template [[host_name("avg_pool_" #DTYPE)]] kernel void avg_pool<DTYPE>( \
constant DTYPE * input [[buffer(0)]], \
device DTYPE * output [[buffer(1)]], \
constant AvgPoolingParams<5> & params [[buffer(2)]], \
#define REGISTER_MAX_POOL_OP(DTYPE) \
template [[host_name("max_pool_" #DTYPE)]] kernel void max_pool<DTYPE>( \
constant DTYPE * input [[buffer(0)]], \
device DTYPE * output [[buffer(1)]], \
device int64_t* indices [[buffer(2)]], \
constant PoolingParams<5>& params [[buffer(3)]], \
uint tid [[thread_position_in_grid]]);
#define REGISTER_POOL_BACKWARD_OP(DTYPE) \
#define REGISTER_MAX_POOL_BACKWARD_OP(DTYPE) \
template [[host_name("max_pool_backward_" #DTYPE)]] \
kernel void max_pool_backward<DTYPE>( \
device AtomicType_t<DTYPE> * grad_input [[buffer(0)]], \
constant DTYPE * grad_output_ [[buffer(1)]], \
constant int64_t* grad_indices_ [[buffer(2)]], \
constant PoolingBackwardParams<5>& params [[buffer(3)]], \
uint tid [[thread_position_in_grid]]); \
\
template [[host_name("avg_pool_backward_" #DTYPE)]] \
kernel void avg_pool_backward<DTYPE>( \
device AtomicType_t<DTYPE> * grad_input [[buffer(0)]], \
constant DTYPE * grad_output [[buffer(1)]], \
constant AvgPoolingParams<5> & params [[buffer(2)]], \
uint tid [[thread_position_in_grid]]);
REGISTER_POOL_OP(float);
REGISTER_POOL_OP(half);
REGISTER_POOL_OP(bfloat);
REGISTER_POOL_OP(int);
REGISTER_POOL_OP(long);
REGISTER_POOL_OP(short);
REGISTER_POOL_OP(char);
REGISTER_POOL_OP(uchar);
REGISTER_POOL_OP(bool);
REGISTER_MAX_POOL_OP(float);
REGISTER_MAX_POOL_OP(half);
REGISTER_MAX_POOL_OP(int);
REGISTER_MAX_POOL_OP(long);
REGISTER_MAX_POOL_OP(short);
REGISTER_MAX_POOL_OP(char);
REGISTER_MAX_POOL_OP(uchar);
REGISTER_MAX_POOL_OP(bool);
REGISTER_POOL_BACKWARD_OP(float);
REGISTER_POOL_BACKWARD_OP(half);
REGISTER_POOL_BACKWARD_OP(bfloat);
REGISTER_MAX_POOL_BACKWARD_OP(float);
REGISTER_MAX_POOL_BACKWARD_OP(half);
#if __METAL_VERSION__ >= 310
REGISTER_MAX_POOL_OP(bfloat);
REGISTER_MAX_POOL_BACKWARD_OP(bfloat);
#endif

View File

@ -197,10 +197,12 @@ INSTANTIATE_INT4MV(float, 128);
INSTANTIATE_INT4MV(half, 128);
INSTANTIATE_INT4MV(float, 256);
INSTANTIATE_INT4MV(half, 256);
#if __METAL_VERSION__ >= 310
INSTANTIATE_INT4MV(bfloat, 32);
INSTANTIATE_INT4MV(bfloat, 64);
INSTANTIATE_INT4MV(bfloat, 128);
INSTANTIATE_INT4MV(bfloat, 256);
#endif
// ------------------------------ int8 MM For M >= 12 ------------------------------------
/**
@ -232,10 +234,12 @@ template <> struct BlockType<half> {
using simdgroup_type8x8 = simdgroup_half8x8;
using type4 = half4;
};
#if __METAL_VERSION__ >= 310
template <> struct BlockType<bfloat> {
using simdgroup_type8x8 = simdgroup_bfloat8x8;
using type4 = bfloat4;
};
#endif
template<typename T>
float2 get_scale_zero_q8(constant T * scalesAndZeros, uint2 index) {
@ -486,7 +490,9 @@ kernel void kernel_mul_mm<DTYPE, WDTYPE, DEQUANT_FUNC>( \
INSTANTIATE_MM(float, char, get_scale_zero_q8);
INSTANTIATE_MM(half, char, get_scale_zero_q8);
#if __METAL_VERSION__ >= 310
INSTANTIATE_MM(bfloat, char, get_scale_zero_q8);
#endif
// ------------------------------ int8 MM For M < 12 ------------------------------------
/* Matrix vector multiplication, used for small M size for matrix multiplication as well.
@ -640,4 +646,6 @@ kernel void kernel_mul_mv<DTYPE>(
INSTANTIATE_MV(float);
INSTANTIATE_MV(half);
#if __METAL_VERSION__ >= 310
INSTANTIATE_MV(bfloat);
#endif

View File

@ -192,4 +192,6 @@ template <typename T>
instantiate_rms(float)
instantiate_rms(half)
#if __METAL_VERSION__ >= 310
instantiate_rms(bfloat)
#endif // clang-format on

View File

@ -23,4 +23,6 @@ kernel void renorm(
REGISTER_RENORM_OP(float);
REGISTER_RENORM_OP(half);
#if __METAL_VERSION__ >= 310
REGISTER_RENORM_OP(bfloat);
#endif

Some files were not shown because too many files have changed in this diff Show More