Compare commits

..

12 Commits

Author SHA1 Message Date
3b9b4065af Leave ROCm alone for now
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-11 21:20:56 -07:00
e1f586a43e Install the correct torchao version
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-11 19:45:43 -07:00
18dc2e03ac Merge branch 'main' into install-torchao-0.13.0
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-10 23:01:07 -07:00
d7c3d8a551 Merge branch 'main' into install-torchao-0.13.0
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-10 15:14:32 -07:00
78b4d254aa Ready to land
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-09 11:42:27 -07:00
8d5240d846 Fix lint
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-08 19:29:29 -07:00
135db45c9c Use more memory to build 0.13.0 torchao
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-08 19:12:37 -07:00
8139b6b1b1 Test torchao build
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-08 02:43:16 -07:00
24c95d83e6 Bump torchao pinned commit
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-07 22:11:18 -07:00
21a34fa017 Merge branch 'main' into install-torchao-0.13.0 2025-09-07 22:06:33 -07:00
636d3aa00f Tiny comment update
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-06 23:13:43 -07:00
174f2faa8c Put torchao (0.13.0) back to benchmark workflow
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-09-04 17:26:03 -07:00
495 changed files with 5159 additions and 13555 deletions

View File

@ -31,8 +31,7 @@ pip install -r /pytorch/requirements.txt
pip install auditwheel==6.2.0 wheel
if [ "$DESIRED_CUDA" = "cpu" ]; then
echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
else
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
export USE_SYSTEM_NCCL=1
@ -46,6 +45,5 @@ else
export USE_NVIDIA_PYPI_LIBS=1
fi
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
fi

View File

@ -13,49 +13,6 @@ def list_dir(path: str) -> list[str]:
return check_output(["ls", "-1", path]).decode().split("\n")
def build_ArmComputeLibrary() -> None:
"""
Using ArmComputeLibrary for aarch64 PyTorch
"""
print("Building Arm Compute Library")
acl_build_flags = [
"debug=0",
"neon=1",
"opencl=0",
"os=linux",
"openmp=1",
"cppthreads=0",
"arch=armv8a",
"multi_isa=1",
"fixed_format_kernels=1",
"build=native",
]
acl_install_dir = "/acl"
acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary")
if os.path.isdir(acl_install_dir):
shutil.rmtree(acl_install_dir)
if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)):
check_call(
[
"git",
"clone",
"https://github.com/ARM-software/ComputeLibrary.git",
"-b",
"v25.02",
"--depth",
"1",
"--shallow-submodules",
]
)
check_call(
["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags,
cwd=acl_checkout_dir,
)
for d in ["arm_compute", "include", "utils", "support", "src", "build"]:
shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
def replace_tag(filename) -> None:
with open(filename) as f:
lines = f.readlines()
@ -317,7 +274,7 @@ if __name__ == "__main__":
).decode()
print("Building PyTorch wheel")
build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
build_vars = ""
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
if enable_cuda:
build_vars += "MAX_JOBS=5 "
@ -356,19 +313,13 @@ if __name__ == "__main__":
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
if enable_mkldnn:
build_ArmComputeLibrary()
print("build pytorch with mkldnn+acl backend")
build_vars += (
"USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
"ACL_ROOT_DIR=/acl "
"LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH "
"ACL_INCLUDE_DIR=/acl/build "
"ACL_LIBRARY=/acl/build "
)
build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
build_vars += "ACL_ROOT_DIR=/acl "
if enable_cuda:
build_vars += "BLAS=NVPL "
else:
build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/OpenBLAS "
build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS "
else:
print("build pytorch without mkldnn backend")

View File

@ -299,40 +299,6 @@ def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
)
def build_OpenBLAS(host: RemoteHost, git_clone_flags: str = "") -> None:
print("Building OpenBLAS")
host.run_cmd(
f"git clone https://github.com/xianyi/OpenBLAS -b v0.3.28 {git_clone_flags}"
)
make_flags = "NUM_THREADS=64 USE_OPENMP=1 NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=ARMV8"
host.run_cmd(
f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS"
)
def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None:
print("Building Arm Compute Library")
acl_build_flags = " ".join(
[
"debug=0",
"neon=1",
"opencl=0",
"os=linux",
"openmp=1",
"cppthreads=0",
"arch=armv8a",
"multi_isa=1",
"fixed_format_kernels=1",
"build=native",
]
)
host.run_cmd(
f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v25.02 {git_clone_flags}"
)
host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}")
def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None:
host.run_cmd("pip3 install auditwheel")
host.run_cmd(
@ -700,7 +666,6 @@ def start_build(
configure_system(
host, compiler=compiler, use_conda=use_conda, python_version=python_version
)
build_OpenBLAS(host, git_clone_flags)
if host.using_docker():
print("Move libgfortant.a into a standard location")
@ -723,6 +688,8 @@ def start_build(
f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}"
)
host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh")
print("Building PyTorch wheel")
build_opts = ""
if pytorch_build_number is not None:
@ -743,15 +710,18 @@ def start_build(
if host.using_docker():
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
if enable_mkldnn:
build_ArmComputeLibrary(host, git_clone_flags)
host.run_cmd("pytorch/.ci/docker/common/install_acl.sh")
print("build pytorch with mkldnn+acl backend")
build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
build_vars += " BLAS=OpenBLAS"
build_vars += " OpenBLAS_HOME=/opt/OpenBLAS"
build_vars += " ACL_ROOT_DIR=/acl"
host.run_cmd(
f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}"
f"cd $HOME/pytorch && {build_vars} python3 setup.py bdist_wheel{build_opts}"
)
print("Repair the wheel")
pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
ld_library_path = "$HOME/acl/build:$HOME/pytorch/build/lib"
ld_library_path = "/acl/build:$HOME/pytorch/build/lib"
host.run_cmd(
f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}"
)
@ -907,7 +877,7 @@ def terminate_instances(instance_type: str) -> None:
def parse_arguments():
from argparse import ArgumentParser
parser = ArgumentParser("Builid and test AARCH64 wheels using EC2")
parser = ArgumentParser("Build and test AARCH64 wheels using EC2")
parser.add_argument("--key-name", type=str)
parser.add_argument("--debug", action="store_true")
parser.add_argument("--build-only", action="store_true")

View File

@ -214,7 +214,8 @@ case "$tag" in
TRITON=yes
;;
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
ANACONDA_PYTHON_VERSION=3.10
# TODO (huydhn): Upgrade this to Python >= 3.10
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
VISION=yes
KATEX=yes

View File

@ -1 +1 @@
5ae38bdb0dc066c5823e34dc9797afb9de42c866
fccfc522864cf8bc172abe0cd58ae5581e2d44b9

27
.ci/docker/common/install_acl.sh Normal file → Executable file
View File

@ -1,16 +1,27 @@
set -euo pipefail
#!/bin/bash
# Script used only in CD pipeline
readonly version=v25.02
readonly src_host=https://github.com/ARM-software
readonly src_repo=ComputeLibrary
set -eux
ACL_VERSION=${ACL_VERSION:-"v25.02"}
ACL_INSTALL_DIR="/acl"
# Clone ACL
[[ ! -d ${src_repo} ]] && git clone ${src_host}/${src_repo}.git
cd ${src_repo}
git checkout $version
git clone https://github.com/ARM-software/ComputeLibrary.git -b "${ACL_VERSION}" --depth 1 --shallow-submodules
ACL_CHECKOUT_DIR="ComputeLibrary"
# Build with scons
pushd $ACL_CHECKOUT_DIR
scons -j8 Werror=0 debug=0 neon=1 opencl=0 embed_kernels=0 \
os=linux arch=armv8a build=native multi_isa=1 \
fixed_format_kernels=1 openmp=1 cppthreads=0
popd
# Install ACL
sudo mkdir -p ${ACL_INSTALL_DIR}
for d in arm_compute include utils support src build
do
sudo cp -r ${ACL_CHECKOUT_DIR}/${d} ${ACL_INSTALL_DIR}/${d}
done
rm -rf $ACL_CHECKOUT_DIR

12
.ci/docker/common/install_openblas.sh Normal file → Executable file
View File

@ -3,8 +3,10 @@
set -ex
cd /
git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules
OPENBLAS_VERSION=${OPENBLAS_VERSION:-"v0.3.30"}
# Clone OpenBLAS
git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION}" --depth 1 --shallow-submodules
OPENBLAS_CHECKOUT_DIR="OpenBLAS"
OPENBLAS_BUILD_FLAGS="
@ -17,5 +19,7 @@ CFLAGS=-O3
BUILD_BFLOAT16=1
"
make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}
make -j8 ${OPENBLAS_BUILD_FLAGS} -C $OPENBLAS_CHECKOUT_DIR
sudo make install -C $OPENBLAS_CHECKOUT_DIR
rm -rf $OPENBLAS_CHECKOUT_DIR

View File

@ -62,6 +62,13 @@ ARG OPENBLAS_VERSION
ADD ./common/install_openblas.sh install_openblas.sh
RUN bash ./install_openblas.sh && rm install_openblas.sh
# Install Arm Compute Library
FROM base as arm_compute
# use python3.9 to install scons
RUN python3.9 -m pip install scons==4.7.0
RUN ln -sf /opt/python/cp39-cp39/bin/scons /usr/local/bin
COPY ./common/install_acl.sh install_acl.sh
RUN bash ./install_acl.sh && rm install_acl.sh
FROM base as final
# remove unnecessary python versions
@ -70,4 +77,5 @@ RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
COPY --from=openblas /opt/OpenBLAS/ /opt/OpenBLAS/
ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
COPY --from=arm_compute /acl /acl
ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:/acl/build/:$LD_LIBRARY_PATH

View File

@ -28,6 +28,7 @@ fi
MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
OPENBLAS_VERSION=${OPENBLAS_VERSION:-}
ACL_VERSION=${ACL_VERSION:-}
case ${image} in
manylinux2_28-builder:cpu)
@ -41,7 +42,6 @@ case ${image} in
GPU_IMAGE=arm64v8/almalinux:8
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"
MANY_LINUX_VERSION="2_28_aarch64"
OPENBLAS_VERSION="v0.3.30"
;;
manylinuxcxx11-abi-builder:cpu-cxx11-abi)
TARGET=final
@ -121,7 +121,8 @@ tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
DOCKER_BUILDKIT=1 docker build \
${DOCKER_GPU_BUILD_ARG} \
--build-arg "GPU_IMAGE=${GPU_IMAGE}" \
--build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION}" \
--build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION:-}" \
--build-arg "ACL_VERSION=${ACL_VERSION:-}" \
--target "${TARGET}" \
-t "${tmp_tag}" \
$@ \

View File

@ -7,4 +7,4 @@ set -ex
SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh
USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh

View File

@ -66,11 +66,6 @@ class VllmBuildParameters:
"DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm"
)
# the cleaning script to remove torch dependencies from pip
cleaning_script: Path = env_path_field(
"cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py"
)
# OUTPUT_DIR: where docker buildx (local exporter) will write artifacts
output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm")
@ -165,7 +160,6 @@ class VllmBuildRunner(BaseRunner):
logger.info("Running vllm build with inputs: %s", inputs)
vllm_commit = clone_vllm()
self.cp_torch_cleaning_script(inputs)
self.cp_dockerfile_if_exist(inputs)
# cp torch wheels from root direct to vllm workspace if exist
self.cp_torch_whls_if_exist(inputs)
@ -211,11 +205,6 @@ class VllmBuildRunner(BaseRunner):
copy(inputs.torch_whls_path, tmp_dir)
return tmp_dir
def cp_torch_cleaning_script(self, inputs: VllmBuildParameters):
script = get_path(inputs.cleaning_script, resolve=True)
vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py")
copy(script, vllm_script)
def cp_dockerfile_if_exist(self, inputs: VllmBuildParameters):
if not inputs.use_local_dockerfile:
logger.info("using vllm default dockerfile.torch_nightly for build")

View File

@ -11,7 +11,7 @@ from typing import Any
from cli.lib.common.cli_helper import BaseRunner
from cli.lib.common.envs_helper import env_path_field, env_str_field, get_env
from cli.lib.common.path_helper import copy, get_path, remove_dir
from cli.lib.common.path_helper import copy, remove_dir
from cli.lib.common.pip_helper import (
pip_install_first_match,
pip_install_packages,
@ -43,10 +43,6 @@ class VllmTestParameters:
torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")
cleaning_script: Path = env_path_field(
"cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py"
)
def __post_init__(self):
if not self.torch_whls_path.exists():
raise ValueError("missing torch_whls_path")
@ -96,13 +92,11 @@ class VllmTestRunner(BaseRunner):
self._set_envs(params)
clone_vllm(dst=self.work_directory)
self.cp_torch_cleaning_script(params)
with working_directory(self.work_directory):
remove_dir(Path("vllm"))
self._install_wheels(params)
self._install_dependencies()
# verify the torches are not overridden by test dependencies
check_versions()
def run(self):
@ -131,11 +125,6 @@ class VllmTestRunner(BaseRunner):
# double check the torches are not overridden by other packages
check_versions()
def cp_torch_cleaning_script(self, params: VllmTestParameters):
script = get_path(params.cleaning_script, resolve=True)
vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py")
copy(script, vllm_script)
def _install_wheels(self, params: VllmTestParameters):
logger.info("Running vllm test with inputs: %s", params)
if not pkg_exists("torch"):

View File

@ -89,7 +89,7 @@ fi
if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
export USE_MKLDNN=1
export USE_MKLDNN_ACL=1
export ACL_ROOT_DIR=/ComputeLibrary
export ACL_ROOT_DIR=/acl
fi
if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then

View File

@ -258,19 +258,11 @@ function install_torchrec_and_fbgemm() {
git clone --recursive https://github.com/pytorch/fbgemm
pushd fbgemm/fbgemm_gpu
git checkout "${fbgemm_commit}" --recurse-submodules
# until the fbgemm_commit includes the tbb patch
patch <<'EOF'
--- a/FbgemmGpu.cmake
+++ b/FbgemmGpu.cmake
@@ -184,5 +184,6 @@ gpu_cpp_library(
fbgemm_gpu_tbe_cache
fbgemm_gpu_tbe_optimizers
fbgemm_gpu_tbe_utils
+ tbb
DESTINATION
fbgemm_gpu)
EOF
python setup.py bdist_wheel --build-variant=rocm
python setup.py bdist_wheel \
--build-variant=rocm \
-DHIP_ROOT_DIR="${ROCM_PATH}" \
-DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
-DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
popd
# Save the wheel before cleaning up

View File

@ -35,10 +35,11 @@ fi
print_cmake_info
if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
# Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
else
# NB: we always build with distributed; USE_DISTRIBUTED turns off all
# backends (specifically the gloo backend), so test that this case works too
# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
fi
if which sccache > /dev/null; then

View File

@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
fi
popd
python -mpip install -r requirements.txt
# enable debug asserts in serialization
export TORCH_SERIALIZATION_DEBUG=1
python -mpip install --no-input -r requirements.txt
setup_test_python() {
# The CircleCI worker hostname doesn't resolve to an address.
# This environment variable makes ProcessGroupGloo default to
@ -181,9 +177,6 @@ checkout_install_torchbench() {
popd
pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt
# https://github.com/pytorch/pytorch/issues/160689 to remove torchao because
# its current version 0.12.0 doesn't work with transformers 4.54.0
pip uninstall -y torchao
echo "Print all dependencies after TorchBench is installed"
python -mpip freeze

View File

@ -386,8 +386,8 @@ def smoke_test_compile(device: str = "cpu") -> None:
def smoke_test_nvshmem() -> None:
if not torch.cuda.is_available() or target_os == "windows":
print("Windows platform or CUDA is not available, skipping NVSHMEM test")
if not torch.cuda.is_available():
print("CUDA is not available, skipping NVSHMEM test")
return
# Check if NVSHMEM is compiled in current build
@ -396,9 +396,7 @@ def smoke_test_nvshmem() -> None:
except ImportError:
# Not built with NVSHMEM support.
# torch is not compiled with NVSHMEM prior to 2.9
from torch.torch_version import TorchVersion
if TorchVersion(torch.__version__) < (2, 9):
if torch.__version__ < "2.9":
return
else:
# After 2.9: NVSHMEM is expected to be compiled in current build

View File

@ -778,11 +778,6 @@ test_single_dynamo_benchmark() {
}
test_inductor_micro_benchmark() {
# torchao requires cuda 8.0 or above for bfloat16 support
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;8.6"
fi
install_torchao
TEST_REPORTS_DIR=$(pwd)/test/test-reports
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
test_inductor_set_cpu_affinity
@ -1614,26 +1609,6 @@ test_operator_benchmark() {
--expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
}
test_operator_microbenchmark() {
TEST_REPORTS_DIR=$(pwd)/test/test-reports
mkdir -p "$TEST_REPORTS_DIR"
TEST_DIR=$(pwd)
pip_uninstall torch torchvision torchaudio
pip_install torch==2.8.0 torchvision torchaudio ninja --force-reinstall
cd benchmarks/operator_benchmark/pt_extension
python -m pip install .
cd "${TEST_DIR}"/benchmarks/operator_benchmark
for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do
$TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
--output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
--benchmark-name "PyTorch operator microbenchmark" --use-compile
$TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
--output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}.json" \
--benchmark-name "PyTorch operator microbenchmark"
done
}
if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
(cd test && python -c "import torch; print(torch.__config__.show())")
@ -1684,39 +1659,50 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
elif [[ "${TEST_CONFIG}" == *all* ]]; then
TEST_MODE="all"
fi
test_operator_benchmark cpu ${TEST_MODE}
fi
elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then
test_operator_microbenchmark
elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
install_torchao
fi
test_inductor_distributed
elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
test_inductor_halide
elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
test_inductor_triton_cpu
elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
install_torchao
test_inductor_micro_benchmark
elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
install_torchvision
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
install_torchao
fi
id=$((SHARD_NUMBER-1))
test_dynamo_benchmark huggingface "$id"
elif [[ "${TEST_CONFIG}" == *timm* ]]; then
install_torchvision
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
install_torchao
fi
id=$((SHARD_NUMBER-1))
test_dynamo_benchmark timm_models "$id"
elif [[ "${TEST_CONFIG}" == cachebench ]]; then
install_torchaudio
install_torchvision
install_torchao
PYTHONPATH=/torchbench test_cachebench
elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
install_torchaudio
install_torchvision
install_torchao
PYTHONPATH=/torchbench test_verify_cachebench
elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
install_torchaudio
install_torchvision
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
install_torchao
fi
id=$((SHARD_NUMBER-1))
# https://github.com/opencv/opencv-python/issues/885
pip_install opencv-python==4.8.0.74
@ -1736,13 +1722,24 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
fi
elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
install_torchvision
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
install_torchao
fi
PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
test_inductor_aoti
fi
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
install_torchvision
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
install_torchao
fi
test_inductor_shard "${SHARD_NUMBER}"
if [[ "${SHARD_NUMBER}" == 1 ]]; then
if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.9-gcc11-build ]]; then
test_inductor_distributed
fi
fi
elif [[ "${TEST_CONFIG}" == *einops* ]]; then
test_einops
elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then

View File

@ -85,7 +85,7 @@ mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
# Create an isolated directory to store this builds pytorch checkout and conda
# installation
if [[ -z "$MAC_PACKAGE_WORK_DIR" ]]; then
MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_${DESIRED_PYTHON}_$(date +%H%M%S)"
MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_conda_${DESIRED_PYTHON}_$(date +%H%M%S)"
fi
mkdir -p "$MAC_PACKAGE_WORK_DIR" || true
if [[ -n ${GITHUB_ACTIONS} ]]; then
@ -96,11 +96,11 @@ fi
whl_tmp_dir="${MAC_PACKAGE_WORK_DIR}/dist"
mkdir -p "$whl_tmp_dir"
mac_version='macosx-11_0-arm64'
mac_version='macosx_11_0_arm64'
libtorch_arch='arm64'
# Create a consistent wheel package name to rename the wheel to
wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version//[-,]/_}.whl"
wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version}.whl"
###########################################################
@ -125,6 +125,7 @@ popd
export TH_BINARY_BUILD=1
export INSTALL_TEST=0 # dont install test binaries into site-packages
export MACOSX_DEPLOYMENT_TARGET=11.0
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
@ -132,19 +133,25 @@ RENAME_WHEEL=true
case $desired_python in
3.14t)
echo "Using 3.14 deps"
mac_version='macosx-11.0-arm64'
NUMPY_PINNED_VERSION="==2.1.0"
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
RENAME_WHEEL=false
;;
3.14)
echo "Using 3.14t deps"
mac_version='macosx-11.0-arm64'
NUMPY_PINNED_VERSION="==2.1.0"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
RENAME_WHEEL=false
;;
3.13t)
echo "Using 3.13 deps"
NUMPY_PINNED_VERSION="==2.1.0"
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
RENAME_WHEEL=false
;;
3.13)
@ -169,16 +176,20 @@ case $desired_python in
;;
esac
# Install into a fresh env
tmp_env_name="wheel_py$python_nodot"
conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS}
source activate "$tmp_env_name"
PINNED_PACKAGES=(
"numpy${NUMPY_PINNED_VERSION}"
)
python -mvenv ~/${desired_python}-build
source ~/${desired_python}-build/bin/activate
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements-build.txt"
pip install requests ninja typing-extensions
retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
retry brew install libomp
# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
# is build as part of tensorpipe submodule
# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
export USE_DISTRIBUTED=1
export USE_MKLDNN=OFF
@ -188,7 +199,7 @@ export BUILD_TEST=OFF
pushd "$pytorch_rootdir"
echo "Calling setup.py bdist_wheel at $(date)"
_PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name "${mac_version//[-.]/_}"
python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name ${mac_version}
echo "Finished setup.py bdist_wheel at $(date)"

View File

@ -73,7 +73,7 @@ exclude =
./docs/src,
./functorch/docs,
./functorch/examples,
./functorch/docs/source/tutorials,
./functorch/notebooks,
./scripts,
./test/generated_type_hints_smoketest.py,
./third_party,

View File

@ -21,7 +21,6 @@ self-hosted-runner:
- linux.arm64.2xlarge.ephemeral
- linux.arm64.m7g.4xlarge
- linux.arm64.m7g.4xlarge.ephemeral
- linux.arm64.r7g.12xlarge.memory
- linux.4xlarge.nvidia.gpu
- linux.8xlarge.nvidia.gpu
- linux.16xlarge.nvidia.gpu

View File

@ -1 +1 @@
87ff22e49ed0e92576c4935ccb8c143daac4a3cd
fa5142928ee157aa65137c4ecff2fe9b1a9e0648

View File

@ -1 +1 @@
51c87b6ead6b7e098ada95d6a7609ee873b854cf
f32431e593d0e9db86c502d3872dd67ee40a005f

View File

@ -1 +1 @@
5bcc153d7bf69ef34bc5788a33f60f1792cf2861
cc99baf14dacc2497d0c5ed84e076ef2c37f6a4d

View File

@ -1 +1 @@
c77852e117bdf056c8e9a087e51d6f65cf6ba53d
6c5478ff7c3d50dd1e3047d72ec5909bea474073

View File

@ -82,10 +82,16 @@ RUN if command -v apt-get >/dev/null; then \
apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim; \
else \
dnf install -y git curl wget sudo; \
dnf install -y git curl wget sudo vim; \
fi \
&& python3 --version && python3 -m pip --version
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# Install uv for faster pip installs if not existed
RUN --mount=type=cache,target=/root/.cache/uv \
if ! python3 -m uv --version >/dev/null 2>&1; then \
@ -214,16 +220,11 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..."; \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
SCCACHE_ARCHIVE="sccache-v0.8.1-aarch64-unknown-linux-musl"; \
else \
SCCACHE_ARCHIVE="sccache-v0.8.1-x86_64-unknown-linux-musl"; \
fi; \
curl -L -o sccache.tar.gz "https://github.com/mozilla/sccache/releases/download/v0.8.1/${SCCACHE_ARCHIVE}.tar.gz" \
echo "Installing sccache..." \
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
&& tar -xzf sccache.tar.gz \
&& sudo mv "${SCCACHE_ARCHIVE}"/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz "${SCCACHE_ARCHIVE}" \
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
@ -284,7 +285,7 @@ RUN if command -v apt-get >/dev/null; then \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
else \
dnf install -y git curl wget sudo; \
dnf install -y git curl wget sudo vim; \
fi \
&& python3 --version && python3 -m pip --version
@ -297,6 +298,12 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
echo "[INFO] Showing torch_build_versions.txt content:" && \
cat torch_build_versions.txt
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# Install uv for faster pip installs if not existed
RUN --mount=type=cache,target=/root/.cache/uv \
if ! python3 -m uv --version > /dev/null 2>&1; then \

View File

@ -1,17 +0,0 @@
import glob
requires_files = glob.glob("requirements/*.txt")
requires_files += ["pyproject.toml"]
for file in requires_files:
print(f">>> cleaning {file}")
with open(file) as f:
lines = f.readlines()
if "torch" in "".join(lines).lower():
print("removed:")
with open(file, "w") as f:
for line in lines:
if "torch" not in line.lower():
f.write(line)
print(f"<<< done cleaning {file}")
print()

View File

@ -15,7 +15,7 @@ optree==0.13.0
packaging==23.1
parameterized==0.8.1
pillow==10.3.0
protobuf==5.29.5
protobuf==5.29.4
psutil==5.9.8
pygments==2.15.0
pytest-cpp==2.3.0
@ -26,7 +26,7 @@ pytest-xdist==3.3.1
pytest==7.3.2
pyyaml==6.0.2
scipy==1.12.0
setuptools==78.1.1
setuptools==72.1.0
sympy==1.13.3
tlparse==0.4.0
tensorboard==2.13.0

View File

@ -39,9 +39,7 @@ def main() -> None:
pull_request_label_names = [label.name for label in pull_request_labels]
issue_label_names = [label.name for label in issue_labels]
labels_to_add = [
label
for label in issue_label_names
if label not in pull_request_label_names and label != "actionable"
label for label in issue_label_names if label not in pull_request_label_names
]
if not labels_to_add:
print("The pull request already has the same labels.")

View File

@ -84,9 +84,6 @@ repackage_wheel() {
rm -rf $package
}
# Require to re-package the wheel
${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1
pushd externals/vllm/wheels
for package in xformers flashinfer-python vllm; do
repackage_wheel $package

View File

@ -22,16 +22,6 @@ name: !{{ build_environment }}
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
{%- endmacro %}
{%- macro setup_python(py_ver) -%}
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}"
freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }}
{%- endmacro %}
on:
# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
push:
@ -71,13 +61,23 @@ jobs:
{%- endif %}
steps:
!{{ set_runner_specific_vars() }}
!{{ setup_python(config.get("python_version", "3.10")) }}
- name: Install conda and dependencies
run: |
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
!{{ common.checkout(deep_clone=False, directory="pytorch") }}
- name: Populate binary env
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -94,6 +94,8 @@ jobs:
{%- if config["package_type"] == "wheel" %}
- name: Test PyTorch wheel
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -104,9 +106,33 @@ jobs:
SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086
python -mvenv test_venv
source test_venv/bin/activate
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
# shellcheck disable=SC2086

View File

@ -273,8 +273,6 @@ jobs:
TEST_CONFIG: ${{ matrix.config }}
SHARD_NUMBER: ${{ matrix.shard }}
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
EXTRA_FLAGS: ${{ matrix.extra_flags || '' }}
OP_BENCHMARK_TESTS: ${{ matrix.op_benchmark_tests }}
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}

View File

@ -62,11 +62,6 @@ on:
required: false
type: number
default: 1
secrets:
HUGGING_FACE_HUB_TOKEN:
required: false
description: |
HF Auth token to avoid rate limits when downloading models or datasets from hub
env:
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
@ -81,9 +76,10 @@ jobs:
strategy:
matrix: ${{ fromJSON(inputs.test-matrix) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
runs-on: ${{ matrix.runner }}
steps:
# [see note: pytorch repo ref]
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
with:
@ -135,9 +131,6 @@ jobs:
- name: Start monitoring script
id: monitor-script
if: ${{ !inputs.disable-monitor }}
shell: bash
continue-on-error: true
env:
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
@ -145,6 +138,9 @@ jobs:
WORKFLOW_RUN_ID: ${{github.run_id}}
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
if: ${{ !inputs.disable-monitor }}
shell: bash
continue-on-error: true
run: |
python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7
python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
@ -182,12 +178,6 @@ jobs:
run: |
echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
- name: Preserve github env variables for use in docker
shell: bash
run: |
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Test
id: test
env:
@ -203,22 +193,20 @@ jobs:
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
BRANCH: ${{ steps.parse-ref.outputs.branch }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
TEST_CONFIG: ${{ matrix.config }}
SHARD_NUMBER: ${{ matrix.shard }}
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
TEST_CONFIG: ${{ matrix.config }}
SHARD_NUMBER: ${{ matrix.shard }}
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
DOCKER_IMAGE: ${{ inputs.docker-image }}
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
run: |
set -x
@ -248,7 +236,6 @@ jobs:
-e GITHUB_RUN_ATTEMPT \
-e JOB_ID \
-e JOB_NAME \
-e BASE_SHA \
-e BRANCH \
-e SHA1 \
-e AWS_DEFAULT_REGION \
@ -266,12 +253,10 @@ jobs:
-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
-e TESTS_TO_INCLUDE \
-e HUGGING_FACE_HUB_TOKEN \
-e DASHBOARD_TAG \
--env-file="${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}" \
--ulimit stack=10485760:83886080 \
--ulimit core=0 \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--shm-size="8g" \

View File

@ -12,9 +12,6 @@ on:
paths:
- .github/workflows/build-vllm-wheel.yml
- .github/ci_commit_pins/vllm.txt
schedule:
# every morning at 01:30PM UTC, 9:30AM EST, 6:30AM PST
- cron: 30 13 * * *
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -27,33 +24,21 @@ jobs:
fail-fast: false
matrix:
python-version: [ '3.12' ]
# TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
# TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554
device: [ 'cu128', 'cu129' ]
runner: [ 'linux.12xlarge.memory' ]
include:
- platform: manylinux_2_28_x86_64
device: cu128
- device: cu128
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8'
runner: linux.12xlarge.memory
- platform: manylinux_2_28_x86_64
device: cu129
- device: cu129
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
runner: linux.12xlarge.memory
- platform: manylinux_2_28_aarch64
device: cu128
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8'
runner: linux.arm64.r7g.12xlarge.memory
- platform: manylinux_2_28_aarch64
device: cu129
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9'
runner: linux.arm64.r7g.12xlarge.memory
name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
name: "Build ${{ matrix.device }} vLLM wheel"
runs-on: ${{ matrix.runner }}
timeout-minutes: 480
env:
PY_VERS: ${{ matrix.python-version }}
MANYLINUX_IMAGE: ${{ matrix.manylinux-image }}
PLATFORM: ${{ matrix.platform }}
PLATFORM: 'manylinux_2_28_x86_64'
BUILD_DEVICE: ${{ matrix.device }}
steps:
- name: Setup SSH (Click me for login details)
@ -151,7 +136,7 @@ jobs:
- uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
with:
name: vllm-wheel-${{ matrix.device }}-${{ matrix.platform }}-${{ matrix.python-version }}
name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }}
if-no-files-found: error
path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl
@ -161,17 +146,15 @@ jobs:
# Copied from build-triton-wheel workflow (mostly)
upload-wheel:
name: "Upload ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
name: "Upload ${{ matrix.device }} vLLM wheel"
needs:
- build-wheel
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
device: [ 'cu128', 'cu129' ]
env:
PLATFORM: ${{ matrix.platform }}
BUILD_DEVICE: ${{ matrix.device }}
permissions:
id-token: write
@ -183,7 +166,7 @@ jobs:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Configure AWS credentials(PyTorch account) for main
if: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
@ -207,15 +190,15 @@ jobs:
run: |
set -eux
mkdir -p "${RUNNER_TEMP}/artifacts/"
mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-"${PLATFORM}"-*/* "${RUNNER_TEMP}/artifacts/"
mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/"
- name: Set DRY_RUN
if: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
shell: bash
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
shell: bash
run: |

View File

@ -60,13 +60,13 @@ jobs:
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.10.4"
freethreaded: false
- name: Install conda and dependencies
run: |
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
@ -81,9 +81,13 @@ jobs:
working-directory: pytorch
- name: Populate binary env
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"

View File

@ -56,13 +56,13 @@ jobs:
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.10.4"
freethreaded: false
- name: Install conda and dependencies
run: |
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
@ -77,9 +77,13 @@ jobs:
working-directory: pytorch
- name: Populate binary env
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -95,6 +99,8 @@ jobs:
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
- name: Test PyTorch wheel
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -105,9 +111,33 @@ jobs:
SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086
python -mvenv test_venv
source test_venv/bin/activate
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
# shellcheck disable=SC2086
@ -166,13 +196,13 @@ jobs:
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.11.4"
freethreaded: false
- name: Install conda and dependencies
run: |
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
@ -187,9 +217,13 @@ jobs:
working-directory: pytorch
- name: Populate binary env
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -205,6 +239,8 @@ jobs:
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
- name: Test PyTorch wheel
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -215,9 +251,33 @@ jobs:
SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086
python -mvenv test_venv
source test_venv/bin/activate
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
# shellcheck disable=SC2086
@ -276,13 +336,13 @@ jobs:
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.12.4"
freethreaded: false
- name: Install conda and dependencies
run: |
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
@ -297,9 +357,13 @@ jobs:
working-directory: pytorch
- name: Populate binary env
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -315,6 +379,8 @@ jobs:
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
- name: Test PyTorch wheel
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -325,9 +391,33 @@ jobs:
SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086
python -mvenv test_venv
source test_venv/bin/activate
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
# shellcheck disable=SC2086
@ -386,13 +476,13 @@ jobs:
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.13.4"
freethreaded: false
- name: Install conda and dependencies
run: |
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
@ -407,9 +497,13 @@ jobs:
working-directory: pytorch
- name: Populate binary env
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -425,6 +519,8 @@ jobs:
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
- name: Test PyTorch wheel
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -435,9 +531,33 @@ jobs:
SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086
python -mvenv test_venv
source test_venv/bin/activate
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
# shellcheck disable=SC2086
@ -496,13 +616,13 @@ jobs:
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.13.4"
freethreaded: true
- name: Install conda and dependencies
run: |
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
@ -517,9 +637,13 @@ jobs:
working-directory: pytorch
- name: Populate binary env
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -535,6 +659,8 @@ jobs:
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
- name: Test PyTorch wheel
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -545,9 +671,33 @@ jobs:
SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086
python -mvenv test_venv
source test_venv/bin/activate
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
# shellcheck disable=SC2086
@ -606,13 +756,13 @@ jobs:
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.14.0-rc.2"
freethreaded: false
- name: Install conda and dependencies
run: |
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
@ -627,9 +777,13 @@ jobs:
working-directory: pytorch
- name: Populate binary env
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -645,6 +799,8 @@ jobs:
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
- name: Test PyTorch wheel
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -655,9 +811,33 @@ jobs:
SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086
python -mvenv test_venv
source test_venv/bin/activate
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
# shellcheck disable=SC2086
@ -716,13 +896,13 @@ jobs:
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.14.0-rc.2"
freethreaded: true
- name: Install conda and dependencies
run: |
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
@ -737,9 +917,13 @@ jobs:
working-directory: pytorch
- name: Populate binary env
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -755,6 +939,8 @@ jobs:
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
- name: Test PyTorch wheel
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
set -eux -o pipefail
# shellcheck disable=SC1090
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
@ -765,9 +951,33 @@ jobs:
SMOKE_TEST_PARAMS=""
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
# shellcheck disable=SC2153
case $DESIRED_PYTHON in
3.14t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.14)
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
desired_python="3.14.0rc1"
;;
3.13t)
CONDA_ENV_CREATE_FLAGS="python-freethreading"
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
desired_python="3.13"
;;
*)
# shellcheck disable=SC2153
desired_python=${DESIRED_PYTHON}
;;
esac
# shellcheck disable=SC2086
python -mvenv test_venv
source test_venv/bin/activate
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
conda activate test_conda_env
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
# shellcheck disable=SC2086

View File

@ -35,6 +35,8 @@ jobs:
needs:
- get-default-label-prefix
with:
# More memory is needed to build torchao
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
@ -43,6 +45,7 @@ jobs:
{ include: [
{ config: "inductor-micro-benchmark", shard: 1, num_shards: 1, runner: "linux.aws.a100", owners: ["oncall:pt2"] },
]}
build-additional-packages: "vision audio fbgemm torchao"
secrets: inherit
test:

View File

@ -37,7 +37,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix
with:
build-environment: linux-jammy-py3.10-gcc11-build
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
test-matrix: |
@ -56,7 +56,7 @@ jobs:
uses: ./.github/workflows/_linux-test.yml
needs: nightly-dynamo-benchmarks-build
with:
build-environment: linux-jammy-py3.10-gcc11-build
build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }}
timeout-minutes: 720

View File

@ -137,7 +137,6 @@ jobs:
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests, next step is to enable it
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
@ -154,7 +153,6 @@ jobs:
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
timeout-minutes: 1440
# disable monitor in perf tests, next step is to enable it
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
@ -173,7 +171,6 @@ jobs:
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests for more investigation
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4

View File

@ -43,11 +43,6 @@ on:
required: false
type: boolean
default: false
freezing:
description: Run freezing?
required: false
type: boolean
default: true
benchmark_configs:
description: The list of configs used the benchmark
required: false
@ -80,7 +75,7 @@ jobs:
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-gcc11-build
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
test-matrix: |
{ include: [
@ -106,8 +101,8 @@ jobs:
needs: inductor-build
if: github.event.schedule == '0 7 * * *'
with:
build-environment: linux-jammy-py3.10-gcc11-build
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
build-environment: linux-jammy-py3.9-gcc11-build
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
timeout-minutes: 720
@ -121,9 +116,10 @@ jobs:
name: inductor-test
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build
if: github.event_name == 'workflow_dispatch'
with:
build-environment: linux-jammy-py3.10-gcc11-build
dashboard-tag: training-${{ inputs.training || 'false' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'true' }}-aotinductor-${{ inputs.aotinductor || 'true' }}-freezing-${{ inputs.freezing || 'true' }}
build-environment: linux-jammy-py3.9-gcc11-build
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
timeout-minutes: 720

View File

@ -80,7 +80,7 @@ jobs:
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-gcc11-build
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
test-matrix: |
{ include: [
@ -107,7 +107,7 @@ jobs:
needs: inductor-build
if: github.event.schedule == '0 7 * * *'
with:
build-environment: linux-jammy-py3.10-gcc11-build
build-environment: linux-jammy-py3.9-gcc11-build
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
@ -124,7 +124,7 @@ jobs:
needs: inductor-build
if: github.event_name == 'workflow_dispatch'
with:
build-environment: linux-jammy-py3.10-gcc11-build
build-environment: linux-jammy-py3.9-gcc11-build
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }}
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}

View File

@ -36,10 +36,12 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix
with:
# More memory is needed to build torchao
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0;8.6'
cuda-arch-list: '8.6'
test-matrix: |
{ include: [
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
@ -62,7 +64,7 @@ jobs:
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
@ -128,6 +130,8 @@ jobs:
needs:
- get-default-label-prefix
with:
# More memory is needed to build torchao
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
@ -154,7 +158,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix
with:
build-environment: linux-jammy-py3.10-gcc11-build
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
test-matrix: |
@ -200,7 +204,7 @@ jobs:
uses: ./.github/workflows/_linux-test.yml
needs: periodic-dynamo-benchmarks-cpu-build
with:
build-environment: linux-jammy-py3.10-gcc11-build
build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }}
secrets: inherit

View File

@ -33,6 +33,8 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
# More memory is needed to build torchao
runner: linux.2xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
@ -45,6 +47,7 @@ jobs:
{ config: "inductor_cpp_wrapper", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_cpp_wrapper", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
]}
build-additional-packages: "vision audio torchao"
secrets: inherit
inductor-test:
@ -110,7 +113,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-py3.10-gcc11-build
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
test-matrix: |
@ -127,7 +130,7 @@ jobs:
uses: ./.github/workflows/_linux-test.yml
needs: inductor-cpu-build
with:
build-environment: linux-jammy-py3.10-gcc11-build
build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
secrets: inherit

View File

@ -49,6 +49,8 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
# More memory is needed to build torchao
runner: linux.2xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
@ -79,7 +81,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-py3.10-gcc11-build
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
test-matrix: |
@ -101,7 +103,7 @@ jobs:
uses: ./.github/workflows/_linux-test.yml
needs: inductor-cpu-build
with:
build-environment: linux-jammy-py3.10-gcc11-build
build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
secrets: inherit

View File

@ -54,7 +54,7 @@ jobs:
- get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-gcc11
build-environment: linux-jammy-py3.9-gcc11
docker-image: ${{ needs.docs-build.outputs.docker-image }}
push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }}
run-doxygen: true

View File

@ -14,10 +14,6 @@ on:
schedule:
# Run at 07:00 UTC every Sunday
- cron: 0 7 * * 0
pull_request:
paths:
- benchmarks/operator_benchmark/**
- .github/workflows/operator_benchmark.yml
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -33,7 +29,7 @@ jobs:
name: opbenchmark-build
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-py3.10-gcc11-build
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
test-matrix: |
{ include: [
@ -46,7 +42,7 @@ jobs:
name: opbenchmark-on-demand-build
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-py3.10-gcc11-build
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
test-matrix: |
{ include: [
@ -59,7 +55,7 @@ jobs:
uses: ./.github/workflows/_linux-test.yml
needs: opbenchmark-build
with:
build-environment: linux-jammy-py3.10-gcc11-build
build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
secrets: inherit

View File

@ -1,46 +0,0 @@
name: operator_microbenchmark
on:
push:
tags:
- ciflow/op-benchmark/*
workflow_dispatch:
schedule:
# Run at 06:00 UTC everyday
- cron: 0 6 * * *
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
opmicrobenchmark-build:
if: github.repository_owner == 'pytorch'
name: opmicrobenchmark-build
uses: ./.github/workflows/_linux-build.yml
with:
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '8.0 9.0'
test-matrix: |
{ include: [
{ config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" },
{ config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
]}
secrets: inherit
opmicrobenchmark-test:
name: opmicrobenchmark-test
uses: ./.github/workflows/_linux-test.yml
needs: opmicrobenchmark-build
with:
timeout-minutes: 500
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
secrets: inherit

View File

@ -1,46 +0,0 @@
name: operator_microbenchmark_b200
on:
push:
tags:
- ciflow/op-benchmark/*
workflow_dispatch:
schedule:
# Run at 06:00 UTC everyday
- cron: 0 6 * * *
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
opmicrobenchmark-build:
if: github.repository_owner == 'pytorch'
name: opmicrobenchmark-build
uses: ./.github/workflows/_linux-build.yml
with:
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '10.0'
test-matrix: |
{ include: [
{ config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
]}
secrets: inherit
opmicrobenchmark-test:
name: opmicrobenchmark-test
uses: ./.github/workflows/_linux-test.yml
needs: opmicrobenchmark-build
with:
timeout-minutes: 500
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
secrets: inherit

View File

@ -70,4 +70,5 @@ jobs:
build-environment: linux-noble-rocm-py3.12-mi300
docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
tests-to-include: "inductor/test_ck_backend"
secrets: inherit

View File

@ -239,13 +239,16 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
# More memory is needed to build torchao
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-gcc11
build-environment: linux-jammy-py3.9-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
test-matrix: |
{ include: [
{ config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
]}
build-additional-packages: "vision audio torchao"
secrets: inherit
verify-cachebench-cpu-test:
@ -255,7 +258,7 @@ jobs:
- verify-cachebench-cpu-build
- target-determination
with:
build-environment: linux-jammy-py3.10-gcc11
build-environment: linux-jammy-py3.9-gcc11
docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
secrets: inherit

5
.gitignore vendored
View File

@ -259,6 +259,9 @@ gen
.pytest_cache
aten/build/*
# Linker scripts for prioritized text optimization
cmake/linker_script.ld
# Bram
plsdontbreak
@ -389,5 +392,3 @@ android/pytorch_android_torchvision/.cxx
# Claude Code local configuration
CLAUDE.local.md
/test_*.py
/debug_*.py

View File

@ -13,7 +13,7 @@ exclude_patterns = [
'**/fb/**',
'functorch/docs/**',
'functorch/examples/**',
'functorch/docs/source/tutorials/**',
'functorch/notebooks/**',
'torch/_inductor/fx_passes/serialized_patterns/**',
'torch/_inductor/autoheuristic/artifacts/**',
'scripts/**',
@ -1568,6 +1568,7 @@ include_patterns = [
exclude_patterns = [
'caffe2/**',
'functorch/docs/**',
'functorch/notebooks/**',
'torch/_inductor/fx_passes/serialized_patterns/**',
'torch/_inductor/autoheuristic/artifacts/**',
'test/dynamo/cpython/**',

View File

@ -22,6 +22,7 @@ COMMON_COPTS = [
"-DHAVE_SHM_UNLINK=1",
"-D_FILE_OFFSET_BITS=64",
"-DUSE_FBGEMM",
"-DUSE_DISTRIBUTED",
"-DAT_PER_OPERATOR_HEADERS",
"-DATEN_THREADING=NATIVE",
"-DNO_CUDNN_DESTROY_HANDLE",
@ -810,7 +811,7 @@ cc_library(
name = "torch_python",
srcs = libtorch_python_core_sources
+ if_cuda(libtorch_python_cuda_sources)
+ libtorch_python_distributed_sources
+ if_cuda(libtorch_python_distributed_sources)
+ GENERATED_AUTOGRAD_PYTHON,
hdrs = glob([
"torch/csrc/generic/*.cpp",

View File

@ -181,9 +181,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
set(CPU_POWER ON)
endif()
# For non-supported platforms, turn USE_DISTRIBUTED off by default.
# NB: USE_DISTRIBUTED simply disables the backend; distributed code
# still gets built
# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
# tested and likely won't work without additional changes.
if(NOT LINUX AND NOT WIN32)
set(USE_DISTRIBUTED
OFF
@ -263,11 +262,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
option(USE_NATIVE_ARCH "Use -march=native" OFF)
cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
option(USE_DISTRIBUTED "Enable default distributed backends" ON)
option(USE_DISTRIBUTED "Use distributed" ON)
cmake_dependent_option(USE_NCCL "Use NCCL" ON
"USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
cmake_dependent_option(USE_XCCL "Use XCCL" ON
"USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
"USE_XPU;UNIX;NOT APPLE" OFF)
cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
@ -380,6 +379,13 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
OFF "USE_CUDA" OFF)
cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
"CPU_AARCH64" OFF)
# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le.
set(USE_PRIORITIZED_TEXT_DEFAULT OFF)
if(LINUX AND CPU_AARCH64)
set(USE_PRIORITIZED_TEXT_DEFAULT ON)
endif()
cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld."
"${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF)
option(USE_MIMALLOC "Use mimalloc" OFF)
# Enable third party mimalloc library to improve memory allocation performance
@ -432,10 +438,11 @@ if(WIN32)
PATH_SUFFIXES lib
NO_DEFAULT_PATH)
if(NOT libuv_tmp_LIBRARY)
set(USE_DISTRIBUTED OFF)
set(USE_GLOO OFF)
message(
WARNING
"Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
"Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
"Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
)
else()
@ -657,6 +664,11 @@ endif(MSVC)
string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
# Set linker max-page-size to 64KiB on AArch64 Linux
if(LINUX AND CPU_AARCH64)
add_link_options_if_supported("-z,max-page-size=0x10000")
endif()
# Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
# applicable to mobile are disabled by this variable. Setting
# `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it
@ -874,7 +886,7 @@ cmake_dependent_option(
"Whether to build the flash_attention kernel for scaled dot product attention.\
Will be disabled if not supported by the platform"
ON
"(USE_CUDA AND NOT MSVC) OR USE_ROCM"
"USE_CUDA OR USE_ROCM;NOT MSVC"
OFF)
cmake_dependent_option(
@ -909,7 +921,7 @@ cmake_dependent_option(
# USE_FLASH_ATTENTION -> USE_ROCM -> Dependencies.cmake -> aotriton.cmake
#
if(USE_ROCM)
if(USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION)
if(UNIX AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
include(cmake/External/aotriton.cmake)
endif()
endif()
@ -1421,3 +1433,57 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas"
DESTINATION "${CMAKE_INSTALL_BINDIR}")
endif()
if(USE_PRIORITIZED_TEXT_FOR_LD)
add_compile_options(
$<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
$<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
)
set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
add_custom_command(
OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
COMMENT "Generating prioritized text linker files"
VERBATIM
)
add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
if(BUILD_PYTHON)
set(LINKER_OPT_TARGETS torch_python)
endif()
if(NOT BUILD_LIBTORCHLESS)
list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
if(USE_CUDA)
list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
endif()
if(USE_XPU)
list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
endif()
if(USE_ROCM)
list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
endif()
endif()
foreach(tgt IN LISTS LINKER_OPT_TARGETS)
if(TARGET ${tgt})
add_dependencies("${tgt}" generate_linker_script)
target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
else()
message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
endif()
endforeach()
else()
if(LINUX AND CPU_AARCH64)
message(WARNING [[
It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
]])
endif()
endif()

View File

@ -16,8 +16,6 @@ However, if you believe you have found a security vulnerability in PyTorch, we e
Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new
All reports submitted thru the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework.
Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:
https://www.facebook.com/whitehat

View File

@ -180,7 +180,7 @@ void Context::setUserEnabledNNPACK(bool e) {
}
bool Context::allowTF32CuDNN(const std::string& op) const {
if (op.empty()){
if (op.size() == 0){
bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32";
bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32";
TORCH_CHECK(
@ -281,6 +281,9 @@ bool Context::userEnabledOverrideableSDP() const {
static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
#ifdef USE_ROCM
static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32";
#endif
bool Context::checkCuBLASConfigDeterministic() {
// If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
@ -340,6 +343,12 @@ void Context::setImmediateMiopen(bool b) {
}
bool Context::allowTF32CuBLAS() const {
#ifdef USE_ROCM
const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
if (allow_tf32 != true) {
return false;
}
#endif
bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32";
TORCH_CHECK(
@ -353,6 +362,14 @@ bool Context::allowTF32CuBLAS() const {
}
void Context::setAllowTF32CuBLAS(bool b) {
#ifdef USE_ROCM
const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
if (allow_tf32 != true) {
C10_LOG_FIRST_N(INFO, 10) << "torch.backends.cuda.matmul.allow_tf32 is not supported on ROCm by default. "
<< "Please set environment variable HIPBLASLT_ALLOW_TF32=1 to enable it.";
return;
}
#endif
float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee");
}
@ -426,7 +443,7 @@ void Context::setFloat32Precision(const std::string& backend, const std::string&
std::string msg;
auto iterp = _fp32_precisions.find(backend);
TORCH_CHECK(iterp != _fp32_precisions.end());
for (const auto& p : iterp->second) {
for (auto p : iterp->second) {
msg += p;
msg += " ";
}

View File

@ -102,7 +102,7 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
// SparseTensorImpl has no storage, so we cannot query its nbytes.
// (original_storage_size is only used for storage resizing in fsdp anyway, which does not apply to sparse)
// Same for XLA
if (base.unsafeGetTensorImpl()->has_storage() && data_ptr().device().type() != c10::DeviceType::XLA) {
if (base.unsafeGetTensorImpl()->has_storage() && base.device().type() != c10::DeviceType::XLA) {
original_storage_size_ = base.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl()->sym_nbytes();
} else {
original_storage_size_ = -1;

View File

@ -457,9 +457,24 @@ void gemm(
return;
}
#endif
// for the fallback path, first compute gemm with beta = 0,
// and then add c in full precision.
int64_t c_size = n * m;
std::vector<float> float_c(c_size, 0.f);
gemm_no_downcast_stub(
at::kCPU, at::kBFloat16,
transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
for (const auto j : c10::irange(n)) {
for (const auto i : c10::irange(m)) {
auto offset = j * ldc + i;
// beta == 0 won't propagate NaN from C
if (beta == 0.f) {
c[offset] = float_c[j * m + i];
} else {
c[offset] = beta * c[offset] + float_c[j * m + i];
}
}
}
}
void gemm(
@ -478,9 +493,24 @@ void gemm(
return;
}
#endif
// for the fallback path, first compute gemm with beta = 0,
// and then add c in full precision.
int64_t c_size = n * m;
std::vector<float> float_c(c_size, 0.f);
gemm_no_downcast_stub(
at::kCPU, at::kHalf,
transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
for (const auto j : c10::irange(n)) {
for (const auto i : c10::irange(m)) {
auto offset = j * ldc + i;
// beta == 0 won't propagate NaN from C
if (beta == 0.f) {
c[offset] = float_c[j * m + i];
} else {
c[offset] = beta * c[offset] + float_c[j * m + i];
}
}
}
}
void gemm(

View File

@ -67,13 +67,13 @@ TORCH_PRECOMPUTE_META_FUNC(fractional_max_pool3d)(
int64_t inputH = input_.size(heightDim);
int64_t inputW = input_.size(widthDim);
TORCH_CHECK((poolSizeT <= inputT) && (outputT + poolSizeT - 1 < inputT),
TORCH_CHECK(outputT + poolSizeT - 1 < inputT,
"fractional_max_pool3d_out(): pool time ", poolSizeT,
" too large relative to input time ", inputT);
TORCH_CHECK((poolSizeW <= inputW) && (outputW + poolSizeW - 1 < inputW),
TORCH_CHECK(outputW + poolSizeW - 1 < inputW,
"fractional_max_pool3d_out(): pool width ", poolSizeW,
" too large relative to input width ", inputW);
TORCH_CHECK((poolSizeH <= inputH) && (outputH + poolSizeH - 1 < inputH),
TORCH_CHECK(outputH + poolSizeH - 1 < inputH,
"fractional_max_pool3d_out(): pool height ", poolSizeH,
" too large relative to input height ", inputH);

View File

@ -1360,8 +1360,7 @@ Tensor outer(const Tensor& self, const Tensor& vec2) {
#endif
#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED()
// Used by default on x86 platforms and on AArch64+ACL
#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
static inline int64_t get_mkldnn_matmul_min_dim() {
static auto value = [&] {
const int64_t default_min_dim = [&] {
@ -1396,6 +1395,8 @@ static inline bool apply_mkldnn_matmul_heur(int64_t m, int64_t k, int64_t n) {
return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size;
}
#endif
static void addmm_impl_cpu_(
Tensor &result, const Tensor &self, Tensor m1, Tensor m2, const Scalar& beta, const Scalar& alpha) {
TORCH_INTERNAL_ASSERT(self.dim() == 2 && m1.dim() == 2 && m2.dim() == 2);
@ -1771,8 +1772,8 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
return (strides[2] == 1 && (sizes[1] == 1 || strides[1] >= sizes[2])) ||
(strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1]));
};
#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED()
// Always apply mkldnn heuristic on x86 platform, but on ARM only if compiled with ACL
#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]);
if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) {
try {
@ -1784,6 +1785,7 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
}
}
#endif
if (contraction_size * res_rows * res_cols < 400) {
if (is_bmm_out) {
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, batch1.scalar_type(), "bmm", [&] {

View File

@ -624,9 +624,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
if (backend == BatchNormBackend::Miopen) {
return std::tuple_cat(
at::miopen_batch_norm(
input.contiguous(input.suggest_memory_format()),
weight.contiguous(),
bias.contiguous(),
input.contiguous(), weight.contiguous(), bias.contiguous(),
running_mean.defined() ? running_mean.contiguous() : running_mean,
running_var.defined() ? running_var.contiguous() : running_var,
training, momentum, eps),

View File

@ -2174,7 +2174,7 @@ static void _scatter_via_index_put(
if (self.dim() == 1 || broadcast_index) {
Tensor squeezed = index;
if (broadcast_index && index.dim() > 1) {
for (int64_t d = index.dim() - 1; d >= 0; --d) {
for (const auto d : c10::irange(index.dim())) {
if (d == dim) {
continue;
}

View File

@ -52,7 +52,6 @@ void apply_triu_tril_single(
int64_t self_col_stride,
bool upper) {
constexpr int64_t zero = 0;
k = std::clamp(k, -n, m); // Clamp k to [-n, m] to prevent i + k arithmetic overflow, especially if k approaches INT64_MAX/INT64_MIN.
if (upper) {
parallel_for(0, n, 0, [&](int64_t start, int64_t end) {

View File

@ -36,7 +36,7 @@ void hardsigmoid_kernel(TensorIteratorBase& iter) {
[zero, one_sixth, three, six] GPU_LAMBDA(
scalar_t self_val) -> scalar_t {
opmath_t x = static_cast<opmath_t>(self_val);
return std::min<opmath_t>(std::max<opmath_t>(x + three, zero), six) * one_sixth;
return std::min(std::max(x + three, zero), six) * one_sixth;
});
});
}

View File

@ -317,17 +317,6 @@ void nonzero_static_cuda_out_impl(
out_temp =
Tensor(at::detail::empty_cuda({self.dim(), size}, out.options())).t();
}
// If input has zero elements, avoid kernel grid calculations (which can
// produce zero divisors) and just fill the output with fill_value.
if (self.numel() == 0) {
if (need_to_copy) {
out_temp.fill_(fill_value);
out.copy_(out_temp);
} else {
out.fill_(fill_value);
}
return;
}
int64_t* out_data_ptr = need_to_copy ? out_temp.mutable_data_ptr<int64_t>()
: out.mutable_data_ptr<int64_t>();

View File

@ -416,7 +416,6 @@ struct ReduceOp {
if (config.should_block_y_reduce()) {
value = block_y_reduce<output_vec_size>(value, shared_memory);
}
__syncthreads();
if (config.should_block_x_reduce()) {
value = block_x_reduce<output_vec_size>(value, shared_memory);
}

View File

@ -17,11 +17,12 @@ __global__ static void compute_cuda_kernel(
index_t* result_ptr,
int64_t size,
int64_t result_size) {
CUDA_KERNEL_ASSERT_PRINTF(
result_size == cumsum_ptr[size - 1],
if (C10_UNLIKELY((result_size != cumsum_ptr[size - 1]))) {
printf("%s:%d:%s: block: [%d,%d,%d], thread: [%d,%d,%d] "
"Invalid input! In `repeat_interleave`, the `output_size` argument (%ld) must be the same as the sum of the elements in the `repeats` tensor (%ld).\n",
result_size,
cumsum_ptr[size - 1]);
__FILE__, __LINE__, __func__,blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z, result_size, cumsum_ptr[size - 1 ]);
CUDA_KERNEL_ASSERT(result_size == cumsum_ptr[size - 1])
}
int64_t idx = ((int64_t) blockIdx.x) * blockDim.x + threadIdx.x;
int64_t stride = (blockDim.x * gridDim.x) / C10_WARP_SIZE;

View File

@ -7,7 +7,6 @@
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/empty.h>
#include <ATen/ops/empty_like.h>
#include <ATen/ops/miopen_batch_norm_native.h>
#include <ATen/ops/miopen_batch_norm_backward_native.h>
#endif
@ -103,7 +102,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
mode = miopenBNSpatial;
}
auto output_t = at::empty_like(input_t, input_t.options(), input_t.suggest_memory_format());
auto output_t = at::empty(input->sizes(), input->options());
TensorArg output{ output_t, "output", 0 };
auto handle = getMiopenHandle();
@ -171,15 +170,20 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
const std::optional<Tensor>& save_var_t_opt,
double epsilon) {
// See [Note: hacky wrapper removal for optional tensor]
const Tensor& save_mean_t = save_mean_t_opt.value_or(Tensor());
const Tensor& save_var_t = save_var_t_opt.value_or(Tensor());
const Tensor& running_mean =
running_mean_opt.value_or(Tensor());
const Tensor& running_var =
running_var_opt.value_or(Tensor());
const Tensor& save_mean_t =
save_mean_t_opt.value_or(Tensor());
const Tensor& save_var_t =
save_var_t_opt.value_or(Tensor());
auto grad_output_contig =
grad_output_t.contiguous(input_t.suggest_memory_format());
TensorArg input{input_t, "input", 1},
grad_output{grad_output_contig, "grad_output", 2},
weight{weight_t, "weight", 3}, save_mean{save_mean_t, "save_mean", 4},
save_var{save_var_t, "save_var", 5};
TensorArg input{ input_t, "input", 1 },
grad_output{ grad_output_t, "grad_output", 2 },
weight{ weight_t, "weight", 3 },
save_mean{ save_mean_t, "save_mean", 4 },
save_var{ save_var_t, "save_var", 5 };
CheckedFrom c = "miopen_batch_norm_backward";
checkAllDefined(c, {input, grad_output, weight, save_mean, save_var});
@ -191,11 +195,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
}
checkAllSameType(c, {input, grad_output});
checkAllSameType(c, {weight, save_mean, save_var});
// TODO: is weight required to be contiguous?
checkAllContiguous(c, {save_mean, save_var});
// TODO: TensorArg check should start handle memory format
TORCH_CHECK(input->is_contiguous(input->suggest_memory_format()));
TORCH_CHECK(grad_output->is_contiguous(input->suggest_memory_format()));
checkAllContiguous(c, {input, grad_output, save_mean, save_var});
checkDimRange(c, input, 2, 6 /* exclusive */);
checkSameSize(c, input, grad_output);
auto num_features = input->size(1);
@ -210,7 +210,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
mode = miopenBNSpatial;
}
auto grad_input_t = at::empty(input->sizes(), input->options(), input->suggest_memory_format());
auto grad_input_t = at::empty(input->sizes(), input->options());
auto grad_weight_t = at::empty(weight->sizes(), weight->options());
auto grad_bias_t = at::empty(weight->sizes(), weight->options());

View File

@ -1770,12 +1770,10 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> miopen_depthwise_convolution_back
// fusions
// ---------------------------------------------------------------------
void raw_miopen_convolution_add_relu_out(
void raw_miopen_convolution_relu_out(
const Tensor& output,
const Tensor& input,
const Tensor& weight,
const Tensor& z,
float alpha,
const Tensor& bias,
IntArrayRef stride,
IntArrayRef padding,
@ -1783,20 +1781,68 @@ void raw_miopen_convolution_add_relu_out(
int64_t groups,
bool benchmark,
bool deterministic) {
raw_miopen_convolution_forward_out(
output,
auto dataType = getMiopenDataType(input);
miopenConvolutionMode_t c_mode = miopenConvolution;
ConvolutionArgs args{ input, output, weight };
args.handle = getMiopenHandle();
at::MemoryFormat memory_format = miopen_conv_suggest_memory_format(input, weight);
setConvolutionParams(
&args.params,
args.handle,
input,
weight,
padding,
stride,
dilation,
groups,
deterministic,
memory_format);
args.idesc.set(input, memory_format);
args.wdesc.set(weight, memory_format, 0);
args.odesc.set(output, memory_format);
args.cdesc.set(
dataType,
c_mode,
input.dim() - 2,
args.params.padding,
args.params.stride,
args.params.dilation,
args.params.groups,
benchmark,
deterministic);
at::Tensor alpha_mul_z_add_bias =
at::native::reshape_bias(input.dim(), bias).add(z, alpha);
output.add_(alpha_mul_z_add_bias);
output.relu_();
TensorDescriptor bdesc;
bdesc.set(bias.expand({1, bias.size(0)}), output.dim());
// Create the fusion plan
miopenFusionPlanDescriptor_t fusePlanDesc;
miopenFusionOpDescriptor_t convoOp;
miopenFusionOpDescriptor_t biasOp;
miopenFusionOpDescriptor_t activOp;
MIOPEN_CHECK(miopenCreateFusionPlan(&fusePlanDesc, miopenVerticalFusion, args.idesc.desc()));
MIOPEN_CHECK(miopenCreateOpConvForward(fusePlanDesc, &convoOp, args.cdesc.desc(), args.wdesc.desc()));
MIOPEN_CHECK(miopenCreateOpBiasForward(fusePlanDesc, &biasOp, bdesc.desc()));
MIOPEN_CHECK(miopenCreateOpActivationForward(fusePlanDesc, &activOp, miopenActivationRELU));
// compile fusion plan
MIOPEN_CHECK(miopenCompileFusionPlan(args.handle, fusePlanDesc));
// Set the Args
float alpha = static_cast<float>(1);
float beta = static_cast<float>(0);
float activ_alpha = static_cast<float>(0);
float activ_beta = static_cast<float>(0);
float activ_gamma = static_cast<float>(0);
miopenOperatorArgs_t fusionArgs;
MIOPEN_CHECK(miopenCreateOperatorArgs(&fusionArgs));
MIOPEN_CHECK(miopenSetOpArgsConvForward(fusionArgs, convoOp, &alpha, &beta, weight.const_data_ptr()));
MIOPEN_CHECK(miopenSetOpArgsBiasForward(fusionArgs, biasOp, &alpha, &beta, bias.const_data_ptr()));
MIOPEN_CHECK(miopenSetOpArgsActivForward(fusionArgs, activOp, &alpha, &beta, activ_alpha, activ_beta, activ_gamma));
miopenExecuteFusionPlan(args.handle, fusePlanDesc, args.idesc.desc(), input.const_data_ptr(), args.odesc.desc(), output.data_ptr(), fusionArgs);
// Cleanup
miopenDestroyFusionPlan(fusePlanDesc);
}
static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat memory_format) {
@ -1809,107 +1855,171 @@ static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat m
Tensor miopen_convolution_add_relu(
const Tensor& input_t,
const Tensor& weight_t,
const Tensor& z_t,
const Tensor& z,
const std::optional<Scalar>& alpha,
const std::optional<Tensor>& bias_t,
const std::optional<Tensor>& bias,
IntArrayRef stride,
IntArrayRef padding,
IntArrayRef dilation,
int64_t groups) {
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
const Tensor input = input_t.contiguous(memory_format);
const Tensor weight = weight_t.contiguous(memory_format);
Tensor z = z_t;
if (z.suggest_memory_format() != memory_format) {
z = z.to(memory_format);
}
z = z.contiguous(memory_format);
// FuseFrozenConvAddRelu performs some tensor shape checking
Tensor output_t = at::detail::empty_cuda(
conv_output_size(
input.sizes(), weight.sizes(), padding, stride, dilation),
input.options().memory_format(memory_format));
if (output_t.numel() == 0) {
return output_t;
}
// MIOpen does not support fusion of add, the alpha2 * z step of the below cuDNN function:
// y = act ( alpha1 * conv(x) + alpha2 * z + bias )
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
auto& ctx = at::globalContext();
bool benchmark = ctx.benchmarkCuDNN();
auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
auto _bias = bias_t.has_value()
? bias_t.value()
: at::zeros(
{output_t.size(1)},
optTypeMetaToScalarType(output_t.options().dtype_opt()),
output_t.options().layout_opt(),
output_t.options().device_opt(),
output_t.options().pinned_memory_opt());
raw_miopen_convolution_add_relu_out(
output_t,
TensorArg input { input_t, "input", 1 },
weight { weight_t, "weight", 2 };
Tensor output_t = at::detail::empty_cuda(
conv_output_size(
input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
input_t.options().memory_format(memory_format));
if (output_t.numel() == 0){
return output_t;
}
// Avoid ambiguity of "output" when this is being used as backwards
TensorArg output{output_t, "result", 0};
miopen_convolution_forward_out(
output,
"miopen_convolution_add_relu",
input,
weight,
z,
_alpha,
_bias,
stride,
padding,
stride,
dilation,
groups,
benchmark,
true); // deterministic
false // deterministic
);
return output_t;
auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
if (!output_t.is_same(contig_output_t)) {
contig_output_t.copy_(output_t);
}
auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
auto _bias = bias.has_value()
? bias.value()
: at::zeros(
{contig_output_t.size(1)},
optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
contig_output_t.options().layout_opt(),
contig_output_t.options().device_opt(),
contig_output_t.options().pinned_memory_opt());
at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input_t.dim(), _bias).add(z, _alpha);
contig_output_t.add_(alpha_mul_z_add_bias);
contig_output_t.relu_();
return contig_output_t;
}
Tensor miopen_convolution_relu(
const Tensor& input_t,
const Tensor& weight_t,
const std::optional<Tensor>& bias_t,
const std::optional<Tensor>& bias,
IntArrayRef stride,
IntArrayRef padding,
IntArrayRef dilation,
int64_t groups) {
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
const Tensor input = input_t.contiguous(memory_format);
const Tensor weight = weight_t.contiguous(memory_format);
// FuseFrozenConvAddRelu performs some tensor shape checking
Tensor output_t = at::detail::empty_cuda(
conv_output_size(
input.sizes(), weight.sizes(), padding, stride, dilation),
input.options().memory_format(memory_format));
if (output_t.numel() == 0) {
return output_t;
}
auto& ctx = at::globalContext();
bool benchmark = ctx.benchmarkCuDNN();
auto _bias = bias_t.has_value()
? bias_t.value()
: at::zeros(
{output_t.size(1)},
optTypeMetaToScalarType(output_t.options().dtype_opt()),
output_t.options().layout_opt(),
output_t.options().device_opt(),
output_t.options().pinned_memory_opt());
raw_miopen_convolution_add_relu_out(
output_t,
input,
weight,
output_t, // use output_t as z to satisfy MIOpen API
0, // alpha
_bias,
stride,
padding,
dilation,
groups,
benchmark, // benchmark
true); // deterministic
// MIOpen currently only supports MemoryFormat::Contiguous and fp32 and 2d
if (input_t.suggest_memory_format() == at::MemoryFormat::Contiguous
&& input_t.scalar_type() == at::kFloat
&& input_t.ndimension() == 4) {
return output_t;
// FuseFrozenConvAddRelu performs some tensor shape checking
Tensor output_t = at::detail::empty_cuda(
conv_output_size(
input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
input_t.options().memory_format(input_t.suggest_memory_format()));
if (output_t.numel() == 0) {
return output_t;
}
auto _bias = bias.has_value()
? bias.value()
: at::zeros(
{output_t.size(1)},
optTypeMetaToScalarType(output_t.options().dtype_opt()),
output_t.options().layout_opt(),
output_t.options().device_opt(),
output_t.options().pinned_memory_opt());
raw_miopen_convolution_relu_out(
output_t,
input_t,
weight_t,
_bias,
stride,
padding,
dilation,
groups,
benchmark, // benchmark
false // deterministic
);
return output_t;
}
else {
// fallback
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
TensorArg input { input_t, "input", 1 },
weight { weight_t, "weight", 2 };
Tensor output_t = at::detail::empty_cuda(
conv_output_size(
input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
input->options().memory_format(memory_format));
if (output_t.numel() == 0){
return output_t;
}
// Avoid ambiguity of "output" when this is being used as backwards
TensorArg output{output_t, "result", 0};
miopen_convolution_forward_out(
output,
"miopen_convolution_relu",
input,
weight,
padding,
stride,
dilation,
groups,
benchmark,
false // deterministic
);
auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
if (!output_t.is_same(contig_output_t)) {
contig_output_t.copy_(output_t);
}
auto _bias = bias.has_value()
? bias.value()
: at::zeros(
{contig_output_t.size(1)},
optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
contig_output_t.options().layout_opt(),
contig_output_t.options().device_opt(),
contig_output_t.options().pinned_memory_opt());
at::Tensor reshaped_bias = at::native::reshape_bias(input_t.dim(), _bias);
contig_output_t.add_(reshaped_bias);
contig_output_t.relu_();
return contig_output_t;
}
}
REGISTER_CUDA_DISPATCH(miopen_convolution_backward_stub, &miopen_convolution_backward)

View File

@ -0,0 +1,48 @@
#pragma once
#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
typedef NS_ENUM(NSUInteger, MPSGraphFFTScalingMode) {
MPSGraphFFTScalingModeNone = 0L,
MPSGraphFFTScalingModeSize = 1L,
MPSGraphFFTScalingModeUnitary = 2L,
};
@interface FakeMPSGraphFFTDescriptor : NSObject<NSCopying>
@property(readwrite, nonatomic) BOOL inverse;
@property(readwrite, nonatomic) MPSGraphFFTScalingMode scalingMode;
@property(readwrite, nonatomic) BOOL roundToOddHermitean;
+ (nullable instancetype)descriptor;
@end
@compatibility_alias MPSGraphFFTDescriptor FakeMPSGraphFFTDescriptor;
@interface MPSGraph (SonomaOps)
- (MPSGraphTensor* _Nonnull)conjugateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)realPartOfTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)fastFourierTransformWithTensor:(MPSGraphTensor* _Nonnull)tensor
axes:(NSArray<NSNumber*>* _Nonnull)axes
descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)realToHermiteanFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
axes:(NSArray<NSNumber*>* _Nonnull)axes
descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)HermiteanToRealFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
axes:(NSArray<NSNumber*>* _Nonnull)axes
descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
name:(NSString* _Nullable)name;
@end
// define BFloat16 enums for MacOS13
#define MPSDataTypeBFloat16 ((MPSDataType)(MPSDataTypeAlternateEncodingBit | MPSDataTypeFloat16))
// define Metal version
#define MTLLanguageVersion3_1 ((MTLLanguageVersion)((3 << 16) + 1))
#endif

View File

@ -0,0 +1,196 @@
#pragma once
#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
// TODO: Remove me when moved to MacOS 13
#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
@interface FakeMPSGraphConvolution3DOpDescriptor : NSObject<NSCopying>
@property(readwrite, nonatomic) NSUInteger strideInX;
@property(readwrite, nonatomic) NSUInteger strideInY;
@property(readwrite, nonatomic) NSUInteger strideInZ;
@property(readwrite, nonatomic) NSUInteger dilationRateInX;
@property(readwrite, nonatomic) NSUInteger dilationRateInY;
@property(readwrite, nonatomic) NSUInteger dilationRateInZ;
@property(readwrite, nonatomic) NSUInteger paddingLeft;
@property(readwrite, nonatomic) NSUInteger paddingRight;
@property(readwrite, nonatomic) NSUInteger paddingTop;
@property(readwrite, nonatomic) NSUInteger paddingBottom;
@property(readwrite, nonatomic) NSUInteger paddingFront;
@property(readwrite, nonatomic) NSUInteger paddingBack;
@property(readwrite, nonatomic) MPSGraphPaddingStyle paddingStyle;
@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout dataLayout;
@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout weightsLayout;
@property(readwrite, nonatomic) NSUInteger groups;
@end
@compatibility_alias MPSGraphConvolution3DOpDescriptor FakeMPSGraphConvolution3DOpDescriptor;
#endif
@interface MPSGraph (VenturaOps)
#if !defined(__MAC_13_0) && (!defined(MAC_OS_X_VERSION_13_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_0))
typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode) {
MPSGraphResizeNearestRoundingModeRoundPreferCeil = 0L,
MPSGraphResizeNearestRoundingModeRoundPreferFloor = 1L,
MPSGraphResizeNearestRoundingModeCeil = 2L,
MPSGraphResizeNearestRoundingModeFloor = 3L,
MPSGraphResizeNearestRoundingModeRoundToEven = 4L,
MPSGraphResizeNearestRoundingModeRoundToOdd = 5L,
};
// Define complex enums for MacOS 12
#define MPSDataTypeComplexBit 0x01000000
#define MPSDataTypeComplexFloat32 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 64))
#define MPSDataTypeComplexFloat16 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 32))
#endif
- (MPSGraphTensor* _Nonnull)convolution3DWithSourceTensor:(MPSGraphTensor* _Nonnull)source
weightsTensor:(MPSGraphTensor* _Nonnull)weights
descriptor:(MPSGraphConvolution3DOpDescriptor* _Nonnull)descriptor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)
convolution3DDataGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
weightsTensor:(MPSGraphTensor* _Nonnull)weights
outputShape:(MPSShape* _Nonnull)outputShape
forwardConvolutionDescriptor:
(MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)
convolution3DWeightsGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
sourceTensor:(MPSGraphTensor* _Nonnull)source
outputShape:(MPSShape* _Nonnull)outputShape
forwardConvolutionDescriptor:
(MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)cumulativeSumWithTensor:(MPSGraphTensor* _Nonnull)tensor
axis:(NSInteger)axis
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axis:(NSInteger)axis
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axis:(NSInteger)axis
descending:(BOOL)descending
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
descending:(BOOL)descending
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axis:(NSInteger)axis
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axis:(NSInteger)axis
descending:(BOOL)descending
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
descending:(BOOL)descending
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)inverseOfTensor:(MPSGraphTensor* _Nonnull)inputTensor name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
sizeTensor:(MPSGraphTensor* _Nonnull)size
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
centerResult:(BOOL)centerResult
alignCorners:(BOOL)alignCorners
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
sizeTensor:(MPSGraphTensor* _Nonnull)size
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
sizeTensor:(MPSGraphTensor* _Nonnull)size
centerResult:(BOOL)centerResult
alignCorners:(BOOL)alignCorners
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
sizeTensor:(MPSGraphTensor* _Nonnull)size
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
input:(MPSGraphTensor* _Nonnull)input
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
centerResult:(BOOL)centerResult
alignCorners:(BOOL)alignCorners
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
input:(MPSGraphTensor* _Nonnull)input
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
input:(MPSGraphTensor* _Nonnull)input
centerResult:(BOOL)centerResult
alignCorners:(BOOL)alignCorners
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
input:(MPSGraphTensor* _Nonnull)input
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
layout:(MPSGraphTensorNamedDataLayout)layout
normalizeCoordinates:(BOOL)normalizeCoordinates
relativeCoordinates:(BOOL)relativeCoordinates
alignCorners:(BOOL)alignCorners
paddingMode:(MPSGraphPaddingMode)paddingMode
samplingMode:(MPSGraphResizeMode)samplingMode
constantValue:(double)constantValue
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
layout:(MPSGraphTensorNamedDataLayout)layout
normalizeCoordinates:(BOOL)normalizeCoordinates
relativeCoordinates:(BOOL)relativeCoordinates
alignCorners:(BOOL)alignCorners
paddingMode:(MPSGraphPaddingMode)paddingMode
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
constantValue:(double)constantValue
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)truncateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
@end

View File

@ -9,6 +9,8 @@
#include <ATen/mps/MPSAllocatorInterface.h>
#include <ATen/mps/MPSProfiler.h>
#include <ATen/native/mps/MPSGraphSequoiaOps.h>
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#include <fmt/format.h>
#include <fmt/ranges.h>
@ -568,7 +570,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,
MPSShape* mpsStrides = getMPSShape(_tensor.strides());
check_mps_shape(mpsShape);
auto storage_numel = src.storage().nbytes() / src.element_size() - src.storage_offset();
auto storage_numel = src.storage().nbytes() / src.element_size();
TORCH_CHECK(storage_numel <= std::numeric_limits<int32_t>::max(),
"MPSGaph does not support tensor dims larger than INT_MAX");
MPSNDArrayDescriptor* srcTensorDesc = [MPSNDArrayDescriptor descriptorWithDataType:dataType

View File

@ -8,6 +8,8 @@
#include <ATen/native/TensorIterator.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/native/mps/operations/BinaryKernel.h>
// For MTLLanguageVersion_3_1
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <fmt/format.h>
#ifndef AT_PER_OPERATOR_HEADERS

View File

@ -1,12 +1,23 @@
// Copyright © 2022 Apple Inc.
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/native/ConvUtils.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/ops/_mps_convolution_native.h>
#include <ATen/ops/_mps_convolution_transpose_native.h>
#include <ATen/ops/mps_convolution_backward_native.h>
#include <ATen/ops/mps_convolution_transpose_backward_native.h>
#include <fmt/format.h>
#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
@implementation FakeMPSGraphConvolution3DOpDescriptor
- (nonnull id)copyWithZone:(nullable NSZone*)zone {
return self;
}
@end
#endif
namespace at::native {
@ -39,9 +50,11 @@ static void fill_conv3d_desc(MPSGraphConvolution3DOpDescriptor* descriptor_,
descriptor_.paddingFront = paddingDepth;
descriptor_.paddingBack = paddingDepth;
descriptor_.dataLayout = MPSGraphTensorNamedDataLayoutNCDHW;
// PyTorch always uses NCDHW memory layout for 3D tensors
descriptor_.dataLayout = (MPSGraphTensorNamedDataLayout)7L; // MPSGraphTensorNamedDataLayoutNCDHW;
descriptor_.weightsLayout = MPSGraphTensorNamedDataLayoutOIDHW;
// PyTorch always uses OIDHW memory layout for 3D weights
descriptor_.weightsLayout = (MPSGraphTensorNamedDataLayout)9L; // MPSGraphTensorNamedDataLayoutOIDHW;
descriptor_.groups = groups; // not yet tested in Xcode/C++
}
@ -173,6 +186,18 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
if (bias_defined)
bias_shape = bias_opt.value().sizes();
std::string mem_format_key;
switch (memory_format) {
case at::MemoryFormat::Contiguous:
mem_format_key = "Contiguous";
break;
case at::MemoryFormat::ChannelsLast:
mem_format_key = "ChannelsLast";
break;
default:
assert(0 && "Check should have been done earlier\n");
}
std::string bias_shape_key;
if (bias_defined) {
bias_shape_key = std::to_string(bias_shape[0]);
@ -180,16 +205,20 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
bias_shape_key = "nobias";
}
std::string key = fmt::format("mps_{}convolution:{}:{}:{}:{}:{}:{}:{}:{}",
is3DConv ? "3d_" : "",
getArrayRefString(stride),
getArrayRefString(dilation),
getArrayRefString(padding),
groups,
is_channels_last,
mps::getTensorsStringKey({input_t, weight_t}),
bias_defined,
bias_shape_key);
std::string key;
if (is3DConv) {
key = "mps_3d_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
} else {
key = "mps_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
}
MPSShape* inputShape = mps::getMPSShape(input_t, memory_format);
MPSShape* outputShape = mps::getMPSShape(output_t, memory_format);
@ -371,15 +400,33 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
@autoreleasepool {
MPSStream* stream = getCurrentMPSStream();
std::string mem_format_key;
switch (memory_format) {
case at::MemoryFormat::Contiguous:
mem_format_key = "Contiguous";
break;
case at::MemoryFormat::ChannelsLast:
mem_format_key = "ChannelsLast";
break;
default:
assert(0 && "Check should have been done earlier\n");
}
MPSShape* mps_input_shape = getMPSShape(input_size);
std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}",
is3DConv ? "3d_" : "",
getArrayRefString(stride),
getArrayRefString(dilation),
getArrayRefString(padding),
groups,
is_channels_last,
getTensorsStringKey({grad_output_t, weight_t}));
std::string key;
if (is3DConv) {
key = "mps_3d_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
":" + std::to_string(stride[2]) + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
getTensorsStringKey({grad_output_t, weight_t});
} else {
key = "mps_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
getTensorsStringKey({grad_output_t, weight_t});
}
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
auto gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output_t);
auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
@ -504,13 +551,19 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
MPSStream* stream = getCurrentMPSStream();
MPSShape* mps_weight_shape = getMPSShape(weight_size);
std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}",
is3DConv ? "3d_" : "",
getArrayRefString(stride),
getArrayRefString(dilation),
getArrayRefString(padding),
groups,
getTensorsStringKey({grad_output_t, input_t, grad_weight_t}));
std::string key;
if (is3DConv) {
key = "mps_3d_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" +
getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
} else {
key = "mps_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" +
getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
}
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
MPSShape* inputShape = getMPSShape(input_t);
bool isDepthwiseConv =

View File

@ -2,6 +2,7 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/mps/MPSProfiler.h>
#include <ATen/native/mps/Copy.h>
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/ops/_copy_from_and_resize_native.h>
#include <ATen/ops/_copy_from_native.h>

View File

@ -5,6 +5,8 @@
#include <ATen/native/DistributionTemplates.h>
#include <ATen/native/Distributions.h>
#include <ATen/native/TensorFactories.h>
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS

View File

@ -1,4 +1,6 @@
#include <ATen/native/SpectralOpsUtils.h>
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS
@ -10,6 +12,20 @@
#include <ATen/ops/_fft_r2c_native.h>
#endif
#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
@implementation FakeMPSGraphFFTDescriptor
+ (nullable instancetype)descriptor {
// Redispatch the constructor to the actual implementation
id desc = NSClassFromString(@"MPSGraphFFTDescriptor");
return (FakeMPSGraphFFTDescriptor*)[desc descriptor];
}
- (nonnull id)copyWithZone:(nullable NSZone*)zone {
return self;
}
@end
#endif
namespace at::native {
namespace {
MPSGraphFFTScalingMode normalization_to_ScalingMode(int64_t normalization) {

View File

@ -2,6 +2,7 @@
#include <ATen/mps/MPSProfiler.h>
#include <ATen/native/GridSamplerUtils.h>
#include <ATen/native/Pool.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/native/mps/kernels/GridSampler.h>

View File

@ -17,6 +17,7 @@
#include <ATen/native/LinearAlgebraUtils.h>
#include <ATen/native/Resize.h>
#include <ATen/native/TensorAdvancedIndexing.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <c10/util/SmallVector.h>
#include <c10/util/irange.h>
#include <fmt/format.h>

View File

@ -6,7 +6,9 @@
#include <ATen/native/LinearAlgebra.h>
#include <ATen/native/LinearAlgebraUtils.h>
#include <ATen/native/Resize.h>
// For MTLLanguageVersion_3_1
#include <ATen/native/mps/MPSGraphSequoiaOps.h>
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS

View File

@ -4,6 +4,7 @@
#include <ATen/TensorUtils.h>
#include <ATen/native/Pool.h>
#include <ATen/native/ReduceOpsUtils.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#include <c10/util/irange.h>
@ -616,7 +617,6 @@ static Tensor median_common_mps(const Tensor& input_t, bool nanmedian) {
// we allocate 1 here due to MacOS13 bug for gather MPSGraph op, look below for the error
Tensor output_t = at::empty({1}, input_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
if (output_t.numel() == 0 || num_in_elements == 0) {
output_t.fill_(std::numeric_limits<float>::quiet_NaN());
return output_t;
}

View File

@ -4,6 +4,7 @@
#include <ATen/WrapDimUtils.h>
#include <ATen/native/TensorShape.h>
#include <ATen/native/TypeProperties.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS

View File

@ -5,6 +5,7 @@
#include <ATen/native/SortingUtils.h>
#include <ATen/native/TensorShape.h>
#include <ATen/native/TypeProperties.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS

View File

@ -2,6 +2,8 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/native/UnaryOps.h>
#include <ATen/native/mps/Copy.h>
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS

View File

@ -1,6 +1,7 @@
// Copyright © 2022 Apple Inc.
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/native/Resize.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS

View File

@ -1,6 +1,7 @@
// Copyright © 2023 Apple Inc.
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/native/UpSample.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#include <fmt/format.h>

View File

@ -4,6 +4,8 @@
#include <ATen/mps/MPSAllocatorInterface.h>
#include <ATen/mps/MPSProfiler.h>
#include <ATen/native/Resize.h>
// For MTLLanguageVersion_3_1
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#include <fmt/format.h>

View File

@ -1414,7 +1414,7 @@
- func: cat(Tensor[] tensors, int dim=0) -> Tensor
structured_delegate: cat.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: cat_sparse
SparseCPU, SparseCUDA: cat_sparse
QuantizedCPU: cat_quantized_cpu
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: cat_nested
tags: core
@ -1798,7 +1798,7 @@
device_guard: False
dispatch:
MkldnnCPU: copy_mkldnn_
SparseCPU, SparseCUDA, SparseMPS: copy_sparse_wrapper_
SparseCPU, SparseCUDA: copy_sparse_wrapper_
CompositeExplicitAutograd: copy_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: copy_sparse_compressed_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: copy_nested_
@ -2160,7 +2160,7 @@
variants: function, method
structured_delegate: div.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: div_sparse
SparseCPU, SparseCUDA: div_sparse
ZeroTensor: div_zerotensor
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Tensor
tags: [core, pointwise]
@ -2170,7 +2170,7 @@
variants: method
structured_delegate: div.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: div_sparse_
SparseCPU, SparseCUDA: div_sparse_
tags: pointwise
- func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@ -2179,7 +2179,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS, MTIA: div_out
SparseCPU, SparseCUDA, SparseMPS: div_out_sparse_zerodim
SparseCPU, SparseCUDA: div_out_sparse_zerodim
tags: pointwise
- func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
@ -2187,7 +2187,7 @@
variants: function, method
structured_delegate: div.out_mode
dispatch:
SparseCPU, SparseCUDA, SparseMPS: div_sparse
SparseCPU, SparseCUDA: div_sparse
tags: [core, pointwise]
- func: div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
@ -2195,7 +2195,7 @@
variants: method
structured_delegate: div.out_mode
dispatch:
SparseCPU, SparseCUDA, SparseMPS: div_sparse_
SparseCPU, SparseCUDA: div_sparse_
tags: pointwise
- func: div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
@ -2204,7 +2204,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: div_out_mode
SparseCPU, SparseCUDA, SparseMPS: div_out_sparse_zerodim
SparseCPU, SparseCUDA: div_out_sparse_zerodim
tags: pointwise
# For C++ only, until we have conversion from C++ numbers to Tensor
@ -2517,7 +2517,7 @@
dispatch:
CompositeExplicitAutograd: empty_like
QuantizedCPU, QuantizedCUDA: empty_like_quantized
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: empty_like_sparse_coo
SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: empty_like_nested
autogen: empty_like.out
@ -2768,20 +2768,20 @@
variants: function, method
dispatch:
CPU, CUDA, MPS, MTIA: floor_divide
SparseCPU, SparseCUDA, SparseMPS: floor_divide_sparse
SparseCPU, SparseCUDA: floor_divide_sparse
- func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: method
dispatch:
CPU, CUDA, MPS: floor_divide_
SparseCPU, SparseCUDA, SparseMPS: floor_divide_sparse_
SparseCPU, SparseCUDA: floor_divide_sparse_
- func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
CPU, CUDA, MPS: floor_divide_out
SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim
SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
- func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
device_check: NoCheck # TensorIterator
@ -4273,7 +4273,7 @@
structured_delegate: mul.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: mul_sparse
SparseCPU, SparseCUDA: mul_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr
MkldnnCPU: mkldnn_mul
ZeroTensor: mul_zerotensor
@ -4285,7 +4285,7 @@
structured_delegate: mul.out
variants: method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: mul_sparse_
SparseCPU, SparseCUDA: mul_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr_
MkldnnCPU: mkldnn_mul_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Tensor
@ -4299,7 +4299,6 @@
CPU, CUDA, MPS, MTIA: mul_out
SparseCPU: mul_out_sparse_cpu
SparseCUDA: mul_out_sparse_cuda
SparseMPS: mul_out_sparse_mps
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_out_sparse_csr
MkldnnCPU: mkldnn_mul_out
tags: pointwise
@ -5849,7 +5848,7 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: sum
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: sum_coo
SparseCPU, SparseCUDA, SparseMeta: sum_coo
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
autogen: sum.out
@ -5860,7 +5859,7 @@
variants: function, method
dispatch:
NestedTensorCPU: NestedTensor_sum_dim_CPU
SparseCPU, SparseCUDA, SparseMPS: sum_sparse_coo
SparseCPU, SparseCUDA: sum_sparse_coo
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_sparse_compressed
tags: core
@ -6492,7 +6491,7 @@
device_guard: False
dispatch:
CompositeExplicitAutograd: unsqueeze
SparseCPU, SparseCUDA, SparseMPS: unsqueeze_sparse
SparseCPU, SparseCUDA: unsqueeze_sparse
QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: unsqueeze_nested
tags: core
@ -6976,7 +6975,7 @@
CPU, CUDA: sub_out
MPS: sub_out_mps
MTIA: sub_out_mtia
SparseCPU, SparseCUDA, SparseMPS: sub_out_sparse
SparseCPU, SparseCUDA: sub_out_sparse
tags: pointwise
- func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
@ -6984,7 +6983,7 @@
variants: function, method
structured_delegate: sub.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sub_sparse
SparseCPU, SparseCUDA: sub_sparse
ZeroTensor: sub_zerotensor
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sub_Tensor
tags: [core, pointwise]
@ -6994,7 +6993,7 @@
variants: method
structured_delegate: sub.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sub_sparse_
SparseCPU, SparseCUDA: sub_sparse_
tags: pointwise
# For C++ only, until we have conversion from C++ numbers to Tensor
@ -10259,7 +10258,7 @@
structured_delegate: any.all_out
variants: method, function
dispatch:
SparseCPU, SparseCUDA, SparseMPS: any_sparse
SparseCPU, SparseCUDA: any_sparse
tags: core
- func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -10343,7 +10342,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA: pow_Tensor_Scalar_out
SparseCPU, SparseCUDA, SparseMPS: pow_out_sparse_scalar
SparseCPU, SparseCUDA: pow_out_sparse_scalar
MPS: pow_tensor_scalar_out_mps
tags: pointwise
@ -10352,7 +10351,7 @@
structured_delegate: pow.Tensor_Scalar_out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: pow_sparse_scalar
SparseCPU, SparseCUDA: pow_sparse_scalar
tags: [core, pointwise]
- func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)

View File

@ -2,7 +2,6 @@
#include <ATen/core/Tensor.h>
#include <ATen/Config.h>
#include <ATen/Dispatch.h>
#include <ATen/AccumulateType.h>
#include <ATen/NamedTensorUtils.h>
#include <ATen/native/sparse/ParamUtils.h>
#include <ATen/native/SparseTensorUtils.h>
@ -296,7 +295,6 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
to exp functions as well as reuse of softmax implementation for
log_softmax.
*/
using accscalar_t = at::acc_type<scalar_t, false>;
auto sparse_dim = input.sparse_dim();
auto indices = input._indices().contiguous();
auto values = input._values().contiguous();
@ -342,14 +340,14 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
continue;
/* Prepare scratch space */
std::vector<accscalar_t> mx_row(nvalues, -std::numeric_limits<accscalar_t>::infinity());
std::vector<accscalar_t> exp_sums_row(nvalues, 0);
std::vector<scalar_t> mx_row(nvalues, -std::numeric_limits<scalar_t>::infinity());
std::vector<scalar_t> exp_sums_row(nvalues, 0);
/* Compute mx */
for (int64_t i : pool_indices) {
auto values_row = values_accessor[i];
for (const auto j : c10::irange(nvalues)) {
mx_row[j] = std::max(mx_row[j], accscalar_t(values_row[j]));
mx_row[j] = std::max(mx_row[j], values_row[j]);
}
}

View File

@ -10,7 +10,6 @@
#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
#include <ATen/ops/cat.h>
#include <ATen/ops/add_native.h>
#include <ATen/ops/mul_native.h>
#include <ATen/ops/empty_native.h>
#include <ATen/ops/zeros_native.h>
#include <ATen/ops/result_type.h>
@ -21,265 +20,10 @@
namespace at::native {
using namespace at::sparse;
using namespace mps;
#ifndef PYTORCH_JIT_COMPILE_SHADERS
static auto& lib = MetalShaderLibrary::getBundledLibrary();
#else
#include <ATen/native/mps/Mul_metallib.h>
#endif
Tensor& add_out_dense_sparse_mps(Tensor& out, const Tensor& dense, const SparseTensor& sparse, const Scalar& alpha);
static SparseTensor& mul_out_dense_sparse_mps(
const Tensor& dense,
const Tensor& sparse,
SparseTensor& out) {
TORCH_CHECK(sparse.is_sparse(), "mul: expected 'sparse' to be sparse COO");
TORCH_CHECK(sparse.is_mps(), "mul: expected 'sparse' to be MPS, got ", sparse.device());
TORCH_CHECK(out.is_mps(), "mul: expected 'out' to be MPS, got ", out.device());
const bool scalar_like = (dense.dim() == 0) || (dense.numel() == 1);
TORCH_CHECK(dense.is_mps() || scalar_like,
"mul: expected 'dense' to be MPS or scalar-like, got ", dense.device());
const int64_t nnz = sparse._nnz();
out.resize_as_(sparse);
auto commonDtype = at::result_type(dense, sparse);
TORCH_CHECK(canCast(commonDtype, out.scalar_type()),
"Can't convert result type ", commonDtype, " to output ", out.scalar_type());
auto indices = sparse._indices().contiguous();
auto values = sparse._values().to(commonDtype).contiguous();
if (nnz == 0) {
auto empty_vals = values.narrow(0, 0, 0);
alias_into_sparse(out,
indices.narrow(1, 0, 0),
(out.scalar_type() == commonDtype) ? empty_vals
: empty_vals.to(out.scalar_type()));
out._coalesced_(sparse.is_coalesced());
return out;
}
if (scalar_like) {
auto scalar = dense;
if (dense.numel() == 1 && dense.dim() > 0) {
scalar = dense.view({});
}
scalar = scalar.to(values.options());
auto out_vals = values.mul(scalar);
if (out.scalar_type() != commonDtype) {
out_vals = out_vals.to(out.scalar_type());
}
alias_into_sparse(out, indices, out_vals);
out._coalesced_(sparse.is_coalesced());
return out;
}
TORCH_CHECK(dense.sizes().equals(sparse.sizes()),
"mul(dense, sparse): sizes must match exactly (no broadcasting): ",
dense.sizes(), " vs ", sparse.sizes());
const int64_t ndim_i = sparse.sparse_dim();
const int64_t ndim = dense.dim();
TORCH_CHECK(
ndim_i <= ndim,
"mul(dense, sparse): sparse_dim=", ndim_i, " exceeds dense.dim()=", ndim);
// Prepare shapes
int64_t view_rows = 1, view_cols = 1;
for (int64_t i = 0; i < ndim_i; ++i) view_rows *= sparse.size(i);
for (int64_t i = ndim_i; i < ndim; ++i) view_cols *= sparse.size(i);
auto dense_mps = dense.to(commonDtype).contiguous().reshape({view_rows, view_cols});
auto out_vals = at::empty_like(values, values.options());
const uint32_t u_view_cols = static_cast<uint32_t>(view_cols);
const uint32_t u_nnz = static_cast<uint32_t>(nnz);
const uint32_t u_ndim_i = static_cast<uint32_t>(ndim_i);
auto stream = getCurrentMPSStream();
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
auto pso = lib.getPipelineStateForFunc("dense_sparse_mul_kernel_" + mps::scalarToMetalTypeString(values));
auto computeEncoder = stream->commandEncoder();
[computeEncoder setComputePipelineState:pso];
const uint32_t gridWidth = u_view_cols;
const uint32_t gridDepth = u_nnz;
MTLSize gridSize = MTLSizeMake(gridWidth, 1, gridDepth);
const uint32_t maxThreadsPerGroup = pso.maxTotalThreadsPerThreadgroup;
const uint32_t tew = pso.threadExecutionWidth;
uint32_t tgWidth = std::min(gridWidth, tew);
MTLSize threadgroupSize = MTLSizeMake(tgWidth, 1, 1);
mtl_setArgs(
computeEncoder,
dense_mps,
values,
out_vals,
indices,
sparse.sizes(),
std::array<uint32_t, 3>{u_nnz, u_ndim_i, u_view_cols}
);
[computeEncoder dispatchThreads:gridSize threadsPerThreadgroup:threadgroupSize];
}
});
Tensor final_vals = out_vals;
if (out.scalar_type() != commonDtype) {
final_vals = final_vals.to(out.scalar_type());
}
alias_into_sparse(out, indices, final_vals);
out._coalesced_(sparse.is_coalesced());
return out;
}
SparseTensor& mul_out_sparse_mps(const Tensor& t_, const Tensor& src_, SparseTensor& r_) {
TORCH_CHECK(r_.is_mps(), "mul: expected 'out' to be MPS, but got ", r_.device());
// Dense x sparse fallback (keep dense first)
if (!t_.is_sparse() || !src_.is_sparse()) {
const Tensor& dense = t_.is_sparse() ? src_ : t_;
const Tensor& sparse = t_.is_sparse() ? t_ : src_;
return mul_out_dense_sparse_mps(dense, sparse, r_);
}
TORCH_CHECK(t_.is_mps(), "mul: expected 'self' to be MPS, but got ", t_.device());
TORCH_CHECK(src_.is_mps(), "mul: expected 'other' to be MPS, but got ", src_.device());
TORCH_CHECK(t_.sparse_dim() == src_.sparse_dim(),
"mul(sparse, sparse): must have same sparse_dim, got ",
t_.sparse_dim(), " vs ", src_.sparse_dim());
TORCH_CHECK(t_.sizes().equals(src_.sizes()),
"mul(sparse, sparse): sizes must match exactly (no broadcasting).");
// Coalesce and early-exit on structurally empty operands
auto lhs = t_.coalesce();
auto rhs = src_.coalesce();
const int64_t lhs_nnz = lhs._nnz();
const int64_t rhs_nnz = rhs._nnz();
if (!lhs_nnz || !rhs_nnz) {
r_.resize_as_(lhs);
return r_.zero_();
}
// dtype checks and promotion
auto commonDtype = at::result_type(lhs, rhs);
TORCH_CHECK(canCast(commonDtype, r_.scalar_type()),
"Can't convert result type ", commonDtype, " to output ", r_.scalar_type());
const int64_t ndim_i = lhs.sparse_dim();
// ndim_i == 0, at most one structural entry
if (ndim_i == 0) {
r_.resize_as_(lhs);
const bool has = (lhs_nnz && rhs_nnz);
auto out_indices = lhs._indices().narrow(1, 0, has ? 1 : 0);
Tensor lhs_vals = lhs._values().to(commonDtype);
Tensor rhs_vals = rhs._values().to(commonDtype);
lhs_vals = lhs_vals.narrow(0, 0, has ? 1 : 0);
rhs_vals = rhs_vals.narrow(0, 0, has ? 1 : 0);
Tensor out_values = lhs_vals.mul(rhs_vals);
if (r_.scalar_type() != commonDtype) {
out_values = out_values.to(r_.scalar_type());
}
alias_into_sparse(r_, out_indices, out_values);
r_._coalesced_(true);
return r_;
}
// General path, intersect keys, then gather + multiply on GPU
const auto device = r_.device();
auto stream = getCurrentMPSStream();
auto lhs_indices = lhs._indices();
auto rhs_indices = rhs._indices();
auto lhs_values = lhs._values().to(commonDtype);
auto rhs_values = rhs._values().to(commonDtype);
// Flatten sparse indices to keys
auto lhs_keys = flatten_indices(lhs_indices, lhs.sizes());
auto rhs_keys = flatten_indices(rhs_indices, rhs.sizes());
// Intersect sorted keys (search the shorter in the longer)
const bool A_is_lhs = (lhs_nnz <= rhs_nnz);
const int64_t lenA = A_is_lhs ? lhs_nnz : rhs_nnz;
const int64_t lenB = A_is_lhs ? rhs_nnz : lhs_nnz;
auto A_keys = A_is_lhs ? lhs_keys : rhs_keys;
auto B_keys = A_is_lhs ? rhs_keys : lhs_keys;
auto outA_idx = at::empty({lenA}, at::device(device).dtype(kLong));
auto outB_idx = at::empty({lenA}, at::device(device).dtype(kLong));
auto counter = at::zeros({1}, at::device(device).dtype(kInt));
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
auto pso = lib.getPipelineStateForFunc("intersect_binary_search");
auto enc = stream->commandEncoder();
[enc setComputePipelineState:pso];
mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter,
static_cast<uint32_t>(lenB), A_is_lhs);
mtl_dispatch1DJob(enc, pso, static_cast<uint32_t>(lenA));
}
});
const uint32_t M = counter.item<int32_t>(); // number of structural matches
r_.resize_as_(lhs);
auto out_indices = at::empty({ndim_i, static_cast<int64_t>(M)}, at::device(device).dtype(at::kLong));
auto lhs_match = outA_idx.narrow(0, 0, M);
auto rhs_match = outB_idx.narrow(0, 0, M);
auto out_val_sizes = lhs_values.sizes().vec();
out_val_sizes[0] = static_cast<int64_t>(M);
auto out_values = at::empty(out_val_sizes, lhs_values.options());
const uint32_t cols = static_cast<uint32_t>(
lhs_values.numel() / std::max<int64_t>(1, lhs_nnz));
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
auto pso = lib.getPipelineStateForFunc(
"fused_gather_mul_kernel_" + mps::scalarToMetalTypeString(lhs_values));
auto enc = stream->commandEncoder();
[enc setComputePipelineState:pso];
const uint32_t tew = pso.threadExecutionWidth;
uint32_t tgW = std::min(cols, tew);
MTLSize grid = MTLSizeMake(cols, 1, M);
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
mtl_setArgs(enc,
lhs_values, rhs_values,
lhs_match, rhs_match,
lhs_indices, out_indices,
out_values,
std::array<uint32_t, 2>{static_cast<uint32_t>(ndim_i), static_cast<uint32_t>(lhs_nnz)},
std::array<uint32_t, 2>{M, cols});
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
}
});
if (r_.scalar_type() != commonDtype) {
out_values = out_values.to(r_.scalar_type());
}
alias_into_sparse(r_, out_indices, out_values);
r_._coalesced_(true);
return r_;
}
static Tensor& add_out_dense_sparse_mps(
Tensor& add_out_dense_sparse_mps(
Tensor& out,
const Tensor& dense,
const SparseTensor& sparse,

View File

@ -1,150 +0,0 @@
#include <metal_stdlib>
#include <c10/metal/indexing.h>
using namespace metal;
template <typename T>
kernel void dense_sparse_mul_kernel(
device const T* dense [[buffer(0)]],
device const T* values [[buffer(1)]],
device T* out_values [[buffer(2)]],
device const long* indices [[buffer(3)]],
device const long* sizes [[buffer(4)]],
constant uint3& sparse_params [[buffer(5)]],
uint3 gid [[thread_position_in_grid]])
{
uint col = gid.x;
uint i = gid.z;
uint nnz = sparse_params.x;
uint ndim_i = sparse_params.y;
uint view_cols = sparse_params.z;
long key = 0;
for (uint d = 0; d < ndim_i; ++d) {
long idx_d = indices[(ulong)d * (ulong)nnz + (ulong)i];
const auto sz_d = sizes[d];
key = key * sz_d + idx_d;
}
ulong dense_idx = (ulong)key * (ulong)view_cols + (ulong)col;
ulong val_idx = (ulong)i * (ulong)view_cols + (ulong)col;
const auto a = static_cast<float>(values[val_idx]);
const auto b = static_cast<float>(dense[dense_idx]);
out_values[val_idx] = static_cast<T>(a * b);
}
kernel void intersect_binary_search(
device const long* keysA [[buffer(0)]],
device const long* keysB [[buffer(1)]],
device long* outA_idx [[buffer(2)]],
device long* outB_idx [[buffer(3)]],
device atomic_uint* counter [[buffer(4)]],
constant uint& lenB [[buffer(5)]],
constant bool& A_is_lhs [[buffer(6)]],
uint3 tid_in_grid [[thread_position_in_grid]])
{
uint gid = tid_in_grid.x;
long key = keysA[gid];
// lower_bound in B
uint lo = 0;
uint hi = lenB;
while (lo < hi) {
uint mid = (lo + hi) >> 1;
long v = keysB[mid];
if (v < key) lo = mid + 1;
else hi = mid;
}
if (lo < lenB && keysB[lo] == key) {
uint pos = atomic_fetch_add_explicit(counter, 1u, memory_order_relaxed);
if (A_is_lhs) {
outA_idx[pos] = (long)gid;
outB_idx[pos] = (long)lo;
} else {
outA_idx[pos] = (long)lo;
outB_idx[pos] = (long)gid;
}
}
}
template <typename T>
kernel void fused_gather_mul_kernel(
device const T* lhs_vals [[buffer(0)]],
device const T* rhs_vals [[buffer(1)]],
device const long* lhs_sel [[buffer(2)]],
device const long* rhs_sel [[buffer(3)]],
device const long* lhs_indices [[buffer(4)]],
device long* out_indices [[buffer(5)]],
device T* out_vals [[buffer(6)]],
constant uint2& dims_input [[buffer(7)]],
constant uint2& dims_output [[buffer(8)]],
uint3 gid [[thread_position_in_grid]])
{
const uint col = gid.x;
const uint k = gid.z;
const uint n_dim_i = dims_input.x;
const uint L = dims_input.y;
const uint M = dims_output.x;
const uint view_cols = dims_output.y;
const long iL = lhs_sel[k];
const long iR = rhs_sel[k];
if (col < view_cols) {
const ulong offL = (ulong)iL * (ulong)view_cols + (ulong)col;
const ulong offR = (ulong)iR * (ulong)view_cols + (ulong)col;
const ulong offO = (ulong)k * (ulong)view_cols + (ulong)col;
const float a = (float)lhs_vals[offL];
const float b = (float)rhs_vals[offR];
out_vals[offO] = (T)(a * b);
}
// One thread per match copies the indices column
if (col == 0) {
const ulong uL = (ulong)L;
const ulong uM = (ulong)M;
const ulong src_col = (ulong)iL; // gather from lhs
for (uint d = 0; d < n_dim_i; ++d) {
const long v = lhs_indices[(ulong)d * uL + src_col];
out_indices[(ulong)d * uM + (ulong)k] = v;
}
}
}
#define INSTANTIATE_DENSE_SPARSE_MUL(DTYPE) \
template [[host_name("dense_sparse_mul_kernel_" #DTYPE)]] kernel void \
dense_sparse_mul_kernel<DTYPE>( \
device const DTYPE* dense [[buffer(0)]], \
device const DTYPE* values [[buffer(1)]], \
device DTYPE* out_values [[buffer(2)]], \
device const long* indices [[buffer(3)]], \
device const long* sizes [[buffer(4)]], \
constant uint3& sparse_params [[buffer(5)]], \
uint3 gid [[thread_position_in_grid]]);
INSTANTIATE_DENSE_SPARSE_MUL(float);
INSTANTIATE_DENSE_SPARSE_MUL(half);
INSTANTIATE_DENSE_SPARSE_MUL(bfloat);
#define INSTANTIATE_FUSED_GATHER_MUL(DTYPE) \
template [[host_name("fused_gather_mul_kernel_" #DTYPE)]] kernel void \
fused_gather_mul_kernel<DTYPE>( \
device const DTYPE* lhs_vals [[buffer(0)]], \
device const DTYPE* rhs_vals [[buffer(1)]], \
device const long* lhs_sel [[buffer(2)]], \
device const long* rhs_sel [[buffer(3)]], \
device const long* lhs_indices [[buffer(4)]], \
device long* out_indices [[buffer(5)]], \
device DTYPE* out_vals [[buffer(6)]], \
constant uint2& dims_input [[buffer(7)]], \
constant uint2& dims_output [[buffer(8)]], \
uint3 gid [[thread_position_in_grid]]);
INSTANTIATE_FUSED_GATHER_MUL(float);
INSTANTIATE_FUSED_GATHER_MUL(half);
INSTANTIATE_FUSED_GATHER_MUL(bfloat);

View File

@ -95,72 +95,6 @@
#endif
#endif
#if defined(USE_ROCM) && (defined(USE_FLASH_ATTENTION) || defined(USE_MEM_EFF_ATTENTION))
namespace pytorch_flash
{
std::tuple<
at::Tensor,
at::Tensor,
at::Tensor,
at::Tensor,
at::Tensor,
at::Tensor,
at::Tensor,
at::Tensor>
mha_fwd(
const at::Tensor& q, // batch_size x seqlen_q x num_heads x head_size
const at::Tensor& k, // batch_size x seqlen_k x num_heads_k x head_size
const at::Tensor& v, // batch_size x seqlen_k x num_heads_k x head_size
std::optional<at::Tensor>&
out_, // batch_size x seqlen_q x num_heads x head_size
std::optional<at::Tensor>&
alibi_slopes_, // num_heads or batch_size x num_heads
const float p_dropout,
const float softmax_scale,
bool is_causal,
std::optional<int64_t> window_size_left,
std::optional<int64_t> window_size_right,
const float softcap,
const bool return_softmax,
std::optional<at::Generator> gen_) {
#if defined(USE_ROCM_CK_SDPA)
if (at::globalContext().getROCmFAPreferredBackend() ==
at::ROCmFABackend::Ck) {
const int non_null_window_left = window_size_left.value_or(-1);
const int non_null_window_right = window_size_right.value_or(-1);
std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
return mha_fwd_ck(
q,
k,
v,
out_,
p_dropout,
softmax_scale,
is_causal,
non_null_window_left,
non_null_window_right,
return_softmax,
gen_,
dummy_attn_bias); // Not used in flash attention
}
#endif
return mha_fwd_aot(
q,
k,
v,
out_,
alibi_slopes_,
p_dropout,
softmax_scale,
is_causal,
window_size_left,
window_size_right,
return_softmax,
gen_);
}
}
#endif
namespace at {
namespace cuda::philox {

View File

@ -270,7 +270,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varle
#endif
TORCH_API
std::tuple<
inline std::tuple<
at::Tensor,
at::Tensor,
at::Tensor,
@ -294,7 +294,42 @@ mha_fwd(
std::optional<int64_t> window_size_right,
const float softcap,
const bool return_softmax,
std::optional<at::Generator> gen_);
std::optional<at::Generator> gen_) {
#if defined(USE_ROCM_CK_SDPA)
if (at::globalContext().getROCmFAPreferredBackend() ==
at::ROCmFABackend::Ck) {
const int non_null_window_left = window_size_left.value_or(-1);
const int non_null_window_right = window_size_right.value_or(-1);
std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
return mha_fwd_ck(
q,
k,
v,
out_,
p_dropout,
softmax_scale,
is_causal,
non_null_window_left,
non_null_window_right,
return_softmax,
gen_,
dummy_attn_bias); // Not used in flash attention
}
#endif
return mha_fwd_aot(
q,
k,
v,
out_,
alibi_slopes_,
p_dropout,
softmax_scale,
is_causal,
window_size_left,
window_size_right,
return_softmax,
gen_);
}
inline std::tuple<
at::Tensor,

View File

@ -72,12 +72,6 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
"timm_vovnet",
"torchrec_dlrm",
"vgg16",
# LLM
"meta-llama/Llama-3.2-1B",
"google/gemma-2-2b",
"google/gemma-3-4b-it",
"openai/whisper-tiny",
"Qwen/Qwen3-0.6B",
}
)

View File

@ -55,12 +55,6 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
"timm_nfnet",
"torchrec_dlrm",
"vgg16",
# LLM
"meta-llama/Llama-3.2-1B",
"google/gemma-2-2b",
"google/gemma-3-4b-it",
"openai/whisper-tiny",
"Qwen/Qwen3-0.6B",
}
)

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,5
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
171
172
173

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
YituTechConvBert,pass,5
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
google/gemma-2-2b,eager_fail_to_run,0
google/gemma-3-4b-it,eager_fail_to_run,0
openai/whisper-tiny,eager_fail_to_run,0
Qwen/Qwen3-0.6B,eager_fail_to_run,0

1 name accuracy graph_breaks
171
172
173

View File

@ -167,23 +167,3 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,fail_accuracy,0
google/gemma-2-2b,fail_accuracy,0
google/gemma-3-4b-it,fail_accuracy,0
openai/whisper-tiny,fail_to_run,0
Qwen/Qwen3-0.6B,fail_accuracy,0

1 name accuracy graph_breaks
167
168
169

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass_due_to_skip,0
google/gemma-2-2b,pass_due_to_skip,0
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass_due_to_skip,0
Qwen/Qwen3-0.6B,pass_due_to_skip,0

1 name accuracy graph_breaks
171
172
173

View File

@ -98,11 +98,11 @@ dlrm,pass,0
doctr_det_predictor,pass,3
doctr_det_predictor,pass,5
doctr_reco_predictor,pass,1
doctr_reco_predictor,pass,4

1 name accuracy graph_breaks
98
99
100
101
102
103
104
105
106
107
108

Some files were not shown because too many files have changed in this diff Show More