mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-24 23:54:56 +08:00
Compare commits
1 Commits
codex-test
...
mlazos/dc-
| Author | SHA1 | Date | |
|---|---|---|---|
| a87ac298a5 |
@ -144,6 +144,16 @@ case "$tag" in
|
||||
TRITON=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9)
|
||||
CUDA_VERSION=12.6.3
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
|
||||
CUDA_VERSION=12.8.1
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
@ -154,6 +164,39 @@ case "$tag" in
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.6
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.6
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.6
|
||||
ANACONDA_PYTHON_VERSION=3.13
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
|
||||
CUDA_VERSION=12.8.1
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
@ -176,6 +219,18 @@ case "$tag" in
|
||||
VISION=yes
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3.11-clang12)
|
||||
ANACONDA_PYTHON_VERSION=3.11
|
||||
CLANG_VERSION=12
|
||||
VISION=yes
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3.9-gcc9)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-noble-rocm-n-py3)
|
||||
if [[ $tag =~ "jammy" ]]; then
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
|
||||
@ -1 +1 @@
|
||||
f7888497a1eb9e98d4c07537f0d0bcfe180d1363
|
||||
11ec6354315768a85da41032535e3b7b99c5f706
|
||||
|
||||
@ -68,8 +68,8 @@ function install_nvshmem {
|
||||
# download, unpack, install
|
||||
wget -q "${url}"
|
||||
tar xf "${filename}.tar.gz"
|
||||
cp -a "libnvshmem/include/"* /usr/local/cuda/include/
|
||||
cp -a "libnvshmem/lib/"* /usr/local/cuda/lib64/
|
||||
cp -a "libnvshmem/include/"* /usr/local/include/
|
||||
cp -a "libnvshmem/lib/"* /usr/local/lib/
|
||||
|
||||
# cleanup
|
||||
cd ..
|
||||
|
||||
@ -15,37 +15,11 @@ function install_timm() {
|
||||
commit=$(get_pinned_commit timm)
|
||||
|
||||
pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
|
||||
}
|
||||
|
||||
function install_torchbench() {
|
||||
local commit
|
||||
commit=$(get_pinned_commit torchbench)
|
||||
git clone https://github.com/pytorch/benchmark torchbench
|
||||
pushd torchbench
|
||||
git checkout "$commit"
|
||||
|
||||
python install.py --continue_on_fail
|
||||
|
||||
# TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
|
||||
# is regressing speedup metric. This needs to be investigated further
|
||||
pip install transformers==4.38.1
|
||||
|
||||
echo "Print all dependencies after TorchBench is installed"
|
||||
python -mpip freeze
|
||||
popd
|
||||
|
||||
chown -R jenkins torchbench
|
||||
# Clean up
|
||||
conda_run pip uninstall -y torch torchvision triton
|
||||
}
|
||||
|
||||
# Pango is needed for weasyprint which is needed for doctr
|
||||
conda_install pango
|
||||
|
||||
# Stable packages are ok here, just to satisfy TorchBench check
|
||||
pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
|
||||
|
||||
install_torchbench
|
||||
install_huggingface
|
||||
install_timm
|
||||
|
||||
# Clean up
|
||||
conda_run pip uninstall -y torch torchvision torchaudio triton
|
||||
|
||||
@ -221,9 +221,9 @@ pygments==2.15.0
|
||||
#Pinned versions: 2.12.0
|
||||
#test that import: the doctests
|
||||
|
||||
#pyyaml
|
||||
#PyYAML
|
||||
#Description: data serialization format
|
||||
#Pinned versions: 6.0.2
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
#requests
|
||||
@ -233,7 +233,7 @@ pygments==2.15.0
|
||||
|
||||
#rich
|
||||
#Description: rich text and beautiful formatting in the terminal
|
||||
#Pinned versions: 14.1.0
|
||||
#Pinned versions: 10.9.0
|
||||
#test that import:
|
||||
|
||||
scikit-image==0.19.3 ; python_version < "3.10"
|
||||
@ -361,6 +361,7 @@ pwlf==2.2.1
|
||||
#Pinned versions: 2.2.1
|
||||
#test that import: test_sac_estimator.py
|
||||
|
||||
|
||||
# To build PyTorch itself
|
||||
pyyaml
|
||||
pyzstd
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
sphinx==5.3.0
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 5.3.0
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
|
||||
|
||||
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
|
||||
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
|
||||
@ -50,8 +50,8 @@ IPython==8.12.0
|
||||
#Pinned versions: 8.12.0
|
||||
|
||||
myst-nb==0.17.2
|
||||
#Description: This is used to generate PyTorch functorch and torch.compile docs.
|
||||
#Pinned versions: 0.17.2
|
||||
#Description: This is used to generate PyTorch functorch docs
|
||||
#Pinned versions: 0.13.2
|
||||
|
||||
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
|
||||
python-etcd==0.4.5
|
||||
@ -59,3 +59,4 @@ sphinx-copybutton==0.5.0
|
||||
sphinx-design==0.4.0
|
||||
sphinxcontrib-mermaid==1.0.0
|
||||
myst-parser==0.18.1
|
||||
myst-nb
|
||||
|
||||
@ -98,9 +98,8 @@ COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ci_commit_pins/huggingface.txt huggingface.txt
|
||||
COPY ci_commit_pins/timm.txt timm.txt
|
||||
COPY ci_commit_pins/torchbench.txt torchbench.txt
|
||||
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
|
||||
|
||||
# (optional) Install non-default Ninja version
|
||||
ARG NINJA_VERSION
|
||||
|
||||
@ -98,9 +98,8 @@ COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ci_commit_pins/huggingface.txt huggingface.txt
|
||||
COPY ci_commit_pins/timm.txt timm.txt
|
||||
COPY ci_commit_pins/torchbench.txt torchbench.txt
|
||||
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
|
||||
|
||||
ARG TRITON
|
||||
ARG TRITON_CPU
|
||||
|
||||
@ -194,7 +194,7 @@ ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library
|
||||
ROCBLAS_LIB_DST=lib/rocblas/library
|
||||
ROCBLAS_ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
|
||||
ROCBLAS_OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
|
||||
ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $ROCBLAS_OTHER_FILES)
|
||||
ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $OTHER_FILES)
|
||||
|
||||
# hipblaslt library files
|
||||
HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library
|
||||
|
||||
@ -50,9 +50,6 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
|
||||
export ATEN_THREADING=NATIVE
|
||||
fi
|
||||
|
||||
# Enable LLVM dependency for TensorExpr testing
|
||||
export USE_LLVM=/opt/llvm
|
||||
export LLVM_DIR=/opt/llvm/lib/cmake/llvm
|
||||
|
||||
if ! which conda; then
|
||||
# In ROCm CIs, we are doing cross compilation on build machines with
|
||||
@ -192,7 +189,6 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
|
||||
export USE_ASAN=1
|
||||
export REL_WITH_DEB_INFO=1
|
||||
export UBSAN_FLAGS="-fno-sanitize-recover=all"
|
||||
unset USE_LLVM
|
||||
fi
|
||||
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *no-ops* ]]; then
|
||||
|
||||
@ -229,6 +229,7 @@ function install_torchrec_and_fbgemm() {
|
||||
|
||||
pip_install tabulate # needed for newer fbgemm
|
||||
pip_install patchelf # needed for rocm fbgemm
|
||||
pushd /tmp
|
||||
|
||||
local wheel_dir=dist/fbgemm_gpu
|
||||
local found_whl=0
|
||||
@ -244,7 +245,7 @@ function install_torchrec_and_fbgemm() {
|
||||
if [ "${found_whl}" == "0" ]; then
|
||||
git clone --recursive https://github.com/pytorch/fbgemm
|
||||
pushd fbgemm/fbgemm_gpu
|
||||
git checkout "${fbgemm_commit}" --recurse-submodules
|
||||
git checkout "${fbgemm_commit}"
|
||||
python setup.py bdist_wheel \
|
||||
--build-variant=rocm \
|
||||
-DHIP_ROOT_DIR="${ROCM_PATH}" \
|
||||
@ -263,6 +264,7 @@ function install_torchrec_and_fbgemm() {
|
||||
done
|
||||
|
||||
rm -rf fbgemm
|
||||
popd
|
||||
else
|
||||
pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
|
||||
pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
|
||||
@ -281,6 +283,30 @@ function clone_pytorch_xla() {
|
||||
fi
|
||||
}
|
||||
|
||||
function checkout_install_torchbench() {
|
||||
local commit
|
||||
commit=$(get_pinned_commit torchbench)
|
||||
git clone https://github.com/pytorch/benchmark torchbench
|
||||
pushd torchbench
|
||||
git checkout "$commit"
|
||||
|
||||
if [ "$1" ]; then
|
||||
python install.py --continue_on_fail models "$@"
|
||||
else
|
||||
# Occasionally the installation may fail on one model but it is ok to continue
|
||||
# to install and test other models
|
||||
python install.py --continue_on_fail
|
||||
fi
|
||||
|
||||
# TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
|
||||
# is regressing speedup metric. This needs to be investigated further
|
||||
pip install transformers==4.38.1
|
||||
|
||||
echo "Print all dependencies after TorchBench is installed"
|
||||
python -mpip freeze
|
||||
popd
|
||||
}
|
||||
|
||||
function install_torchao() {
|
||||
local commit
|
||||
commit=$(get_pinned_commit torchao)
|
||||
|
||||
@ -157,29 +157,6 @@ test_jit_hooks() {
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
# Shellcheck doesn't like it when you pass no arguments to a function
|
||||
# that can take args. See https://www.shellcheck.net/wiki/SC2120
|
||||
# shellcheck disable=SC2120
|
||||
checkout_install_torchbench() {
|
||||
local commit
|
||||
commit=$(cat .ci/docker/ci_commit_pins/torchbench.txt)
|
||||
git clone https://github.com/pytorch/benchmark torchbench
|
||||
pushd torchbench
|
||||
git checkout "$commit"
|
||||
|
||||
if [ "$1" ]; then
|
||||
python install.py --continue_on_fail models "$@"
|
||||
else
|
||||
# Occasionally the installation may fail on one model but it is ok to continue
|
||||
# to install and test other models
|
||||
python install.py --continue_on_fail
|
||||
fi
|
||||
|
||||
echo "Print all dependencies after TorchBench is installed"
|
||||
python -mpip freeze
|
||||
popd
|
||||
}
|
||||
|
||||
torchbench_setup_macos() {
|
||||
git clone --recursive https://github.com/pytorch/vision torchvision
|
||||
git clone --recursive https://github.com/pytorch/audio torchaudio
|
||||
@ -202,6 +179,8 @@ torchbench_setup_macos() {
|
||||
USE_OPENMP=0 python setup.py develop
|
||||
popd
|
||||
|
||||
# Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120
|
||||
# shellcheck disable=SC2119,SC2120
|
||||
checkout_install_torchbench
|
||||
}
|
||||
|
||||
|
||||
@ -462,7 +462,7 @@ test_inductor_aoti() {
|
||||
# rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
|
||||
/usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
|
||||
|
||||
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
|
||||
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
|
||||
}
|
||||
|
||||
test_inductor_cpp_wrapper_shard() {
|
||||
@ -627,8 +627,6 @@ test_perf_for_dashboard() {
|
||||
device=cuda_a10g
|
||||
elif [[ "${TEST_CONFIG}" == *h100* ]]; then
|
||||
device=cuda_h100
|
||||
elif [[ "${TEST_CONFIG}" == *b200* ]]; then
|
||||
device=cuda_b200
|
||||
elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
|
||||
device=rocm
|
||||
fi
|
||||
@ -803,16 +801,6 @@ test_dynamo_benchmark() {
|
||||
if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
|
||||
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
|
||||
elif [[ "${TEST_CONFIG}" == *perf* ]]; then
|
||||
# TODO (huydhn): Just smoke test some sample models
|
||||
if [[ "${TEST_CONFIG}" == *b200* ]]; then
|
||||
if [[ "${suite}" == "huggingface" ]]; then
|
||||
export TORCHBENCH_ONLY_MODELS="DistillGPT2"
|
||||
elif [[ "${suite}" == "timm_models" ]]; then
|
||||
export TORCHBENCH_ONLY_MODELS="inception_v3"
|
||||
elif [[ "${suite}" == "torchbench" ]]; then
|
||||
export TORCHBENCH_ONLY_MODELS="hf_Bert"
|
||||
fi
|
||||
fi
|
||||
test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
|
||||
else
|
||||
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
|
||||
@ -1051,20 +1039,10 @@ test_libtorch_api() {
|
||||
mkdir -p $TEST_REPORTS_DIR
|
||||
|
||||
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
|
||||
"$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml
|
||||
else
|
||||
# Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
|
||||
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"
|
||||
|
||||
# On s390x, pytorch is built without llvm.
|
||||
# Even if it would be built with llvm, llvm currently doesn't support used features on s390x and
|
||||
# test fails with errors like:
|
||||
# JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer
|
||||
# unknown file: Failure
|
||||
# C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) }
|
||||
if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
|
||||
python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
|
||||
fi
|
||||
fi
|
||||
|
||||
# quantization is not fully supported on s390x yet
|
||||
@ -1684,11 +1662,13 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
|
||||
elif [[ "${TEST_CONFIG}" == cachebench ]]; then
|
||||
install_torchaudio
|
||||
install_torchvision
|
||||
PYTHONPATH=/torchbench test_cachebench
|
||||
checkout_install_torchbench nanogpt BERT_pytorch resnet50 hf_T5 llama moco
|
||||
PYTHONPATH=$(pwd)/torchbench test_cachebench
|
||||
elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
|
||||
install_torchaudio
|
||||
install_torchvision
|
||||
PYTHONPATH=/torchbench test_verify_cachebench
|
||||
checkout_install_torchbench nanogpt
|
||||
PYTHONPATH=$(pwd)/torchbench test_verify_cachebench
|
||||
elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
|
||||
install_torchaudio
|
||||
install_torchvision
|
||||
@ -1697,22 +1677,28 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
|
||||
# https://github.com/opencv/opencv-python/issues/885
|
||||
pip_install opencv-python==4.8.0.74
|
||||
if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
|
||||
PYTHONPATH=/torchbench test_inductor_torchbench_smoketest_perf
|
||||
checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer
|
||||
PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
|
||||
elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
|
||||
PYTHONPATH=/torchbench test_inductor_torchbench_cpu_smoketest_perf
|
||||
checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \
|
||||
llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
|
||||
functorch_maml_omniglot yolov3 mobilenet_v2 resnext50_32x4d densenet121 mnasnet1_0
|
||||
PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
|
||||
elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
|
||||
TORCHBENCHPATH=/torchbench test_torchbench_gcp_smoketest
|
||||
checkout_install_torchbench
|
||||
TORCHBENCHPATH=$(pwd)/torchbench test_torchbench_gcp_smoketest
|
||||
else
|
||||
checkout_install_torchbench
|
||||
# Do this after checkout_install_torchbench to ensure we clobber any
|
||||
# nightlies that torchbench may pull in
|
||||
if [[ "${TEST_CONFIG}" != *cpu* ]]; then
|
||||
install_torchrec_and_fbgemm
|
||||
fi
|
||||
PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
|
||||
PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
|
||||
fi
|
||||
elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
|
||||
install_torchvision
|
||||
PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
|
||||
PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
|
||||
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
|
||||
test_inductor_aoti
|
||||
fi
|
||||
|
||||
2
.flake8
2
.flake8
@ -7,7 +7,7 @@ max-line-length = 120
|
||||
# C408 ignored because we like the dict keyword argument syntax
|
||||
# E501 is not flexible enough, we're using B950 instead
|
||||
ignore =
|
||||
E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
|
||||
E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,
|
||||
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
|
||||
# to line this up with executable bit
|
||||
EXE001,
|
||||
|
||||
6
.github/actionlint.yaml
vendored
6
.github/actionlint.yaml
vendored
@ -53,9 +53,9 @@ self-hosted-runner:
|
||||
- linux.rocm.gpu.mi250
|
||||
- linux.rocm.gpu.2
|
||||
- linux.rocm.gpu.4
|
||||
# gfx942 runners
|
||||
- linux.rocm.gpu.gfx942.2
|
||||
- linux.rocm.gpu.gfx942.4
|
||||
# MI300 runners
|
||||
- linux.rocm.gpu.mi300.2
|
||||
- linux.rocm.gpu.mi300.4
|
||||
- rocm-docker
|
||||
# Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
|
||||
- macos-m1-stable
|
||||
|
||||
@ -70,7 +70,7 @@ runs:
|
||||
set -eux
|
||||
# PyYAML 6.0 doesn't work with MacOS x86 anymore
|
||||
# This must run on Python-3.7 (AmazonLinux2) so can't use request=3.32.2
|
||||
python3 -m pip install requests==2.27.1 pyyaml==6.0.2
|
||||
python3 -m pip install requests==2.27.1 pyyaml==6.0.1
|
||||
|
||||
- name: Parse ref
|
||||
id: parse-ref
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
6fbc710b617f79b992ef2ebc7f95e818aa390293
|
||||
f6dfe1231dcdd221a68416e49ab85c2575cbb824
|
||||
|
||||
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
6a39ba85fe0f2fff9494b5eccea717c93510c230
|
||||
8f605ee30912541126c0fe46d0c8c413101b600a
|
||||
|
||||
2
.github/ci_commit_pins/xla.txt
vendored
2
.github/ci_commit_pins/xla.txt
vendored
@ -1 +1 @@
|
||||
b6a5b82b9948b610fa4c304d0d869c82b8f17db1
|
||||
29ae4c76c026185f417a25e841d2cd5e65f087a3
|
||||
|
||||
4
.github/merge_rules.yaml
vendored
4
.github/merge_rules.yaml
vendored
@ -488,10 +488,6 @@
|
||||
- torch/_dynamo/**
|
||||
- torch/csrc/dynamo/**
|
||||
- test/dynamo/**
|
||||
- test/dynamo_expected_failures/**
|
||||
- test/dynamo_skips/**
|
||||
- test/inductor_expected_failures/**
|
||||
- test/inductor_skips/**
|
||||
approved_by:
|
||||
- guilhermeleobas
|
||||
mandatory_checks_name:
|
||||
|
||||
6
.github/requirements-gha-cache.txt
vendored
6
.github/requirements-gha-cache.txt
vendored
@ -7,9 +7,9 @@
|
||||
# .ci/docker/requirements-ci.txt
|
||||
boto3==1.35.42
|
||||
jinja2==3.1.6
|
||||
lintrunner==0.12.7
|
||||
lintrunner==0.10.7
|
||||
ninja==1.10.0.post1
|
||||
nvidia-ml-py==11.525.84
|
||||
pyyaml==6.0.2
|
||||
pyyaml==6.0
|
||||
requests==2.32.4
|
||||
rich==14.1.0
|
||||
rich==10.9.0
|
||||
|
||||
@ -2,7 +2,7 @@ boto3==1.35.42
|
||||
cmake==3.27.*
|
||||
expecttest==0.3.0
|
||||
fbscribelogger==0.1.7
|
||||
filelock==3.18.0
|
||||
filelock==3.6.0
|
||||
hypothesis==6.56.4
|
||||
librosa>=0.6.2
|
||||
mpmath==1.3.0
|
||||
|
||||
@ -193,7 +193,7 @@ LIBTORCH_CONTAINER_IMAGES: dict[str, str] = {
|
||||
"cpu": "libtorch-cxx11-builder:cpu",
|
||||
}
|
||||
|
||||
FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
|
||||
FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]
|
||||
|
||||
|
||||
def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
|
||||
@ -315,11 +315,6 @@ def generate_wheels_matrix(
|
||||
# TODO: Enable python 3.13t on cpu-s390x
|
||||
if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
|
||||
continue
|
||||
# TODO: Enable python 3.14 on non linux OSes
|
||||
if os != "linux" and (
|
||||
python_version == "3.14" or python_version == "3.14t"
|
||||
):
|
||||
continue
|
||||
|
||||
if use_split_build and (
|
||||
arch_version not in ["12.6", "12.8", "12.9", "cpu"] or os != "linux"
|
||||
|
||||
2
.github/scripts/lintrunner.sh
vendored
2
.github/scripts/lintrunner.sh
vendored
@ -2,7 +2,7 @@
|
||||
set -ex
|
||||
|
||||
# Use uv to speed up lintrunner init
|
||||
python3 -m pip install -U uv==0.8.* setuptools
|
||||
python3 -m pip install uv==0.1.45 setuptools
|
||||
|
||||
CACHE_DIRECTORY="/tmp/.lintbin"
|
||||
# Try to recover the cached binaries
|
||||
|
||||
4
.github/scripts/trymerge.py
vendored
4
.github/scripts/trymerge.py
vendored
@ -1891,9 +1891,7 @@ def validate_revert(
|
||||
else pr.get_comment_by_id(comment_id)
|
||||
)
|
||||
if comment.editor_login is not None:
|
||||
raise PostCommentError(
|
||||
"Halting the revert as the revert comment has been edited."
|
||||
)
|
||||
raise PostCommentError("Don't want to revert based on edited command")
|
||||
author_association = comment.author_association
|
||||
author_login = comment.author_login
|
||||
allowed_reverters = ["COLLABORATOR", "MEMBER", "OWNER"]
|
||||
|
||||
20
.github/workflows/_linux-test.yml
vendored
20
.github/workflows/_linux-test.yml
vendored
@ -96,7 +96,7 @@ jobs:
|
||||
steps:
|
||||
- name: Setup SSH (Click me for login details)
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
if: ${{ !contains(matrix.runner, 'b200') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
|
||||
if: ${{ matrix.runner != 'B200' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
instructions: |
|
||||
@ -109,7 +109,7 @@ jobs:
|
||||
no-sudo: true
|
||||
|
||||
- name: Setup Python
|
||||
if: contains(matrix.runner, 'b200')
|
||||
if: matrix.runner == 'B200'
|
||||
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
|
||||
with:
|
||||
python-version: '3.12'
|
||||
@ -117,7 +117,7 @@ jobs:
|
||||
|
||||
- name: Setup Linux
|
||||
uses: ./.github/actions/setup-linux
|
||||
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && !contains(matrix.runner, 'b200')
|
||||
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && matrix.runner != 'B200'
|
||||
|
||||
- name: configure aws credentials
|
||||
if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
|
||||
@ -128,7 +128,7 @@ jobs:
|
||||
aws-region: us-east-1
|
||||
|
||||
- name: Login to Amazon ECR
|
||||
if: ${{ inputs.aws-role-to-assume != '' && contains(matrix.runner, 'b200') }}
|
||||
if: ${{ inputs.aws-role-to-assume != '' && matrix.runner == 'B200' }}
|
||||
id: login-ecr
|
||||
continue-on-error: true
|
||||
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
|
||||
@ -166,17 +166,17 @@ jobs:
|
||||
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
|
||||
with:
|
||||
driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
|
||||
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}
|
||||
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && matrix.runner != 'B200' }}
|
||||
|
||||
- name: Setup GPU_FLAG for docker run
|
||||
id: setup-gpu-flag
|
||||
run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
|
||||
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || contains(matrix.runner, 'b200')) }}
|
||||
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || matrix.runner == 'B200') }}
|
||||
|
||||
- name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
|
||||
id: setup-sscache-port-flag
|
||||
run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
|
||||
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && !contains(matrix.runner, 'b200') }}
|
||||
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && matrix.runner != 'B200' }}
|
||||
|
||||
- name: Lock NVIDIA A100 40GB Frequency
|
||||
run: |
|
||||
@ -277,8 +277,8 @@ jobs:
|
||||
NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
|
||||
TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
|
||||
# Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
|
||||
SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }}
|
||||
SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
|
||||
SCCACHE_BUCKET: ${{ matrix.runner != 'B200' && 'ossci-compiler-cache-circleci-v2' || '' }}
|
||||
SCCACHE_REGION: ${{ matrix.runner != 'B200' && 'us-east-1' || '' }}
|
||||
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
|
||||
DOCKER_IMAGE: ${{ inputs.docker-image }}
|
||||
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
|
||||
@ -403,7 +403,7 @@ jobs:
|
||||
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
|
||||
|
||||
- name: Authenticate with AWS
|
||||
if: ${{ contains(matrix.runner, 'b200') }}
|
||||
if: ${{ matrix.runner == 'B200' }}
|
||||
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
|
||||
|
||||
4
.github/workflows/_rocm-test.yml
vendored
4
.github/workflows/_rocm-test.yml
vendored
@ -269,8 +269,8 @@ jobs:
|
||||
# copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
|
||||
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"
|
||||
|
||||
- name: Change permissions (only needed for kubernetes runners for now)
|
||||
if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'gfx942') || contains(matrix.runner, 'mi355')) }}
|
||||
- name: Change permissions (only needed for MI300 and MI355 kubernetes runners for now)
|
||||
if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'mi300') || contains(matrix.runner, 'mi355')) }}
|
||||
run: |
|
||||
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
|
||||
|
||||
|
||||
3
.github/workflows/check-labels.yml
vendored
3
.github/workflows/check-labels.yml
vendored
@ -34,8 +34,7 @@ jobs:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
name: Check labels
|
||||
# Disabling the job until https://github.com/pytorch/pytorch/issues/159825 is resolved
|
||||
if: github.repository_owner == 'pytorch' && false
|
||||
if: github.repository_owner == 'pytorch'
|
||||
runs-on: linux.24_04.4x
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
|
||||
@ -7,8 +7,7 @@ on:
|
||||
|
||||
jobs:
|
||||
ghstack-mergeability-check:
|
||||
# Disabling the job until https://github.com/pytorch/pytorch/issues/159825 is resolved
|
||||
if: github.repository_owner == 'pytorch' && false
|
||||
if: github.repository_owner == 'pytorch'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
@ -57,7 +56,7 @@ jobs:
|
||||
cache: pip
|
||||
architecture: x64
|
||||
|
||||
- run: pip install pyyaml==6.0.2
|
||||
- run: pip install pyyaml==6.0
|
||||
shell: bash
|
||||
|
||||
- name: Verify mergeability
|
||||
|
||||
2
.github/workflows/cherry-pick.yml
vendored
2
.github/workflows/cherry-pick.yml
vendored
@ -26,7 +26,7 @@ jobs:
|
||||
cache: pip
|
||||
|
||||
# Not the direct dependencies but the script uses trymerge
|
||||
- run: pip install pyyaml==6.0.2
|
||||
- run: pip install pyyaml==6.0
|
||||
|
||||
- name: Setup committer id
|
||||
run: |
|
||||
|
||||
8
.github/workflows/docker-builds.yml
vendored
8
.github/workflows/docker-builds.yml
vendored
@ -51,12 +51,17 @@ jobs:
|
||||
docker-image-name: [
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
|
||||
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-py3.9-clang12,
|
||||
pytorch-linux-jammy-py3.11-clang12,
|
||||
pytorch-linux-jammy-py3.12-clang12,
|
||||
pytorch-linux-jammy-py3.13-clang12,
|
||||
pytorch-linux-jammy-rocm-n-py3,
|
||||
pytorch-linux-noble-rocm-n-py3,
|
||||
@ -71,8 +76,7 @@ jobs:
|
||||
pytorch-linux-jammy-py3-clang12-onnx,
|
||||
pytorch-linux-jammy-linter,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
|
||||
# Executorch pin needs update
|
||||
# pytorch-linux-jammy-py3-clang12-executorch,
|
||||
pytorch-linux-jammy-py3-clang12-executorch,
|
||||
pytorch-linux-jammy-py3.12-triton-cpu
|
||||
]
|
||||
include:
|
||||
|
||||
1226
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
1226
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
File diff suppressed because it is too large
Load Diff
154
.github/workflows/inductor-perf-test-b200.yml
vendored
154
.github/workflows/inductor-perf-test-b200.yml
vendored
@ -1,154 +0,0 @@
|
||||
name: inductor-perf-b200
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: 0 7 * * 1-6
|
||||
- cron: 0 7 * * 0
|
||||
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
|
||||
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
training:
|
||||
description: Run training (on by default)?
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
inference:
|
||||
description: Run inference (on by default)?
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
default:
|
||||
description: Run inductor_default?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
dynamic:
|
||||
description: Run inductor_dynamic_shapes?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
cppwrapper:
|
||||
description: Run inductor_cpp_wrapper?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
cudagraphs:
|
||||
description: Run inductor_cudagraphs?
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
freezing_cudagraphs:
|
||||
description: Run inductor_cudagraphs with freezing for inference?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
aotinductor:
|
||||
description: Run aot_inductor for inference?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
maxautotune:
|
||||
description: Run inductor_max_autotune?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
benchmark_configs:
|
||||
description: The list of configs used the benchmark
|
||||
required: false
|
||||
type: string
|
||||
default: inductor_huggingface_perf_cuda_b200,inductor_timm_perf_cuda_b200,inductor_torchbench_perf_cuda_b200
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
opt_out_experiments: lf
|
||||
|
||||
build:
|
||||
name: cuda12.8-py3.10-gcc9-sm100
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
# Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100
|
||||
# or newer GPUs, so it doesn't benefit much from existing compiler cache
|
||||
# from trunk. Also use a memory-intensive runner here because memory is
|
||||
# usually the bottleneck
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '10.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor_huggingface_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
|
||||
{ config: "inductor_timm_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
|
||||
{ config: "inductor_torchbench_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
|
||||
]}
|
||||
selected-test-configs: ${{ inputs.benchmark_configs }}
|
||||
build-additional-packages: "vision audio fbgemm torchao"
|
||||
secrets: inherit
|
||||
|
||||
test-periodically:
|
||||
name: cuda12.8-py3.10-gcc9-sm100
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event.schedule == '0 7 * * 1-6'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
timeout-minutes: 720
|
||||
disable-monitor: false
|
||||
monitor-log-interval: 15
|
||||
monitor-data-collect-interval: 4
|
||||
secrets: inherit
|
||||
|
||||
test-weekly:
|
||||
name: cuda12.8-py3.10-gcc9-sm100
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event.schedule == '0 7 * * 0'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
timeout-minutes: 1440
|
||||
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
disable-monitor: false
|
||||
monitor-log-interval: 15
|
||||
monitor-data-collect-interval: 4
|
||||
secrets: inherit
|
||||
|
||||
test:
|
||||
name: cuda12.8-py3.10-gcc9-sm100
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
timeout-minutes: 720
|
||||
disable-monitor: false
|
||||
monitor-log-interval: 15
|
||||
monitor-data-collect-interval: 4
|
||||
secrets: inherit
|
||||
@ -88,23 +88,23 @@ jobs:
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
30
.github/workflows/inductor-periodic.yml
vendored
30
.github/workflows/inductor-periodic.yml
vendored
@ -81,21 +81,21 @@ jobs:
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
4
.github/workflows/inductor-rocm-mi300.yml
vendored
4
.github/workflows/inductor-rocm-mi300.yml
vendored
@ -47,8 +47,8 @@ jobs:
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
9
.github/workflows/nightly.yml
vendored
9
.github/workflows/nightly.yml
vendored
@ -75,11 +75,10 @@ jobs:
|
||||
repo-owner: pytorch
|
||||
branch: main
|
||||
pin-folder: .github/ci_commit_pins
|
||||
# executorch jobs are disabled since it needs some manual work for the hash update
|
||||
# - repo-name: executorch
|
||||
# repo-owner: pytorch
|
||||
# branch: main
|
||||
# pin-folder: .ci/docker/ci_commit_pins
|
||||
- repo-name: executorch
|
||||
repo-owner: pytorch
|
||||
branch: main
|
||||
pin-folder: .ci/docker/ci_commit_pins
|
||||
- repo-name: triton
|
||||
repo-owner: triton-lang
|
||||
branch: main
|
||||
|
||||
6
.github/workflows/periodic-rocm-mi300.yml
vendored
6
.github/workflows/periodic-rocm-mi300.yml
vendored
@ -59,9 +59,9 @@ jobs:
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
31
.github/workflows/periodic.yml
vendored
31
.github/workflows/periodic.yml
vendored
@ -51,6 +51,37 @@ jobs:
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-cuda12_4-py3_10-gcc11-sm89-build:
|
||||
name: linux-jammy-cuda12.4-py3.10-gcc11-sm89
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.4-py3.10-gcc11-sm89
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11
|
||||
cuda-arch-list: 8.9
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_4-py3_10-gcc11-sm89-test:
|
||||
name: linux-jammy-cuda12.4-py3.10-gcc11-sm89
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-jammy-cuda12_4-py3_10-gcc11-sm89-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.4-py3.10-gcc11-sm89
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-sm89-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-sm89-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_4-py3_10-gcc11-build:
|
||||
name: linux-jammy-cuda12.4-py3.10-gcc11
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
||||
43
.github/workflows/pull.yml
vendored
43
.github/workflows/pull.yml
vendored
@ -292,14 +292,13 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: 8.9
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
@ -403,8 +402,38 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-sm89-build:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: 8.9
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-sm89-test:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-jammy-cuda12_8-py3_10-gcc11-sm89-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3-clang12-executorch-build:
|
||||
if: false # Docker build needs pin update
|
||||
name: linux-jammy-py3-clang12-executorch
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
|
||||
2
.github/workflows/revert.yml
vendored
2
.github/workflows/revert.yml
vendored
@ -26,7 +26,7 @@ jobs:
|
||||
architecture: x64
|
||||
check-latest: false
|
||||
cache: pip
|
||||
- run: pip install pyyaml==6.0.2
|
||||
- run: pip install pyyaml==6.0
|
||||
|
||||
- name: Setup committer id
|
||||
run: |
|
||||
|
||||
12
.github/workflows/rocm-mi300.yml
vendored
12
.github/workflows/rocm-mi300.yml
vendored
@ -48,12 +48,12 @@ jobs:
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
2
.github/workflows/rocm-mi355.yml
vendored
2
.github/workflows/rocm-mi355.yml
vendored
@ -3,7 +3,7 @@ name: rocm-mi355
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: 30 11,1 * * * # about 4:30am PDT and 6:30pm PDT
|
||||
- cron: 30 9 * * * # about 2:30am PDT
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
|
||||
4
.github/workflows/torchbench.yml
vendored
4
.github/workflows/torchbench.yml
vendored
@ -10,10 +10,6 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
get-default-label-prefix:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
|
||||
2
.github/workflows/trunk.yml
vendored
2
.github/workflows/trunk.yml
vendored
@ -205,7 +205,7 @@ jobs:
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
|
||||
2
.github/workflows/trymerge.yml
vendored
2
.github/workflows/trymerge.yml
vendored
@ -28,7 +28,7 @@ jobs:
|
||||
check-latest: false
|
||||
cache: pip
|
||||
architecture: x64
|
||||
- run: pip install pyyaml==6.0.2
|
||||
- run: pip install pyyaml==6.0
|
||||
|
||||
- name: Setup committer id
|
||||
run: |
|
||||
|
||||
2
.github/workflows/tryrebase.yml
vendored
2
.github/workflows/tryrebase.yml
vendored
@ -25,7 +25,7 @@ jobs:
|
||||
architecture: x64
|
||||
check-latest: false
|
||||
cache: pip
|
||||
- run: pip install pyyaml==6.0.2
|
||||
- run: pip install pyyaml==6.0
|
||||
|
||||
- name: Setup committer id
|
||||
run: |
|
||||
|
||||
2
.github/workflows/update-viablestrict.yml
vendored
2
.github/workflows/update-viablestrict.yml
vendored
@ -23,7 +23,7 @@ jobs:
|
||||
with:
|
||||
repository: pytorch/pytorch
|
||||
stable-branch: viable/strict
|
||||
requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\", \"linux-aarch64\"]'
|
||||
requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\"]'
|
||||
secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
|
||||
clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
|
||||
clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}
|
||||
|
||||
@ -39,16 +39,16 @@ init_command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/pip_init.py',
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'flake8==7.3.0',
|
||||
'flake8-bugbear==24.12.12',
|
||||
'flake8-comprehensions==3.16.0',
|
||||
'flake8==6.1.0',
|
||||
'flake8-bugbear==23.3.23',
|
||||
'flake8-comprehensions==3.15.0',
|
||||
'flake8-executable==2.1.3',
|
||||
'flake8-logging-format==2024.24.12',
|
||||
'flake8-pyi==25.5.0',
|
||||
'flake8-simplify==0.22.0',
|
||||
'flake8-logging-format==0.9.0',
|
||||
'flake8-pyi==23.3.1',
|
||||
'flake8-simplify==0.19.3',
|
||||
'mccabe==0.7.0',
|
||||
'pycodestyle==2.14.0',
|
||||
'pyflakes==3.4.0',
|
||||
'pycodestyle==2.11.1',
|
||||
'pyflakes==3.1.0',
|
||||
'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"',
|
||||
]
|
||||
|
||||
@ -158,16 +158,16 @@ init_command = [
|
||||
'mypy==1.16.0',
|
||||
'sympy==1.13.3',
|
||||
'types-requests==2.27.25',
|
||||
'types-pyyaml==6.0.2',
|
||||
'types-pyyaml==6.0.1',
|
||||
'types-tabulate==0.8.8',
|
||||
'types-protobuf==5.29.1.20250403',
|
||||
'types-setuptools==79.0.0.20250422',
|
||||
'types-jinja2==2.11.9',
|
||||
'types-colorama==0.4.6',
|
||||
'filelock==3.18.0',
|
||||
'filelock==3.13.1',
|
||||
'junitparser==2.1.1',
|
||||
'rich==14.1.0',
|
||||
'pyyaml==6.0.2',
|
||||
'rich==10.9.0',
|
||||
'pyyaml==6.0.1',
|
||||
'optree==0.13.0',
|
||||
'dataclasses-json==0.6.7',
|
||||
'pandas==2.2.3',
|
||||
@ -1111,7 +1111,7 @@ init_command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/pip_init.py',
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'pyyaml==6.0.2',
|
||||
'PyYAML==6.0.1',
|
||||
]
|
||||
|
||||
[[linter]]
|
||||
@ -1133,7 +1133,7 @@ init_command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/pip_init.py',
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'pyyaml==6.0.2',
|
||||
'PyYAML==6.0.1',
|
||||
]
|
||||
|
||||
[[linter]]
|
||||
|
||||
17
AGENTS.md
17
AGENTS.md
@ -1,18 +1 @@
|
||||
- This is the only AGENTS.md, there are no recursive AGENTS.md
|
||||
- When you are working on a bug, first create a standalone file that
|
||||
reproduces the bug and verify it fails in the expected way. Use this to
|
||||
test if your changes work. Once the change is passing, find an appropriate
|
||||
test file to add the test to and make sure to follow local conventions on
|
||||
the test file.
|
||||
- If you are running the real test suite, DO NOT run the entire test suite.
|
||||
Instead run only a single test case, e.g., 'python test/test_torch.py TestTorch.test_dir'
|
||||
- Do NOT run setup.py, you do not have a working build environment
|
||||
- Do NOT run pre-commit, it is not setup
|
||||
- To run lint, run 'lintrunner -a' (which will autoapply changes). lintrunner
|
||||
ONLY accepts this flag, do not try to run on individual files.
|
||||
- Do NOT attempt to install dependencies, you do not have Internet access
|
||||
- When you are ready to make a PR, do exactly these steps:
|
||||
- git stash -u
|
||||
- git reset --hard $(cat /tmp/orig_work.txt) # NB: reset to the LOCAL branch, do NOT fetch
|
||||
- git stash pop
|
||||
- Resolve conflicts if necessary
|
||||
|
||||
@ -679,7 +679,6 @@ cc_library(
|
||||
[
|
||||
"torch/*.h",
|
||||
"torch/csrc/**/*.h",
|
||||
"torch/nativert/**/*.h",
|
||||
"torch/csrc/distributed/c10d/**/*.hpp",
|
||||
"torch/lib/libshm/*.h",
|
||||
],
|
||||
|
||||
@ -564,7 +564,7 @@ if(MSVC)
|
||||
set(CMAKE_NINJA_CMCLDEPS_RC OFF)
|
||||
if(MSVC_Z7_OVERRIDE)
|
||||
# CMake set debug flags to use /Z7
|
||||
set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug,RelWithDebInfo>:Embedded>")
|
||||
set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT Embedded)
|
||||
endif()
|
||||
foreach(
|
||||
flag_var
|
||||
@ -872,14 +872,6 @@ cmake_dependent_option(
|
||||
"USE_CUDA OR USE_ROCM;NOT MSVC"
|
||||
OFF)
|
||||
|
||||
cmake_dependent_option(
|
||||
USE_FBGEMM_GENAI
|
||||
"Whether to build FBGEMM GenAI quantized GEMM kernels.\
|
||||
Will be disabled if not supported by the platform"
|
||||
OFF
|
||||
"USE_CUDA OR USE_ROCM"
|
||||
OFF)
|
||||
|
||||
# CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
|
||||
# Eff Attention won't
|
||||
cmake_dependent_option(
|
||||
@ -913,10 +905,6 @@ if(USE_FBGEMM)
|
||||
string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM")
|
||||
endif()
|
||||
|
||||
if(USE_FBGEMM_GENAI)
|
||||
string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM_GENAI")
|
||||
endif()
|
||||
|
||||
if(USE_PYTORCH_QNNPACK)
|
||||
string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
|
||||
endif()
|
||||
|
||||
18
CODEOWNERS
18
CODEOWNERS
@ -14,6 +14,7 @@
|
||||
/torch/csrc/autograd/ @albanD @soulitzer
|
||||
/torch/autograd/ @albanD @soulitzer
|
||||
/tools/autograd/ @albanD @soulitzer
|
||||
/torch/header_only_apis.txt @janeyx99
|
||||
/torch/nn/ @albanD @jbschlosser @mikaylagawarecki
|
||||
/torch/optim/ @albanD @janeyx99
|
||||
/test/test_public_bindings.py @albanD
|
||||
@ -50,12 +51,12 @@ nn/qat/ @jerryzh168
|
||||
/torch/csrc/distributed/c10d/Ops.* @kwen2501
|
||||
|
||||
# ONNX Export
|
||||
/torch/_dynamo/backends/onnxrt.py @titaiwangms @xadupre @justinchuby
|
||||
/torch/csrc/jit/passes/onnx.h @titaiwangms @xadupre
|
||||
/torch/csrc/jit/passes/onnx.cpp @titaiwangms @xadupre
|
||||
/torch/csrc/jit/passes/onnx/ @titaiwangms @xadupre
|
||||
/torch/onnx/ @titaiwangms @xadupre @justinchuby
|
||||
/test/onnx/ @titaiwangms @xadupre @justinchuby
|
||||
/torch/_dynamo/backends/onnxrt.py @wschin
|
||||
/torch/csrc/jit/passes/onnx.h @titaiwangms @shubhambhokare1
|
||||
/torch/csrc/jit/passes/onnx.cpp @titaiwangms @shubhambhokare1
|
||||
/torch/csrc/jit/passes/onnx/ @titaiwangms @shubhambhokare1
|
||||
/torch/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin
|
||||
/test/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin
|
||||
|
||||
# CI
|
||||
/.ci @pytorch/pytorch-dev-infra
|
||||
@ -195,8 +196,3 @@ torch/backends/cudnn/ @eqy @syed-ahmed
|
||||
/torch/utils/_cxx_pytree.py @XuehaiPan
|
||||
/torch/utils/pytree/ @XuehaiPan
|
||||
/torch/_dynamo/polyfills/pytree.py @XuehaiPan
|
||||
|
||||
# Relating to libtorch ABI
|
||||
/torch/csrc/stable/ @janeyx99 @mikaylagawarecki
|
||||
/torch/headeronly/ @janeyx99
|
||||
/torch/header_only_apis.txt @janeyx99
|
||||
|
||||
@ -276,7 +276,7 @@ conda install pkg-config libuv
|
||||
pip install mkl-static mkl-include
|
||||
# Add these packages if torch.distributed is needed.
|
||||
# Distributed package support on Windows is a prototype feature and is subject to changes.
|
||||
conda install -c conda-forge libuv
|
||||
conda install -c conda-forge libuv=1.39
|
||||
```
|
||||
|
||||
#### Install PyTorch
|
||||
|
||||
@ -247,50 +247,6 @@ if(USE_MEM_EFF_ATTENTION)
|
||||
list(APPEND ATen_ATTENTION_KERNEL_SRCS ${mem_eff_attention_cuda_kernels_cu})
|
||||
endif()
|
||||
|
||||
IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
|
||||
message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
|
||||
set(USE_FBGEMM_GENAI off)
|
||||
endif()
|
||||
|
||||
# FBGEMM GenAI
|
||||
IF(USE_FBGEMM_GENAI)
|
||||
set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/)
|
||||
set(FBGEMM_GENAI_DIR ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
|
||||
|
||||
if(USE_ROCM)
|
||||
# Only include the kernels we want to build to avoid increasing binary size.
|
||||
file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
|
||||
"${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
|
||||
"${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
|
||||
set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
|
||||
|
||||
# Add additional HIPCC compiler flags for performance
|
||||
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
|
||||
-mllvm
|
||||
-amdgpu-coerce-illegal-types=1
|
||||
-mllvm
|
||||
-enable-post-misched=0
|
||||
-mllvm
|
||||
-greedy-reverse-local-assignment=1
|
||||
-fhip-new-launch-api)
|
||||
|
||||
hip_add_library(
|
||||
fbgemm_genai STATIC
|
||||
${fbgemm_genai_native_rocm_hip}
|
||||
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
|
||||
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
|
||||
|
||||
target_include_directories(fbgemm_genai PUBLIC
|
||||
# FBGEMM version of Composable Kernel is used due to some customizations
|
||||
${FBGEMM_THIRD_PARTY}/composable_kernel/include
|
||||
${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
|
||||
${FBGEMM_GENAI_DIR}/include/
|
||||
${FBGEMM_GENAI_DIR}/common/include/
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# XNNPACK
|
||||
file(GLOB native_xnnpack "native/xnnpack/*.cpp")
|
||||
|
||||
@ -439,7 +395,6 @@ if(USE_ROCM)
|
||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
|
||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
|
||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
|
||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/example/ck_tile/01_fmha)
|
||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
|
||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/aiter/csrc/include)
|
||||
_pytorch_rocm_generate_ck_conf()
|
||||
@ -704,17 +659,21 @@ if(USE_MPS)
|
||||
if(CAN_COMPILE_METAL)
|
||||
foreach(SHADER ${native_mps_metal})
|
||||
cmake_path(GET SHADER STEM TGT_STEM)
|
||||
string(CONCAT TGT_BASIC ${TGT_STEM} "_31.air")
|
||||
string(CONCAT TGT_BASIC ${TGT_STEM} "_30.air")
|
||||
string(CONCAT TGT_BFLOAT ${TGT_STEM} "_31.air")
|
||||
list(APPEND AIR_BASIC ${TGT_BASIC})
|
||||
metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.1")
|
||||
list(APPEND AIR_BFLOAT ${TGT_BFLOAT})
|
||||
metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.0")
|
||||
metal_to_air(${SHADER} ${TGT_BFLOAT} "-std=metal3.1")
|
||||
endforeach()
|
||||
air_to_metallib(kernels_basic.metallib ${AIR_BASIC})
|
||||
air_to_metallib(kernels_bfloat.metallib ${AIR_BFLOAT})
|
||||
add_custom_command(
|
||||
COMMAND echo "// $$(date)" > metallib_dummy.cpp
|
||||
DEPENDS kernels_basic.metallib
|
||||
DEPENDS kernels_basic.metallib kernels_bfloat.metallib
|
||||
OUTPUT metallib_dummy.cpp
|
||||
COMMENT "Updating metallibs timestamp")
|
||||
add_custom_target(metallibs DEPENDS kernels_basic.metallib metallib_dummy.cpp)
|
||||
add_custom_target(metallibs DEPENDS kernels_basic.metallib kernels_bfloat.metallib metallib_dummy.cpp)
|
||||
else()
|
||||
file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/native/mps")
|
||||
foreach(SHADER ${native_mps_metal})
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/CachingDeviceAllocator.h>
|
||||
#include <c10/core/DeviceType.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
|
||||
@ -73,27 +72,6 @@ TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index);
|
||||
// original device index that was active before the change.
|
||||
TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index);
|
||||
|
||||
TORCH_API inline void emptyCache() {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
at::getDeviceAllocator(device_type)->emptyCache();
|
||||
}
|
||||
|
||||
TORCH_API inline at::CachingDeviceAllocator::DeviceStats getDeviceStats(
|
||||
c10::DeviceIndex device_index) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
return at::getDeviceAllocator(device_type)->getDeviceStats(device_index);
|
||||
}
|
||||
|
||||
TORCH_API inline void resetAccumulatedStats(c10::DeviceIndex device_index) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
at::getDeviceAllocator(device_type)->resetAccumulatedStats(device_index);
|
||||
}
|
||||
|
||||
TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
|
||||
}
|
||||
|
||||
} // namespace at::accelerator
|
||||
|
||||
namespace at {
|
||||
|
||||
@ -1 +1,55 @@
|
||||
#include <torch/headeronly/cpu/vec/intrinsics.h>
|
||||
#pragma once
|
||||
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
|
||||
/* GCC or clang-compatible compiler, targeting x86/x86-64 */
|
||||
#include <x86intrin.h>
|
||||
#elif defined(__clang__) && (defined(__ARM_NEON__) || defined(__aarch64__))
|
||||
/* Clang-compatible compiler, targeting arm neon */
|
||||
#include <arm_neon.h>
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
/* CLANG-compatible compiler, targeting ARM with SVE */
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
#elif defined(_MSC_VER)
|
||||
/* Microsoft C/C++-compatible compiler */
|
||||
#include <intrin.h>
|
||||
#if _MSC_VER <= 1900
|
||||
#define _mm256_extract_epi64(X, Y) \
|
||||
(_mm_extract_epi64(_mm256_extractf128_si256(X, Y >> 1), Y % 2))
|
||||
#define _mm256_extract_epi32(X, Y) \
|
||||
(_mm_extract_epi32(_mm256_extractf128_si256(X, Y >> 2), Y % 4))
|
||||
#define _mm256_extract_epi16(X, Y) \
|
||||
(_mm_extract_epi16(_mm256_extractf128_si256(X, Y >> 3), Y % 8))
|
||||
#define _mm256_extract_epi8(X, Y) \
|
||||
(_mm_extract_epi8(_mm256_extractf128_si256(X, Y >> 4), Y % 16))
|
||||
#endif
|
||||
#elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__aarch64__))
|
||||
/* GCC-compatible compiler, targeting ARM with NEON */
|
||||
#include <arm_neon.h>
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
/* GCC-compatible compiler, targeting ARM with SVE */
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
#if defined(MISSING_ARM_VLD1)
|
||||
#include <ATen/cpu/vec/vec256/missing_vld1_neon.h>
|
||||
#elif defined(MISSING_ARM_VST1)
|
||||
#include <ATen/cpu/vec/vec256/missing_vst1_neon.h>
|
||||
#endif
|
||||
#elif defined(__GNUC__) && defined(__IWMMXT__)
|
||||
/* GCC-compatible compiler, targeting ARM with WMMX */
|
||||
#include <mmintrin.h>
|
||||
#elif defined(__s390x__)
|
||||
// targets Z/architecture
|
||||
// we will include vecintrin later
|
||||
#elif (defined(__GNUC__) || defined(__xlC__)) && \
|
||||
(defined(__VEC__) || defined(__ALTIVEC__))
|
||||
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
|
||||
#include <altivec.h>
|
||||
/* We need to undef those tokens defined by <altivec.h> to avoid conflicts
|
||||
with the C++ types. => Can still use __bool/__vector */
|
||||
#undef bool
|
||||
#undef vector
|
||||
#undef pixel
|
||||
#elif defined(__GNUC__) && defined(__SPE__)
|
||||
/* GCC-compatible compiler, targeting PowerPC with SPE */
|
||||
#include <spe.h>
|
||||
#endif
|
||||
|
||||
@ -1 +1,396 @@
|
||||
#include <torch/headeronly/cpu/vec/vec256/missing_vld1_neon.h>
|
||||
/* Workaround for missing vld1_*_x2 and vst1_*_x2 intrinsics in gcc-7. */
|
||||
|
||||
__extension__ extern __inline uint8x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_u8_x2(const uint8_t* __a) {
|
||||
uint8x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int8x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_s8_x2(const int8_t* __a) {
|
||||
int8x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint16x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_u16_x2(const uint16_t* __a) {
|
||||
uint16x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int16x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_s16_x2(const int16_t* __a) {
|
||||
int16x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint32x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_u32_x2(const uint32_t* __a) {
|
||||
uint32x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int32x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_s32_x2(const int32_t* __a) {
|
||||
int32x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint64x1x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_u64_x2(const uint64_t* __a) {
|
||||
uint64x1x2_t ret;
|
||||
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int64x1x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_s64_x2(const int64_t* __a) {
|
||||
int64x1x2_t ret;
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline float16x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_f16_x2(const float16_t* __a) {
|
||||
float16x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline float32x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_f32_x2(const float32_t* __a) {
|
||||
float32x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline float64x1x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_f64_x2(const float64_t* __a) {
|
||||
float64x1x2_t ret;
|
||||
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly8x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_p8_x2(const poly8_t* __a) {
|
||||
poly8x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly16x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_p16_x2(const poly16_t* __a) {
|
||||
poly16x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly64x1x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_p64_x2(const poly64_t* __a) {
|
||||
poly64x1x2_t ret;
|
||||
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint8x16x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_u8_x2(const uint8_t* __a) {
|
||||
uint8x16x2_t ret;
|
||||
asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int8x16x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_s8_x2(const int8_t* __a) {
|
||||
int8x16x2_t ret;
|
||||
asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint16x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_u16_x2(const uint16_t* __a) {
|
||||
uint16x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int16x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_s16_x2(const int16_t* __a) {
|
||||
int16x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint32x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_u32_x2(const uint32_t* __a) {
|
||||
uint32x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int32x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_s32_x2(const int32_t* __a) {
|
||||
int32x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint64x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_u64_x2(const uint64_t* __a) {
|
||||
uint64x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int64x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_s64_x2(const int64_t* __a) {
|
||||
int64x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline float16x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_f16_x2(const float16_t* __a) {
|
||||
float16x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline float32x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_f32_x2(const float32_t* __a) {
|
||||
float32x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline float64x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_f64_x2(const float64_t* __a) {
|
||||
float64x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly8x16x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_p8_x2(const poly8_t* __a) {
|
||||
poly8x16x2_t ret;
|
||||
asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly16x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_p16_x2(const poly16_t* __a) {
|
||||
poly16x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly64x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_p64_x2(const poly64_t* __a) {
|
||||
poly64x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* vst1x2 */
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_s64_x2(int64_t* __a, int64x1x2_t val) {
|
||||
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_u64_x2(uint64_t* __a, uint64x1x2_t val) {
|
||||
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_f64_x2(float64_t* __a, float64x1x2_t val) {
|
||||
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_s8_x2(int8_t* __a, int8x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_p8_x2(poly8_t* __a, poly8x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_s16_x2(int16_t* __a, int16x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_p16_x2(poly16_t* __a, poly16x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_s32_x2(int32_t* __a, int32x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_u8_x2(uint8_t* __a, uint8x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_u16_x2(uint16_t* __a, uint16x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_u32_x2(uint32_t* __a, uint32x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_f16_x2(float16_t* __a, float16x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_f32_x2(float32_t* __a, float32x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_p64_x2(poly64_t* __a, poly64x1x2_t val) {
|
||||
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_s8_x2(int8_t* __a, int8x16x2_t val) {
|
||||
asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_p8_x2(poly8_t* __a, poly8x16x2_t val) {
|
||||
asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_s16_x2(int16_t* __a, int16x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_p16_x2(poly16_t* __a, poly16x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_s32_x2(int32_t* __a, int32x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_s64_x2(int64_t* __a, int64x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_u8_x2(uint8_t* __a, uint8x16x2_t val) {
|
||||
asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_u16_x2(uint16_t* __a, uint16x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_u32_x2(uint32_t* __a, uint32x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_u64_x2(uint64_t* __a, uint64x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_f16_x2(float16_t* __a, float16x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_f32_x2(float32_t* __a, float32x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_f64_x2(float64_t* __a, float64x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_p64_x2(poly64_t* __a, poly64x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
@ -1 +1,7 @@
|
||||
#include <torch/headeronly/cpu/vec/vec256/missing_vst1_neon.h>
|
||||
/* Workaround for missing vst1q_f32_x2 in gcc-8. */
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_f32_x2(float32_t* __a, float32x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
@ -3,12 +3,50 @@
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
#include <torch/headeronly/cpu/vec/vec_half.h>
|
||||
|
||||
namespace at::vec {
|
||||
// See Note [CPU_CAPABILITY namespace]
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
|
||||
!defined(__APPLE__)
|
||||
static inline uint16_t float2half_scalar(float val) {
|
||||
#if defined(CPU_CAPABILITY_AVX2)
|
||||
#if defined(_MSC_VER)
|
||||
__m256 v = _mm256_set1_ps(val);
|
||||
__m128i o =
|
||||
_mm256_cvtps_ph(v, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
|
||||
return static_cast<std::uint16_t>(_mm_cvtsi128_si32(o));
|
||||
#else
|
||||
return _cvtss_sh(val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
|
||||
#endif
|
||||
#elif defined(CPU_CAPABILITY_AVX512)
|
||||
__m512 v = _mm512_set1_ps(val);
|
||||
__m256i o =
|
||||
_mm512_cvtps_ph(v, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
|
||||
return static_cast<std::uint16_t>(
|
||||
_mm_cvtsi128_si32(_mm256_castsi256_si128(o)));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline float half2float_scalar(uint16_t val) {
|
||||
#if defined(CPU_CAPABILITY_AVX2)
|
||||
#if defined(_MSC_VER)
|
||||
__m128i v = _mm_cvtsi32_si128(val);
|
||||
__m256 o = _mm256_cvtph_ps(v);
|
||||
return _mm256_cvtss_f32(o);
|
||||
#else
|
||||
return _cvtsh_ss(val);
|
||||
#endif
|
||||
#elif defined(CPU_CAPABILITY_AVX512)
|
||||
__m256i v =
|
||||
_mm256_setr_epi16(val, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
__m512 o = _mm512_cvtph_ps(v);
|
||||
return _mm512_cvtss_f32(o);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Transpose a [2, 32] matrix to [32, 2]
|
||||
// Note: the output leading dimension should be 2,
|
||||
// that is, the output must be contiguous
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
#include <ATen/cuda/CUDAGraph.h>
|
||||
#include <ATen/cuda/Exceptions.h>
|
||||
#include <ATen/Functions.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <c10/cuda/CUDAFunctions.h>
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
|
||||
#include <ATen/Tensor.h>
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <c10/cuda/CUDAGraphsC10Utils.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <c10/util/flat_hash_map.h>
|
||||
|
||||
@ -162,7 +162,7 @@ struct CUDACachingHostAllocatorImpl
|
||||
}
|
||||
|
||||
bool pinned_use_background_threads() override {
|
||||
return c10::CachingAllocator::AcceleratorAllocatorConfig::
|
||||
return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
|
||||
pinned_use_background_threads();
|
||||
}
|
||||
|
||||
|
||||
@ -24,29 +24,6 @@ static void _assert_match(const O& original, const C& compared, const std::strin
|
||||
}
|
||||
}
|
||||
|
||||
template<>
|
||||
void _assert_match<c10::Device, std::optional<c10::Device>>(
|
||||
const c10::Device& original,
|
||||
const std::optional<c10::Device>& compared,
|
||||
const std::string& name) {
|
||||
if (compared) {
|
||||
const c10::Device& expected = compared.value();
|
||||
if (original.type() != expected.type()) {
|
||||
std::stringstream msg;
|
||||
msg << "Tensor " << name << " mismatch! Expected: " << expected << ", Got: " << original;
|
||||
throw std::runtime_error(msg.str());
|
||||
}
|
||||
|
||||
// If the expected device doesn't have an index (e.g., just "cuda"),
|
||||
// or if both devices have the same index, consider them equal
|
||||
if (expected.has_index() && original.has_index() && expected.index() != original.index()) {
|
||||
std::stringstream msg;
|
||||
msg << "Tensor " << name << " mismatch! Expected: " << expected << ", Got: " << original;
|
||||
throw std::runtime_error(msg.str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void _assert_tensor_metadata_meta_symint(at::Tensor const& tensor, at::OptionalSymIntArrayRef sizes, at::OptionalSymIntArrayRef strides, std::optional<c10::ScalarType> dtype, std::optional<c10::Device> device, std::optional<c10::Layout> layout) {
|
||||
_assert_match(tensor.sym_sizes(), sizes, "sizes");
|
||||
_assert_match(tensor.sym_strides(), strides, "strides");
|
||||
|
||||
@ -367,27 +367,27 @@ void int8pack_mm_kernel_(
|
||||
auto* C_data = C.data_ptr<T>();
|
||||
const auto* S_data = scales.const_data_ptr<T>();
|
||||
|
||||
int64_t M = A.size(0);
|
||||
int64_t N = B.size(0);
|
||||
int64_t K = A.size(1);
|
||||
int64_t lda = A.stride(0);
|
||||
constexpr int64_t BLOCK_M = 4;
|
||||
constexpr int64_t BLOCK_N = 4;
|
||||
int M = A.size(0);
|
||||
int N = B.size(0);
|
||||
int K = A.size(1);
|
||||
int lda = A.stride(0);
|
||||
constexpr int BLOCK_M = 4;
|
||||
constexpr int BLOCK_N = 4;
|
||||
|
||||
const int64_t MB = (M + BLOCK_M - 1) / BLOCK_M;
|
||||
const int64_t NB = (N + BLOCK_N - 1) / BLOCK_N;
|
||||
const int MB = (M + BLOCK_M - 1) / BLOCK_M;
|
||||
const int NB = (N + BLOCK_N - 1) / BLOCK_N;
|
||||
|
||||
at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
|
||||
int64_t mb{0}, nb{0};
|
||||
at::parallel_for(0, MB * NB, 0, [&](int begin, int end) {
|
||||
int mb{0}, nb{0};
|
||||
data_index_init(begin, mb, MB, nb, NB);
|
||||
|
||||
for (const auto i : c10::irange(begin, end)) {
|
||||
(void)i;
|
||||
|
||||
int64_t mb_start = mb * BLOCK_M;
|
||||
int64_t mb_size = std::min(BLOCK_M, M - mb_start);
|
||||
int64_t nb_start = nb * BLOCK_N;
|
||||
int64_t nb_size = std::min(BLOCK_N, N - nb_start);
|
||||
int mb_start = mb * BLOCK_M;
|
||||
int mb_size = std::min(BLOCK_M, M - mb_start);
|
||||
int nb_start = nb * BLOCK_N;
|
||||
int nb_size = std::min(BLOCK_N, N - nb_start);
|
||||
|
||||
const auto* A_ptr = A_data + mb_start * lda;
|
||||
const auto* B_ptr = B_data + nb_start * K;
|
||||
|
||||
@ -526,7 +526,7 @@ namespace {
|
||||
|
||||
|
||||
// we are dealing with packed tensor here. max index is the same as numel.
|
||||
// TODO: to really support input tensor large enough to go beyond int32,
|
||||
// TODO: to really support input tensor large enought to go beyond int32,
|
||||
// we will need to restrict out shared memory usage and adjust the launch
|
||||
// config;
|
||||
AT_ASSERT(input_.numel() < std::numeric_limits<int32_t>::max());
|
||||
@ -681,7 +681,7 @@ namespace {
|
||||
const dim3 grid(grid_x, grid_y, grid_z);
|
||||
|
||||
// we are dealing with packed tensor here. max index is the same as numel.
|
||||
// TODO: to really support input tensor large enough to go beyond int32,
|
||||
// TODO: to really support input tensor large enought to go beyond int32,
|
||||
// we will need to restrict out shared memory usage and adjust the launch
|
||||
// config;
|
||||
AT_ASSERT(input.numel() < std::numeric_limits<int32_t>::max());
|
||||
|
||||
@ -21,10 +21,6 @@
|
||||
#include <ATen/native/cuda/GroupMM.h>
|
||||
#include <ATen/ceil_div.h>
|
||||
|
||||
#ifdef USE_FBGEMM_GENAI
|
||||
#include <fbgemm_gpu/torch_ops.h>
|
||||
#endif
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
@ -1220,7 +1216,7 @@ std::pair<ScalingType, ScalingType> get_joint_scaling(
|
||||
// - `scale_a`: a tensor with the inverse scale of `mat1`, whose shape/strides/dtype depend on the scaling scheme
|
||||
// - `scale_b`: a tensor with the inverse scale of `mat2`, whose shape/strides/dtype depend on the scaling scheme
|
||||
// - `scale_result`: a scalar tensor with the scale of the output, only utilized if the output is a float8 type
|
||||
// - `use_fast_accum`: if true, enables fast float8 accumulation. Backends may ignore this option if not applicable.
|
||||
// - `use_fast_accum`: if true, enables fast float8 accumulation
|
||||
// - `out`: a reference to the output tensor
|
||||
|
||||
Tensor&
|
||||
@ -1529,7 +1525,6 @@ namespace {
|
||||
const auto out_dtype_ = out_dtype.value_or(kBFloat16);
|
||||
TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm");
|
||||
|
||||
#ifndef USE_ROCM
|
||||
// For TMA transfers, strides of output tensor have to be either
|
||||
// 1, or aligned to 16 bytes.
|
||||
const auto last_dim = out_size.size() - 1;
|
||||
@ -1541,10 +1536,9 @@ namespace {
|
||||
} else {
|
||||
out_stride = {out_size[1] * size_padded, size_padded, 1};
|
||||
}
|
||||
return at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype_));
|
||||
#else
|
||||
return at::empty(out_size, mat_a.options().dtype(out_dtype_));
|
||||
#endif
|
||||
auto out = at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype_));
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
bool check_valid_strides_and_return_transposed(const Tensor& mat) {
|
||||
@ -1625,18 +1619,18 @@ const std::optional<at::Tensor>& bias,
|
||||
const std::optional<at::Tensor>& scale_result,
|
||||
std::optional<c10::ScalarType> out_dtype,
|
||||
bool use_fast_accum) {
|
||||
bool allowed_device = _scaled_mm_allowed_device();
|
||||
TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0, or ROCm MI300+");
|
||||
#ifndef USE_ROCM
|
||||
bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true);
|
||||
TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0");
|
||||
|
||||
TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type());
|
||||
TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_b.scalar_type());
|
||||
TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed");
|
||||
TORCH_CHECK(check_valid_strides_and_return_transposed(mat_b), "Expected mat2 to be transposed");
|
||||
TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d");
|
||||
TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
|
||||
const bool a_is_2d = mat_a.dim() == 2;
|
||||
const bool b_is_2d = mat_b.dim() == 2;
|
||||
if (!a_is_2d || !b_is_2d) {
|
||||
TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
|
||||
}
|
||||
TORCH_CHECK(
|
||||
mat_a.size(-1) % 16 == 0,
|
||||
"Expected trailing dimension of mat_a to be divisible by 16 ",
|
||||
@ -1670,10 +1664,6 @@ bool use_fast_accum) {
|
||||
|
||||
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type());
|
||||
TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_b.scalar_type());
|
||||
|
||||
at::cuda::detail::f8f8bf16_grouped_mm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
@ -1684,23 +1674,12 @@ bool use_fast_accum) {
|
||||
use_fast_accum,
|
||||
out);
|
||||
return out;
|
||||
#else
|
||||
#ifdef USE_FBGEMM_GENAI
|
||||
TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fnuz, "Expected mat_a to be Float8_e4m3fnuz matrix got ", mat_a.scalar_type());
|
||||
TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fnuz, "Expected mat_a to be Float8_e4m3fnuz matrix got ", mat_b.scalar_type());
|
||||
|
||||
fbgemm_gpu::f8f8bf16_rowwise_grouped_mm(
|
||||
mat_a,
|
||||
// FBGEMM expects B matrix shape to be (.., N, K)
|
||||
mat_b.transpose(-2, -1),
|
||||
scale_a,
|
||||
scale_b,
|
||||
offs,
|
||||
out);
|
||||
return out;
|
||||
|
||||
|
||||
|
||||
#else
|
||||
TORCH_CHECK(false, "grouped gemm is not supported without USE_FBGEMM_GENAI on ROCM")
|
||||
#endif
|
||||
TORCH_CHECK(false, "grouped gemm is not supported on ROCM")
|
||||
#endif
|
||||
|
||||
}
|
||||
@ -1719,9 +1698,6 @@ std::optional<c10::ScalarType> out_dtype) {
|
||||
TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
|
||||
const bool a_is_2d = mat_a.dim() == 2;
|
||||
const bool b_is_2d = mat_b.dim() == 2;
|
||||
if (!a_is_2d || !b_is_2d) {
|
||||
TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
|
||||
}
|
||||
|
||||
// check that the strides are valid, the fn will throw an error if not
|
||||
check_valid_strides_and_return_transposed(mat_a);
|
||||
|
||||
@ -223,7 +223,7 @@ inline CuFFTDataLayout as_cufft_embed(IntArrayRef strides, IntArrayRef sizes, bo
|
||||
class CuFFTConfig {
|
||||
public:
|
||||
|
||||
// Only move semantics is enough for this class. Although we already use
|
||||
// Only move semantics is enought for this class. Although we already use
|
||||
// unique_ptr for the plan, still remove copy constructor and assignment op so
|
||||
// we don't accidentally copy and take perf hit.
|
||||
CuFFTConfig(const CuFFTConfig&) = delete;
|
||||
|
||||
@ -38,19 +38,17 @@ static inline std::string _cudaGetErrorEnum(cufftResult error)
|
||||
return "CUFFT_INVALID_SIZE";
|
||||
case CUFFT_UNALIGNED_DATA:
|
||||
return "CUFFT_UNALIGNED_DATA";
|
||||
case CUFFT_INCOMPLETE_PARAMETER_LIST:
|
||||
return "CUFFT_INCOMPLETE_PARAMETER_LIST";
|
||||
case CUFFT_INVALID_DEVICE:
|
||||
return "CUFFT_INVALID_DEVICE";
|
||||
case CUFFT_PARSE_ERROR:
|
||||
return "CUFFT_PARSE_ERROR";
|
||||
case CUFFT_NO_WORKSPACE:
|
||||
return "CUFFT_NO_WORKSPACE";
|
||||
case CUFFT_NOT_IMPLEMENTED:
|
||||
return "CUFFT_NOT_IMPLEMENTED";
|
||||
#if CUDA_VERSION <= 12090
|
||||
case CUFFT_INCOMPLETE_PARAMETER_LIST:
|
||||
return "CUFFT_INCOMPLETE_PARAMETER_LIST";
|
||||
case CUFFT_PARSE_ERROR:
|
||||
return "CUFFT_PARSE_ERROR";
|
||||
#endif
|
||||
#if !defined(USE_ROCM) && CUDA_VERSION <= 12090
|
||||
#if !defined(USE_ROCM)
|
||||
case CUFFT_LICENSE_ERROR:
|
||||
return "CUFFT_LICENSE_ERROR";
|
||||
#endif
|
||||
|
||||
@ -241,8 +241,6 @@ void bf16bf16_grouped_gemm_impl_sm90_sm100(
|
||||
Strides tensor_StrideA = make_strides(mat_a.strides());
|
||||
Strides tensor_StrideB = make_strides(mat_b.strides());
|
||||
Strides tensor_StrideOutput = make_strides(out.strides());
|
||||
Strides tensor_ShapeA = make_strides(mat_a.sizes());
|
||||
Strides tensor_ShapeB = make_strides(mat_b.sizes());
|
||||
|
||||
at::cuda::detail::prepare_grouped_gemm_data<<<1, group_count, 0, stream>>>(
|
||||
reinterpret_cast<DtypeA*>(mat_a.data_ptr()),
|
||||
@ -266,8 +264,6 @@ void bf16bf16_grouped_gemm_impl_sm90_sm100(
|
||||
tensor_StrideA,
|
||||
tensor_StrideB,
|
||||
tensor_StrideOutput,
|
||||
tensor_ShapeA,
|
||||
tensor_ShapeB,
|
||||
0,
|
||||
0,
|
||||
a_row_major,
|
||||
|
||||
@ -38,20 +38,18 @@ __global__ void prepare_grouped_gemm_data(
|
||||
Strides tensor_StrideA,
|
||||
Strides tensor_StrideB,
|
||||
Strides tensor_StrideOutput,
|
||||
Strides tensor_ShapeA,
|
||||
Strides tensor_ShapeB,
|
||||
int64_t a_scale_stride,
|
||||
int64_t b_scale_stride,
|
||||
bool a_row_major = true,
|
||||
bool b_row_major = false) {
|
||||
int32_t tid = threadIdx.x;
|
||||
int32_t delta = 0;
|
||||
int32_t offset = 0;
|
||||
if (offs != nullptr) {
|
||||
int32_t start = tid == 0 ? 0 : offs[tid - 1];
|
||||
offset = offs[tid];
|
||||
delta = offset - start;
|
||||
CUDA_KERNEL_ASSERT(delta >=0 && "expected gemm dimension to be greater or equal 0\n");
|
||||
delta = offs[tid] - start;
|
||||
if (K < 0) {
|
||||
CUDA_KERNEL_ASSERT(delta >=0 && "expected ofsets to be greater or equal 0\n");
|
||||
}
|
||||
|
||||
// TMA transfers require global memory tensor addresses to be
|
||||
// aligned to 16 bytes.
|
||||
@ -86,7 +84,6 @@ __global__ void prepare_grouped_gemm_data(
|
||||
int64_t lda, ldb, ldoutput;
|
||||
if (M < 0) {
|
||||
// A and output is 2d
|
||||
CUDA_KERNEL_ASSERT(offset <= tensor_ShapeA[0] && "expected offset to be less than tensor size\n");
|
||||
M = delta;
|
||||
lda = a_row_major ? tensor_StrideA[0] : tensor_StrideA[1];
|
||||
ldb = b_row_major ? tensor_StrideB[1] : tensor_StrideB[2];
|
||||
@ -99,7 +96,6 @@ __global__ void prepare_grouped_gemm_data(
|
||||
output_ptrs[tid] = tid == 0 ? output : output + offs[tid - 1] * ldoutput;
|
||||
B_ptrs[tid] = B + tid * tensor_StrideB[0];
|
||||
} else if (N < 0) {
|
||||
CUDA_KERNEL_ASSERT(offset <= tensor_ShapeB[1] && "expected offset to be less than tensor size\n");
|
||||
N = delta;
|
||||
lda = a_row_major ? tensor_StrideA[1] : tensor_StrideA[2];
|
||||
ldb = b_row_major ? tensor_StrideB[0] : tensor_StrideB[1]; // B is transposed
|
||||
@ -112,7 +108,6 @@ __global__ void prepare_grouped_gemm_data(
|
||||
inputB_scale_ptrs[tid] = tid == 0 ? scale_B : scale_B + offs[tid - 1];
|
||||
}
|
||||
} else if (K < 0) {
|
||||
CUDA_KERNEL_ASSERT(offset <= tensor_ShapeA[1] && offset <= tensor_ShapeB[0] && "expected offset to be less than tensor size\n");
|
||||
// A, B is 2d, output is 3d
|
||||
K = delta;
|
||||
lda = a_row_major ? tensor_StrideA[0] : tensor_StrideA[1];
|
||||
|
||||
@ -644,12 +644,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
|
||||
Tensor grad = at::full_like(log_probs, neginf, LEGACY_CONTIGUOUS_MEMORY_FORMAT); // initialization for log(sum (alpha beta))
|
||||
|
||||
// As above, there may be better configurations to use.
|
||||
constexpr int max_threads_ = std::is_same_v<scalar_t, float> ? 1024 : 896; // we need 72 or so 32 bit registers for double
|
||||
int max_threads = max_threads_;
|
||||
// Blackwell launch bounds
|
||||
if (at::cuda::getCurrentDeviceProperties()->major >= 10) {
|
||||
max_threads = 512;
|
||||
}
|
||||
constexpr int max_threads = std::is_same_v<scalar_t, float> ? 1024 : 896; // we need 72 or so 32 bit registers for double
|
||||
int threads_target = max_threads;
|
||||
while (threads_target / 2 >= 2*max_target_length+1) {
|
||||
threads_target /= 2;
|
||||
|
||||
@ -9,7 +9,6 @@
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wmissing-field-initializers")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
|
||||
|
||||
// Determine if the architecture supports rowwise scaled mm
|
||||
// Currently failing on windows with:
|
||||
@ -45,7 +44,6 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
|
||||
|
||||
#include <ATen/native/cuda/cutlass_common.cuh>
|
||||
|
||||
C10_DIAGNOSTIC_POP()
|
||||
C10_DIAGNOSTIC_POP()
|
||||
C10_DIAGNOSTIC_POP()
|
||||
|
||||
|
||||
@ -10,7 +10,6 @@
|
||||
// Two warninngs in Cutlass included header files
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
|
||||
|
||||
// Determine if the architecture supports rowwise scaled mm
|
||||
// Currently failing on windows with:
|
||||
@ -45,7 +44,6 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
|
||||
#include <cutlass/gemm/kernel/gemm_universal.hpp>
|
||||
#include <cutlass/util/packed_stride.hpp>
|
||||
|
||||
C10_DIAGNOSTIC_POP()
|
||||
C10_DIAGNOSTIC_POP()
|
||||
C10_DIAGNOSTIC_POP()
|
||||
|
||||
@ -298,9 +296,6 @@ void f8f8bf16_grouped_gemm_impl_sm90(
|
||||
Strides tensor_StrideA = make_strides(mat_a.strides());
|
||||
Strides tensor_StrideB = make_strides(mat_b.strides());
|
||||
Strides tensor_StrideOutput = make_strides(out.strides());
|
||||
Strides tensor_ShapeA = make_strides(mat_a.sizes());
|
||||
Strides tensor_ShapeB = make_strides(mat_b.sizes());
|
||||
|
||||
// scale stride will be used inside the kernel only if needed,
|
||||
// so for 1d scales the "1" assigned here won't be used
|
||||
int64_t a_scale_stride = scale_a.stride(0);
|
||||
@ -328,8 +323,6 @@ void f8f8bf16_grouped_gemm_impl_sm90(
|
||||
tensor_StrideA,
|
||||
tensor_StrideB,
|
||||
tensor_StrideOutput,
|
||||
tensor_ShapeA,
|
||||
tensor_ShapeB,
|
||||
a_scale_stride,
|
||||
b_scale_stride);
|
||||
|
||||
|
||||
@ -1,74 +0,0 @@
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
namespace at::native {
|
||||
|
||||
__global__ void weight_int8pack_mm_kernel(const float* x, const int8_t* w, const float* scale, float* out, int B, int K, int N) {
|
||||
// one thread per output element: [B, N]
|
||||
int b = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int n = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (b >= B || n >= N) return;
|
||||
|
||||
float acc = 0.0f;
|
||||
for (int k = 0; k < K; ++k) {
|
||||
acc += x[b * K + k] * static_cast<float>(w[n * K + k]);
|
||||
}
|
||||
|
||||
out[b * N + n] = acc * scale[n];
|
||||
}
|
||||
|
||||
void launch_weight_int8pack_mm_cuda_kernel(const Tensor& x, const Tensor& w_int8, const Tensor& scale, Tensor& out) {
|
||||
const int B = x.size(0);
|
||||
const int K = x.size(1);
|
||||
const int N = w_int8.size(0);
|
||||
|
||||
const dim3 block(16, 16);
|
||||
const dim3 grid((N + block.x - 1) / block.x, (B + block.y - 1) / block.y);
|
||||
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
weight_int8pack_mm_kernel<<<grid, block, 0, stream>>>(
|
||||
x.data_ptr<float>(),
|
||||
w_int8.data_ptr<int8_t>(),
|
||||
scale.data_ptr<float>(),
|
||||
out.data_ptr<float>(),
|
||||
B, K, N);
|
||||
}
|
||||
|
||||
|
||||
// Main GPU entry point
|
||||
at::Tensor _weight_int8pack_mm_cuda(const at::Tensor& x, const at::Tensor& w_int8, const at::Tensor& scale) {
|
||||
// --- Check inputs ---
|
||||
TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor");
|
||||
TORCH_CHECK(w_int8.is_cuda(), "w must be a CUDA tensor");
|
||||
TORCH_CHECK(scale.is_cuda(), "scale must be a CUDA tensor");
|
||||
|
||||
TORCH_CHECK(x.dim() == 2, "x must be 2D");
|
||||
TORCH_CHECK(w_int8.dim() == 2, "w must be 2D");
|
||||
TORCH_CHECK(scale.dim() == 1, "scale must be 1D");
|
||||
|
||||
TORCH_CHECK(x.size(1) == w_int8.size(1), "K dimension mismatch: x.size(1) != w.size(1)");
|
||||
TORCH_CHECK(w_int8.size(0) == scale.size(0), "Output dim mismatch: w.size(0) != scale.size(0)");
|
||||
|
||||
// --- Determine shapes ---
|
||||
auto B = x.size(0); // batch size
|
||||
auto N = w_int8.size(0); // output dim
|
||||
|
||||
// Ensure inputs are in the correct types for the kernel
|
||||
auto x_f32 = x.to(at::kFloat);
|
||||
auto w_int8_contiguous = w_int8.contiguous();
|
||||
auto scale_f32 = scale.to(at::kFloat);
|
||||
|
||||
// --- Allocate output ---
|
||||
auto out = at::empty({B, N}, x.options().dtype(at::kFloat));
|
||||
|
||||
// --- Launch kernel ---
|
||||
launch_weight_int8pack_mm_cuda_kernel(x_f32, w_int8_contiguous, scale_f32, out);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
@ -45,7 +45,7 @@ namespace at::cuda::jit {
|
||||
// Copied from aten/src/ATen/cuda/llvm_basic.cpp, then modified as above.
|
||||
// If not compiling for ROCm, return the original get_traits_string().
|
||||
std::string get_traits_string_but_hiprtc_safe() {
|
||||
#if defined(USE_ROCM) && HIP_VERSION_MAJOR < 7
|
||||
#if defined(USE_ROCM) && ROCM_VERSION < 70000
|
||||
return R"ESCAPE(
|
||||
namespace std {
|
||||
|
||||
|
||||
@ -28,22 +28,6 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
|
||||
TORCH_CHECK(false, "cudnn_batch_norm: ATen not compiled with cuDNN support");
|
||||
}
|
||||
|
||||
std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
|
||||
const Tensor& input,
|
||||
const Tensor& weight,
|
||||
const std::optional<Tensor>& bias,
|
||||
const std::optional<Tensor>& running_mean,
|
||||
const std::optional<Tensor>& running_var,
|
||||
bool training,
|
||||
double exponential_average_factor,
|
||||
double epsilon,
|
||||
Tensor& out,
|
||||
Tensor& save_mean,
|
||||
Tensor& save_var,
|
||||
Tensor& reserve) {
|
||||
AT_ERROR("cudnn_batch_norm_out: ATen not compiled with cuDNN support");
|
||||
}
|
||||
|
||||
std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
|
||||
const Tensor& input,
|
||||
const Tensor& grad_output,
|
||||
@ -136,12 +120,7 @@ size_t _get_cudnn_batch_norm_reserve_space_size(
|
||||
return reserve_size;
|
||||
}
|
||||
|
||||
// Param `reserve` is a placeholder, just passing an empty tensor.
|
||||
// usage:
|
||||
// auto reserve = torch::empty({0}, torch::device(torch::kCUDA));
|
||||
// at::native::cudnn_batch_norm_out(..., epsilon, output, save_mean, save_var,
|
||||
// reserve);
|
||||
std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
|
||||
std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
|
||||
const Tensor& input_t,
|
||||
const Tensor& weight_t,
|
||||
const std::optional<Tensor>& bias_t_opt,
|
||||
@ -149,11 +128,7 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
|
||||
const std::optional<Tensor>& running_var_t_opt,
|
||||
bool training,
|
||||
double exponential_average_factor,
|
||||
double epsilon,
|
||||
Tensor& output_t,
|
||||
Tensor& save_mean,
|
||||
Tensor& save_var,
|
||||
Tensor& reserve) {
|
||||
double epsilon) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> bias_t_maybe_owned =
|
||||
at::borrow_from_optional_tensor(bias_t_opt);
|
||||
@ -193,6 +168,9 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
|
||||
cudnnBatchNormMode_t mode = getCudnnBatchNormMode(
|
||||
training, input->suggest_memory_format(), input->dim());
|
||||
|
||||
auto output_t =
|
||||
at::empty_like(*input, input->options(), input->suggest_memory_format());
|
||||
|
||||
TensorArg output{output_t, "output", 0};
|
||||
|
||||
auto handle = getCudnnHandle();
|
||||
@ -204,8 +182,15 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
|
||||
|
||||
Constant one(dataType, 1);
|
||||
Constant zero(dataType, 0);
|
||||
Tensor save_mean, save_var;
|
||||
|
||||
Tensor reserve;
|
||||
|
||||
if (training) {
|
||||
int64_t num_features = input_t.size(1);
|
||||
save_mean = at::empty({num_features}, weight_t.options());
|
||||
save_var = at::empty({num_features}, weight_t.options());
|
||||
|
||||
auto op = CUDNN_BATCHNORM_OPS_BN;
|
||||
size_t workspace_size;
|
||||
AT_CUDNN_CHECK(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
|
||||
@ -253,6 +238,9 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
|
||||
reserve_size));
|
||||
} else {
|
||||
reserve = at::empty({0}, input->options().dtype(kByte));
|
||||
// This keeps a consistent output with native_batch_norm
|
||||
save_mean = at::empty({0}, weight_t.options());
|
||||
save_var = at::empty({0}, weight_t.options());
|
||||
AT_CUDNN_CHECK(cudnnBatchNormalizationForwardInference(
|
||||
handle,
|
||||
mode,
|
||||
@ -273,48 +261,10 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> cudnn_batch_norm_out(
|
||||
// save_mean and save_var can be undefined
|
||||
// If this causes problems, we can initialize them to empty tensors
|
||||
// of the correct type
|
||||
return std::tuple<Tensor&, Tensor&, Tensor&, Tensor&>{
|
||||
return std::tuple<Tensor, Tensor, Tensor, Tensor>{
|
||||
output_t, save_mean, save_var, reserve};
|
||||
}
|
||||
|
||||
std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
|
||||
const Tensor& input_t,
|
||||
const Tensor& weight_t,
|
||||
const std::optional<Tensor>& bias_t_opt,
|
||||
const std::optional<Tensor>& running_mean_t_opt,
|
||||
const std::optional<Tensor>& running_var_t_opt,
|
||||
bool training,
|
||||
double exponential_average_factor,
|
||||
double epsilon) {
|
||||
auto output_t = at::empty_like(
|
||||
input_t, input_t.options(), input_t.suggest_memory_format());
|
||||
Tensor save_mean, save_var, reserve;
|
||||
|
||||
if (training) {
|
||||
int64_t num_features = input_t.size(1);
|
||||
save_mean = at::empty({num_features}, weight_t.options());
|
||||
save_var = at::empty({num_features}, weight_t.options());
|
||||
} else {
|
||||
// This keeps a consistent output with native_batch_norm
|
||||
save_mean = at::empty({0}, weight_t.options());
|
||||
save_var = at::empty({0}, weight_t.options());
|
||||
}
|
||||
|
||||
return cudnn_batch_norm_out(
|
||||
input_t,
|
||||
weight_t,
|
||||
bias_t_opt,
|
||||
running_mean_t_opt,
|
||||
running_var_t_opt,
|
||||
training,
|
||||
exponential_average_factor,
|
||||
epsilon,
|
||||
output_t,
|
||||
save_mean,
|
||||
save_var,
|
||||
reserve);
|
||||
}
|
||||
|
||||
// NB: CuDNN only implements the backward algorithm for batchnorm
|
||||
// in training mode (evaluation mode batchnorm has a different algorithm),
|
||||
// which is why this doesn't accept a 'training' parameter.
|
||||
|
||||
@ -342,8 +342,8 @@ Tensor rms_norm_symint(
|
||||
|
||||
if (weight_opt.has_value() && weight_opt.value().defined() && weight_opt.value().dtype() != input.dtype()) {
|
||||
TORCH_WARN_ONCE(
|
||||
"Mismatch dtype between input and weight: input dtype = ", input.dtype(),
|
||||
", weight dtype = ", weight_opt.value().dtype(), ", Cannot dispatch to fused implementation."
|
||||
"Mismatch dtype between input and module: input dtype = ", input.dtype(),
|
||||
", module dtype = ", weight_opt.value().dtype(), ", Can not dispatch to fused implementation"
|
||||
);
|
||||
return std::get<0>(rms_norm_composite(input, IntArrayRef(reinterpret_cast<const int64_t*>(normalized_shape.data()), normalized_shape.size()), weight_opt, eps));
|
||||
}
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/Config.h>
|
||||
#include <ATen/Context.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/native/mkldnn/Matmul.h>
|
||||
|
||||
@ -427,74 +428,56 @@ static inline bool checksize(const Tensor& mat1, const Tensor& mat2){
|
||||
}
|
||||
}
|
||||
|
||||
bool use_mkldnn_bf16_matmul(
|
||||
template <typename T>
|
||||
bool use_mkldnn_typed_matmul(
|
||||
const Tensor& mat1,
|
||||
const Tensor& mat2,
|
||||
const Tensor& result) {
|
||||
bool dtype_check = false;
|
||||
if constexpr (std::is_same_v<T, c10::BFloat16>) {
|
||||
#if defined(__aarch64__)
|
||||
if (mkldnn_bf16_device_check_arm()) {
|
||||
// onednn fastmath mode can leverage bf16 HW even for the fp32 input, e.g.
|
||||
// Arm Neoverse V1 so, don't restrict the mkldnn_matmul only for bf16
|
||||
// inputs, allow it for float as well
|
||||
return (
|
||||
use_mkldnn_bf16_matmul() &&
|
||||
(mat1.scalar_type() == mat2.scalar_type()) &&
|
||||
(!result.defined() || (mat1.scalar_type() == result.scalar_type())) &&
|
||||
((mat1.scalar_type() == kFloat) || (mat1.scalar_type() == kBFloat16)) &&
|
||||
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
|
||||
} else
|
||||
if (mkldnn_bf16_device_check_arm()) {
|
||||
// onednn fastmath mode can leverage bf16 HW even for the fp32 input, e.g.
|
||||
// Arm Neoverse V1 so, don't restrict the mkldnn_matmul only for bf16
|
||||
// inputs, allow it for float as well
|
||||
dtype_check = use_mkldnn_bf16_matmul() &&
|
||||
((mat1.scalar_type() == kFloat) || (mat1.scalar_type() == kBFloat16));
|
||||
}
|
||||
#else
|
||||
dtype_check = dtype_check && use_mkldnn_bf16_matmul() &&
|
||||
(mat1.scalar_type() == kBFloat16);
|
||||
#endif
|
||||
{
|
||||
return (
|
||||
use_mkldnn_bf16_matmul() && mat1.scalar_type() == kBFloat16 &&
|
||||
mat2.scalar_type() == kBFloat16 &&
|
||||
(!result.defined() || result.scalar_type() == kBFloat16) &&
|
||||
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
|
||||
} else if constexpr (std::is_same_v<T, c10::Half>) {
|
||||
dtype_check = dtype_check && use_mkldnn_fp16_matmul() &&
|
||||
(mat1.scalar_type() == kHalf);
|
||||
} else if constexpr (std::is_same_v<T, float>) {
|
||||
dtype_check = dtype_check &&
|
||||
(use_mkldnn_bf32_matmul() || use_mkldnn_tf32_matmul()) &&
|
||||
(mat1.scalar_type() == kFloat);
|
||||
}
|
||||
}
|
||||
|
||||
bool use_mkldnn_fp16_matmul(
|
||||
const Tensor& mat1,
|
||||
const Tensor& mat2,
|
||||
const Tensor& result) {
|
||||
return (
|
||||
use_mkldnn_fp16_matmul() && mat1.scalar_type() == kHalf &&
|
||||
mat2.scalar_type() == kHalf &&
|
||||
(!result.defined() || result.scalar_type() == kHalf) &&
|
||||
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
|
||||
}
|
||||
|
||||
bool use_mkldnn_bf32_matmul(
|
||||
const Tensor& mat1,
|
||||
const Tensor& mat2,
|
||||
const Tensor& result) {
|
||||
return (
|
||||
use_mkldnn_bf32_matmul() && mat1.scalar_type() == kFloat &&
|
||||
mat2.scalar_type() == kFloat &&
|
||||
(!result.defined() || result.scalar_type() == kFloat) &&
|
||||
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
|
||||
}
|
||||
|
||||
bool use_mkldnn_tf32_matmul(
|
||||
const Tensor& mat1,
|
||||
const Tensor& mat2,
|
||||
const Tensor& result) {
|
||||
return (
|
||||
use_mkldnn_tf32_matmul() && mat1.scalar_type() == kFloat &&
|
||||
mat2.scalar_type() == kFloat &&
|
||||
(!result.defined() || result.scalar_type() == kFloat) &&
|
||||
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2));
|
||||
if (!dtype_check) {
|
||||
return false;
|
||||
}
|
||||
bool size_check =
|
||||
mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2);
|
||||
dtype_check = (mat1.scalar_type() == mat2.scalar_type()) &&
|
||||
(!result.defined() || result.scalar_type() == mat1.scalar_type());
|
||||
return dtype_check && size_check;
|
||||
}
|
||||
|
||||
bool use_mkldnn_matmul(
|
||||
const Tensor& mat1,
|
||||
const Tensor& mat2,
|
||||
const Tensor& result) {
|
||||
return (
|
||||
use_mkldnn_bf16_matmul(mat1, mat2, result) ||
|
||||
use_mkldnn_fp16_matmul(mat1, mat2, result) ||
|
||||
use_mkldnn_bf32_matmul(mat1, mat2, result) ||
|
||||
use_mkldnn_tf32_matmul(mat1, mat2, result));
|
||||
auto mat1_type = mat1.scalar_type();
|
||||
if (mat1_type != kBFloat16 || mat1_type != kHalf || mat1_type != kFloat) {
|
||||
return false;
|
||||
}
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(
|
||||
kBFloat16, kHalf, mat1.scalar_type(), "use_mkldnn_matmul", [&] {
|
||||
return use_mkldnn_typed_matmul<scalar_t>(mat1, mat2, result);
|
||||
});
|
||||
return false;
|
||||
}
|
||||
|
||||
static void _mkldnn_matmul_i8i8i32_with_primitive(
|
||||
|
||||
@ -469,94 +469,4 @@ Tensor _weight_int4pack_mm_xpu(
|
||||
|
||||
return C;
|
||||
}
|
||||
|
||||
Tensor& _int_mm_out_xpu(
|
||||
const Tensor& self,
|
||||
const Tensor& mat2,
|
||||
Tensor& result) {
|
||||
TORCH_CHECK(
|
||||
self.dim() == 2,
|
||||
"Expected self to be of dimension 2 but got ",
|
||||
self.dim());
|
||||
TORCH_CHECK(
|
||||
mat2.dim() == 2,
|
||||
"Expected mat2 to be of dimension 2 but got ",
|
||||
mat2.dim());
|
||||
TORCH_CHECK(
|
||||
self.size(1) == mat2.size(0),
|
||||
"self.size(1) needs to match mat2.size(0) but got ",
|
||||
self.size(1),
|
||||
" and ",
|
||||
mat2.size(0));
|
||||
|
||||
TORCH_CHECK(
|
||||
self.dtype() == at::kChar,
|
||||
"Expected self dtype to be of type int8 but got ",
|
||||
self.dtype());
|
||||
TORCH_CHECK(
|
||||
mat2.dtype() == at::kChar,
|
||||
"Expected mat2 dtype to be of type int8 but got ",
|
||||
mat2.dtype());
|
||||
TORCH_CHECK(
|
||||
result.dtype() == at::kInt,
|
||||
"Expected result dtype to be of type kInt but got ",
|
||||
result.dtype());
|
||||
TORCH_CHECK(
|
||||
result.size(0) == self.size(0),
|
||||
"Expected result.size(0) to be ",
|
||||
self.size(0),
|
||||
" but got ",
|
||||
result.size(0));
|
||||
TORCH_CHECK(
|
||||
result.size(1) == mat2.size(1),
|
||||
"Expected result.size(1) to be ",
|
||||
mat2.size(1),
|
||||
" but got ",
|
||||
result.size(1));
|
||||
|
||||
TORCH_CHECK(
|
||||
result.dim() == 2,
|
||||
"Expected result to be of dimension 2 but got ",
|
||||
result.dim());
|
||||
|
||||
TORCH_CHECK(result.is_contiguous(), "Expected result to be contiguous.");
|
||||
|
||||
if (result.numel() == 0 || self.size(1) == 0) {
|
||||
return result.zero_();
|
||||
}
|
||||
|
||||
Tensor bias = at::Tensor();
|
||||
Tensor mat2_scales = at::ones({1}, mat2.options().dtype(at::kFloat));
|
||||
Tensor mat2_zero_points = at::Tensor();
|
||||
auto post_op_args = torch::List<std::optional<at::Scalar>>();
|
||||
|
||||
at::native::onednn::quantized_matmul(
|
||||
self.contiguous(),
|
||||
1.0,
|
||||
0,
|
||||
mat2.contiguous(),
|
||||
mat2_scales,
|
||||
mat2_zero_points,
|
||||
bias,
|
||||
result,
|
||||
1.0,
|
||||
0,
|
||||
result.scalar_type(),
|
||||
/*other*/ std::nullopt,
|
||||
/*other scale*/ 1.0,
|
||||
/*other zp*/ 0,
|
||||
/*binary post op*/ "none",
|
||||
/*binary alpha*/ 1.0,
|
||||
/*post_op_name*/ "none",
|
||||
post_op_args,
|
||||
/*post_op_algorithm*/ "none",
|
||||
/*m2_trans*/ true);
|
||||
return result;
|
||||
}
|
||||
|
||||
Tensor _int_mm_xpu(const Tensor& self, const Tensor& mat2) {
|
||||
Tensor result =
|
||||
at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt));
|
||||
return _int_mm_out_xpu(self, mat2, result);
|
||||
}
|
||||
} // namespace at::native
|
||||
|
||||
@ -953,7 +953,8 @@ class BundledShaderLibary : public MetalShaderLibrary {
|
||||
if (C10_UNLIKELY(!library)) {
|
||||
auto device = MPSDevice::getInstance()->device();
|
||||
NSError* error = nil;
|
||||
library = [device newLibraryWithData:getSectionData("metal_basic") error:&error];
|
||||
auto section_name = is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS) ? "metal_bfloat" : "metal_basic";
|
||||
library = [device newLibraryWithData:getSectionData(section_name) error:&error];
|
||||
TORCH_CHECK(library, "Failed to create metal library, error: ", [[error description] UTF8String]);
|
||||
}
|
||||
return library;
|
||||
|
||||
@ -33,15 +33,21 @@ struct shrink_backward_functor {
|
||||
|
||||
REGISTER_UNARY_ALPHA_OP(hardshrink, float, float, float);
|
||||
REGISTER_UNARY_ALPHA_OP(hardshrink, half, half, half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_UNARY_ALPHA_OP(hardshrink, bfloat, bfloat, bfloat);
|
||||
#endif
|
||||
|
||||
REGISTER_UNARY_ALPHA_OP(softshrink, float, float, float);
|
||||
REGISTER_UNARY_ALPHA_OP(softshrink, half, half, half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_UNARY_ALPHA_OP(softshrink, bfloat, bfloat, bfloat);
|
||||
#endif
|
||||
|
||||
REGISTER_BINARY_ALPHA_OP(shrink_backward, float, float, float);
|
||||
REGISTER_BINARY_ALPHA_OP(shrink_backward, half, half, half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_BINARY_ALPHA_OP(shrink_backward, bfloat, bfloat, bfloat);
|
||||
#endif
|
||||
|
||||
struct hardsigmoid_functor {
|
||||
template <typename T>
|
||||
@ -61,11 +67,15 @@ struct hardsigmoid_backward_functor {
|
||||
|
||||
REGISTER_UNARY_OP(hardsigmoid, float, float);
|
||||
REGISTER_UNARY_OP(hardsigmoid, half, half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_UNARY_OP(hardsigmoid, bfloat, bfloat);
|
||||
#endif
|
||||
|
||||
REGISTER_BINARY_OP(hardsigmoid_backward, float, float);
|
||||
REGISTER_BINARY_OP(hardsigmoid_backward, half, half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_BINARY_OP(hardsigmoid_backward, bfloat, bfloat);
|
||||
#endif
|
||||
|
||||
struct hardswish_functor {
|
||||
template <typename T>
|
||||
@ -93,11 +103,15 @@ struct hardswish_backward_functor {
|
||||
|
||||
REGISTER_UNARY_OP(hardswish, float, float);
|
||||
REGISTER_UNARY_OP(hardswish, half, half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_UNARY_OP(hardswish, bfloat, bfloat);
|
||||
#endif
|
||||
|
||||
REGISTER_BINARY_OP(hardswish_backward, float, float);
|
||||
REGISTER_BINARY_OP(hardswish_backward, half, half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_BINARY_OP(hardswish_backward, bfloat, bfloat);
|
||||
#endif
|
||||
|
||||
struct leaky_relu_functor {
|
||||
template <typename T>
|
||||
@ -121,8 +135,12 @@ struct leaky_relu_backward_functor {
|
||||
|
||||
REGISTER_UNARY_ALPHA_OP(leaky_relu, float, float, float);
|
||||
REGISTER_UNARY_ALPHA_OP(leaky_relu, half, half, half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_UNARY_ALPHA_OP(leaky_relu, bfloat, bfloat, bfloat);
|
||||
#endif
|
||||
|
||||
REGISTER_BINARY_ALPHA_OP(leaky_relu_backward, float, float, float);
|
||||
REGISTER_BINARY_ALPHA_OP(leaky_relu_backward, half, half, half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_BINARY_ALPHA_OP(leaky_relu_backward, bfloat, bfloat, bfloat);
|
||||
#endif
|
||||
|
||||
@ -113,12 +113,18 @@ kernel void ampUpdateScale(
|
||||
|
||||
INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE(float);
|
||||
INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE(half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE(bfloat);
|
||||
#endif
|
||||
|
||||
INSTANTIATE_AMP_UPDATE_SCALE(float);
|
||||
INSTANTIATE_AMP_UPDATE_SCALE(half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
INSTANTIATE_AMP_UPDATE_SCALE(bfloat);
|
||||
#endif
|
||||
|
||||
INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE_SINGLE(float);
|
||||
INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE_SINGLE(half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE_SINGLE(bfloat);
|
||||
#endif
|
||||
|
||||
@ -590,7 +590,9 @@ kernel void attention(
|
||||
|
||||
INSTANTIATE_SDPA_VECTOR_HEADS(float);
|
||||
INSTANTIATE_SDPA_VECTOR_HEADS(half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
INSTANTIATE_SDPA_VECTOR_HEADS(bfloat);
|
||||
#endif
|
||||
|
||||
#define INSTANTIATE_ATTN(DTYPE, bq, bk, bd, wm, wn) \
|
||||
template [[host_name("attention_" #DTYPE "_bq" #bq "_bk" #bk "_bd" #bd \
|
||||
@ -619,4 +621,6 @@ INSTANTIATE_SDPA_VECTOR_HEADS(bfloat);
|
||||
|
||||
INSTANTIATE_ATTN_SHAPES_HELPER(float);
|
||||
INSTANTIATE_ATTN_SHAPES_HELPER(half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
INSTANTIATE_ATTN_SHAPES_HELPER(bfloat);
|
||||
#endif
|
||||
|
||||
@ -209,9 +209,38 @@ struct hermite_polynomial_he_functor {
|
||||
};
|
||||
|
||||
struct nextafter_functor {
|
||||
#if __METAL_VERSION__ < 310
|
||||
template <typename U>
|
||||
struct bit_type {};
|
||||
template <>
|
||||
struct bit_type<float> {
|
||||
using type = int;
|
||||
};
|
||||
template <>
|
||||
struct bit_type<half> {
|
||||
using type = short;
|
||||
};
|
||||
#endif
|
||||
template <typename T>
|
||||
inline T operator()(const T a, const T b) {
|
||||
#if __METAL_VERSION__ >= 310
|
||||
return static_cast<T>(::metal::nextafter(a, b));
|
||||
#else
|
||||
using U = typename bit_type<T>::type;
|
||||
if (a == b) {
|
||||
return a;
|
||||
}
|
||||
if (::metal::isunordered(a, b)) {
|
||||
return NAN;
|
||||
}
|
||||
if (a == 0) {
|
||||
constexpr auto eps = as_type<T>(static_cast<U>(1));
|
||||
return b > 0 ? eps : -eps;
|
||||
}
|
||||
auto bits = as_type<U>(a);
|
||||
(a > 0) ^ (a > b) ? bits++ : bits--;
|
||||
return as_type<T>(bits);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
@ -315,6 +344,13 @@ struct fmod_functor {
|
||||
}
|
||||
};
|
||||
|
||||
// Some helper defines
|
||||
#if __METAL_VERSION__ >= 310
|
||||
#define _METAL_310_PLUS(x) x
|
||||
#else
|
||||
#define _METAL_310_PLUS(x)
|
||||
#endif
|
||||
|
||||
#define REGISTER_INTEGER_BINARY_OP(NAME) \
|
||||
REGISTER_BINARY_OP(NAME, long, long); \
|
||||
REGISTER_BINARY_OP(NAME, int, int); \
|
||||
@ -334,12 +370,12 @@ struct fmod_functor {
|
||||
#define REGISTER_FLOAT_BINARY_OP(NAME) \
|
||||
REGISTER_BINARY_OP(NAME, float, float); \
|
||||
REGISTER_BINARY_OP(NAME, half, half); \
|
||||
REGISTER_BINARY_OP(NAME, bfloat, bfloat)
|
||||
_METAL_310_PLUS(REGISTER_BINARY_OP(NAME, bfloat, bfloat))
|
||||
|
||||
#define REGISTER_OPMATH_FLOAT_BINARY_OP(NAME) \
|
||||
REGISTER_OPMATH_BINARY_OP(NAME, float, float); \
|
||||
REGISTER_OPMATH_BINARY_OP(NAME, half, half); \
|
||||
REGISTER_OPMATH_BINARY_OP(NAME, bfloat, bfloat)
|
||||
_METAL_310_PLUS(REGISTER_OPMATH_BINARY_OP(NAME, bfloat, bfloat))
|
||||
|
||||
REGISTER_FLOAT_BINARY_OP(copysign);
|
||||
REGISTER_INT2FLOAT_BINARY_OP(copysign);
|
||||
@ -411,9 +447,11 @@ REGISTER_BINARY_ALPHA_OP(lerp_alpha, uchar, uchar, uchar);
|
||||
REGISTER_BINARY_ALPHA_OP(lerp_alpha, char, char, char);
|
||||
REGISTER_BINARY_ALPHA_OP(lerp_alpha, bool, bool, bool);
|
||||
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_BINARY_ALPHA_OP(add_alpha, bfloat, bfloat, bfloat);
|
||||
REGISTER_BINARY_ALPHA_OP(sub_alpha, bfloat, bfloat, bfloat);
|
||||
REGISTER_BINARY_ALPHA_OP(lerp_alpha, bfloat, bfloat, bfloat);
|
||||
#endif
|
||||
|
||||
// Complex binary functions
|
||||
REGISTER_BINARY_OP(polar, float, float2);
|
||||
|
||||
@ -180,8 +180,10 @@ REGISTER_SEARCHSORTED_OP(float, int);
|
||||
REGISTER_SEARCHSORTED_OP(float, long);
|
||||
REGISTER_SEARCHSORTED_OP(half, int);
|
||||
REGISTER_SEARCHSORTED_OP(half, long);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_SEARCHSORTED_OP(bfloat, int);
|
||||
REGISTER_SEARCHSORTED_OP(bfloat, long);
|
||||
#endif
|
||||
REGISTER_SEARCHSORTED_OP(char, int);
|
||||
REGISTER_SEARCHSORTED_OP(char, long);
|
||||
REGISTER_SEARCHSORTED_OP(uchar, int);
|
||||
|
||||
@ -96,4 +96,6 @@ kernel void col2im_kernel(
|
||||
INSTANTIATE_COL2IM(bool);
|
||||
INSTANTIATE_COL2IM(float);
|
||||
INSTANTIATE_COL2IM(half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
INSTANTIATE_COL2IM(bfloat);
|
||||
#endif
|
||||
|
||||
@ -20,7 +20,9 @@ REGISTER_CROSS_FUNC(short);
|
||||
REGISTER_CROSS_FUNC(char);
|
||||
REGISTER_CROSS_FUNC(uchar);
|
||||
REGISTER_CROSS_FUNC(bool);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_CROSS_FUNC(bfloat);
|
||||
#endif
|
||||
|
||||
template <typename T, typename U>
|
||||
kernel void cross(
|
||||
@ -66,4 +68,6 @@ REGISTER_CROSS_OP(short);
|
||||
REGISTER_CROSS_OP(char);
|
||||
REGISTER_CROSS_OP(uchar);
|
||||
REGISTER_CROSS_OP(bool);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_CROSS_OP(bfloat);
|
||||
#endif
|
||||
|
||||
@ -1,9 +1,11 @@
|
||||
#include <metal_stdlib>
|
||||
|
||||
using metal::max;
|
||||
#if __METAL_VERSION__ >= 310
|
||||
bfloat max(bfloat a, bfloat b) {
|
||||
return a > b ? a : b;
|
||||
}
|
||||
#endif
|
||||
|
||||
#define kmaxThreadGroups 32
|
||||
#define kmaxTensors 32
|
||||
@ -304,9 +306,11 @@ REGISTER_ADAM_OPS_QUART(float, float);
|
||||
REGISTER_ADAM_OPS_QUART(float, half);
|
||||
REGISTER_ADAM_OPS_QUART(half, float);
|
||||
REGISTER_ADAM_OPS_QUART(half, half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_ADAM_OPS_QUART(float, bfloat);
|
||||
REGISTER_ADAM_OPS_QUART(bfloat, bfloat);
|
||||
REGISTER_ADAM_OPS_QUART(bfloat, float);
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
inline void sgd_momentum_math(
|
||||
@ -456,5 +460,7 @@ REGISTER_FUSED_SGD_OP(float);
|
||||
REGISTER_FUSED_SGD_OP(half);
|
||||
REGISTER_FUSED_SGD_MOMENTUM_OP(float);
|
||||
REGISTER_FUSED_SGD_MOMENTUM_OP(half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_FUSED_SGD_OP(bfloat);
|
||||
REGISTER_FUSED_SGD_MOMENTUM_OP(bfloat);
|
||||
#endif
|
||||
|
||||
@ -106,7 +106,9 @@ kernel void polygamma(
|
||||
constant int64_t& order [[buffer(2)]], \
|
||||
uint id [[thread_position_in_grid]]);
|
||||
|
||||
#if __METAL_VERSION__ >= 310
|
||||
INSTANTIATE_GAMMA_KERNELS(bfloat, bfloat);
|
||||
#endif
|
||||
INSTANTIATE_GAMMA_KERNELS(half, half);
|
||||
INSTANTIATE_GAMMA_KERNELS(float, float);
|
||||
INSTANTIATE_GAMMA_KERNELS(bool, float);
|
||||
|
||||
@ -76,4 +76,6 @@ INSTANTIATE_IM2COL(float);
|
||||
INSTANTIATE_IM2COL(float2);
|
||||
INSTANTIATE_IM2COL(half);
|
||||
INSTANTIATE_IM2COL(half2);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
INSTANTIATE_IM2COL(bfloat);
|
||||
#endif
|
||||
|
||||
@ -240,7 +240,9 @@ REGISTER_INDEX_OP(put_accumulate, short, short);
|
||||
REGISTER_INDEX_OP(put_accumulate, char, char);
|
||||
REGISTER_INDEX_OP(put_accumulate, uchar, uchar);
|
||||
REGISTER_INDEX_OP(put_accumulate, bool, bool);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_INDEX_OP(put_accumulate, bfloat, bfloat);
|
||||
#endif
|
||||
|
||||
template <typename StridesT, typename DataT>
|
||||
kernel void kernel_index_offsets(
|
||||
@ -475,8 +477,10 @@ INSTANTIATE_INDEX_COPY(char, long);
|
||||
INSTANTIATE_INDEX_COPY(uchar, int);
|
||||
INSTANTIATE_INDEX_COPY(uchar, long);
|
||||
|
||||
#if __METAL_VERSION__ >= 310
|
||||
INSTANTIATE_INDEX_COPY(bfloat, int);
|
||||
INSTANTIATE_INDEX_COPY(bfloat, long);
|
||||
#endif
|
||||
INSTANTIATE_INDEX_COPY(float2, int);
|
||||
INSTANTIATE_INDEX_COPY(float2, long);
|
||||
INSTANTIATE_INDEX_COPY(half2, int);
|
||||
|
||||
@ -288,6 +288,7 @@ kernel void layer_norm_looped(
|
||||
#define instantiate_layer_norm(DTYPE) \
|
||||
instantiate_layer_norm_single_row(DTYPE) instantiate_layer_norm_looped(DTYPE)
|
||||
|
||||
instantiate_layer_norm(float);
|
||||
instantiate_layer_norm(half);
|
||||
instantiate_layer_norm(bfloat);
|
||||
instantiate_layer_norm(float) instantiate_layer_norm(half)
|
||||
#if __METAL_VERSION__ >= 310
|
||||
instantiate_layer_norm(bfloat)
|
||||
#endif
|
||||
|
||||
@ -635,7 +635,9 @@ kernel void applyPivots(
|
||||
|
||||
INSTANTIATE_NAIVE_MM(float);
|
||||
INSTANTIATE_NAIVE_MM(half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
INSTANTIATE_NAIVE_MM(bfloat);
|
||||
#endif
|
||||
|
||||
// Integral MM
|
||||
INSTANTIATE_NAIVE_MM(short);
|
||||
|
||||
@ -22,22 +22,6 @@ struct PoolingParams {
|
||||
bool return_indices;
|
||||
};
|
||||
|
||||
template <unsigned N = 5, typename idx_type_t = int32_t>
|
||||
struct AvgPoolingParams {
|
||||
int32_t dims;
|
||||
int32_t pooling_dims;
|
||||
::c10::metal::array<idx_type_t, N> input_sizes;
|
||||
::c10::metal::array<idx_type_t, N> input_strides;
|
||||
::c10::metal::array<idx_type_t, N> output_sizes;
|
||||
::c10::metal::array<idx_type_t, N> output_strides;
|
||||
::c10::metal::array<idx_type_t, N - 2> kernel_size;
|
||||
::c10::metal::array<idx_type_t, N - 2> stride;
|
||||
::c10::metal::array<idx_type_t, N - 2> padding;
|
||||
bool count_include_pad;
|
||||
bool has_divisor_override;
|
||||
int32_t divisor_override;
|
||||
};
|
||||
|
||||
template <unsigned N = 5, typename idx_type_t = int32_t>
|
||||
struct PoolingBackwardParams {
|
||||
int32_t dims;
|
||||
@ -48,14 +32,3 @@ struct PoolingBackwardParams {
|
||||
::c10::metal::array<idx_type_t, N> grad_output_strides;
|
||||
::c10::metal::array<idx_type_t, N> indices_strides;
|
||||
};
|
||||
|
||||
template <unsigned N = 5, typename idx_type_t = int32_t>
|
||||
struct MaxUnpoolingParams {
|
||||
int32_t dims;
|
||||
int32_t pooling_dims;
|
||||
::c10::metal::array<idx_type_t, N> input_sizes;
|
||||
::c10::metal::array<idx_type_t, N> input_strides;
|
||||
::c10::metal::array<idx_type_t, N> output_sizes;
|
||||
::c10::metal::array<idx_type_t, N> output_strides;
|
||||
::c10::metal::array<idx_type_t, N> indices_strides;
|
||||
};
|
||||
|
||||
@ -168,16 +168,6 @@ PoolOffsets find_pool_offsets(
|
||||
leading_dims,
|
||||
return_indices,
|
||||
tid);
|
||||
case 3:
|
||||
return find_pool_offsets_dim_specific<3>(
|
||||
output_sizes,
|
||||
output_strides,
|
||||
indices_strides,
|
||||
input_strides,
|
||||
pooling_dim_indices,
|
||||
leading_dims,
|
||||
return_indices,
|
||||
tid);
|
||||
}
|
||||
return PoolOffsets();
|
||||
}
|
||||
@ -302,359 +292,36 @@ kernel void max_pool_backward(
|
||||
pooling_dims);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void max_unpool_impl(
|
||||
device T* output,
|
||||
T input_element,
|
||||
int32_t input_index,
|
||||
constant int32_t* output_sizes,
|
||||
constant int32_t* output_strides,
|
||||
int32_t pooling_dims) {
|
||||
int32_t size_prod = 1;
|
||||
int32_t pool_offset = 0;
|
||||
|
||||
for (auto dim = pooling_dims - 1; dim >= 0; dim--) {
|
||||
auto next_size_prod = output_sizes[dim] * size_prod;
|
||||
pool_offset +=
|
||||
output_strides[dim] * ((input_index % next_size_prod) / size_prod);
|
||||
size_prod *= output_sizes[dim];
|
||||
}
|
||||
|
||||
output[pool_offset] = input_element;
|
||||
}
|
||||
|
||||
// Kernel computes one element of the grad input per kernel call.
|
||||
template <typename T>
|
||||
kernel void max_unpool(
|
||||
device T* output [[buffer(0)]],
|
||||
constant T* input [[buffer(1)]],
|
||||
constant int64_t* indices [[buffer(2)]],
|
||||
constant MaxUnpoolingParams<5>& params [[buffer(3)]],
|
||||
uint tid [[thread_position_in_grid]]) {
|
||||
auto pooling_dims = params.pooling_dims;
|
||||
auto dims = params.dims;
|
||||
auto input_sizes = params.input_sizes.data();
|
||||
auto input_strides = params.input_strides.data();
|
||||
auto output_sizes = params.output_sizes.data();
|
||||
auto output_strides = params.output_strides.data();
|
||||
auto indices_strides = params.indices_strides.data();
|
||||
|
||||
auto leading_dims = dims - pooling_dims;
|
||||
|
||||
// NOTE: Since we're doing unpooling, the variable names "input" and "output"
|
||||
// are reversed compared to the pooling operations. So in `find_pool_offsets`,
|
||||
// we need to map "input" -> "output" and "output" -> "input".
|
||||
PoolOffsets offsets = find_pool_offsets(
|
||||
/*output_sizes=*/input_sizes,
|
||||
/*output_strides=*/input_strides,
|
||||
indices_strides,
|
||||
/*input_strides=*/output_strides,
|
||||
/*pooling_dim_indices=*/nullptr,
|
||||
dims,
|
||||
leading_dims,
|
||||
/*return_indices=*/true,
|
||||
tid);
|
||||
|
||||
max_unpool_impl<T>(
|
||||
output + offsets.input_leading,
|
||||
input[offsets.output],
|
||||
indices[offsets.indices],
|
||||
output_sizes + leading_dims,
|
||||
output_strides + leading_dims,
|
||||
pooling_dims);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct AvgPoolIterBounds {
|
||||
T start;
|
||||
T end;
|
||||
T count;
|
||||
};
|
||||
|
||||
template <int32_t dim>
|
||||
AvgPoolIterBounds<int32_t> get_avg_pool_input_iter_bounds(
|
||||
constant int32_t* input_sizes,
|
||||
thread int32_t (&pooling_dim_indices)[3],
|
||||
constant int32_t* kernel_size,
|
||||
constant int32_t* stride,
|
||||
constant int32_t* padding,
|
||||
bool count_include_pad) {
|
||||
auto start = stride[dim] * pooling_dim_indices[dim] - padding[dim];
|
||||
auto end = start + kernel_size[dim];
|
||||
auto end_corrected = min(start + kernel_size[dim], input_sizes[dim]);
|
||||
auto start_corrected = (start < 0) ? 0 : start;
|
||||
auto count = count_include_pad
|
||||
? (min(end, input_sizes[dim] + padding[dim]) - start)
|
||||
: (end_corrected - start_corrected);
|
||||
return {start_corrected, end_corrected, count};
|
||||
}
|
||||
|
||||
// Iterates through all the input elements that this kernel needs to
|
||||
// apply max to. Specialized for 3 pooling dimensions.
|
||||
template <typename T>
|
||||
void avg_pool_3d_input_iter(
|
||||
constant T* input,
|
||||
device T* output,
|
||||
constant int32_t* input_sizes,
|
||||
constant int32_t* input_strides,
|
||||
thread int32_t (&pooling_dim_indices)[3],
|
||||
constant int32_t* kernel_size,
|
||||
constant int32_t* stride,
|
||||
constant int32_t* padding,
|
||||
bool count_include_pad,
|
||||
bool has_divisor_override,
|
||||
int32_t divisor_override) {
|
||||
auto bounds0 = get_avg_pool_input_iter_bounds<0>(
|
||||
input_sizes,
|
||||
pooling_dim_indices,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
count_include_pad);
|
||||
auto bounds1 = get_avg_pool_input_iter_bounds<1>(
|
||||
input_sizes,
|
||||
pooling_dim_indices,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
count_include_pad);
|
||||
auto bounds2 = get_avg_pool_input_iter_bounds<2>(
|
||||
input_sizes,
|
||||
pooling_dim_indices,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
count_include_pad);
|
||||
|
||||
T value_sum = 0;
|
||||
auto divisor = has_divisor_override
|
||||
? divisor_override
|
||||
: (bounds0.count) * (bounds1.count) * (bounds2.count);
|
||||
|
||||
for (auto i0 = bounds0.start; i0 < bounds0.end; i0++) {
|
||||
auto offset0 = input_strides[0] * i0;
|
||||
|
||||
for (auto i1 = bounds1.start; i1 < bounds1.end; i1++) {
|
||||
auto offset1 = input_strides[1] * i1;
|
||||
|
||||
for (auto i2 = bounds2.start; i2 < bounds2.end; i2++) {
|
||||
auto offset2 = input_strides[2] * i2;
|
||||
auto input_value = input[offset0 + offset1 + offset2];
|
||||
value_sum += input_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
*output = value_sum / static_cast<T>(divisor);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void avg_pool_backward_3d_input_iter(
|
||||
device AtomicType_t<T>* grad_input,
|
||||
constant T* grad_output,
|
||||
constant int32_t* grad_input_sizes,
|
||||
constant int32_t* grad_input_strides,
|
||||
int32_t grad_input_leading_offset,
|
||||
thread int32_t (&pooling_dim_indices)[3],
|
||||
constant int32_t* kernel_size,
|
||||
constant int32_t* stride,
|
||||
constant int32_t* padding,
|
||||
bool count_include_pad,
|
||||
bool has_divisor_override,
|
||||
int32_t divisor_override) {
|
||||
auto bounds0 = get_avg_pool_input_iter_bounds<0>(
|
||||
grad_input_sizes,
|
||||
pooling_dim_indices,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
count_include_pad);
|
||||
auto bounds1 = get_avg_pool_input_iter_bounds<1>(
|
||||
grad_input_sizes,
|
||||
pooling_dim_indices,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
count_include_pad);
|
||||
auto bounds2 = get_avg_pool_input_iter_bounds<2>(
|
||||
grad_input_sizes,
|
||||
pooling_dim_indices,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
count_include_pad);
|
||||
|
||||
auto divisor = has_divisor_override
|
||||
? divisor_override
|
||||
: (bounds0.count) * (bounds1.count) * (bounds2.count);
|
||||
auto grad_val = *grad_output / static_cast<T>(divisor);
|
||||
|
||||
for (auto i0 = bounds0.start; i0 < bounds0.end; i0++) {
|
||||
auto offset0 = grad_input_strides[0] * i0;
|
||||
|
||||
for (auto i1 = bounds1.start; i1 < bounds1.end; i1++) {
|
||||
auto offset1 = grad_input_strides[1] * i1;
|
||||
|
||||
for (auto i2 = bounds2.start; i2 < bounds2.end; i2++) {
|
||||
auto offset2 = grad_input_strides[2] * i2;
|
||||
auto pool_offset = offset0 + offset1 + offset2;
|
||||
|
||||
AtomicType<T>::atomic_add(
|
||||
grad_input, grad_input_leading_offset + pool_offset, grad_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Kernel computes one element of the output per kernel call.
|
||||
template <typename T>
|
||||
kernel void avg_pool(
|
||||
constant T* input [[buffer(0)]],
|
||||
device T* output [[buffer(1)]],
|
||||
constant AvgPoolingParams<5>& params [[buffer(2)]],
|
||||
uint tid [[thread_position_in_grid]]) {
|
||||
auto pooling_dims = params.pooling_dims;
|
||||
auto dims = params.dims;
|
||||
auto input_sizes = params.input_sizes.data();
|
||||
auto input_strides = params.input_strides.data();
|
||||
auto output_sizes = params.output_sizes.data();
|
||||
auto output_strides = params.output_strides.data();
|
||||
auto kernel_size = params.kernel_size.data();
|
||||
auto stride = params.stride.data();
|
||||
auto padding = params.padding.data();
|
||||
auto leading_dims = dims - pooling_dims;
|
||||
|
||||
// This buffer keeps track of the pooling dimension indices of this thread's
|
||||
// element of the output. We need to fill it with the proper values below.
|
||||
int32_t pooling_dim_indices[3];
|
||||
|
||||
PoolOffsets offsets = find_pool_offsets(
|
||||
output_sizes,
|
||||
output_strides,
|
||||
/*indices_strides=*/nullptr,
|
||||
input_strides,
|
||||
pooling_dim_indices,
|
||||
dims,
|
||||
leading_dims,
|
||||
/*return_indices=*/false,
|
||||
tid);
|
||||
|
||||
output += offsets.output;
|
||||
input += offsets.input_leading;
|
||||
input_sizes += leading_dims;
|
||||
input_strides += leading_dims;
|
||||
|
||||
avg_pool_3d_input_iter<T>(
|
||||
input,
|
||||
output,
|
||||
input_sizes,
|
||||
input_strides,
|
||||
pooling_dim_indices,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
params.count_include_pad,
|
||||
params.has_divisor_override,
|
||||
params.divisor_override);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
kernel void avg_pool_backward(
|
||||
device AtomicType_t<T>* grad_input [[buffer(0)]],
|
||||
constant T* grad_output [[buffer(1)]],
|
||||
constant AvgPoolingParams<5>& params [[buffer(2)]],
|
||||
uint tid [[thread_position_in_grid]]) {
|
||||
auto pooling_dims = params.pooling_dims;
|
||||
auto dims = params.dims;
|
||||
auto grad_input_sizes = params.input_sizes.data();
|
||||
auto grad_input_strides = params.input_strides.data();
|
||||
auto grad_output_sizes = params.output_sizes.data();
|
||||
auto grad_output_strides = params.output_strides.data();
|
||||
auto kernel_size = params.kernel_size.data();
|
||||
auto stride = params.stride.data();
|
||||
auto padding = params.padding.data();
|
||||
auto leading_dims = dims - pooling_dims;
|
||||
|
||||
// This buffer keeps track of the pooling dimension indices of this thread's
|
||||
// element of the output. We need to fill it with the proper values below.
|
||||
int32_t pooling_dim_indices[3];
|
||||
|
||||
PoolOffsets offsets = find_pool_offsets(
|
||||
grad_output_sizes,
|
||||
grad_output_strides,
|
||||
/*indices_strides=*/nullptr,
|
||||
grad_input_strides,
|
||||
pooling_dim_indices,
|
||||
dims,
|
||||
leading_dims,
|
||||
/*return_indices=*/false,
|
||||
tid);
|
||||
|
||||
grad_output += offsets.output;
|
||||
grad_input_sizes += leading_dims;
|
||||
grad_input_strides += leading_dims;
|
||||
|
||||
avg_pool_backward_3d_input_iter<T>(
|
||||
grad_input,
|
||||
grad_output,
|
||||
grad_input_sizes,
|
||||
grad_input_strides,
|
||||
offsets.input_leading,
|
||||
pooling_dim_indices,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
params.count_include_pad,
|
||||
params.has_divisor_override,
|
||||
params.divisor_override);
|
||||
}
|
||||
|
||||
#define REGISTER_POOL_OP(DTYPE) \
|
||||
template [[host_name("max_pool_" #DTYPE)]] kernel void max_pool<DTYPE>( \
|
||||
constant DTYPE * input [[buffer(0)]], \
|
||||
device DTYPE * output [[buffer(1)]], \
|
||||
device int64_t* indices [[buffer(2)]], \
|
||||
constant PoolingParams<5>& params [[buffer(3)]], \
|
||||
uint tid [[thread_position_in_grid]]); \
|
||||
\
|
||||
template [[host_name("max_unpool_" #DTYPE)]] kernel void max_unpool<DTYPE>( \
|
||||
device DTYPE * output [[buffer(0)]], \
|
||||
constant DTYPE * input [[buffer(1)]], \
|
||||
constant int64_t* indices [[buffer(2)]], \
|
||||
constant MaxUnpoolingParams<5>& params [[buffer(3)]], \
|
||||
uint tid [[thread_position_in_grid]]); \
|
||||
\
|
||||
template [[host_name("avg_pool_" #DTYPE)]] kernel void avg_pool<DTYPE>( \
|
||||
constant DTYPE * input [[buffer(0)]], \
|
||||
device DTYPE * output [[buffer(1)]], \
|
||||
constant AvgPoolingParams<5> & params [[buffer(2)]], \
|
||||
#define REGISTER_MAX_POOL_OP(DTYPE) \
|
||||
template [[host_name("max_pool_" #DTYPE)]] kernel void max_pool<DTYPE>( \
|
||||
constant DTYPE * input [[buffer(0)]], \
|
||||
device DTYPE * output [[buffer(1)]], \
|
||||
device int64_t* indices [[buffer(2)]], \
|
||||
constant PoolingParams<5>& params [[buffer(3)]], \
|
||||
uint tid [[thread_position_in_grid]]);
|
||||
|
||||
#define REGISTER_POOL_BACKWARD_OP(DTYPE) \
|
||||
#define REGISTER_MAX_POOL_BACKWARD_OP(DTYPE) \
|
||||
template [[host_name("max_pool_backward_" #DTYPE)]] \
|
||||
kernel void max_pool_backward<DTYPE>( \
|
||||
device AtomicType_t<DTYPE> * grad_input [[buffer(0)]], \
|
||||
constant DTYPE * grad_output_ [[buffer(1)]], \
|
||||
constant int64_t* grad_indices_ [[buffer(2)]], \
|
||||
constant PoolingBackwardParams<5>& params [[buffer(3)]], \
|
||||
uint tid [[thread_position_in_grid]]); \
|
||||
\
|
||||
template [[host_name("avg_pool_backward_" #DTYPE)]] \
|
||||
kernel void avg_pool_backward<DTYPE>( \
|
||||
device AtomicType_t<DTYPE> * grad_input [[buffer(0)]], \
|
||||
constant DTYPE * grad_output [[buffer(1)]], \
|
||||
constant AvgPoolingParams<5> & params [[buffer(2)]], \
|
||||
uint tid [[thread_position_in_grid]]);
|
||||
|
||||
REGISTER_POOL_OP(float);
|
||||
REGISTER_POOL_OP(half);
|
||||
REGISTER_POOL_OP(bfloat);
|
||||
REGISTER_POOL_OP(int);
|
||||
REGISTER_POOL_OP(long);
|
||||
REGISTER_POOL_OP(short);
|
||||
REGISTER_POOL_OP(char);
|
||||
REGISTER_POOL_OP(uchar);
|
||||
REGISTER_POOL_OP(bool);
|
||||
REGISTER_MAX_POOL_OP(float);
|
||||
REGISTER_MAX_POOL_OP(half);
|
||||
REGISTER_MAX_POOL_OP(int);
|
||||
REGISTER_MAX_POOL_OP(long);
|
||||
REGISTER_MAX_POOL_OP(short);
|
||||
REGISTER_MAX_POOL_OP(char);
|
||||
REGISTER_MAX_POOL_OP(uchar);
|
||||
REGISTER_MAX_POOL_OP(bool);
|
||||
|
||||
REGISTER_POOL_BACKWARD_OP(float);
|
||||
REGISTER_POOL_BACKWARD_OP(half);
|
||||
REGISTER_POOL_BACKWARD_OP(bfloat);
|
||||
REGISTER_MAX_POOL_BACKWARD_OP(float);
|
||||
REGISTER_MAX_POOL_BACKWARD_OP(half);
|
||||
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_MAX_POOL_OP(bfloat);
|
||||
REGISTER_MAX_POOL_BACKWARD_OP(bfloat);
|
||||
#endif
|
||||
|
||||
@ -197,10 +197,12 @@ INSTANTIATE_INT4MV(float, 128);
|
||||
INSTANTIATE_INT4MV(half, 128);
|
||||
INSTANTIATE_INT4MV(float, 256);
|
||||
INSTANTIATE_INT4MV(half, 256);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
INSTANTIATE_INT4MV(bfloat, 32);
|
||||
INSTANTIATE_INT4MV(bfloat, 64);
|
||||
INSTANTIATE_INT4MV(bfloat, 128);
|
||||
INSTANTIATE_INT4MV(bfloat, 256);
|
||||
#endif
|
||||
|
||||
// ------------------------------ int8 MM For M >= 12 ------------------------------------
|
||||
/**
|
||||
@ -232,10 +234,12 @@ template <> struct BlockType<half> {
|
||||
using simdgroup_type8x8 = simdgroup_half8x8;
|
||||
using type4 = half4;
|
||||
};
|
||||
#if __METAL_VERSION__ >= 310
|
||||
template <> struct BlockType<bfloat> {
|
||||
using simdgroup_type8x8 = simdgroup_bfloat8x8;
|
||||
using type4 = bfloat4;
|
||||
};
|
||||
#endif
|
||||
|
||||
template<typename T>
|
||||
float2 get_scale_zero_q8(constant T * scalesAndZeros, uint2 index) {
|
||||
@ -486,7 +490,9 @@ kernel void kernel_mul_mm<DTYPE, WDTYPE, DEQUANT_FUNC>( \
|
||||
|
||||
INSTANTIATE_MM(float, char, get_scale_zero_q8);
|
||||
INSTANTIATE_MM(half, char, get_scale_zero_q8);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
INSTANTIATE_MM(bfloat, char, get_scale_zero_q8);
|
||||
#endif
|
||||
// ------------------------------ int8 MM For M < 12 ------------------------------------
|
||||
/* Matrix vector multiplication, used for small M size for matrix multiplication as well.
|
||||
|
||||
@ -640,4 +646,6 @@ kernel void kernel_mul_mv<DTYPE>(
|
||||
|
||||
INSTANTIATE_MV(float);
|
||||
INSTANTIATE_MV(half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
INSTANTIATE_MV(bfloat);
|
||||
#endif
|
||||
|
||||
@ -192,4 +192,6 @@ template <typename T>
|
||||
|
||||
instantiate_rms(float)
|
||||
instantiate_rms(half)
|
||||
#if __METAL_VERSION__ >= 310
|
||||
instantiate_rms(bfloat)
|
||||
#endif // clang-format on
|
||||
|
||||
@ -23,4 +23,6 @@ kernel void renorm(
|
||||
|
||||
REGISTER_RENORM_OP(float);
|
||||
REGISTER_RENORM_OP(half);
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_RENORM_OP(bfloat);
|
||||
#endif
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user