mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 21:49:24 +08:00
Compare commits
205 Commits
ciflow/ind
...
copilot/co
Author | SHA1 | Date | |
---|---|---|---|
241b702918 | |||
83df2e0610 | |||
77fe8234bb | |||
6ece527fc5 | |||
ce29d0d796 | |||
7231118db3 | |||
5d4da26ed0 | |||
574c9fc950 | |||
80d2ca7566 | |||
4a22139eea | |||
cb6e4d7d82 | |||
202f83dc4e | |||
9fe3b2afbe | |||
d0c24b392c | |||
b44fb14906 | |||
51348c0219 | |||
fdd560afd1 | |||
e925dfcc6b | |||
f1d882212a | |||
24879f0de9 | |||
9e94ec76b8 | |||
364624e209 | |||
7e150467f7 | |||
43d78423ac | |||
fcbde24c1c | |||
861cdb887b | |||
3154482072 | |||
9fccbdd4f0 | |||
7dabfb07cb | |||
d0add0be43 | |||
11e2084308 | |||
9726553653 | |||
d82527b32a | |||
5d9b024276 | |||
5b2afe4c5d | |||
b2953f5643 | |||
470e2f61c3 | |||
e0fe37fa68 | |||
d2c82bafb7 | |||
98a488c9aa | |||
5b3ea75895 | |||
556fc09a9f | |||
ce109b3f79 | |||
4d833f859b | |||
d7e275d4b4 | |||
d5db3aee0d | |||
5641de7b6b | |||
cbc08c8993 | |||
1a54d3333d | |||
4c1c341fa0 | |||
5f21cc786a | |||
e86942f422 | |||
2cd5fd1588 | |||
7d0f872cb3 | |||
fb06e49ce8 | |||
27a98e6ae9 | |||
b10f463b1a | |||
431c13cf61 | |||
aead9270f5 | |||
9bf5b38c14 | |||
aba8c43594 | |||
37f3ba274a | |||
585b9dbb5e | |||
d795fb225a | |||
7df9aca529 | |||
d4a713cd9c | |||
5daef30b26 | |||
6dedd34c31 | |||
a303d6dda9 | |||
7669ac9402 | |||
86fd4fc23e | |||
99097b6d89 | |||
a214371008 | |||
7d87d7052e | |||
1a34ff4e04 | |||
fe5ccb1a74 | |||
85586d7efc | |||
e1d71a6b35 | |||
d61a9b88cf | |||
99b32a6750 | |||
783da8b8e7 | |||
ed74dc054d | |||
f33c7e1a43 | |||
219fb6aafc | |||
515b5ff539 | |||
608a6d4a26 | |||
03e5dbb26e | |||
7ee45f7503 | |||
e6d9d68598 | |||
1a5b7eca7b | |||
8573574b32 | |||
e6033f6efb | |||
9272437cde | |||
f06e669f6c | |||
69b05913fb | |||
d73c283c3a | |||
eaeaa08e3a | |||
d0c32971b4 | |||
d7ffa8b8a2 | |||
00afa06800 | |||
5d0b22008d | |||
ab6014a903 | |||
f6daffc54d | |||
66b75693ae | |||
21697feff2 | |||
12fa4192c5 | |||
23fb7e9f4b | |||
5e480b8ecf | |||
19ba506ca3 | |||
003dd13073 | |||
c2bd41ac9f | |||
ca8bd5dbed | |||
26f3803433 | |||
48064acf37 | |||
e5a9c247bc | |||
36371b8ec7 | |||
7e6721fb0a | |||
901bbcba12 | |||
febb603230 | |||
568d2f3ae7 | |||
b54e466fd0 | |||
53f9ae0e50 | |||
b42fe389b9 | |||
66ea76ec44 | |||
e787d532b6 | |||
b3f6d49b69 | |||
bc1f2108d7 | |||
f071f17911 | |||
fa1539594b | |||
dfc8a1c5dd | |||
7f9b745494 | |||
83f9baf413 | |||
ffc7552e01 | |||
78f5a1ec60 | |||
2b71b62045 | |||
8c4b528403 | |||
066f818eea | |||
14af1dc3da | |||
2395d7d7da | |||
0aa7ebaf03 | |||
7a97832585 | |||
84d141e910 | |||
7c6c5d04fe | |||
b509fb9b5d | |||
331b7cc054 | |||
815d641599 | |||
ffe3cb226a | |||
7ae123d72c | |||
7719cb75bf | |||
712f54d453 | |||
f58f301313 | |||
5c583e2573 | |||
0c14f55de6 | |||
8e510e1095 | |||
59d30d1b75 | |||
3915898c22 | |||
3044e1a460 | |||
b11593c31b | |||
36871622f1 | |||
b4fd47179e | |||
4f400ab520 | |||
839f6facdb | |||
ca65023b90 | |||
132ae8e6dd | |||
a20afb6100 | |||
47524dcc48 | |||
9ffba8a2f9 | |||
3681312ce0 | |||
7778a58e7c | |||
e7091a47da | |||
bcfea48ab7 | |||
d2e1dbc8f2 | |||
89298ada83 | |||
c467e59cb0 | |||
bbb902c8dd | |||
e6f766c7d7 | |||
13b621d87c | |||
01738a3fea | |||
a2f34bdd7c | |||
a63ab0b8cd | |||
102b7885ff | |||
382d04a51e | |||
1ec0755a7e | |||
058782c6ab | |||
2b4ef6b4d6 | |||
3f83e8915e | |||
d7e3f493d9 | |||
08f09d9543 | |||
74acf92648 | |||
cbf212e9c7 | |||
d18e068fd6 | |||
3401665110 | |||
8c60f4ae08 | |||
c4565c3b94 | |||
6918f17114 | |||
9b6be53326 | |||
7fee6bbf34 | |||
6adaa328f4 | |||
4a7eed527f | |||
d2494cbb2b | |||
5eddbb5e47 | |||
c9b2a09530 | |||
bf5aeb3148 | |||
45b8c0f75c | |||
c733072874 |
@ -20,7 +20,7 @@ ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
|
|||||||
|
|
||||||
# cmake-3.18.4 from pip
|
# cmake-3.18.4 from pip
|
||||||
RUN yum install -y python3-pip && \
|
RUN yum install -y python3-pip && \
|
||||||
python3 -mpip install cmake==3.18.4 && \
|
python3 -m pip install cmake==3.18.4 && \
|
||||||
ln -s /usr/local/bin/cmake /usr/bin/cmake3
|
ln -s /usr/local/bin/cmake /usr/bin/cmake3
|
||||||
RUN rm -rf /usr/local/cuda-*
|
RUN rm -rf /usr/local/cuda-*
|
||||||
|
|
||||||
|
@ -113,6 +113,7 @@ case "$tag" in
|
|||||||
UCX_COMMIT=${_UCX_COMMIT}
|
UCX_COMMIT=${_UCX_COMMIT}
|
||||||
UCC_COMMIT=${_UCC_COMMIT}
|
UCC_COMMIT=${_UCC_COMMIT}
|
||||||
TRITON=yes
|
TRITON=yes
|
||||||
|
INSTALL_MINGW=yes
|
||||||
;;
|
;;
|
||||||
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
|
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
|
||||||
CUDA_VERSION=13.0.0
|
CUDA_VERSION=13.0.0
|
||||||
@ -361,6 +362,7 @@ docker build \
|
|||||||
--build-arg "OPENBLAS=${OPENBLAS:-}" \
|
--build-arg "OPENBLAS=${OPENBLAS:-}" \
|
||||||
--build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
|
--build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
|
||||||
--build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
|
--build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
|
||||||
|
--build-arg "INSTALL_MINGW=${INSTALL_MINGW:-}" \
|
||||||
-f $(dirname ${DOCKERFILE})/Dockerfile \
|
-f $(dirname ${DOCKERFILE})/Dockerfile \
|
||||||
-t "$tmp_tag" \
|
-t "$tmp_tag" \
|
||||||
"$@" \
|
"$@" \
|
||||||
|
@ -25,7 +25,7 @@ function install_torchbench() {
|
|||||||
python install.py --continue_on_fail
|
python install.py --continue_on_fail
|
||||||
|
|
||||||
echo "Print all dependencies after TorchBench is installed"
|
echo "Print all dependencies after TorchBench is installed"
|
||||||
python -mpip freeze
|
python -m pip freeze
|
||||||
popd
|
popd
|
||||||
|
|
||||||
chown -R jenkins torchbench
|
chown -R jenkins torchbench
|
||||||
|
10
.ci/docker/common/install_mingw.sh
Normal file
10
.ci/docker/common/install_mingw.sh
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Install MinGW-w64 for Windows cross-compilation
|
||||||
|
apt-get update
|
||||||
|
apt-get install -y g++-mingw-w64-x86-64-posix
|
||||||
|
|
||||||
|
echo "MinGW-w64 installed successfully"
|
||||||
|
x86_64-w64-mingw32-g++ --version
|
@ -8,8 +8,8 @@ MKLROOT=/opt/intel
|
|||||||
mkdir -p ${MKLROOT}
|
mkdir -p ${MKLROOT}
|
||||||
pushd /tmp
|
pushd /tmp
|
||||||
|
|
||||||
python3 -mpip install wheel
|
python3 -m pip install wheel
|
||||||
python3 -mpip download -d . mkl-static==${MKL_VERSION}
|
python3 -m pip download -d . mkl-static==${MKL_VERSION}
|
||||||
python3 -m wheel unpack mkl_static-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl
|
python3 -m wheel unpack mkl_static-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl
|
||||||
python3 -m wheel unpack mkl_include-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl
|
python3 -m wheel unpack mkl_include-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl
|
||||||
mv mkl_static-${MKL_VERSION}/mkl_static-${MKL_VERSION}.data/data/lib ${MKLROOT}
|
mv mkl_static-${MKL_VERSION}/mkl_static-${MKL_VERSION}.data/data/lib ${MKLROOT}
|
||||||
|
@ -20,7 +20,7 @@ pip_install \
|
|||||||
|
|
||||||
pip_install coloredlogs packaging
|
pip_install coloredlogs packaging
|
||||||
pip_install onnxruntime==1.23.0
|
pip_install onnxruntime==1.23.0
|
||||||
pip_install onnxscript==0.5.3
|
pip_install onnxscript==0.5.4
|
||||||
|
|
||||||
# Cache the transformers model to be used later by ONNX tests. We need to run the transformers
|
# Cache the transformers model to be used later by ONNX tests. We need to run the transformers
|
||||||
# package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
|
# package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
|
||||||
|
@ -11,5 +11,5 @@ ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
|
|||||||
python -m venv /var/lib/jenkins/ci_env
|
python -m venv /var/lib/jenkins/ci_env
|
||||||
source /var/lib/jenkins/ci_env/bin/activate
|
source /var/lib/jenkins/ci_env/bin/activate
|
||||||
|
|
||||||
python -mpip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
python -mpip install -r /opt/requirements-ci.txt
|
python -m pip install -r /opt/requirements-ci.txt
|
||||||
|
@ -14,7 +14,7 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/op
|
|||||||
|
|
||||||
# cmake-3.18.4 from pip
|
# cmake-3.18.4 from pip
|
||||||
RUN yum install -y python3-pip && \
|
RUN yum install -y python3-pip && \
|
||||||
python3 -mpip install cmake==3.18.4 && \
|
python3 -m pip install cmake==3.18.4 && \
|
||||||
ln -s /usr/local/bin/cmake /usr/bin/cmake3
|
ln -s /usr/local/bin/cmake /usr/bin/cmake3
|
||||||
|
|
||||||
FROM base as openssl
|
FROM base as openssl
|
||||||
@ -135,7 +135,7 @@ RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
|
|||||||
|
|
||||||
# cmake-3.18.4 from pip; force in case cmake3 already exists
|
# cmake-3.18.4 from pip; force in case cmake3 already exists
|
||||||
RUN yum install -y python3-pip && \
|
RUN yum install -y python3-pip && \
|
||||||
python3 -mpip install cmake==3.18.4 && \
|
python3 -m pip install cmake==3.18.4 && \
|
||||||
ln -sf /usr/local/bin/cmake /usr/bin/cmake3
|
ln -sf /usr/local/bin/cmake /usr/bin/cmake3
|
||||||
|
|
||||||
FROM cpu_final as cuda_final
|
FROM cpu_final as cuda_final
|
||||||
@ -157,7 +157,7 @@ ENV ROCM_PATH /opt/rocm
|
|||||||
# cmake-3.28.4 from pip to get enable_language(HIP)
|
# cmake-3.28.4 from pip to get enable_language(HIP)
|
||||||
# and avoid 3.21.0 cmake+ninja issues with ninja inserting "-Wl,--no-as-needed" in LINK_FLAGS for static linker
|
# and avoid 3.21.0 cmake+ninja issues with ninja inserting "-Wl,--no-as-needed" in LINK_FLAGS for static linker
|
||||||
RUN python3 -m pip install --upgrade pip && \
|
RUN python3 -m pip install --upgrade pip && \
|
||||||
python3 -mpip install cmake==3.28.4
|
python3 -m pip install cmake==3.28.4
|
||||||
# replace the libdrm in /opt/amdgpu with custom amdgpu.ids lookup path
|
# replace the libdrm in /opt/amdgpu with custom amdgpu.ids lookup path
|
||||||
ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
|
ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
|
||||||
RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
|
RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
|
||||||
@ -174,7 +174,7 @@ FROM cpu_final as xpu_final
|
|||||||
ENV XPU_DRIVER_TYPE ROLLING
|
ENV XPU_DRIVER_TYPE ROLLING
|
||||||
# cmake-3.28.4 from pip
|
# cmake-3.28.4 from pip
|
||||||
RUN python3 -m pip install --upgrade pip && \
|
RUN python3 -m pip install --upgrade pip && \
|
||||||
python3 -mpip install cmake==3.28.4
|
python3 -m pip install cmake==3.28.4
|
||||||
ADD ./common/install_xpu.sh install_xpu.sh
|
ADD ./common/install_xpu.sh install_xpu.sh
|
||||||
ENV XPU_VERSION 2025.2
|
ENV XPU_VERSION 2025.2
|
||||||
RUN bash ./install_xpu.sh && rm install_xpu.sh
|
RUN bash ./install_xpu.sh && rm install_xpu.sh
|
||||||
|
@ -113,7 +113,7 @@ RUN dnf install -y \
|
|||||||
RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
|
RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
|
||||||
|
|
||||||
# cmake-3.28.0 from pip for onnxruntime
|
# cmake-3.28.0 from pip for onnxruntime
|
||||||
RUN python3 -mpip install cmake==3.28.0
|
RUN python3 -m pip install cmake==3.28.0
|
||||||
|
|
||||||
ADD ./common/patch_libstdc.sh patch_libstdc.sh
|
ADD ./common/patch_libstdc.sh patch_libstdc.sh
|
||||||
RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
|
RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
|
||||||
|
@ -103,6 +103,11 @@ COPY ci_commit_pins/torchbench.txt torchbench.txt
|
|||||||
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
||||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
|
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
|
||||||
|
|
||||||
|
ARG INSTALL_MINGW
|
||||||
|
COPY ./common/install_mingw.sh install_mingw.sh
|
||||||
|
RUN if [ -n "${INSTALL_MINGW}" ]; then bash ./install_mingw.sh; fi
|
||||||
|
RUN rm install_mingw.sh
|
||||||
|
|
||||||
ARG TRITON
|
ARG TRITON
|
||||||
ARG TRITON_CPU
|
ARG TRITON_CPU
|
||||||
|
|
||||||
|
@ -187,19 +187,22 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
|
|||||||
export USE_CUFILE=0
|
export USE_CUFILE=0
|
||||||
else
|
else
|
||||||
DEPS_LIST+=(
|
DEPS_LIST+=(
|
||||||
"/usr/local/cuda/lib64/libnvToolsExt.so.1"
|
|
||||||
"/usr/local/cuda/lib64/libcublas.so.12"
|
"/usr/local/cuda/lib64/libcublas.so.12"
|
||||||
"/usr/local/cuda/lib64/libcublasLt.so.12"
|
"/usr/local/cuda/lib64/libcublasLt.so.12"
|
||||||
"/usr/local/cuda/lib64/libcudart.so.12"
|
"/usr/local/cuda/lib64/libcudart.so.12"
|
||||||
"/usr/local/cuda/lib64/libnvrtc.so.12"
|
"/usr/local/cuda/lib64/libnvrtc.so.12"
|
||||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12")
|
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12")
|
||||||
DEPS_SONAME+=(
|
DEPS_SONAME+=(
|
||||||
"libnvToolsExt.so.1"
|
|
||||||
"libcublas.so.12"
|
"libcublas.so.12"
|
||||||
"libcublasLt.so.12"
|
"libcublasLt.so.12"
|
||||||
"libcudart.so.12"
|
"libcudart.so.12"
|
||||||
"libnvrtc.so.12"
|
"libnvrtc.so.12"
|
||||||
"libcupti.so.12")
|
"libcupti.so.12")
|
||||||
|
|
||||||
|
if [[ $CUDA_VERSION != 12.9* ]]; then
|
||||||
|
DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1")
|
||||||
|
DEPS_SONAME+=("libnvToolsExt.so.1")
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
echo "Using nvidia libs from pypi."
|
echo "Using nvidia libs from pypi."
|
||||||
|
@ -288,7 +288,7 @@ else
|
|||||||
# or building non-XLA tests.
|
# or building non-XLA tests.
|
||||||
if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *xla* && "$BUILD_ENVIRONMENT" != *riscv64* ]]; then
|
if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *xla* && "$BUILD_ENVIRONMENT" != *riscv64* ]]; then
|
||||||
# Install numpy-2.0.2 for builds which are backward compatible with 1.X
|
# Install numpy-2.0.2 for builds which are backward compatible with 1.X
|
||||||
python -mpip install numpy==2.0.2
|
python -m pip install numpy==2.0.2
|
||||||
|
|
||||||
WERROR=1 python setup.py clean
|
WERROR=1 python setup.py clean
|
||||||
|
|
||||||
|
@ -67,13 +67,13 @@ function pip_install_whl() {
|
|||||||
# Loop through each path and install individually
|
# Loop through each path and install individually
|
||||||
for path in "${paths[@]}"; do
|
for path in "${paths[@]}"; do
|
||||||
echo "Installing $path"
|
echo "Installing $path"
|
||||||
python3 -mpip install --no-index --no-deps "$path"
|
python3 -m pip install --no-index --no-deps "$path"
|
||||||
done
|
done
|
||||||
else
|
else
|
||||||
# Loop through each argument and install individually
|
# Loop through each argument and install individually
|
||||||
for path in "${args[@]}"; do
|
for path in "${args[@]}"; do
|
||||||
echo "Installing $path"
|
echo "Installing $path"
|
||||||
python3 -mpip install --no-index --no-deps "$path"
|
python3 -m pip install --no-index --no-deps "$path"
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
@ -182,7 +182,7 @@ checkout_install_torchbench() {
|
|||||||
pip uninstall -y torchao
|
pip uninstall -y torchao
|
||||||
|
|
||||||
echo "Print all dependencies after TorchBench is installed"
|
echo "Print all dependencies after TorchBench is installed"
|
||||||
python -mpip freeze
|
python -m pip freeze
|
||||||
}
|
}
|
||||||
|
|
||||||
torchbench_setup_macos() {
|
torchbench_setup_macos() {
|
||||||
@ -211,7 +211,7 @@ torchbench_setup_macos() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pip_benchmark_deps() {
|
pip_benchmark_deps() {
|
||||||
python -mpip install --no-input requests cython scikit-learn six
|
python -m pip install --no-input requests cython scikit-learn six
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -485,6 +485,22 @@ test_inductor_aoti() {
|
|||||||
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
|
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test_inductor_aoti_cross_compile_for_windows() {
|
||||||
|
|
||||||
|
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
||||||
|
mkdir -p "$TEST_REPORTS_DIR"
|
||||||
|
|
||||||
|
# Set WINDOWS_CUDA_HOME environment variable
|
||||||
|
WINDOWS_CUDA_HOME="$(pwd)/win-torch-wheel-extracted"
|
||||||
|
export WINDOWS_CUDA_HOME
|
||||||
|
|
||||||
|
echo "WINDOWS_CUDA_HOME is set to: $WINDOWS_CUDA_HOME"
|
||||||
|
echo "Contents:"
|
||||||
|
ls -lah "$(pwd)/win-torch-wheel-extracted/lib/x64/" || true
|
||||||
|
|
||||||
|
python test/inductor/test_aoti_cross_compile_windows.py -k compile --package-dir "$TEST_REPORTS_DIR" --win-torch-lib-dir "$(pwd)/win-torch-wheel-extracted/torch/lib"
|
||||||
|
}
|
||||||
|
|
||||||
test_inductor_cpp_wrapper_shard() {
|
test_inductor_cpp_wrapper_shard() {
|
||||||
if [[ -z "$NUM_TEST_SHARDS" ]]; then
|
if [[ -z "$NUM_TEST_SHARDS" ]]; then
|
||||||
echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
|
echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
|
||||||
@ -900,7 +916,7 @@ test_inductor_set_cpu_affinity(){
|
|||||||
export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD"
|
export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD"
|
||||||
export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
|
export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
|
||||||
|
|
||||||
if [[ "${TEST_CONFIG}" != *aarch64* ]]; then
|
if [[ "$(uname -m)" != "aarch64" ]]; then
|
||||||
# Use Intel OpenMP for x86
|
# Use Intel OpenMP for x86
|
||||||
IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
|
IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
|
||||||
export LD_PRELOAD="$IOMP_LIB":"$LD_PRELOAD"
|
export LD_PRELOAD="$IOMP_LIB":"$LD_PRELOAD"
|
||||||
@ -914,7 +930,7 @@ test_inductor_set_cpu_affinity(){
|
|||||||
cores=$((cpus / thread_per_core))
|
cores=$((cpus / thread_per_core))
|
||||||
|
|
||||||
# Set number of cores to 16 on aarch64 for performance runs
|
# Set number of cores to 16 on aarch64 for performance runs
|
||||||
if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then
|
if [[ "$(uname -m)" == "aarch64" && $cores -gt 16 ]]; then
|
||||||
cores=16
|
cores=16
|
||||||
fi
|
fi
|
||||||
export OMP_NUM_THREADS=$cores
|
export OMP_NUM_THREADS=$cores
|
||||||
@ -1418,7 +1434,7 @@ EOF
|
|||||||
# shellcheck source=./common-build.sh
|
# shellcheck source=./common-build.sh
|
||||||
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
|
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
|
||||||
python -m build --wheel --no-isolation -C--build-option=--bdist-dir="base_bdist_tmp" --outdir "base_dist"
|
python -m build --wheel --no-isolation -C--build-option=--bdist-dir="base_bdist_tmp" --outdir "base_dist"
|
||||||
python -mpip install base_dist/*.whl
|
python -m pip install base_dist/*.whl
|
||||||
echo "::endgroup::"
|
echo "::endgroup::"
|
||||||
|
|
||||||
pushd test/forward_backward_compatibility
|
pushd test/forward_backward_compatibility
|
||||||
@ -1615,6 +1631,7 @@ test_operator_benchmark() {
|
|||||||
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
||||||
mkdir -p "$TEST_REPORTS_DIR"
|
mkdir -p "$TEST_REPORTS_DIR"
|
||||||
TEST_DIR=$(pwd)
|
TEST_DIR=$(pwd)
|
||||||
|
ARCH=$(uname -m)
|
||||||
|
|
||||||
test_inductor_set_cpu_affinity
|
test_inductor_set_cpu_affinity
|
||||||
|
|
||||||
@ -1629,7 +1646,7 @@ test_operator_benchmark() {
|
|||||||
pip_install pandas
|
pip_install pandas
|
||||||
python check_perf_csv.py \
|
python check_perf_csv.py \
|
||||||
--actual "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \
|
--actual "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \
|
||||||
--expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
|
--expected "${ARCH}_expected_ci_operator_benchmark_eager_float32_cpu.csv"
|
||||||
}
|
}
|
||||||
|
|
||||||
test_operator_microbenchmark() {
|
test_operator_microbenchmark() {
|
||||||
@ -1666,7 +1683,7 @@ if [[ "${TEST_CONFIG}" == *numpy_2* ]]; then
|
|||||||
python -m pip install --pre numpy==2.0.2 scipy==1.13.1 numba==0.60.0
|
python -m pip install --pre numpy==2.0.2 scipy==1.13.1 numba==0.60.0
|
||||||
fi
|
fi
|
||||||
python test/run_test.py --include dynamo/test_functions.py dynamo/test_unspec.py test_binary_ufuncs.py test_fake_tensor.py test_linalg.py test_numpy_interop.py test_tensor_creation_ops.py test_torch.py torch_np/test_basic.py
|
python test/run_test.py --include dynamo/test_functions.py dynamo/test_unspec.py test_binary_ufuncs.py test_fake_tensor.py test_linalg.py test_numpy_interop.py test_tensor_creation_ops.py test_torch.py torch_np/test_basic.py
|
||||||
elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" != *perf_cpu_aarch64* ]]; then
|
elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]]; then
|
||||||
test_linux_aarch64
|
test_linux_aarch64
|
||||||
elif [[ "${TEST_CONFIG}" == *backward* ]]; then
|
elif [[ "${TEST_CONFIG}" == *backward* ]]; then
|
||||||
test_forward_backward_compatibility
|
test_forward_backward_compatibility
|
||||||
@ -1717,6 +1734,8 @@ elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
|
|||||||
test_inductor_triton_cpu
|
test_inductor_triton_cpu
|
||||||
elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
|
elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
|
||||||
test_inductor_micro_benchmark
|
test_inductor_micro_benchmark
|
||||||
|
elif [[ "${TEST_CONFIG}" == *aoti_cross_compile_for_windows* ]]; then
|
||||||
|
test_inductor_aoti_cross_compile_for_windows
|
||||||
elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
|
elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
|
||||||
install_torchvision
|
install_torchvision
|
||||||
id=$((SHARD_NUMBER-1))
|
id=$((SHARD_NUMBER-1))
|
||||||
|
@ -173,7 +173,7 @@ esac
|
|||||||
PINNED_PACKAGES=(
|
PINNED_PACKAGES=(
|
||||||
"numpy${NUMPY_PINNED_VERSION}"
|
"numpy${NUMPY_PINNED_VERSION}"
|
||||||
)
|
)
|
||||||
python -mvenv ~/${desired_python}-build
|
python -m venv ~/${desired_python}-build
|
||||||
source ~/${desired_python}-build/bin/activate
|
source ~/${desired_python}-build/bin/activate
|
||||||
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
|
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
|
||||||
retry brew install libomp
|
retry brew install libomp
|
||||||
|
1
.github/ISSUE_TEMPLATE/ci-sev.md
vendored
1
.github/ISSUE_TEMPLATE/ci-sev.md
vendored
@ -8,6 +8,7 @@ assignees: ''
|
|||||||
---
|
---
|
||||||
|
|
||||||
> NOTE: Remember to label this issue with "`ci: sev`"
|
> NOTE: Remember to label this issue with "`ci: sev`"
|
||||||
|
> If you want autorevert to be disabled, keep the ci: disable-autorevert label
|
||||||
|
|
||||||
<!-- Add the `merge blocking` label to this PR to prevent PRs from being merged while this issue is open -->
|
<!-- Add the `merge blocking` label to this PR to prevent PRs from being merged while this issue is open -->
|
||||||
|
|
||||||
|
4
.github/ISSUE_TEMPLATE/disable-autorevert.md
vendored
4
.github/ISSUE_TEMPLATE/disable-autorevert.md
vendored
@ -1,7 +1,7 @@
|
|||||||
---
|
---
|
||||||
name: DISABLE AUTOREVERT
|
name: "D❌\U0001F519 ISABLE AUTOREVERT"
|
||||||
about: Disables autorevert when open
|
about: Disables autorevert when open
|
||||||
title: "❌\U0001F519 [DISABLE AUTOREVERT]"
|
title: "[DISABLE AUTOREVERT]"
|
||||||
labels: 'ci: disable-autorevert'
|
labels: 'ci: disable-autorevert'
|
||||||
assignees: ''
|
assignees: ''
|
||||||
|
|
||||||
|
@ -65,7 +65,7 @@ runs:
|
|||||||
cd .ci/lumen_cli
|
cd .ci/lumen_cli
|
||||||
python3 -m pip install -e .
|
python3 -m pip install -e .
|
||||||
)
|
)
|
||||||
MAX_JOBS="$(nproc --ignore=6)"
|
MAX_JOBS="$(nproc --ignore=10)"
|
||||||
export MAX_JOBS
|
export MAX_JOBS
|
||||||
|
|
||||||
# Split the comma-separated list and build each target
|
# Split the comma-separated list and build each target
|
||||||
|
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
|||||||
8ad2aa5d354d1bf432339113860185d5a5d1abbd
|
1b013f5b5a87a1882eb143c26d79d091150d6a37
|
||||||
|
2
.github/ci_commit_pins/vision.txt
vendored
2
.github/ci_commit_pins/vision.txt
vendored
@ -1 +1 @@
|
|||||||
f5c6c2ec6490455e86f67b2a25c10390d60a27f7
|
faffd5cf673615583da6517275e361cb3dbc77e6
|
||||||
|
4
.github/pytorch-probot.yml
vendored
4
.github/pytorch-probot.yml
vendored
@ -3,6 +3,7 @@ ciflow_tracking_issue: 64124
|
|||||||
ciflow_push_tags:
|
ciflow_push_tags:
|
||||||
- ciflow/b200
|
- ciflow/b200
|
||||||
- ciflow/b200-symm-mem
|
- ciflow/b200-symm-mem
|
||||||
|
- ciflow/b200-distributed
|
||||||
- ciflow/binaries
|
- ciflow/binaries
|
||||||
- ciflow/binaries_libtorch
|
- ciflow/binaries_libtorch
|
||||||
- ciflow/binaries_wheel
|
- ciflow/binaries_wheel
|
||||||
@ -15,7 +16,8 @@ ciflow_push_tags:
|
|||||||
- ciflow/inductor-micro-benchmark
|
- ciflow/inductor-micro-benchmark
|
||||||
- ciflow/inductor-micro-benchmark-cpu-x86
|
- ciflow/inductor-micro-benchmark-cpu-x86
|
||||||
- ciflow/inductor-perf-compare
|
- ciflow/inductor-perf-compare
|
||||||
- ciflow/inductor-perf-test-nightly-rocm
|
- ciflow/inductor-perf-test-nightly-rocm-mi300
|
||||||
|
- ciflow/inductor-perf-test-nightly-rocm-mi355
|
||||||
- ciflow/inductor-perf-test-nightly-x86-zen
|
- ciflow/inductor-perf-test-nightly-x86-zen
|
||||||
- ciflow/inductor-periodic
|
- ciflow/inductor-periodic
|
||||||
- ciflow/inductor-rocm
|
- ciflow/inductor-rocm
|
||||||
|
12
.github/scripts/generate_binary_build_matrix.py
vendored
12
.github/scripts/generate_binary_build_matrix.py
vendored
@ -241,7 +241,11 @@ def generate_libtorch_matrix(
|
|||||||
arches += CUDA_ARCHES
|
arches += CUDA_ARCHES
|
||||||
arches += ROCM_ARCHES
|
arches += ROCM_ARCHES
|
||||||
elif os == "windows":
|
elif os == "windows":
|
||||||
arches += CUDA_ARCHES
|
# TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
|
||||||
|
# in 2.10
|
||||||
|
windows_cuda_arches = CUDA_ARCHES.copy()
|
||||||
|
windows_cuda_arches.remove("12.9")
|
||||||
|
arches += windows_cuda_arches
|
||||||
if libtorch_variants is None:
|
if libtorch_variants is None:
|
||||||
libtorch_variants = [
|
libtorch_variants = [
|
||||||
"shared-with-deps",
|
"shared-with-deps",
|
||||||
@ -305,7 +309,11 @@ def generate_wheels_matrix(
|
|||||||
if os == "linux":
|
if os == "linux":
|
||||||
arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
|
arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
|
||||||
elif os == "windows":
|
elif os == "windows":
|
||||||
arches += CUDA_ARCHES + XPU_ARCHES
|
# TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
|
||||||
|
# in 2.10
|
||||||
|
windows_cuda_arches = CUDA_ARCHES.copy()
|
||||||
|
windows_cuda_arches.remove("12.9")
|
||||||
|
arches += windows_cuda_arches + XPU_ARCHES
|
||||||
elif os == "linux-aarch64":
|
elif os == "linux-aarch64":
|
||||||
# Separate new if as the CPU type is different and
|
# Separate new if as the CPU type is different and
|
||||||
# uses different build/test scripts
|
# uses different build/test scripts
|
||||||
|
6
.github/scripts/prepare_vllm_wheels.sh
vendored
6
.github/scripts/prepare_vllm_wheels.sh
vendored
@ -24,7 +24,7 @@ change_wheel_version() {
|
|||||||
local t_version=$4
|
local t_version=$4
|
||||||
|
|
||||||
# Extract the wheel
|
# Extract the wheel
|
||||||
${PYTHON_EXECUTABLE} -mwheel unpack $wheel
|
${PYTHON_EXECUTABLE} -m wheel unpack $wheel
|
||||||
|
|
||||||
mv "${package}-${f_version}" "${package}-${t_version}"
|
mv "${package}-${f_version}" "${package}-${t_version}"
|
||||||
# Change the version from f_version to t_version in the dist-info dir
|
# Change the version from f_version to t_version in the dist-info dir
|
||||||
@ -47,7 +47,7 @@ change_wheel_version() {
|
|||||||
popd
|
popd
|
||||||
|
|
||||||
# Repack the wheel
|
# Repack the wheel
|
||||||
${PYTHON_EXECUTABLE} -mwheel pack "${package}-${t_version}"
|
${PYTHON_EXECUTABLE} -m wheel pack "${package}-${t_version}"
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
rm -rf "${package}-${t_version}"
|
rm -rf "${package}-${t_version}"
|
||||||
@ -85,7 +85,7 @@ repackage_wheel() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Require to re-package the wheel
|
# Require to re-package the wheel
|
||||||
${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1
|
${PYTHON_EXECUTABLE} -m pip install wheel==0.45.1
|
||||||
|
|
||||||
pushd externals/vllm/wheels
|
pushd externals/vllm/wheels
|
||||||
for package in xformers flashinfer-python vllm; do
|
for package in xformers flashinfer-python vllm; do
|
||||||
|
2
.github/scripts/trymerge.py
vendored
2
.github/scripts/trymerge.py
vendored
@ -1092,7 +1092,7 @@ class GitHubPR:
|
|||||||
editor = node["editor"]
|
editor = node["editor"]
|
||||||
return GitHubComment(
|
return GitHubComment(
|
||||||
body_text=node["bodyText"],
|
body_text=node["bodyText"],
|
||||||
created_at=node["createdAt"] if "createdAt" in node else "",
|
created_at=node.get("createdAt", ""),
|
||||||
author_login=node["author"]["login"],
|
author_login=node["author"]["login"],
|
||||||
author_url=node["author"].get("url", None),
|
author_url=node["author"].get("url", None),
|
||||||
author_association=node["authorAssociation"],
|
author_association=node["authorAssociation"],
|
||||||
|
2
.github/workflows/_linux-build.yml
vendored
2
.github/workflows/_linux-build.yml
vendored
@ -37,7 +37,7 @@ on:
|
|||||||
runner:
|
runner:
|
||||||
required: false
|
required: false
|
||||||
type: string
|
type: string
|
||||||
default: "linux.2xlarge"
|
default: "linux.c7i.2xlarge"
|
||||||
description: |
|
description: |
|
||||||
Label of the runner this job should run on.
|
Label of the runner this job should run on.
|
||||||
test-matrix:
|
test-matrix:
|
||||||
|
40
.github/workflows/_linux-test.yml
vendored
40
.github/workflows/_linux-test.yml
vendored
@ -224,6 +224,46 @@ jobs:
|
|||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
uses: ./.github/actions/download-td-artifacts
|
uses: ./.github/actions/download-td-artifacts
|
||||||
|
|
||||||
|
- name: Download Windows torch wheel for cross-compilation
|
||||||
|
if: matrix.win_torch_wheel_artifact != ''
|
||||||
|
uses: seemethere/download-artifact-s3@1da556a7aa0a088e3153970611f6c432d58e80e6 # v4.2.0
|
||||||
|
with:
|
||||||
|
name: ${{ matrix.win_torch_wheel_artifact }}
|
||||||
|
path: win-torch-wheel
|
||||||
|
|
||||||
|
- name: Extract Windows wheel and setup CUDA libraries
|
||||||
|
if: matrix.win_torch_wheel_artifact != ''
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
set -x
|
||||||
|
|
||||||
|
# Find the wheel file
|
||||||
|
WHEEL_FILE=$(find win-torch-wheel -name "*.whl" -type f | head -n 1)
|
||||||
|
if [ -z "$WHEEL_FILE" ]; then
|
||||||
|
echo "Error: No wheel file found in win-torch-wheel directory"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Found wheel file: $WHEEL_FILE"
|
||||||
|
|
||||||
|
# Unzip the wheel file
|
||||||
|
unzip -q "$WHEEL_FILE" -d win-torch-wheel-extracted
|
||||||
|
echo "Extracted wheel contents"
|
||||||
|
|
||||||
|
# Setup CUDA libraries (cuda.lib and cudart.lib) directory
|
||||||
|
mkdir -p win-torch-wheel-extracted/lib/x64
|
||||||
|
if [ -f "win-torch-wheel/cuda.lib" ]; then
|
||||||
|
mv win-torch-wheel/cuda.lib win-torch-wheel-extracted/lib/x64/
|
||||||
|
echo "Moved cuda.lib to win-torch-wheel-extracted/lib/x64/"
|
||||||
|
fi
|
||||||
|
if [ -f "win-torch-wheel/cudart.lib" ]; then
|
||||||
|
mv win-torch-wheel/cudart.lib win-torch-wheel-extracted/lib/x64/
|
||||||
|
echo "Moved cudart.lib to win-torch-wheel-extracted/lib/x64/"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Verify CUDA libraries are present
|
||||||
|
echo "CUDA libraries:"
|
||||||
|
ls -la win-torch-wheel-extracted/lib/x64/ || echo "No CUDA libraries found"
|
||||||
|
|
||||||
- name: Parse ref
|
- name: Parse ref
|
||||||
id: parse-ref
|
id: parse-ref
|
||||||
run: .github/scripts/parse_ref.py
|
run: .github/scripts/parse_ref.py
|
||||||
|
4
.github/workflows/_mac-test.yml
vendored
4
.github/workflows/_mac-test.yml
vendored
@ -211,7 +211,7 @@ jobs:
|
|||||||
$tool --version
|
$tool --version
|
||||||
done
|
done
|
||||||
|
|
||||||
python3 -mpip install --no-index --no-deps dist/*.whl
|
python3 -m pip install --no-index --no-deps dist/*.whl
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
pushd "${RUNNER_TEMP}"
|
pushd "${RUNNER_TEMP}"
|
||||||
@ -222,7 +222,7 @@ jobs:
|
|||||||
popd
|
popd
|
||||||
|
|
||||||
if [ "${RC}" -ne 0 ]; then
|
if [ "${RC}" -ne 0 ]; then
|
||||||
python3 -mpip install --ignore-installed -r "${PIP_REQUIREMENTS_FILE}"
|
python3 -m pip install --ignore-installed -r "${PIP_REQUIREMENTS_FILE}"
|
||||||
fi
|
fi
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
25
.github/workflows/_win-build.yml
vendored
25
.github/workflows/_win-build.yml
vendored
@ -168,6 +168,31 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
.ci/pytorch/win-build.sh
|
.ci/pytorch/win-build.sh
|
||||||
|
|
||||||
|
# Collect Windows torch libs and CUDA libs for cross-compilation
|
||||||
|
- name: Collect Windows CUDA libs for cross-compilation
|
||||||
|
if: steps.build.outcome != 'skipped' && inputs.cuda-version != 'cpu'
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Create directory structure if does not exist
|
||||||
|
mkdir -p /c/${{ github.run_id }}/build-results
|
||||||
|
|
||||||
|
# Copy CUDA libs
|
||||||
|
CUDA_PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${{ inputs.cuda-version }}"
|
||||||
|
|
||||||
|
if [ -f "${CUDA_PATH}/lib/x64/cuda.lib" ]; then
|
||||||
|
cp "${CUDA_PATH}/lib/x64/cuda.lib" /c/${{ github.run_id }}/build-results/
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -f "${CUDA_PATH}/lib/x64/cudart.lib" ]; then
|
||||||
|
cp "${CUDA_PATH}/lib/x64/cudart.lib" /c/${{ github.run_id }}/build-results/
|
||||||
|
fi
|
||||||
|
|
||||||
|
# List collected files
|
||||||
|
echo "Collected CUDA libs:"
|
||||||
|
ls -lah /c/${{ github.run_id }}/build-results/*.lib
|
||||||
|
|
||||||
# Upload to github so that people can click and download artifacts
|
# Upload to github so that people can click and download artifacts
|
||||||
- name: Upload artifacts to s3
|
- name: Upload artifacts to s3
|
||||||
if: steps.build.outcome != 'skipped'
|
if: steps.build.outcome != 'skipped'
|
||||||
|
2
.github/workflows/_win-test.yml
vendored
2
.github/workflows/_win-test.yml
vendored
@ -204,7 +204,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
pushd "${PYTORCH_FINAL_PACKAGE_DIR}"
|
pushd "${PYTORCH_FINAL_PACKAGE_DIR}"
|
||||||
# shellcheck disable=SC2046,SC2102
|
# shellcheck disable=SC2046,SC2102
|
||||||
python3 -mpip install $(echo *.whl)[opt-einsum,optree] optree==0.13.0
|
python3 -m pip install $(echo *.whl)[opt-einsum,optree] optree==0.13.0
|
||||||
popd
|
popd
|
||||||
|
|
||||||
.ci/pytorch/win-test.sh
|
.ci/pytorch/win-test.sh
|
||||||
|
62
.github/workflows/b200-distributed.yml
vendored
Normal file
62
.github/workflows/b200-distributed.yml
vendored
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
name: CI for distributed tests on B200
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
paths:
|
||||||
|
- .github/workflows/b200-distributed.yml
|
||||||
|
workflow_dispatch:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- ciflow/b200-distributed/*
|
||||||
|
schedule:
|
||||||
|
- cron: 46 8 * * * # about 1:46am PDT
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
|
||||||
|
get-label-type:
|
||||||
|
if: github.repository_owner == 'pytorch'
|
||||||
|
name: get-label-type
|
||||||
|
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||||
|
with:
|
||||||
|
triggering_actor: ${{ github.triggering_actor }}
|
||||||
|
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||||
|
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||||
|
curr_ref_type: ${{ github.ref_type }}
|
||||||
|
|
||||||
|
linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200:
|
||||||
|
name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed-b200
|
||||||
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
|
needs: get-label-type
|
||||||
|
with:
|
||||||
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
|
runner: linux.12xlarge.memory
|
||||||
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
|
||||||
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||||
|
cuda-arch-list: '10.0'
|
||||||
|
test-matrix: |
|
||||||
|
{ include: [
|
||||||
|
{ config: "distributed", shard: 1, num_shards: 2, runner: "linux.dgx.b200.8" },
|
||||||
|
{ config: "distributed", shard: 2, num_shards: 2, runner: "linux.dgx.b200.8" },
|
||||||
|
]}
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
|
linux-jammy-cuda12_8-py3_10-gcc11-test-distributed-b200:
|
||||||
|
name: linux-jammy-cuda12.8-py3.10-gcc11-test-b200
|
||||||
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
|
needs:
|
||||||
|
- linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200
|
||||||
|
with:
|
||||||
|
timeout-minutes: 1200
|
||||||
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
|
||||||
|
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.docker-image }}
|
||||||
|
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.test-matrix }}
|
||||||
|
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||||
|
secrets: inherit
|
23
.github/workflows/build-vllm-wheel.yml
vendored
23
.github/workflows/build-vllm-wheel.yml
vendored
@ -27,9 +27,8 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python-version: [ '3.12' ]
|
python-version: [ '3.12' ]
|
||||||
# TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved
|
|
||||||
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
|
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
|
||||||
device: [ 'cu128', 'cu129' ]
|
device: [ 'cu128', 'cu129', 'cu130' ]
|
||||||
include:
|
include:
|
||||||
- platform: manylinux_2_28_x86_64
|
- platform: manylinux_2_28_x86_64
|
||||||
device: cu128
|
device: cu128
|
||||||
@ -39,6 +38,10 @@ jobs:
|
|||||||
device: cu129
|
device: cu129
|
||||||
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
|
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
|
||||||
runner: linux.12xlarge.memory
|
runner: linux.12xlarge.memory
|
||||||
|
- platform: manylinux_2_28_x86_64
|
||||||
|
device: cu130
|
||||||
|
manylinux-image: 'pytorch/manylinux2_28-builder:cuda13.0'
|
||||||
|
runner: linux.12xlarge.memory
|
||||||
- platform: manylinux_2_28_aarch64
|
- platform: manylinux_2_28_aarch64
|
||||||
device: cu128
|
device: cu128
|
||||||
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8'
|
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8'
|
||||||
@ -47,6 +50,11 @@ jobs:
|
|||||||
device: cu129
|
device: cu129
|
||||||
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9'
|
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9'
|
||||||
runner: linux.arm64.r7g.12xlarge.memory
|
runner: linux.arm64.r7g.12xlarge.memory
|
||||||
|
exclude:
|
||||||
|
# TODO (huydhn): Add cu130 aarch64 once PyTorch is on 2.9+ and
|
||||||
|
# xformers is update to support 13.0
|
||||||
|
- platform: manylinux_2_28_aarch64
|
||||||
|
device: cu130
|
||||||
name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
|
name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
|
||||||
runs-on: ${{ matrix.runner }}
|
runs-on: ${{ matrix.runner }}
|
||||||
timeout-minutes: 480
|
timeout-minutes: 480
|
||||||
@ -118,13 +126,13 @@ jobs:
|
|||||||
"${MANYLINUX_IMAGE}"
|
"${MANYLINUX_IMAGE}"
|
||||||
)
|
)
|
||||||
|
|
||||||
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip install \
|
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip install \
|
||||||
--pre torch torchvision torchaudio \
|
--pre torch torchvision torchaudio \
|
||||||
--index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
|
--index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
|
||||||
|
|
||||||
# I wonder if there is a command to both download and install the wheels
|
# I wonder if there is a command to both download and install the wheels
|
||||||
# in one go
|
# in one go
|
||||||
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip download \
|
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip download \
|
||||||
--pre torch torchvision torchaudio \
|
--pre torch torchvision torchaudio \
|
||||||
--index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
|
--index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
|
||||||
|
|
||||||
@ -169,7 +177,12 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
|
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
|
||||||
device: [ 'cu128', 'cu129' ]
|
device: [ 'cu128', 'cu129', 'cu130' ]
|
||||||
|
exclude:
|
||||||
|
# TODO (huydhn): Add cu130 aarch64 once PyTorch is on 2.9+ and
|
||||||
|
# xformers is update to support 13.0
|
||||||
|
- platform: manylinux_2_28_aarch64
|
||||||
|
device: cu130
|
||||||
env:
|
env:
|
||||||
PLATFORM: ${{ matrix.platform }}
|
PLATFORM: ${{ matrix.platform }}
|
||||||
BUILD_DEVICE: ${{ matrix.device }}
|
BUILD_DEVICE: ${{ matrix.device }}
|
||||||
|
14
.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
generated
vendored
14
.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
generated
vendored
@ -106,7 +106,7 @@ jobs:
|
|||||||
SMOKE_TEST_PARAMS=""
|
SMOKE_TEST_PARAMS=""
|
||||||
|
|
||||||
# shellcheck disable=SC2086
|
# shellcheck disable=SC2086
|
||||||
python -mvenv test_venv
|
python -m venv test_venv
|
||||||
source test_venv/bin/activate
|
source test_venv/bin/activate
|
||||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||||
|
|
||||||
@ -216,7 +216,7 @@ jobs:
|
|||||||
SMOKE_TEST_PARAMS=""
|
SMOKE_TEST_PARAMS=""
|
||||||
|
|
||||||
# shellcheck disable=SC2086
|
# shellcheck disable=SC2086
|
||||||
python -mvenv test_venv
|
python -m venv test_venv
|
||||||
source test_venv/bin/activate
|
source test_venv/bin/activate
|
||||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||||
|
|
||||||
@ -326,7 +326,7 @@ jobs:
|
|||||||
SMOKE_TEST_PARAMS=""
|
SMOKE_TEST_PARAMS=""
|
||||||
|
|
||||||
# shellcheck disable=SC2086
|
# shellcheck disable=SC2086
|
||||||
python -mvenv test_venv
|
python -m venv test_venv
|
||||||
source test_venv/bin/activate
|
source test_venv/bin/activate
|
||||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||||
|
|
||||||
@ -436,7 +436,7 @@ jobs:
|
|||||||
SMOKE_TEST_PARAMS=""
|
SMOKE_TEST_PARAMS=""
|
||||||
|
|
||||||
# shellcheck disable=SC2086
|
# shellcheck disable=SC2086
|
||||||
python -mvenv test_venv
|
python -m venv test_venv
|
||||||
source test_venv/bin/activate
|
source test_venv/bin/activate
|
||||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||||
|
|
||||||
@ -546,7 +546,7 @@ jobs:
|
|||||||
SMOKE_TEST_PARAMS=""
|
SMOKE_TEST_PARAMS=""
|
||||||
|
|
||||||
# shellcheck disable=SC2086
|
# shellcheck disable=SC2086
|
||||||
python -mvenv test_venv
|
python -m venv test_venv
|
||||||
source test_venv/bin/activate
|
source test_venv/bin/activate
|
||||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||||
|
|
||||||
@ -656,7 +656,7 @@ jobs:
|
|||||||
SMOKE_TEST_PARAMS=""
|
SMOKE_TEST_PARAMS=""
|
||||||
|
|
||||||
# shellcheck disable=SC2086
|
# shellcheck disable=SC2086
|
||||||
python -mvenv test_venv
|
python -m venv test_venv
|
||||||
source test_venv/bin/activate
|
source test_venv/bin/activate
|
||||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||||
|
|
||||||
@ -766,7 +766,7 @@ jobs:
|
|||||||
SMOKE_TEST_PARAMS=""
|
SMOKE_TEST_PARAMS=""
|
||||||
|
|
||||||
# shellcheck disable=SC2086
|
# shellcheck disable=SC2086
|
||||||
python -mvenv test_venv
|
python -m venv test_venv
|
||||||
source test_venv/bin/activate
|
source test_venv/bin/activate
|
||||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||||
|
|
||||||
|
250
.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
generated
vendored
250
.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
generated
vendored
@ -788,256 +788,6 @@ jobs:
|
|||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
uses: ./.github/workflows/_binary-upload.yml
|
uses: ./.github/workflows/_binary-upload.yml
|
||||||
libtorch-cuda12_9-shared-with-deps-debug-build:
|
|
||||||
if: ${{ github.repository_owner == 'pytorch' }}
|
|
||||||
needs: get-label-type
|
|
||||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
|
||||||
timeout-minutes: 360
|
|
||||||
env:
|
|
||||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
|
||||||
PACKAGE_TYPE: libtorch
|
|
||||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
|
||||||
# favor of GPU_ARCH_VERSION
|
|
||||||
DESIRED_CUDA: cu129
|
|
||||||
GPU_ARCH_VERSION: "12.9"
|
|
||||||
GPU_ARCH_TYPE: cuda
|
|
||||||
SKIP_ALL_TESTS: 1
|
|
||||||
LIBTORCH_CONFIG: debug
|
|
||||||
LIBTORCH_VARIANT: shared-with-deps
|
|
||||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
|
||||||
# without this value pip does not get installed for some reason
|
|
||||||
DESIRED_PYTHON: "3.10"
|
|
||||||
steps:
|
|
||||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
|
||||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
|
||||||
# runner.temp variable, which we need.
|
|
||||||
- name: Populate binary env
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
|
||||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
|
||||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
|
||||||
- name: Display EC2 information
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
set -euo pipefail
|
|
||||||
function get_ec2_metadata() {
|
|
||||||
# Pulled from instance metadata endpoint for EC2
|
|
||||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
|
||||||
category=$1
|
|
||||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
|
||||||
}
|
|
||||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
|
||||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
|
||||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
|
||||||
echo "system info $(uname -a)"
|
|
||||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
|
||||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
|
||||||
continue-on-error: true
|
|
||||||
with:
|
|
||||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
git config --global core.longpaths true
|
|
||||||
git config --global core.symlinks true
|
|
||||||
|
|
||||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
|
||||||
# the directory on Windows and prevent GHA from checking out as reported
|
|
||||||
# in https://github.com/actions/checkout/issues/1018
|
|
||||||
git config --global core.fsmonitor false
|
|
||||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
|
||||||
- name: Enable long paths on Windows
|
|
||||||
shell: powershell
|
|
||||||
run: |
|
|
||||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
|
||||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
|
||||||
# removed once Windows Defender is removed from the AMI
|
|
||||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
|
||||||
continue-on-error: true
|
|
||||||
shell: powershell
|
|
||||||
run: |
|
|
||||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
|
||||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
|
||||||
# that it doesn't interfere
|
|
||||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
|
||||||
- name: Checkout PyTorch
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
|
||||||
submodules: recursive
|
|
||||||
path: pytorch
|
|
||||||
show-progress: false
|
|
||||||
- name: Clean PyTorch checkout
|
|
||||||
run: |
|
|
||||||
# Remove any artifacts from the previous checkouts
|
|
||||||
git clean -fxd
|
|
||||||
working-directory: pytorch
|
|
||||||
- name: Populate binary env
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
|
||||||
- name: Build PyTorch binary
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
|
|
||||||
- uses: actions/upload-artifact@v4.4.0
|
|
||||||
if: always()
|
|
||||||
with:
|
|
||||||
name: libtorch-cuda12_9-shared-with-deps-debug
|
|
||||||
retention-days: 14
|
|
||||||
if-no-files-found: error
|
|
||||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
|
||||||
- name: Wait until all sessions have drained
|
|
||||||
shell: powershell
|
|
||||||
working-directory: pytorch
|
|
||||||
if: always()
|
|
||||||
timeout-minutes: 120
|
|
||||||
run: |
|
|
||||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
|
||||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
|
||||||
shell: powershell
|
|
||||||
working-directory: pytorch
|
|
||||||
if: always()
|
|
||||||
run: |
|
|
||||||
.github\scripts\kill_active_ssh_sessions.ps1
|
|
||||||
|
|
||||||
libtorch-cuda12_9-shared-with-deps-debug-test: # Testing
|
|
||||||
if: ${{ github.repository_owner == 'pytorch' }}
|
|
||||||
needs:
|
|
||||||
- libtorch-cuda12_9-shared-with-deps-debug-build
|
|
||||||
- get-label-type
|
|
||||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
|
||||||
timeout-minutes: 360
|
|
||||||
env:
|
|
||||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
|
||||||
PACKAGE_TYPE: libtorch
|
|
||||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
|
||||||
# favor of GPU_ARCH_VERSION
|
|
||||||
DESIRED_CUDA: cu129
|
|
||||||
GPU_ARCH_VERSION: "12.9"
|
|
||||||
GPU_ARCH_TYPE: cuda
|
|
||||||
SKIP_ALL_TESTS: 1
|
|
||||||
LIBTORCH_CONFIG: debug
|
|
||||||
LIBTORCH_VARIANT: shared-with-deps
|
|
||||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
|
||||||
# without this value pip does not get installed for some reason
|
|
||||||
DESIRED_PYTHON: "3.10"
|
|
||||||
steps:
|
|
||||||
- name: Display EC2 information
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
set -euo pipefail
|
|
||||||
function get_ec2_metadata() {
|
|
||||||
# Pulled from instance metadata endpoint for EC2
|
|
||||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
|
||||||
category=$1
|
|
||||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
|
||||||
}
|
|
||||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
|
||||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
|
||||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
|
||||||
echo "system info $(uname -a)"
|
|
||||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
|
||||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
|
||||||
continue-on-error: true
|
|
||||||
with:
|
|
||||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
git config --global core.longpaths true
|
|
||||||
git config --global core.symlinks true
|
|
||||||
|
|
||||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
|
||||||
# the directory on Windows and prevent GHA from checking out as reported
|
|
||||||
# in https://github.com/actions/checkout/issues/1018
|
|
||||||
git config --global core.fsmonitor false
|
|
||||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
|
||||||
- name: Enable long paths on Windows
|
|
||||||
shell: powershell
|
|
||||||
run: |
|
|
||||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
|
||||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
|
||||||
# removed once Windows Defender is removed from the AMI
|
|
||||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
|
||||||
continue-on-error: true
|
|
||||||
shell: powershell
|
|
||||||
run: |
|
|
||||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
|
||||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
|
||||||
# that it doesn't interfere
|
|
||||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
|
||||||
- name: Checkout PyTorch
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
|
||||||
submodules: recursive
|
|
||||||
path: pytorch
|
|
||||||
show-progress: false
|
|
||||||
- name: Clean PyTorch checkout
|
|
||||||
run: |
|
|
||||||
# Remove any artifacts from the previous checkouts
|
|
||||||
git clean -fxd
|
|
||||||
working-directory: pytorch
|
|
||||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
|
||||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
|
||||||
# runner.temp variable, which we need.
|
|
||||||
- name: Populate binary env
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
|
||||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
|
||||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
|
||||||
- uses: actions/download-artifact@v4.1.7
|
|
||||||
name: Download Build Artifacts
|
|
||||||
with:
|
|
||||||
name: libtorch-cuda12_9-shared-with-deps-debug
|
|
||||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
|
||||||
- name: Populate binary env
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
|
||||||
- name: Test PyTorch binary
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
|
|
||||||
- name: Wait until all sessions have drained
|
|
||||||
shell: powershell
|
|
||||||
working-directory: pytorch
|
|
||||||
if: always()
|
|
||||||
timeout-minutes: 120
|
|
||||||
run: |
|
|
||||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
|
||||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
|
||||||
shell: powershell
|
|
||||||
working-directory: pytorch
|
|
||||||
if: always()
|
|
||||||
run: |
|
|
||||||
.github\scripts\kill_active_ssh_sessions.ps1
|
|
||||||
libtorch-cuda12_9-shared-with-deps-debug-upload: # Uploading
|
|
||||||
if: ${{ github.repository_owner == 'pytorch' }}
|
|
||||||
permissions:
|
|
||||||
id-token: write
|
|
||||||
contents: read
|
|
||||||
needs: libtorch-cuda12_9-shared-with-deps-debug-test
|
|
||||||
with:
|
|
||||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
|
||||||
PACKAGE_TYPE: libtorch
|
|
||||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
|
||||||
# favor of GPU_ARCH_VERSION
|
|
||||||
DESIRED_CUDA: cu129
|
|
||||||
GPU_ARCH_VERSION: "12.9"
|
|
||||||
GPU_ARCH_TYPE: cuda
|
|
||||||
LIBTORCH_CONFIG: debug
|
|
||||||
LIBTORCH_VARIANT: shared-with-deps
|
|
||||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
|
||||||
# without this value pip does not get installed for some reason
|
|
||||||
DESIRED_PYTHON: "3.10"
|
|
||||||
build_name: libtorch-cuda12_9-shared-with-deps-debug
|
|
||||||
secrets:
|
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
uses: ./.github/workflows/_binary-upload.yml
|
|
||||||
libtorch-cuda13_0-shared-with-deps-debug-build:
|
libtorch-cuda13_0-shared-with-deps-debug-build:
|
||||||
if: ${{ github.repository_owner == 'pytorch' }}
|
if: ${{ github.repository_owner == 'pytorch' }}
|
||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
|
250
.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
generated
vendored
250
.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
generated
vendored
@ -788,256 +788,6 @@ jobs:
|
|||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
uses: ./.github/workflows/_binary-upload.yml
|
uses: ./.github/workflows/_binary-upload.yml
|
||||||
libtorch-cuda12_9-shared-with-deps-release-build:
|
|
||||||
if: ${{ github.repository_owner == 'pytorch' }}
|
|
||||||
needs: get-label-type
|
|
||||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
|
||||||
timeout-minutes: 360
|
|
||||||
env:
|
|
||||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
|
||||||
PACKAGE_TYPE: libtorch
|
|
||||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
|
||||||
# favor of GPU_ARCH_VERSION
|
|
||||||
DESIRED_CUDA: cu129
|
|
||||||
GPU_ARCH_VERSION: "12.9"
|
|
||||||
GPU_ARCH_TYPE: cuda
|
|
||||||
SKIP_ALL_TESTS: 1
|
|
||||||
LIBTORCH_CONFIG: release
|
|
||||||
LIBTORCH_VARIANT: shared-with-deps
|
|
||||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
|
||||||
# without this value pip does not get installed for some reason
|
|
||||||
DESIRED_PYTHON: "3.10"
|
|
||||||
steps:
|
|
||||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
|
||||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
|
||||||
# runner.temp variable, which we need.
|
|
||||||
- name: Populate binary env
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
|
||||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
|
||||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
|
||||||
- name: Display EC2 information
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
set -euo pipefail
|
|
||||||
function get_ec2_metadata() {
|
|
||||||
# Pulled from instance metadata endpoint for EC2
|
|
||||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
|
||||||
category=$1
|
|
||||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
|
||||||
}
|
|
||||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
|
||||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
|
||||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
|
||||||
echo "system info $(uname -a)"
|
|
||||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
|
||||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
|
||||||
continue-on-error: true
|
|
||||||
with:
|
|
||||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
git config --global core.longpaths true
|
|
||||||
git config --global core.symlinks true
|
|
||||||
|
|
||||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
|
||||||
# the directory on Windows and prevent GHA from checking out as reported
|
|
||||||
# in https://github.com/actions/checkout/issues/1018
|
|
||||||
git config --global core.fsmonitor false
|
|
||||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
|
||||||
- name: Enable long paths on Windows
|
|
||||||
shell: powershell
|
|
||||||
run: |
|
|
||||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
|
||||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
|
||||||
# removed once Windows Defender is removed from the AMI
|
|
||||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
|
||||||
continue-on-error: true
|
|
||||||
shell: powershell
|
|
||||||
run: |
|
|
||||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
|
||||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
|
||||||
# that it doesn't interfere
|
|
||||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
|
||||||
- name: Checkout PyTorch
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
|
||||||
submodules: recursive
|
|
||||||
path: pytorch
|
|
||||||
show-progress: false
|
|
||||||
- name: Clean PyTorch checkout
|
|
||||||
run: |
|
|
||||||
# Remove any artifacts from the previous checkouts
|
|
||||||
git clean -fxd
|
|
||||||
working-directory: pytorch
|
|
||||||
- name: Populate binary env
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
|
||||||
- name: Build PyTorch binary
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
|
|
||||||
- uses: actions/upload-artifact@v4.4.0
|
|
||||||
if: always()
|
|
||||||
with:
|
|
||||||
name: libtorch-cuda12_9-shared-with-deps-release
|
|
||||||
retention-days: 14
|
|
||||||
if-no-files-found: error
|
|
||||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
|
||||||
- name: Wait until all sessions have drained
|
|
||||||
shell: powershell
|
|
||||||
working-directory: pytorch
|
|
||||||
if: always()
|
|
||||||
timeout-minutes: 120
|
|
||||||
run: |
|
|
||||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
|
||||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
|
||||||
shell: powershell
|
|
||||||
working-directory: pytorch
|
|
||||||
if: always()
|
|
||||||
run: |
|
|
||||||
.github\scripts\kill_active_ssh_sessions.ps1
|
|
||||||
|
|
||||||
libtorch-cuda12_9-shared-with-deps-release-test: # Testing
|
|
||||||
if: ${{ github.repository_owner == 'pytorch' }}
|
|
||||||
needs:
|
|
||||||
- libtorch-cuda12_9-shared-with-deps-release-build
|
|
||||||
- get-label-type
|
|
||||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
|
||||||
timeout-minutes: 360
|
|
||||||
env:
|
|
||||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
|
||||||
PACKAGE_TYPE: libtorch
|
|
||||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
|
||||||
# favor of GPU_ARCH_VERSION
|
|
||||||
DESIRED_CUDA: cu129
|
|
||||||
GPU_ARCH_VERSION: "12.9"
|
|
||||||
GPU_ARCH_TYPE: cuda
|
|
||||||
SKIP_ALL_TESTS: 1
|
|
||||||
LIBTORCH_CONFIG: release
|
|
||||||
LIBTORCH_VARIANT: shared-with-deps
|
|
||||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
|
||||||
# without this value pip does not get installed for some reason
|
|
||||||
DESIRED_PYTHON: "3.10"
|
|
||||||
steps:
|
|
||||||
- name: Display EC2 information
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
set -euo pipefail
|
|
||||||
function get_ec2_metadata() {
|
|
||||||
# Pulled from instance metadata endpoint for EC2
|
|
||||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
|
||||||
category=$1
|
|
||||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
|
||||||
}
|
|
||||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
|
||||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
|
||||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
|
||||||
echo "system info $(uname -a)"
|
|
||||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
|
||||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
|
||||||
continue-on-error: true
|
|
||||||
with:
|
|
||||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
git config --global core.longpaths true
|
|
||||||
git config --global core.symlinks true
|
|
||||||
|
|
||||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
|
||||||
# the directory on Windows and prevent GHA from checking out as reported
|
|
||||||
# in https://github.com/actions/checkout/issues/1018
|
|
||||||
git config --global core.fsmonitor false
|
|
||||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
|
||||||
- name: Enable long paths on Windows
|
|
||||||
shell: powershell
|
|
||||||
run: |
|
|
||||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
|
||||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
|
||||||
# removed once Windows Defender is removed from the AMI
|
|
||||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
|
||||||
continue-on-error: true
|
|
||||||
shell: powershell
|
|
||||||
run: |
|
|
||||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
|
||||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
|
||||||
# that it doesn't interfere
|
|
||||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
|
||||||
- name: Checkout PyTorch
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
|
||||||
submodules: recursive
|
|
||||||
path: pytorch
|
|
||||||
show-progress: false
|
|
||||||
- name: Clean PyTorch checkout
|
|
||||||
run: |
|
|
||||||
# Remove any artifacts from the previous checkouts
|
|
||||||
git clean -fxd
|
|
||||||
working-directory: pytorch
|
|
||||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
|
||||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
|
||||||
# runner.temp variable, which we need.
|
|
||||||
- name: Populate binary env
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
|
||||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
|
||||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
|
||||||
- uses: actions/download-artifact@v4.1.7
|
|
||||||
name: Download Build Artifacts
|
|
||||||
with:
|
|
||||||
name: libtorch-cuda12_9-shared-with-deps-release
|
|
||||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
|
||||||
- name: Populate binary env
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
|
||||||
- name: Test PyTorch binary
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
|
|
||||||
- name: Wait until all sessions have drained
|
|
||||||
shell: powershell
|
|
||||||
working-directory: pytorch
|
|
||||||
if: always()
|
|
||||||
timeout-minutes: 120
|
|
||||||
run: |
|
|
||||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
|
||||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
|
||||||
shell: powershell
|
|
||||||
working-directory: pytorch
|
|
||||||
if: always()
|
|
||||||
run: |
|
|
||||||
.github\scripts\kill_active_ssh_sessions.ps1
|
|
||||||
libtorch-cuda12_9-shared-with-deps-release-upload: # Uploading
|
|
||||||
if: ${{ github.repository_owner == 'pytorch' }}
|
|
||||||
permissions:
|
|
||||||
id-token: write
|
|
||||||
contents: read
|
|
||||||
needs: libtorch-cuda12_9-shared-with-deps-release-test
|
|
||||||
with:
|
|
||||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
|
||||||
PACKAGE_TYPE: libtorch
|
|
||||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
|
||||||
# favor of GPU_ARCH_VERSION
|
|
||||||
DESIRED_CUDA: cu129
|
|
||||||
GPU_ARCH_VERSION: "12.9"
|
|
||||||
GPU_ARCH_TYPE: cuda
|
|
||||||
LIBTORCH_CONFIG: release
|
|
||||||
LIBTORCH_VARIANT: shared-with-deps
|
|
||||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
|
||||||
# without this value pip does not get installed for some reason
|
|
||||||
DESIRED_PYTHON: "3.10"
|
|
||||||
build_name: libtorch-cuda12_9-shared-with-deps-release
|
|
||||||
secrets:
|
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
uses: ./.github/workflows/_binary-upload.yml
|
|
||||||
libtorch-cuda13_0-shared-with-deps-release-build:
|
libtorch-cuda13_0-shared-with-deps-release-build:
|
||||||
if: ${{ github.repository_owner == 'pytorch' }}
|
if: ${{ github.repository_owner == 'pytorch' }}
|
||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
|
1666
.github/workflows/generated-windows-binary-wheel-nightly.yml
generated
vendored
1666
.github/workflows/generated-windows-binary-wheel-nightly.yml
generated
vendored
File diff suppressed because it is too large
Load Diff
132
.github/workflows/inductor-perf-test-nightly-rocm-mi300.yml
vendored
Normal file
132
.github/workflows/inductor-perf-test-nightly-rocm-mi300.yml
vendored
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
name: inductor-perf-nightly-rocm-mi300
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- ciflow/inductor-perf-test-nightly-rocm-mi300/*
|
||||||
|
schedule:
|
||||||
|
- cron: 15 0 * * *
|
||||||
|
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
|
||||||
|
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
training:
|
||||||
|
description: Run training (on by default)?
|
||||||
|
required: false
|
||||||
|
type: boolean
|
||||||
|
default: true
|
||||||
|
inference:
|
||||||
|
description: Run inference (on by default)?
|
||||||
|
required: false
|
||||||
|
type: boolean
|
||||||
|
default: true
|
||||||
|
default:
|
||||||
|
description: Run inductor_default?
|
||||||
|
required: false
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
dynamic:
|
||||||
|
description: Run inductor_dynamic_shapes?
|
||||||
|
required: false
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
cppwrapper:
|
||||||
|
description: Run inductor_cpp_wrapper?
|
||||||
|
required: false
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
cudagraphs:
|
||||||
|
description: Run inductor_cudagraphs?
|
||||||
|
required: false
|
||||||
|
type: boolean
|
||||||
|
default: true
|
||||||
|
freezing_cudagraphs:
|
||||||
|
description: Run inductor_cudagraphs with freezing for inference?
|
||||||
|
required: false
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
aotinductor:
|
||||||
|
description: Run aot_inductor for inference?
|
||||||
|
required: false
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
maxautotune:
|
||||||
|
description: Run inductor_max_autotune?
|
||||||
|
required: false
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
benchmark_configs:
|
||||||
|
description: The list of configs used the benchmark
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
default: inductor_huggingface_perf_rocm_mi300,inductor_timm_perf_rocm_mi300,inductor_torchbench_perf_rocm_mi300
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
permissions: read-all
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
get-label-type:
|
||||||
|
name: get-label-type
|
||||||
|
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||||
|
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||||
|
with:
|
||||||
|
triggering_actor: ${{ github.triggering_actor }}
|
||||||
|
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||||
|
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||||
|
curr_ref_type: ${{ github.ref_type }}
|
||||||
|
opt_out_experiments: lf
|
||||||
|
|
||||||
|
linux-jammy-rocm-py3_10-inductor-benchmark-build:
|
||||||
|
if: github.repository_owner == 'pytorch'
|
||||||
|
name: rocm-py3_10-inductor-benchmark-build
|
||||||
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
|
with:
|
||||||
|
build-environment: linux-jammy-rocm-py3_10
|
||||||
|
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
|
||||||
|
test-matrix: |
|
||||||
|
{ include: [
|
||||||
|
{ config: "inductor_huggingface_perf_rocm_mi300", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_huggingface_perf_rocm_mi300", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_huggingface_perf_rocm_mi300", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_huggingface_perf_rocm_mi300", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_huggingface_perf_rocm_mi300", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_timm_perf_rocm_mi300", shard: 1, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_timm_perf_rocm_mi300", shard: 2, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_timm_perf_rocm_mi300", shard: 3, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_timm_perf_rocm_mi300", shard: 4, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_timm_perf_rocm_mi300", shard: 5, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_timm_perf_rocm_mi300", shard: 6, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_timm_perf_rocm_mi300", shard: 7, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 1, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 2, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 3, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 4, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 5, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 6, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 7, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 8, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
{ config: "inductor_torchbench_perf_rocm_mi300", shard: 9, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
|
||||||
|
]}
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
|
linux-jammy-rocm-py3_10-inductor-benchmark-test:
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
name: rocm-py3_10-inductor-benchmark-test
|
||||||
|
uses: ./.github/workflows/_rocm-test.yml
|
||||||
|
needs: linux-jammy-rocm-py3_10-inductor-benchmark-build
|
||||||
|
with:
|
||||||
|
build-environment: linux-jammy-rocm-py3_10
|
||||||
|
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
|
||||||
|
docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.docker-image }}
|
||||||
|
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.test-matrix }}
|
||||||
|
timeout-minutes: 720
|
||||||
|
# Disable monitor in perf tests for more investigation
|
||||||
|
disable-monitor: true
|
||||||
|
monitor-log-interval: 10
|
||||||
|
monitor-data-collect-interval: 2
|
||||||
|
secrets: inherit
|
@ -1,11 +1,11 @@
|
|||||||
name: inductor-perf-nightly-rocm
|
name: inductor-perf-nightly-rocm-mi355
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
tags:
|
tags:
|
||||||
- ciflow/inductor-perf-test-nightly-rocm/*
|
- ciflow/inductor-perf-test-nightly-rocm-mi355/*
|
||||||
schedule:
|
schedule:
|
||||||
- cron: 0 7 * * 0,3
|
- cron: 15 0 * * *
|
||||||
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
|
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
|
||||||
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
|
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
@ -59,7 +59,7 @@ on:
|
|||||||
description: The list of configs used the benchmark
|
description: The list of configs used the benchmark
|
||||||
required: false
|
required: false
|
||||||
type: string
|
type: string
|
||||||
default: inductor_huggingface_perf_rocm,inductor_timm_perf_rocm,inductor_torchbench_perf_rocm
|
default: inductor_huggingface_perf_rocm_mi355,inductor_timm_perf_rocm_mi355,inductor_torchbench_perf_rocm_mi355
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||||
@ -88,23 +88,27 @@ jobs:
|
|||||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
|
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
{ include: [
|
{ include: [
|
||||||
{ config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
|
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
|
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
|
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
|
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
|
{ config: "inductor_huggingface_perf_rocm_mi355", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
|
{ config: "inductor_timm_perf_rocm_mi355", shard: 1, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
|
{ config: "inductor_timm_perf_rocm_mi355", shard: 2, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
|
{ config: "inductor_timm_perf_rocm_mi355", shard: 3, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
|
{ config: "inductor_timm_perf_rocm_mi355", shard: 4, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
|
{ config: "inductor_timm_perf_rocm_mi355", shard: 5, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
|
{ config: "inductor_timm_perf_rocm_mi355", shard: 6, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
|
{ config: "inductor_timm_perf_rocm_mi355", shard: 7, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
|
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 1, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
|
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 2, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
|
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 3, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
|
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 4, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
|
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 5, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
|
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 6, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
|
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 7, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
|
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 8, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
|
{ config: "inductor_torchbench_perf_rocm_mi355", shard: 9, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
]}
|
]}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
4
.github/workflows/lint.yml
vendored
4
.github/workflows/lint.yml
vendored
@ -118,9 +118,9 @@ jobs:
|
|||||||
CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
|
CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
|
||||||
echo "Running all other linters"
|
echo "Running all other linters"
|
||||||
if [ "$CHANGED_FILES" = '*' ]; then
|
if [ "$CHANGED_FILES" = '*' ]; then
|
||||||
ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
|
ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT,PYREFLY --all-files" .github/scripts/lintrunner.sh
|
||||||
else
|
else
|
||||||
ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT ${CHANGED_FILES}" .github/scripts/lintrunner.sh
|
ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT,PYREFLY ${CHANGED_FILES}" .github/scripts/lintrunner.sh
|
||||||
fi
|
fi
|
||||||
|
|
||||||
quick-checks:
|
quick-checks:
|
||||||
|
61
.github/workflows/operator_benchmark.yml
vendored
61
.github/workflows/operator_benchmark.yml
vendored
@ -7,9 +7,11 @@ on:
|
|||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
inputs:
|
inputs:
|
||||||
test_mode:
|
test_mode:
|
||||||
required: false
|
type: choice
|
||||||
type: string
|
options:
|
||||||
default: 'short'
|
- 'short'
|
||||||
|
- 'long'
|
||||||
|
- 'all'
|
||||||
description: tag filter for operator benchmarks, options from long, short, all
|
description: tag filter for operator benchmarks, options from long, short, all
|
||||||
schedule:
|
schedule:
|
||||||
# Run at 07:00 UTC every Sunday
|
# Run at 07:00 UTC every Sunday
|
||||||
@ -28,38 +30,49 @@ permissions:
|
|||||||
contents: read
|
contents: read
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
opbenchmark-build:
|
x86-opbenchmark-build:
|
||||||
if: github.repository_owner == 'pytorch'
|
if: github.repository_owner == 'pytorch'
|
||||||
name: opbenchmark-build
|
name: x86-opbenchmark-build
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-py3.10-gcc11-build
|
build-environment: linux-jammy-py3.10-gcc11-build
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
{ include: [
|
{ include: [
|
||||||
{ config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
|
{ config: "cpu_operator_benchmark_${{ inputs.test_mode || 'short' }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
|
||||||
]}
|
]}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
opbenchmark-on-demand-build:
|
x86-opbenchmark-test:
|
||||||
if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }}
|
name: x86-opbenchmark-test
|
||||||
name: opbenchmark-on-demand-build
|
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
|
||||||
with:
|
|
||||||
build-environment: linux-jammy-py3.10-gcc11-build
|
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
|
||||||
test-matrix: |
|
|
||||||
{ include: [
|
|
||||||
{ config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
|
|
||||||
]}
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
opbenchmark-test:
|
|
||||||
name: opbenchmark-test
|
|
||||||
uses: ./.github/workflows/_linux-test.yml
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
needs: opbenchmark-build
|
needs: x86-opbenchmark-build
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-py3.10-gcc11-build
|
build-environment: linux-jammy-py3.10-gcc11-build
|
||||||
docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
|
docker-image: ${{ needs.x86-opbenchmark-build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
|
test-matrix: ${{ needs.x86-opbenchmark-build.outputs.test-matrix }}
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
|
aarch64-opbenchmark-build:
|
||||||
|
if: github.repository_owner == 'pytorch'
|
||||||
|
name: aarch64-opbenchmark-build
|
||||||
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
|
with:
|
||||||
|
build-environment: linux-jammy-aarch64-py3.10
|
||||||
|
runner: linux.arm64.m7g.4xlarge
|
||||||
|
docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11
|
||||||
|
test-matrix: |
|
||||||
|
{ include: [
|
||||||
|
{ config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.arm64.m8g.4xlarge" },
|
||||||
|
]}
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
|
aarch64-opbenchmark-test:
|
||||||
|
name: aarch64-opbenchmark-test
|
||||||
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
|
needs: aarch64-opbenchmark-build
|
||||||
|
with:
|
||||||
|
build-environment: linux-jammy-aarch64-py3.10
|
||||||
|
docker-image: ${{ needs.aarch64-opbenchmark-build.outputs.docker-image }}
|
||||||
|
test-matrix: ${{ needs.aarch64-opbenchmark-build.outputs.test-matrix }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
12
.github/workflows/rocm-mi355.yml
vendored
12
.github/workflows/rocm-mi355.yml
vendored
@ -45,12 +45,12 @@ jobs:
|
|||||||
sync-tag: rocm-build
|
sync-tag: rocm-build
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
{ include: [
|
{ include: [
|
||||||
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
|
||||||
]}
|
]}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
|
17
.github/workflows/trunk.yml
vendored
17
.github/workflows/trunk.yml
vendored
@ -200,6 +200,23 @@ jobs:
|
|||||||
cuda-arch-list: '8.0'
|
cuda-arch-list: '8.0'
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
|
# Test cross-compiled models with Windows libs extracted from wheel
|
||||||
|
cross-compile-linux-test:
|
||||||
|
name: cross-compile-linux-test
|
||||||
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
|
needs:
|
||||||
|
- linux-jammy-cuda12_8-py3_10-gcc11-build
|
||||||
|
- get-label-type
|
||||||
|
- win-vs2022-cuda12_8-py3-build
|
||||||
|
with:
|
||||||
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11
|
||||||
|
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
|
||||||
|
test-matrix: |
|
||||||
|
{ include: [
|
||||||
|
{ config: "aoti_cross_compile_for_windows", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", win_torch_wheel_artifact: "win-vs2022-cuda12.8-py3" },
|
||||||
|
]}
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
verify-cachebench-cpu-build:
|
verify-cachebench-cpu-build:
|
||||||
name: verify-cachebench-cpu-build
|
name: verify-cachebench-cpu-build
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
|
@ -209,6 +209,46 @@ command = [
|
|||||||
'@{{PATHSFILE}}'
|
'@{{PATHSFILE}}'
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
[[linter]]
|
||||||
|
code = 'PYREFLY'
|
||||||
|
include_patterns = [
|
||||||
|
'torch/**/*.py',
|
||||||
|
'torch/**/*.pyi',
|
||||||
|
'torchgen/**/*.py',
|
||||||
|
'torchgen/**/*.pyi',
|
||||||
|
'functorch/**/*.py',
|
||||||
|
'functorch/**/*.pyi',
|
||||||
|
]
|
||||||
|
exclude_patterns = []
|
||||||
|
command = [
|
||||||
|
'python3',
|
||||||
|
'tools/linter/adapters/pyrefly_linter.py',
|
||||||
|
'--config=pyrefly.toml',
|
||||||
|
]
|
||||||
|
init_command = [
|
||||||
|
'python3',
|
||||||
|
'tools/linter/adapters/pip_init.py',
|
||||||
|
'--dry-run={{DRYRUN}}',
|
||||||
|
'numpy==2.1.0 ; python_version >= "3.12"',
|
||||||
|
'expecttest==0.3.0',
|
||||||
|
'pyrefly==0.36.2',
|
||||||
|
'sympy==1.13.3',
|
||||||
|
'types-requests==2.27.25',
|
||||||
|
'types-pyyaml==6.0.2',
|
||||||
|
'types-tabulate==0.8.8',
|
||||||
|
'types-protobuf==5.29.1.20250403',
|
||||||
|
'types-setuptools==79.0.0.20250422',
|
||||||
|
'types-jinja2==2.11.9',
|
||||||
|
'types-colorama==0.4.6',
|
||||||
|
'filelock==3.18.0',
|
||||||
|
'junitparser==2.1.1',
|
||||||
|
'rich==14.1.0',
|
||||||
|
'optree==0.17.0',
|
||||||
|
'types-openpyxl==3.1.5.20250919',
|
||||||
|
'types-python-dateutil==2.9.0.20251008'
|
||||||
|
]
|
||||||
|
|
||||||
[[linter]]
|
[[linter]]
|
||||||
code = 'CLANGTIDY'
|
code = 'CLANGTIDY'
|
||||||
include_patterns = [
|
include_patterns = [
|
||||||
|
@ -39,7 +39,7 @@ RUN chmod +x ~/miniconda.sh && \
|
|||||||
bash ~/miniconda.sh -b -p /opt/conda && \
|
bash ~/miniconda.sh -b -p /opt/conda && \
|
||||||
rm ~/miniconda.sh && \
|
rm ~/miniconda.sh && \
|
||||||
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
|
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
|
||||||
/opt/conda/bin/python -mpip install -r requirements.txt && \
|
/opt/conda/bin/python -m pip install -r requirements.txt && \
|
||||||
/opt/conda/bin/conda clean -ya
|
/opt/conda/bin/conda clean -ya
|
||||||
|
|
||||||
FROM dev-base as submodule-update
|
FROM dev-base as submodule-update
|
||||||
|
@ -256,6 +256,7 @@ endif()
|
|||||||
IF(USE_FBGEMM_GENAI)
|
IF(USE_FBGEMM_GENAI)
|
||||||
set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/)
|
set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/)
|
||||||
set(FBGEMM_GENAI_SRCS ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
|
set(FBGEMM_GENAI_SRCS ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
|
||||||
|
|
||||||
if(USE_CUDA)
|
if(USE_CUDA)
|
||||||
# To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
|
# To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
|
||||||
# If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
|
# If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
|
||||||
@ -292,58 +293,64 @@ IF(USE_FBGEMM_GENAI)
|
|||||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
|
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
|
||||||
)
|
)
|
||||||
|
|
||||||
target_include_directories(fbgemm_genai PUBLIC
|
target_include_directories(fbgemm_genai PRIVATE
|
||||||
${FBGEMM_THIRD_PARTY}/cutlass/include
|
${FBGEMM_THIRD_PARTY}/cutlass/include
|
||||||
${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
|
${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
|
||||||
${fbgemm_genai_mx8mx8bf16_grouped}
|
${fbgemm_genai_mx8mx8bf16_grouped}
|
||||||
${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
|
${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
|
||||||
${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h
|
${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h
|
||||||
)
|
)
|
||||||
else()
|
|
||||||
if(USE_ROCM)
|
|
||||||
# Only include the kernels we want to build to avoid increasing binary size.
|
|
||||||
file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
|
|
||||||
"${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
|
|
||||||
"${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
|
|
||||||
set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
|
|
||||||
|
|
||||||
# Add additional HIPCC compiler flags for performance
|
# Add FBGEMM_GENAI include directories for torch_ops.h
|
||||||
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
|
list(APPEND ATen_CUDA_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
|
||||||
-mllvm
|
list(APPEND ATen_CUDA_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
|
||||||
-amdgpu-coerce-illegal-types=1
|
elseif(USE_ROCM)
|
||||||
-mllvm
|
# Only include the kernels we want to build to avoid increasing binary size.
|
||||||
-enable-post-misched=0
|
file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
|
||||||
-mllvm
|
"${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
|
||||||
-greedy-reverse-local-assignment=1
|
"${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
|
||||||
-fhip-new-launch-api)
|
set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
|
||||||
|
|
||||||
# Only compile for gfx942 for now.
|
# Add additional HIPCC compiler flags for performance
|
||||||
# This is rather hacky, I could not figure out a clean solution :(
|
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
|
||||||
set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
|
-mllvm
|
||||||
string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
|
-amdgpu-coerce-illegal-types=1
|
||||||
if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
|
-mllvm
|
||||||
list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
|
-enable-post-misched=0
|
||||||
endif()
|
-mllvm
|
||||||
set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
|
-greedy-reverse-local-assignment=1
|
||||||
|
-fhip-new-launch-api)
|
||||||
|
|
||||||
hip_add_library(
|
# Only compile for gfx942 for now.
|
||||||
fbgemm_genai STATIC
|
# This is rather hacky, I could not figure out a clean solution :(
|
||||||
${fbgemm_genai_native_rocm_hip}
|
set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
|
||||||
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
|
string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
|
||||||
set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
|
if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
|
||||||
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
|
||||||
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
|
|
||||||
|
|
||||||
target_include_directories(fbgemm_genai PUBLIC
|
|
||||||
# FBGEMM version of Composable Kernel is used due to some customizations
|
|
||||||
${FBGEMM_THIRD_PARTY}/composable_kernel/include
|
|
||||||
${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
|
|
||||||
${FBGEMM_THIRD_PARTY}/cutlass/include
|
|
||||||
${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
|
|
||||||
${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
|
|
||||||
${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h
|
|
||||||
)
|
|
||||||
endif()
|
endif()
|
||||||
|
set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
|
||||||
|
|
||||||
|
hip_add_library(
|
||||||
|
fbgemm_genai STATIC
|
||||||
|
${fbgemm_genai_native_rocm_hip}
|
||||||
|
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
|
||||||
|
set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
|
||||||
|
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
|
||||||
|
|
||||||
|
target_include_directories(fbgemm_genai PRIVATE
|
||||||
|
# FBGEMM version of Composable Kernel is used due to some customizations
|
||||||
|
${FBGEMM_THIRD_PARTY}/composable_kernel/include
|
||||||
|
${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
|
||||||
|
${FBGEMM_THIRD_PARTY}/cutlass/include
|
||||||
|
${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
|
||||||
|
${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
|
||||||
|
${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add FBGEMM_GENAI include directories for torch_ops.h
|
||||||
|
list(APPEND ATen_HIP_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
|
||||||
|
list(APPEND ATen_HIP_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
@ -692,12 +699,6 @@ if(USE_CUDA AND NOT USE_ROCM)
|
|||||||
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
|
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
|
||||||
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include)
|
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include)
|
||||||
|
|
||||||
# Add FBGEMM_GENAI include directories for torch_ops.h
|
|
||||||
if(USE_FBGEMM_GENAI)
|
|
||||||
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
|
|
||||||
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if($ENV{ATEN_STATIC_CUDA})
|
if($ENV{ATEN_STATIC_CUDA})
|
||||||
if(CUDA_VERSION VERSION_LESS_EQUAL 12.9)
|
if(CUDA_VERSION VERSION_LESS_EQUAL 12.9)
|
||||||
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
|
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
|
||||||
|
@ -389,37 +389,16 @@ void fillVersion<DLManagedTensorVersioned>(
|
|||||||
// constructed out of ATen tensor
|
// constructed out of ATen tensor
|
||||||
template <class T>
|
template <class T>
|
||||||
T* toDLPackImpl(const Tensor& src) {
|
T* toDLPackImpl(const Tensor& src) {
|
||||||
auto view = src;
|
|
||||||
|
|
||||||
// Detect whether there is need to normalize the strides
|
|
||||||
// Background: gh-83069
|
|
||||||
//
|
|
||||||
// However, normalizing strides can come at a high-cost
|
|
||||||
// to slow down toDLPack conversion 3x, so we
|
|
||||||
// only normalize if needed.
|
|
||||||
//
|
|
||||||
// The following code detects whether the src follows
|
|
||||||
// a continuous pattern. If the src follows such pattern (common-case)
|
|
||||||
// then we do not need to normalize the strides.
|
|
||||||
bool need_normalize_strides = src.dim() == 1 && src.size(0) == 1 && src.stride(0) != 1;
|
|
||||||
// less common case, try normalizing the strides
|
|
||||||
if (need_normalize_strides) {
|
|
||||||
// create a new tensor with possibly normalized strides
|
|
||||||
// gh-83069
|
|
||||||
auto shape = src.sizes();
|
|
||||||
view = src.as_strided(shape, {1}, src.storage_offset());
|
|
||||||
}
|
|
||||||
|
|
||||||
ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
|
ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
|
||||||
atDLMTensor->handle = view;
|
atDLMTensor->handle = src;
|
||||||
atDLMTensor->tensor.manager_ctx = atDLMTensor;
|
atDLMTensor->tensor.manager_ctx = atDLMTensor;
|
||||||
atDLMTensor->tensor.deleter = &deleter<T>;
|
atDLMTensor->tensor.deleter = &deleter<T>;
|
||||||
atDLMTensor->tensor.dl_tensor.data = view.data_ptr();
|
atDLMTensor->tensor.dl_tensor.data = src.data_ptr();
|
||||||
atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device());
|
atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device());
|
||||||
atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim());
|
atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim());
|
||||||
atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src);
|
atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src);
|
||||||
atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(view.sizes().data());
|
atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(src.sizes().data());
|
||||||
atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(view.strides().data());
|
atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(src.strides().data());
|
||||||
atDLMTensor->tensor.dl_tensor.byte_offset = 0;
|
atDLMTensor->tensor.dl_tensor.byte_offset = 0;
|
||||||
fillVersion(&atDLMTensor->tensor);
|
fillVersion(&atDLMTensor->tensor);
|
||||||
|
|
||||||
|
@ -229,10 +229,10 @@ private:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static const uint32_t kPhilox10A = 0x9E3779B9;
|
static constexpr uint32_t kPhilox10A = 0x9E3779B9;
|
||||||
static const uint32_t kPhilox10B = 0xBB67AE85;
|
static constexpr uint32_t kPhilox10B = 0xBB67AE85;
|
||||||
static const uint32_t kPhiloxSA = 0xD2511F53;
|
static constexpr uint32_t kPhiloxSA = 0xD2511F53;
|
||||||
static const uint32_t kPhiloxSB = 0xCD9E8D57;
|
static constexpr uint32_t kPhiloxSB = 0xCD9E8D57;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef philox_engine Philox4_32;
|
typedef philox_engine Philox4_32;
|
||||||
|
@ -624,7 +624,14 @@ struct TORCH_API IValue final {
|
|||||||
IValue(const c10::SymBool& i) {
|
IValue(const c10::SymBool& i) {
|
||||||
if (auto mi = i.maybe_as_bool()) {
|
if (auto mi = i.maybe_as_bool()) {
|
||||||
tag = Tag::Bool;
|
tag = Tag::Bool;
|
||||||
|
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||||
payload.u.as_int = *mi;
|
payload.u.as_int = *mi;
|
||||||
|
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||||
|
/* due to byteorder if value assigned as_int, as_bool actually is not set correctly */
|
||||||
|
payload.u.as_bool = *mi;
|
||||||
|
#else
|
||||||
|
#error Unexpected or undefined __BYTE_ORDER__
|
||||||
|
#endif
|
||||||
} else {
|
} else {
|
||||||
tag = Tag::SymBool;
|
tag = Tag::SymBool;
|
||||||
payload.u.as_intrusive_ptr = i.toSymNodeImpl().release();
|
payload.u.as_intrusive_ptr = i.toSymNodeImpl().release();
|
||||||
|
@ -8,6 +8,7 @@
|
|||||||
#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
|
#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
|
||||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||||
#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
|
#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
|
||||||
|
#include <ATen/cpu/vec/vec128/vec128_int_aarch64.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <ATen/cpu/vec/vec128/vec128_convert.h>
|
#include <ATen/cpu/vec/vec128/vec128_convert.h>
|
||||||
|
794
aten/src/ATen/cpu/vec/vec128/vec128_int_aarch64.h
Normal file
794
aten/src/ATen/cpu/vec/vec128/vec128_int_aarch64.h
Normal file
@ -0,0 +1,794 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <ATen/cpu/vec/intrinsics.h>
|
||||||
|
#include <ATen/cpu/vec/vec_base.h>
|
||||||
|
#include <c10/macros/Macros.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
|
namespace at::vec {
|
||||||
|
// Note [CPU_CAPABILITY namespace]
|
||||||
|
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
// This header, and all of its subheaders, will be compiled with
|
||||||
|
// different architecture flags for each supported set of vector
|
||||||
|
// intrinsics. So we need to make sure they aren't inadvertently
|
||||||
|
// linked together. We do this by declaring objects in an `inline
|
||||||
|
// namespace` which changes the name mangling, but can still be
|
||||||
|
// accessed as `at::vec`.
|
||||||
|
inline namespace CPU_CAPABILITY {
|
||||||
|
|
||||||
|
#define VEC_INT_NEON_TEMPLATE(vl, bit) \
|
||||||
|
template <> \
|
||||||
|
struct is_vec_specialized_for<int##bit##_t> : std::bool_constant<true> {}; \
|
||||||
|
\
|
||||||
|
template <> \
|
||||||
|
class Vectorized<int##bit##_t> { \
|
||||||
|
using neon_type = int##bit##x##vl##_t; \
|
||||||
|
\
|
||||||
|
private: \
|
||||||
|
neon_type values; \
|
||||||
|
\
|
||||||
|
public: \
|
||||||
|
using value_type = int##bit##_t; \
|
||||||
|
using size_type = int; \
|
||||||
|
static constexpr size_type size() { \
|
||||||
|
return vl; \
|
||||||
|
} \
|
||||||
|
Vectorized() { \
|
||||||
|
values = vdupq_n_s##bit(0); \
|
||||||
|
} \
|
||||||
|
Vectorized(neon_type v) : values(v) {} \
|
||||||
|
Vectorized(int##bit##_t val); \
|
||||||
|
template < \
|
||||||
|
typename... Args, \
|
||||||
|
typename = std::enable_if_t<(sizeof...(Args) == size())>> \
|
||||||
|
Vectorized(Args... vals) { \
|
||||||
|
__at_align__ int##bit##_t buffer[size()] = {vals...}; \
|
||||||
|
values = vld1q_s##bit(buffer); \
|
||||||
|
} \
|
||||||
|
operator neon_type() const { \
|
||||||
|
return values; \
|
||||||
|
} \
|
||||||
|
static Vectorized<int##bit##_t> loadu( \
|
||||||
|
const void* ptr, \
|
||||||
|
int64_t count = size()); \
|
||||||
|
void store(void* ptr, int64_t count = size()) const; \
|
||||||
|
template <int64_t mask> \
|
||||||
|
static Vectorized<int##bit##_t> blend( \
|
||||||
|
const Vectorized<int##bit##_t>& a, \
|
||||||
|
const Vectorized<int##bit##_t>& b); \
|
||||||
|
static Vectorized<int##bit##_t> blendv( \
|
||||||
|
const Vectorized<int##bit##_t>& a, \
|
||||||
|
const Vectorized<int##bit##_t>& b, \
|
||||||
|
const Vectorized<int##bit##_t>& mask_) { \
|
||||||
|
return vbslq_s##bit(vreinterpretq_u##bit##_s##bit(mask_.values), b, a); \
|
||||||
|
} \
|
||||||
|
template <typename step_t> \
|
||||||
|
static Vectorized<int##bit##_t> arange( \
|
||||||
|
value_type base = 0, \
|
||||||
|
step_t step = static_cast<step_t>(1)); \
|
||||||
|
static Vectorized<int##bit##_t> set( \
|
||||||
|
const Vectorized<int##bit##_t>& a, \
|
||||||
|
const Vectorized<int##bit##_t>& b, \
|
||||||
|
int64_t count = size()); \
|
||||||
|
const int##bit##_t& operator[](int idx) const = delete; \
|
||||||
|
int##bit##_t& operator[](int idx) = delete; \
|
||||||
|
Vectorized<int##bit##_t> abs() const { \
|
||||||
|
return vabsq_s##bit(values); \
|
||||||
|
} \
|
||||||
|
Vectorized<int##bit##_t> real() const { \
|
||||||
|
return values; \
|
||||||
|
} \
|
||||||
|
Vectorized<int##bit##_t> imag() const { \
|
||||||
|
return vdupq_n_s##bit(0); \
|
||||||
|
} \
|
||||||
|
Vectorized<int##bit##_t> conj() const { \
|
||||||
|
return values; \
|
||||||
|
} \
|
||||||
|
Vectorized<int##bit##_t> neg() const { \
|
||||||
|
return vnegq_s##bit(values); \
|
||||||
|
} \
|
||||||
|
int##bit##_t reduce_add() const { \
|
||||||
|
return vaddvq_s##bit(values); \
|
||||||
|
} \
|
||||||
|
int##bit##_t reduce_max() const; \
|
||||||
|
Vectorized<int##bit##_t> operator==( \
|
||||||
|
const Vectorized<int##bit##_t>& other) const { \
|
||||||
|
return Vectorized<value_type>( \
|
||||||
|
vreinterpretq_s##bit##_u##bit(vceqq_s##bit(values, other.values))); \
|
||||||
|
} \
|
||||||
|
Vectorized<int##bit##_t> operator!=( \
|
||||||
|
const Vectorized<int##bit##_t>& other) const; \
|
||||||
|
Vectorized<int##bit##_t> operator<( \
|
||||||
|
const Vectorized<int##bit##_t>& other) const { \
|
||||||
|
return Vectorized<value_type>( \
|
||||||
|
vreinterpretq_s##bit##_u##bit(vcltq_s##bit(values, other.values))); \
|
||||||
|
} \
|
||||||
|
Vectorized<int##bit##_t> operator<=( \
|
||||||
|
const Vectorized<int##bit##_t>& other) const { \
|
||||||
|
return Vectorized<value_type>( \
|
||||||
|
vreinterpretq_s##bit##_u##bit(vcleq_s##bit(values, other.values))); \
|
||||||
|
} \
|
||||||
|
Vectorized<int##bit##_t> operator>( \
|
||||||
|
const Vectorized<int##bit##_t>& other) const { \
|
||||||
|
return Vectorized<value_type>( \
|
||||||
|
vreinterpretq_s##bit##_u##bit(vcgtq_s##bit(values, other.values))); \
|
||||||
|
} \
|
||||||
|
Vectorized<int##bit##_t> operator>=( \
|
||||||
|
const Vectorized<int##bit##_t>& other) const { \
|
||||||
|
return Vectorized<value_type>( \
|
||||||
|
vreinterpretq_s##bit##_u##bit(vcgeq_s##bit(values, other.values))); \
|
||||||
|
} \
|
||||||
|
Vectorized<int##bit##_t> eq(const Vectorized<int##bit##_t>& other) const; \
|
||||||
|
Vectorized<int##bit##_t> ne(const Vectorized<int##bit##_t>& other) const; \
|
||||||
|
Vectorized<int##bit##_t> gt(const Vectorized<int##bit##_t>& other) const; \
|
||||||
|
Vectorized<int##bit##_t> ge(const Vectorized<int##bit##_t>& other) const; \
|
||||||
|
Vectorized<int##bit##_t> lt(const Vectorized<int##bit##_t>& other) const; \
|
||||||
|
Vectorized<int##bit##_t> le(const Vectorized<int##bit##_t>& other) const; \
|
||||||
|
}; \
|
||||||
|
template <> \
|
||||||
|
Vectorized<int##bit##_t> inline operator+( \
|
||||||
|
const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
|
||||||
|
return vaddq_s##bit(a, b); \
|
||||||
|
} \
|
||||||
|
template <> \
|
||||||
|
Vectorized<int##bit##_t> inline operator-( \
|
||||||
|
const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
|
||||||
|
return vsubq_s##bit(a, b); \
|
||||||
|
} \
|
||||||
|
template <> \
|
||||||
|
Vectorized<int##bit##_t> inline operator&( \
|
||||||
|
const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
|
||||||
|
return vandq_s##bit(a, b); \
|
||||||
|
} \
|
||||||
|
template <> \
|
||||||
|
Vectorized<int##bit##_t> inline operator|( \
|
||||||
|
const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
|
||||||
|
return vorrq_s##bit(a, b); \
|
||||||
|
} \
|
||||||
|
template <> \
|
||||||
|
Vectorized<int##bit##_t> inline operator^( \
|
||||||
|
const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
|
||||||
|
return veorq_s##bit(a, b); \
|
||||||
|
} \
|
||||||
|
Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::eq( \
|
||||||
|
const Vectorized<int##bit##_t>& other) const { \
|
||||||
|
return (*this == other) & Vectorized<int##bit##_t>(1); \
|
||||||
|
} \
|
||||||
|
Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::ne( \
|
||||||
|
const Vectorized<int##bit##_t>& other) const { \
|
||||||
|
return (*this != other) & Vectorized<int##bit##_t>(1); \
|
||||||
|
} \
|
||||||
|
Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::gt( \
|
||||||
|
const Vectorized<int##bit##_t>& other) const { \
|
||||||
|
return (*this > other) & Vectorized<int##bit##_t>(1); \
|
||||||
|
} \
|
||||||
|
Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::ge( \
|
||||||
|
const Vectorized<int##bit##_t>& other) const { \
|
||||||
|
return (*this >= other) & Vectorized<int##bit##_t>(1); \
|
||||||
|
} \
|
||||||
|
Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::lt( \
|
||||||
|
const Vectorized<int##bit##_t>& other) const { \
|
||||||
|
return (*this < other) & Vectorized<int##bit##_t>(1); \
|
||||||
|
} \
|
||||||
|
Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::le( \
|
||||||
|
const Vectorized<int##bit##_t>& other) const { \
|
||||||
|
return (*this <= other) & Vectorized<int##bit##_t>(1); \
|
||||||
|
}
|
||||||
|
|
||||||
|
VEC_INT_NEON_TEMPLATE(2, 64)
|
||||||
|
VEC_INT_NEON_TEMPLATE(4, 32)
|
||||||
|
VEC_INT_NEON_TEMPLATE(8, 16)
|
||||||
|
VEC_INT_NEON_TEMPLATE(16, 8)
|
||||||
|
|
||||||
|
inline int32_t Vectorized<int32_t>::reduce_max() const {
|
||||||
|
return vmaxvq_s32(values);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int16_t Vectorized<int16_t>::reduce_max() const {
|
||||||
|
return vmaxvq_s16(values);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int8_t Vectorized<int8_t>::reduce_max() const {
|
||||||
|
return vmaxvq_s8(values);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int32_t> inline operator*(
|
||||||
|
const Vectorized<int32_t>& a,
|
||||||
|
const Vectorized<int32_t>& b) {
|
||||||
|
return vmulq_s32(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int16_t> inline operator*(
|
||||||
|
const Vectorized<int16_t>& a,
|
||||||
|
const Vectorized<int16_t>& b) {
|
||||||
|
return vmulq_s16(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int8_t> inline operator*(
|
||||||
|
const Vectorized<int8_t>& a,
|
||||||
|
const Vectorized<int8_t>& b) {
|
||||||
|
return vmulq_s8(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline Vectorized<int64_t> operator~(const Vectorized<int64_t>& a) {
|
||||||
|
int64x2_t val = a;
|
||||||
|
return ~val;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline Vectorized<int32_t> operator~(const Vectorized<int32_t>& a) {
|
||||||
|
return vmvnq_s32(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline Vectorized<int16_t> operator~(const Vectorized<int16_t>& a) {
|
||||||
|
return vmvnq_s16(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline Vectorized<int8_t> operator~(const Vectorized<int8_t>& a) {
|
||||||
|
return vmvnq_s8(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline Vectorized<int64_t> Vectorized<int64_t>::operator!=(
|
||||||
|
const Vectorized<int64_t>& other) const {
|
||||||
|
return ~(*this == other);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline Vectorized<int32_t> Vectorized<int32_t>::operator!=(
|
||||||
|
const Vectorized<int32_t>& other) const {
|
||||||
|
return ~(*this == other);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline Vectorized<int16_t> Vectorized<int16_t>::operator!=(
|
||||||
|
const Vectorized<int16_t>& other) const {
|
||||||
|
return ~(*this == other);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline Vectorized<int8_t> Vectorized<int8_t>::operator!=(
|
||||||
|
const Vectorized<int8_t>& other) const {
|
||||||
|
return ~(*this == other);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int32_t> inline minimum(
|
||||||
|
const Vectorized<int32_t>& a,
|
||||||
|
const Vectorized<int32_t>& b) {
|
||||||
|
return vminq_s32(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int16_t> inline minimum(
|
||||||
|
const Vectorized<int16_t>& a,
|
||||||
|
const Vectorized<int16_t>& b) {
|
||||||
|
return vminq_s16(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int8_t> inline minimum(
|
||||||
|
const Vectorized<int8_t>& a,
|
||||||
|
const Vectorized<int8_t>& b) {
|
||||||
|
return vminq_s8(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int32_t> inline maximum(
|
||||||
|
const Vectorized<int32_t>& a,
|
||||||
|
const Vectorized<int32_t>& b) {
|
||||||
|
return vmaxq_s32(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int16_t> inline maximum(
|
||||||
|
const Vectorized<int16_t>& a,
|
||||||
|
const Vectorized<int16_t>& b) {
|
||||||
|
return vmaxq_s16(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int8_t> inline maximum(
|
||||||
|
const Vectorized<int8_t>& a,
|
||||||
|
const Vectorized<int8_t>& b) {
|
||||||
|
return vmaxq_s8(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int64_t mask>
|
||||||
|
Vectorized<int64_t> Vectorized<int64_t>::blend(
|
||||||
|
const Vectorized<int64_t>& a,
|
||||||
|
const Vectorized<int64_t>& b) {
|
||||||
|
// Build an array of flags: each bit of element is 1 if the corresponding bit
|
||||||
|
// in 'mask' is set, 0 otherwise.
|
||||||
|
uint64x2_t maskArray = {
|
||||||
|
(mask & 1LL) ? 0xFFFFFFFFFFFFFFFF : 0,
|
||||||
|
(mask & 2LL) ? 0xFFFFFFFFFFFFFFFF : 0};
|
||||||
|
// Use BSL to select elements from b where the mask is 1, else from a
|
||||||
|
return vbslq_s64(maskArray, b.values, a.values);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int64_t mask>
|
||||||
|
Vectorized<int32_t> Vectorized<int32_t>::blend(
|
||||||
|
const Vectorized<int32_t>& a,
|
||||||
|
const Vectorized<int32_t>& b) {
|
||||||
|
// Build an array of flags: each bit of element is 1 if the corresponding bit
|
||||||
|
// in 'mask' is set, 0 otherwise.
|
||||||
|
uint32x4_t maskArray = {
|
||||||
|
(mask & 1LL) ? 0xFFFFFFFF : 0,
|
||||||
|
(mask & 2LL) ? 0xFFFFFFFF : 0,
|
||||||
|
(mask & 4LL) ? 0xFFFFFFFF : 0,
|
||||||
|
(mask & 8LL) ? 0xFFFFFFFF : 0};
|
||||||
|
// Use BSL to select elements from b where the mask is 1, else from a
|
||||||
|
return vbslq_s32(maskArray, b.values, a.values);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int64_t mask>
|
||||||
|
Vectorized<int16_t> Vectorized<int16_t>::blend(
|
||||||
|
const Vectorized<int16_t>& a,
|
||||||
|
const Vectorized<int16_t>& b) {
|
||||||
|
// Build an array of flags: each bit of element is 1 if the corresponding bit
|
||||||
|
// in 'mask' is set, 0 otherwise.
|
||||||
|
uint16x8_t maskArray = {
|
||||||
|
(mask & 1LL) ? 0xFFFF : 0,
|
||||||
|
(mask & 2LL) ? 0xFFFF : 0,
|
||||||
|
(mask & 4LL) ? 0xFFFF : 0,
|
||||||
|
(mask & 8LL) ? 0xFFFF : 0,
|
||||||
|
(mask & 16LL) ? 0xFFFF : 0,
|
||||||
|
(mask & 32LL) ? 0xFFFF : 0,
|
||||||
|
(mask & 64LL) ? 0xFFFF : 0,
|
||||||
|
(mask & 128LL) ? 0xFFFF : 0};
|
||||||
|
// Use BSL to select elements from b where the mask is 1, else from a
|
||||||
|
return vbslq_s16(maskArray, b.values, a.values);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int64_t mask>
|
||||||
|
Vectorized<int8_t> Vectorized<int8_t>::blend(
|
||||||
|
const Vectorized<int8_t>& a,
|
||||||
|
const Vectorized<int8_t>& b) {
|
||||||
|
// Build an array of flags: each bit of element is 1 if the corresponding bit
|
||||||
|
// in 'mask' is set, 0 otherwise.
|
||||||
|
uint8x16_t maskArray = {
|
||||||
|
(mask & 1LL) ? 0xFF : 0,
|
||||||
|
(mask & 2LL) ? 0xFF : 0,
|
||||||
|
(mask & 4LL) ? 0xFF : 0,
|
||||||
|
(mask & 8LL) ? 0xFF : 0,
|
||||||
|
(mask & 16LL) ? 0xFF : 0,
|
||||||
|
(mask & 32LL) ? 0xFF : 0,
|
||||||
|
(mask & 64LL) ? 0xFF : 0,
|
||||||
|
(mask & 128LL) ? 0xFF : 0,
|
||||||
|
(mask & 256LL) ? 0xFF : 0,
|
||||||
|
(mask & 512LL) ? 0xFF : 0,
|
||||||
|
(mask & 1024LL) ? 0xFF : 0,
|
||||||
|
(mask & 2048LL) ? 0xFF : 0,
|
||||||
|
(mask & 4096LL) ? 0xFF : 0,
|
||||||
|
(mask & 8192LL) ? 0xFF : 0,
|
||||||
|
(mask & 16384LL) ? 0xFF : 0,
|
||||||
|
(mask & 32768LL) ? 0xFF : 0};
|
||||||
|
// Use BSL to select elements from b where the mask is 1, else from a
|
||||||
|
return vbslq_s8(maskArray, b.values, a.values);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define VEC_INT_NEON_OPS(vl, bit) \
|
||||||
|
inline Vectorized<int##bit##_t>::Vectorized(int##bit##_t val) { \
|
||||||
|
values = vdupq_n_s##bit(val); \
|
||||||
|
} \
|
||||||
|
inline Vectorized<int##bit##_t> Vectorized<int##bit##_t>::loadu( \
|
||||||
|
const void* ptr, int64_t count) { \
|
||||||
|
if (count == size()) { \
|
||||||
|
return vld1q_s##bit(reinterpret_cast<const int##bit##_t*>(ptr)); \
|
||||||
|
} else { \
|
||||||
|
__at_align__ int##bit##_t tmp_values[size()]; \
|
||||||
|
for (const auto i : c10::irange(size())) { \
|
||||||
|
tmp_values[i] = 0; \
|
||||||
|
} \
|
||||||
|
std::memcpy( \
|
||||||
|
tmp_values, \
|
||||||
|
reinterpret_cast<const int##bit##_t*>(ptr), \
|
||||||
|
count * sizeof(int##bit##_t)); \
|
||||||
|
return vld1q_s##bit(reinterpret_cast<const int##bit##_t*>(tmp_values)); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
inline void Vectorized<int##bit##_t>::store(void* ptr, int64_t count) \
|
||||||
|
const { \
|
||||||
|
if (count == size()) { \
|
||||||
|
vst1q_s##bit(reinterpret_cast<int##bit##_t*>(ptr), values); \
|
||||||
|
} else { \
|
||||||
|
int##bit##_t tmp_values[size()]; \
|
||||||
|
vst1q_s##bit(reinterpret_cast<int##bit##_t*>(tmp_values), values); \
|
||||||
|
std::memcpy(ptr, tmp_values, count * sizeof(int##bit##_t)); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
VEC_INT_NEON_OPS(2, 64)
|
||||||
|
VEC_INT_NEON_OPS(4, 32)
|
||||||
|
VEC_INT_NEON_OPS(8, 16)
|
||||||
|
VEC_INT_NEON_OPS(16, 8)
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int64_t> inline operator*(
|
||||||
|
const Vectorized<int64_t>& a,
|
||||||
|
const Vectorized<int64_t>& b) {
|
||||||
|
int64x2_t x = a;
|
||||||
|
int64x2_t y = b;
|
||||||
|
return x * y;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int64_t> inline operator/(
|
||||||
|
const Vectorized<int64_t>& a,
|
||||||
|
const Vectorized<int64_t>& b) {
|
||||||
|
int64x2_t x = a;
|
||||||
|
int64x2_t y = b;
|
||||||
|
return x / y;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int32_t> inline operator/(
|
||||||
|
const Vectorized<int32_t>& a,
|
||||||
|
const Vectorized<int32_t>& b) {
|
||||||
|
int32x4_t x = a;
|
||||||
|
int32x4_t y = b;
|
||||||
|
return x / y;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int64_t Vectorized<int64_t>::reduce_max() const {
|
||||||
|
return std::max(values[0], values[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int64_t> inline minimum(
|
||||||
|
const Vectorized<int64_t>& a,
|
||||||
|
const Vectorized<int64_t>& b) {
|
||||||
|
int64x2_t x = a;
|
||||||
|
int64x2_t y = b;
|
||||||
|
return {std::min(x[0], y[0]), std::min(x[1], y[1])};
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int64_t> inline maximum(
|
||||||
|
const Vectorized<int64_t>& a,
|
||||||
|
const Vectorized<int64_t>& b) {
|
||||||
|
int64x2_t x = a;
|
||||||
|
int64x2_t y = b;
|
||||||
|
return {std::max(x[0], y[0]), std::max(x[1], y[1])};
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename step_t>
|
||||||
|
inline Vectorized<int64_t> Vectorized<int64_t>::arange(
|
||||||
|
int64_t base,
|
||||||
|
step_t step) {
|
||||||
|
const Vectorized<int64_t> base_vec(base);
|
||||||
|
const Vectorized<int64_t> step_vec(step);
|
||||||
|
const int64x2_t step_sizes = {0, 1};
|
||||||
|
return base_vec.values + step_sizes * step_vec.values;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename step_t>
|
||||||
|
inline Vectorized<int32_t> Vectorized<int32_t>::arange(
|
||||||
|
int32_t base,
|
||||||
|
step_t step) {
|
||||||
|
const Vectorized<int32_t> base_vec(base);
|
||||||
|
const Vectorized<int32_t> step_vec(step);
|
||||||
|
const int32x4_t step_sizes = {0, 1, 2, 3};
|
||||||
|
return vmlaq_s32(base_vec, step_sizes, step_vec);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename step_t>
|
||||||
|
inline Vectorized<int16_t> Vectorized<int16_t>::arange(
|
||||||
|
int16_t base,
|
||||||
|
step_t step) {
|
||||||
|
const Vectorized<int16_t> base_vec(base);
|
||||||
|
const Vectorized<int16_t> step_vec(step);
|
||||||
|
const int16x8_t step_sizes = {0, 1, 2, 3, 4, 5, 6, 7};
|
||||||
|
return vmlaq_s16(base_vec, step_sizes, step_vec);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename step_t>
|
||||||
|
inline Vectorized<int8_t> Vectorized<int8_t>::arange(int8_t base, step_t step) {
|
||||||
|
const Vectorized<int8_t> base_vec(base);
|
||||||
|
const Vectorized<int8_t> step_vec(step);
|
||||||
|
const int8x16_t step_sizes = {
|
||||||
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||||
|
return vmlaq_s8(base_vec, step_sizes, step_vec);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int64_t> inline operator>>(
|
||||||
|
const Vectorized<int64_t>& a,
|
||||||
|
const Vectorized<int64_t>& b) {
|
||||||
|
int64x2_t x = a;
|
||||||
|
int64x2_t y = b;
|
||||||
|
uint64x2_t u = vreinterpretq_u64_s64(y);
|
||||||
|
uint64x2_t z = {std::min(u[0], (uint64_t)63), std::min(u[1], (uint64_t)63)};
|
||||||
|
return x >> vreinterpretq_s64_u64(z);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int32_t> inline operator>>(
|
||||||
|
const Vectorized<int32_t>& a,
|
||||||
|
const Vectorized<int32_t>& b) {
|
||||||
|
int32x4_t x = a;
|
||||||
|
int32x4_t y = b;
|
||||||
|
uint32x4_t bound = vdupq_n_u32(31);
|
||||||
|
uint32x4_t z = vminq_u32(vreinterpretq_u32_s32(y), bound);
|
||||||
|
return x >> vreinterpretq_s32_u32(z);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int16_t> inline operator>>(
|
||||||
|
const Vectorized<int16_t>& a,
|
||||||
|
const Vectorized<int16_t>& b) {
|
||||||
|
int16x8_t x = a;
|
||||||
|
int16x8_t y = b;
|
||||||
|
uint16x8_t bound = vdupq_n_u16(15);
|
||||||
|
uint16x8_t z = vminq_u16(vreinterpretq_u16_s16(y), bound);
|
||||||
|
return x >> vreinterpretq_s16_u16(z);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int8_t> inline operator>>(
|
||||||
|
const Vectorized<int8_t>& a,
|
||||||
|
const Vectorized<int8_t>& b) {
|
||||||
|
int8x16_t x = a;
|
||||||
|
int8x16_t y = b;
|
||||||
|
uint8x16_t bound = vdupq_n_u8(7);
|
||||||
|
int8x16_t z = vreinterpretq_s8_u8(vminq_u8(vreinterpretq_u8_s8(y), bound));
|
||||||
|
return x >> z;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int64_t> inline operator<<(
|
||||||
|
const Vectorized<int64_t>& a,
|
||||||
|
const Vectorized<int64_t>& b) {
|
||||||
|
int64x2_t y = b;
|
||||||
|
uint64x2_t u = vreinterpretq_u64_s64(y);
|
||||||
|
uint64x2_t z = {std::min(u[0], (uint64_t)64), std::min(u[1], (uint64_t)64)};
|
||||||
|
return vshlq_s64(a, vreinterpretq_s64_u64(z));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int32_t> inline operator<<(
|
||||||
|
const Vectorized<int32_t>& a,
|
||||||
|
const Vectorized<int32_t>& b) {
|
||||||
|
int32x4_t y = b;
|
||||||
|
uint32x4_t bound = vdupq_n_u32(32);
|
||||||
|
uint32x4_t z = vminq_u32(vreinterpretq_u32_s32(y), bound);
|
||||||
|
return vshlq_s32(a, vreinterpretq_s32_u32(z));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int16_t> inline operator<<(
|
||||||
|
const Vectorized<int16_t>& a,
|
||||||
|
const Vectorized<int16_t>& b) {
|
||||||
|
int16x8_t y = b;
|
||||||
|
uint16x8_t bound = vdupq_n_u16(16);
|
||||||
|
uint16x8_t z = vminq_u16(vreinterpretq_u16_s16(y), bound);
|
||||||
|
return vshlq_s16(a, vreinterpretq_s16_u16(z));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int8_t> inline operator<<(
|
||||||
|
const Vectorized<int8_t>& a,
|
||||||
|
const Vectorized<int8_t>& b) {
|
||||||
|
int8x16_t y = b;
|
||||||
|
uint8x16_t bound = vdupq_n_u8(8);
|
||||||
|
int8x16_t z = vreinterpretq_s8_u8(vminq_u8(vreinterpretq_u8_s8(y), bound));
|
||||||
|
return vshlq_s8(a, z);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline Vectorized<int64_t> Vectorized<int64_t>::set(
|
||||||
|
const Vectorized<int64_t>& a,
|
||||||
|
const Vectorized<int64_t>& b,
|
||||||
|
int64_t count) {
|
||||||
|
if (count == 0) {
|
||||||
|
return a;
|
||||||
|
} else if (count >= 2) {
|
||||||
|
return b;
|
||||||
|
} else {
|
||||||
|
int64x2_t c = {b.values[0], a.values[1]};
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline Vectorized<int32_t> Vectorized<int32_t>::set(
|
||||||
|
const Vectorized<int32_t>& a,
|
||||||
|
const Vectorized<int32_t>& b,
|
||||||
|
int64_t count) {
|
||||||
|
if (count == 0) {
|
||||||
|
return a;
|
||||||
|
} else if (count >= 4) {
|
||||||
|
return b;
|
||||||
|
} else {
|
||||||
|
// Build an array of flags: each bit of element is 1 if the corresponding
|
||||||
|
// bit in 'mask' is set, 0 otherwise.
|
||||||
|
uint32x4_t maskArray = {
|
||||||
|
(count >= 1LL) ? 0xFFFFFFFF : 0,
|
||||||
|
(count >= 2LL) ? 0xFFFFFFFF : 0,
|
||||||
|
(count >= 3LL) ? 0xFFFFFFFF : 0,
|
||||||
|
0};
|
||||||
|
// Use BSL to select elements from b where the mask is 1, else from a
|
||||||
|
return vbslq_s32(maskArray, b.values, a.values);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline Vectorized<int16_t> Vectorized<int16_t>::set(
|
||||||
|
const Vectorized<int16_t>& a,
|
||||||
|
const Vectorized<int16_t>& b,
|
||||||
|
int64_t count) {
|
||||||
|
if (count == 0) {
|
||||||
|
return a;
|
||||||
|
} else if (count >= 8) {
|
||||||
|
return b;
|
||||||
|
} else {
|
||||||
|
// Build an array of flags: each bit of element is 1 if the corresponding
|
||||||
|
// bit in 'mask' is set, 0 otherwise.
|
||||||
|
uint16x8_t maskArray = {
|
||||||
|
static_cast<uint16_t>((count >= 1LL) ? 0xFFFF : 0),
|
||||||
|
static_cast<uint16_t>((count >= 2LL) ? 0xFFFF : 0),
|
||||||
|
static_cast<uint16_t>((count >= 3LL) ? 0xFFFF : 0),
|
||||||
|
static_cast<uint16_t>((count >= 4LL) ? 0xFFFF : 0),
|
||||||
|
static_cast<uint16_t>((count >= 5LL) ? 0xFFFF : 0),
|
||||||
|
static_cast<uint16_t>((count >= 6LL) ? 0xFFFF : 0),
|
||||||
|
static_cast<uint16_t>((count >= 7LL) ? 0xFFFF : 0),
|
||||||
|
0};
|
||||||
|
// Use BSL to select elements from b where the mask is 1, else from a
|
||||||
|
return vbslq_s16(maskArray, b.values, a.values);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline Vectorized<int8_t> Vectorized<int8_t>::set(
|
||||||
|
const Vectorized<int8_t>& a,
|
||||||
|
const Vectorized<int8_t>& b,
|
||||||
|
int64_t count) {
|
||||||
|
if (count == 0) {
|
||||||
|
return a;
|
||||||
|
} else if (count >= 16) {
|
||||||
|
return b;
|
||||||
|
} else {
|
||||||
|
// Build an array of flags: each bit of element is 1 if the corresponding
|
||||||
|
// bit in 'mask' is set, 0 otherwise.
|
||||||
|
uint8x16_t maskArray = {
|
||||||
|
static_cast<uint8_t>((count >= 1LL) ? 0xFF : 0),
|
||||||
|
static_cast<uint8_t>((count >= 2LL) ? 0xFF : 0),
|
||||||
|
static_cast<uint8_t>((count >= 3LL) ? 0xFF : 0),
|
||||||
|
static_cast<uint8_t>((count >= 4LL) ? 0xFF : 0),
|
||||||
|
static_cast<uint8_t>((count >= 5LL) ? 0xFF : 0),
|
||||||
|
static_cast<uint8_t>((count >= 6LL) ? 0xFF : 0),
|
||||||
|
static_cast<uint8_t>((count >= 7LL) ? 0xFF : 0),
|
||||||
|
static_cast<uint8_t>((count >= 8LL) ? 0xFF : 0),
|
||||||
|
static_cast<uint8_t>((count >= 9LL) ? 0xFF : 0),
|
||||||
|
static_cast<uint8_t>((count >= 10LL) ? 0xFF : 0),
|
||||||
|
static_cast<uint8_t>((count >= 11LL) ? 0xFF : 0),
|
||||||
|
static_cast<uint8_t>((count >= 12LL) ? 0xFF : 0),
|
||||||
|
static_cast<uint8_t>((count >= 13LL) ? 0xFF : 0),
|
||||||
|
static_cast<uint8_t>((count >= 14LL) ? 0xFF : 0),
|
||||||
|
static_cast<uint8_t>((count >= 15LL) ? 0xFF : 0),
|
||||||
|
0};
|
||||||
|
|
||||||
|
// Use BSL to select elements from b where the mask is 1, else from a
|
||||||
|
return vbslq_s8(maskArray, b.values, a.values);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int16_t> inline operator/(
|
||||||
|
const Vectorized<int16_t>& a,
|
||||||
|
const Vectorized<int16_t>& b) {
|
||||||
|
Vectorized<int32_t> highBitsA = vmovl_high_s16(a);
|
||||||
|
Vectorized<int32_t> highBitsB = vmovl_high_s16(b);
|
||||||
|
Vectorized<int32_t> lowBitsA = vmovl_s16(vget_low_s16(a));
|
||||||
|
Vectorized<int32_t> lowBitsB = vmovl_s16(vget_low_s16(b));
|
||||||
|
int32x4_t highBitsResult = highBitsA / highBitsB;
|
||||||
|
int32x4_t lowBitsResult = lowBitsA / lowBitsB;
|
||||||
|
return vuzp1q_s16(
|
||||||
|
vreinterpretq_s16_s32(lowBitsResult),
|
||||||
|
vreinterpretq_s16_s32(highBitsResult));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int8_t> inline operator/(
|
||||||
|
const Vectorized<int8_t>& a,
|
||||||
|
const Vectorized<int8_t>& b) {
|
||||||
|
Vectorized<int16_t> highBitsA = vmovl_high_s8(a);
|
||||||
|
Vectorized<int16_t> highBitsB = vmovl_high_s8(b);
|
||||||
|
Vectorized<int16_t> lowBitsA = vmovl_s8(vget_low_s8(a));
|
||||||
|
Vectorized<int16_t> lowBitsB = vmovl_s8(vget_low_s8(b));
|
||||||
|
int16x8_t highBitsResult = highBitsA / highBitsB;
|
||||||
|
int16x8_t lowBitsResult = lowBitsA / lowBitsB;
|
||||||
|
return vuzp1q_s8(
|
||||||
|
vreinterpretq_s8_s16(lowBitsResult),
|
||||||
|
vreinterpretq_s8_s16(highBitsResult));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int64_t> inline clamp(
|
||||||
|
const Vectorized<int64_t>& a,
|
||||||
|
const Vectorized<int64_t>& min,
|
||||||
|
const Vectorized<int64_t>& max) {
|
||||||
|
return minimum(max, maximum(min, a));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int32_t> inline clamp(
|
||||||
|
const Vectorized<int32_t>& a,
|
||||||
|
const Vectorized<int32_t>& min,
|
||||||
|
const Vectorized<int32_t>& max) {
|
||||||
|
return minimum(max, maximum(min, a));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int16_t> inline clamp(
|
||||||
|
const Vectorized<int16_t>& a,
|
||||||
|
const Vectorized<int16_t>& min,
|
||||||
|
const Vectorized<int16_t>& max) {
|
||||||
|
return minimum(max, maximum(min, a));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int8_t> inline clamp(
|
||||||
|
const Vectorized<int8_t>& a,
|
||||||
|
const Vectorized<int8_t>& min,
|
||||||
|
const Vectorized<int8_t>& max) {
|
||||||
|
return minimum(max, maximum(min, a));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int64_t> inline clamp_max(
|
||||||
|
const Vectorized<int64_t>& a,
|
||||||
|
const Vectorized<int64_t>& max) {
|
||||||
|
return minimum(max, a);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int32_t> inline clamp_max(
|
||||||
|
const Vectorized<int32_t>& a,
|
||||||
|
const Vectorized<int32_t>& max) {
|
||||||
|
return minimum(max, a);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int16_t> inline clamp_max(
|
||||||
|
const Vectorized<int16_t>& a,
|
||||||
|
const Vectorized<int16_t>& max) {
|
||||||
|
return minimum(max, a);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int8_t> inline clamp_max(
|
||||||
|
const Vectorized<int8_t>& a,
|
||||||
|
const Vectorized<int8_t>& max) {
|
||||||
|
return minimum(max, a);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int64_t> inline clamp_min(
|
||||||
|
const Vectorized<int64_t>& a,
|
||||||
|
const Vectorized<int64_t>& min) {
|
||||||
|
return maximum(min, a);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int32_t> inline clamp_min(
|
||||||
|
const Vectorized<int32_t>& a,
|
||||||
|
const Vectorized<int32_t>& min) {
|
||||||
|
return maximum(min, a);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int16_t> inline clamp_min(
|
||||||
|
const Vectorized<int16_t>& a,
|
||||||
|
const Vectorized<int16_t>& min) {
|
||||||
|
return maximum(min, a);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
Vectorized<int8_t> inline clamp_min(
|
||||||
|
const Vectorized<int8_t>& a,
|
||||||
|
const Vectorized<int8_t>& min) {
|
||||||
|
return maximum(min, a);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace CPU_CAPABILITY
|
||||||
|
} // namespace at::vec
|
@ -1377,7 +1377,7 @@ Vectorized<c10::quint8> inline maximum(
|
|||||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
|
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
|
||||||
std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
|
std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
|
||||||
at::vec::Vectorized<int8_t> src) {
|
at::vec::Vectorized<int8_t> src) {
|
||||||
auto s8x8 = vld1_s8(src.operator const int8_t*());
|
auto s8x8 = vget_low_s8(src);
|
||||||
auto s16x8 = vmovl_s8(s8x8);
|
auto s16x8 = vmovl_s8(s8x8);
|
||||||
|
|
||||||
auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
|
auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
|
||||||
@ -1402,7 +1402,7 @@ std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
|
|||||||
|
|
||||||
Vectorized<float> inline convert_int8_half_register_to_float(
|
Vectorized<float> inline convert_int8_half_register_to_float(
|
||||||
at::vec::Vectorized<int8_t> src) {
|
at::vec::Vectorized<int8_t> src) {
|
||||||
auto s8x8 = vld1_s8(src.operator const int8_t*());
|
auto s8x8 = vget_low_s8(src);
|
||||||
auto s16x8 = vmovl_s8(s8x8);
|
auto s16x8 = vmovl_s8(s8x8);
|
||||||
|
|
||||||
auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
|
auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
|
||||||
|
@ -16,6 +16,8 @@
|
|||||||
#include <c10/util/irange.h>
|
#include <c10/util/irange.h>
|
||||||
#include <c10/core/ScalarType.h>
|
#include <c10/core/ScalarType.h>
|
||||||
|
|
||||||
|
#include <ATen/cuda/detail/BLASConstants.h>
|
||||||
|
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
#include <c10/cuda/CUDAStream.h>
|
#include <c10/cuda/CUDAStream.h>
|
||||||
#include <hipblaslt/hipblaslt-ext.hpp>
|
#include <hipblaslt/hipblaslt-ext.hpp>
|
||||||
@ -1954,13 +1956,15 @@ void scaled_gemm(
|
|||||||
const void *result_scale_ptr,
|
const void *result_scale_ptr,
|
||||||
int64_t result_ld,
|
int64_t result_ld,
|
||||||
ScalarType result_dtype,
|
ScalarType result_dtype,
|
||||||
bool use_fast_accum) {
|
bool use_fast_accum,
|
||||||
|
const std::optional<Tensor>& alpha) {
|
||||||
// Note: see `cublasCommonArgs` for various non-intuitive manupulations
|
// Note: see `cublasCommonArgs` for various non-intuitive manupulations
|
||||||
// of input arguments to this function.
|
// of input arguments to this function.
|
||||||
const auto computeType = CUBLAS_COMPUTE_32F;
|
const auto computeType = CUBLAS_COMPUTE_32F;
|
||||||
const auto scaleType = CUDA_R_32F;
|
const auto scaleType = CUDA_R_32F;
|
||||||
const float alpha_val = 1.0;
|
// Note: alpha_val may change later depending on user-passed argument
|
||||||
const float beta_val = 0.0;
|
float alpha_val = 1.0;
|
||||||
|
float beta_val = 0.0;
|
||||||
CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
|
CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
|
||||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, _cublasOpFromChar(transa));
|
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, _cublasOpFromChar(transa));
|
||||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
|
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
|
||||||
@ -2031,6 +2035,33 @@ void scaled_gemm(
|
|||||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, CUBLASLT_EPILOGUE_BIAS);
|
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, CUBLASLT_EPILOGUE_BIAS);
|
||||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype));
|
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Handle user-passed alpha
|
||||||
|
float *alpha_ptr = &alpha_val;
|
||||||
|
float *beta_ptr = &beta_val;
|
||||||
|
|
||||||
|
if (alpha.has_value()) {
|
||||||
|
auto& a = alpha.value();
|
||||||
|
|
||||||
|
// if device-tensor
|
||||||
|
if (a.is_cuda()) {
|
||||||
|
// NOTE: there are lifetime requirements on device-side pointers for alpha/beta -- the value must be
|
||||||
|
// valid & correct until the cublas call finishes (not is scheduled like host-side values). Thus
|
||||||
|
// we need to use allocations for alpha/beta that have some guarantees on lifetime - a statically
|
||||||
|
// managed 4B buffer for alpha that we'll copy the passed alpha value into, and constant memory
|
||||||
|
// for beta respectively.
|
||||||
|
float *user_alpha_ptr = at::cuda::detail::get_user_alpha_ptr();
|
||||||
|
at::Tensor user_alpha = at::from_blob(user_alpha_ptr, {1}, TensorOptions().device(kCUDA).dtype(kFloat));
|
||||||
|
user_alpha.copy_(a);
|
||||||
|
// Tell cublasLt we're using device-side pointers for alpha/beta
|
||||||
|
auto pointer_mode = CUBLASLT_POINTER_MODE_DEVICE;
|
||||||
|
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_POINTER_MODE, pointer_mode);
|
||||||
|
alpha_ptr = user_alpha.data_ptr<float>();
|
||||||
|
beta_ptr = at::cuda::detail::get_cublas_device_zero();
|
||||||
|
} else {
|
||||||
|
alpha_val = a.item<float>();
|
||||||
|
}
|
||||||
|
}
|
||||||
// For other data types, use the get_scale_mode function based on scaling type
|
// For other data types, use the get_scale_mode function based on scaling type
|
||||||
// The SCALE_MODE attrs only exist in cuBLAS 12.8+/ROCm 7.0 or in recent hipblaslt,
|
// The SCALE_MODE attrs only exist in cuBLAS 12.8+/ROCm 7.0 or in recent hipblaslt,
|
||||||
// but we must invoke get_scale_mode anyways to trigger the version checks.
|
// but we must invoke get_scale_mode anyways to trigger the version checks.
|
||||||
@ -2048,6 +2079,7 @@ void scaled_gemm(
|
|||||||
cublasLtMatmulHeuristicResult_t heuristicResult = {};
|
cublasLtMatmulHeuristicResult_t heuristicResult = {};
|
||||||
int returnedResult = 0;
|
int returnedResult = 0;
|
||||||
cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
|
cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
|
||||||
|
|
||||||
TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
|
TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
|
||||||
ltHandle,
|
ltHandle,
|
||||||
computeDesc.descriptor(),
|
computeDesc.descriptor(),
|
||||||
@ -2088,10 +2120,10 @@ void scaled_gemm(
|
|||||||
auto is_valid_status = hipblaslt_ext::matmulIsAlgoSupported(
|
auto is_valid_status = hipblaslt_ext::matmulIsAlgoSupported(
|
||||||
ltHandle,
|
ltHandle,
|
||||||
computeDesc.descriptor(),
|
computeDesc.descriptor(),
|
||||||
&alpha_val,
|
alpha_ptr,
|
||||||
Adesc.descriptor(),
|
Adesc.descriptor(),
|
||||||
Bdesc.descriptor(),
|
Bdesc.descriptor(),
|
||||||
&beta_val,
|
beta_ptr,
|
||||||
Cdesc.descriptor(),
|
Cdesc.descriptor(),
|
||||||
Ddesc.descriptor(),
|
Ddesc.descriptor(),
|
||||||
all_algos[i].algo,
|
all_algos[i].algo,
|
||||||
@ -2110,17 +2142,14 @@ void scaled_gemm(
|
|||||||
cublasStatus_t cublasStatus = cublasLtMatmul(
|
cublasStatus_t cublasStatus = cublasLtMatmul(
|
||||||
ltHandle,
|
ltHandle,
|
||||||
computeDesc.descriptor(),
|
computeDesc.descriptor(),
|
||||||
&alpha_val,
|
alpha_ptr,
|
||||||
mat1_ptr,
|
mat1_ptr,
|
||||||
Adesc.descriptor(),
|
Adesc.descriptor(),
|
||||||
mat2_ptr,
|
mat2_ptr,
|
||||||
Bdesc.descriptor(),
|
Bdesc.descriptor(),
|
||||||
&beta_val,
|
beta_ptr,
|
||||||
#ifdef USE_ROCM
|
// NOTE: always use result_ptr here, because cuBLASLt w/device beta=0 can't handle nullptr either
|
||||||
result_ptr, // unused, since beta_val is 0, but hipblaslt can't handle nullptr
|
result_ptr, // unused, since beta_val is 0, but hipblaslt can't handle nullptr
|
||||||
#else
|
|
||||||
nullptr,
|
|
||||||
#endif // ifdef USE_ROCM
|
|
||||||
Cdesc.descriptor(),
|
Cdesc.descriptor(),
|
||||||
result_ptr,
|
result_ptr,
|
||||||
Ddesc.descriptor(),
|
Ddesc.descriptor(),
|
||||||
|
@ -161,7 +161,8 @@ void scaled_gemm(
|
|||||||
const void* result_scale_ptr,
|
const void* result_scale_ptr,
|
||||||
int64_t result_ld,
|
int64_t result_ld,
|
||||||
ScalarType result_dtype,
|
ScalarType result_dtype,
|
||||||
bool use_fast_accum);
|
bool use_fast_accum,
|
||||||
|
const std::optional<Tensor>& alpha);
|
||||||
|
|
||||||
#define CUDABLAS_BGEMM_ARGTYPES(Dtype) CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype)
|
#define CUDABLAS_BGEMM_ARGTYPES(Dtype) CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype)
|
||||||
|
|
||||||
|
@ -325,9 +325,9 @@ uint64_t CUDAGeneratorImpl::seed() {
|
|||||||
*/
|
*/
|
||||||
c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
|
c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
|
||||||
// The RNG state comprises the seed, and an offset used for Philox.
|
// The RNG state comprises the seed, and an offset used for Philox.
|
||||||
static const size_t seed_size = sizeof(uint64_t);
|
constexpr size_t seed_size = sizeof(uint64_t);
|
||||||
static const size_t offset_size = sizeof(int64_t);
|
constexpr size_t offset_size = sizeof(int64_t);
|
||||||
static const size_t total_size = seed_size + offset_size;
|
constexpr size_t total_size = seed_size + offset_size;
|
||||||
|
|
||||||
auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
|
auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
|
||||||
auto rng_state = state_tensor.data_ptr<uint8_t>();
|
auto rng_state = state_tensor.data_ptr<uint8_t>();
|
||||||
@ -346,9 +346,9 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
|
|||||||
* and size of the internal state.
|
* and size of the internal state.
|
||||||
*/
|
*/
|
||||||
void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
|
void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
|
||||||
static const size_t seed_size = sizeof(uint64_t);
|
constexpr size_t seed_size = sizeof(uint64_t);
|
||||||
static const size_t offset_size = sizeof(int64_t);
|
constexpr size_t offset_size = sizeof(int64_t);
|
||||||
static const size_t total_size = seed_size + offset_size;
|
constexpr size_t total_size = seed_size + offset_size;
|
||||||
|
|
||||||
detail::check_rng_state(new_state);
|
detail::check_rng_state(new_state);
|
||||||
|
|
||||||
|
54
aten/src/ATen/cuda/detail/BLASConstants.cu
Normal file
54
aten/src/ATen/cuda/detail/BLASConstants.cu
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
#include <ATen/Functions.h>
|
||||||
|
#include <ATen/Tensor.h>
|
||||||
|
#include <ATen/cuda/Exceptions.h>
|
||||||
|
|
||||||
|
#include <mutex>
|
||||||
|
|
||||||
|
namespace at {
|
||||||
|
namespace cuda {
|
||||||
|
namespace detail {
|
||||||
|
|
||||||
|
__device__ __constant__ float cublas_one_device;
|
||||||
|
__device__ __constant__ float cublas_zero_device;
|
||||||
|
|
||||||
|
float *get_cublas_device_one() {
|
||||||
|
static c10::once_flag init_flag;
|
||||||
|
|
||||||
|
c10::call_once(init_flag, []() {
|
||||||
|
const float one = 1.f;
|
||||||
|
AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_one_device, &one, sizeof(float)));
|
||||||
|
});
|
||||||
|
|
||||||
|
float *ptr;
|
||||||
|
AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_one_device));
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
float *get_cublas_device_zero() {
|
||||||
|
static c10::once_flag init_flag;
|
||||||
|
|
||||||
|
c10::call_once(init_flag, []() {
|
||||||
|
const float zero = 0.f;
|
||||||
|
AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_zero_device, &zero, sizeof(float)));
|
||||||
|
});
|
||||||
|
|
||||||
|
float *ptr;
|
||||||
|
AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_zero_device));
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
float *get_user_alpha_ptr() {
|
||||||
|
static float *alpha_ptr;
|
||||||
|
|
||||||
|
static c10::once_flag init_flag;
|
||||||
|
|
||||||
|
c10::call_once(init_flag, []() {
|
||||||
|
AT_CUDA_CHECK(cudaMalloc(&alpha_ptr, sizeof(float)));
|
||||||
|
});
|
||||||
|
|
||||||
|
return alpha_ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace detail
|
||||||
|
} // namespace cuda
|
||||||
|
} // namespace at
|
11
aten/src/ATen/cuda/detail/BLASConstants.h
Normal file
11
aten/src/ATen/cuda/detail/BLASConstants.h
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <ATen/core/TensorBase.h>
|
||||||
|
|
||||||
|
namespace at::cuda::detail {
|
||||||
|
|
||||||
|
float *get_cublas_device_one();
|
||||||
|
float *get_cublas_device_zero();
|
||||||
|
float *get_user_alpha_ptr();
|
||||||
|
|
||||||
|
} // namespace at::cuda::detail
|
@ -13,6 +13,7 @@
|
|||||||
#include <c10/core/ScalarType.h>
|
#include <c10/core/ScalarType.h>
|
||||||
|
|
||||||
#include <ATen/cuda/tunable/TunableOp.h>
|
#include <ATen/cuda/tunable/TunableOp.h>
|
||||||
|
#include <ATen/cuda/tunable/Tunable.h>
|
||||||
#include <ATen/cuda/CUDABlas.h>
|
#include <ATen/cuda/CUDABlas.h>
|
||||||
#include <ATen/cuda/Exceptions.h>
|
#include <ATen/cuda/Exceptions.h>
|
||||||
#include <c10/util/StringUtil.h>
|
#include <c10/util/StringUtil.h>
|
||||||
@ -150,6 +151,7 @@ inline std::string ScalarTypeToBLASType(c10::ScalarType scalar_type) {
|
|||||||
BLASType = "unknown";
|
BLASType = "unknown";
|
||||||
}
|
}
|
||||||
return BLASType;
|
return BLASType;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Similar to Compute Type in GemmRocblas.h
|
// Similar to Compute Type in GemmRocblas.h
|
||||||
@ -244,33 +246,25 @@ inline std::string to_string_epilogue(const at::cuda::blas::GEMMAndBiasActivatio
|
|||||||
|
|
||||||
namespace detail {
|
namespace detail {
|
||||||
|
|
||||||
static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size) {
|
static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size, const NumericalCheckConfig& config) {
|
||||||
|
|
||||||
|
if (!config.enabled) {
|
||||||
|
return true; // skip when disabled
|
||||||
|
}
|
||||||
|
|
||||||
auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA);
|
auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA);
|
||||||
// comparison done as 1D tensor
|
|
||||||
at::Tensor ref = at::from_blob(c, {size}, options);
|
at::Tensor ref = at::from_blob(c, {size}, options);
|
||||||
at::Tensor oth = at::from_blob(other_c, {size}, options);
|
at::Tensor oth = at::from_blob(other_c, {size}, options);
|
||||||
at::Tensor ref_float = ref.to(at::kFloat);
|
at::Tensor ref_float = ref.to(at::kFloat);
|
||||||
at::Tensor oth_float = oth.to(at::kFloat);
|
at::Tensor oth_float = oth.to(at::kFloat);
|
||||||
std::vector<double> atols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
|
|
||||||
std::vector<double> rtols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
|
|
||||||
double last_succeed_atol = 1;
|
|
||||||
double last_succeed_rtol = 1;
|
|
||||||
for (auto& atol : atols) {
|
|
||||||
for (auto& rtol : rtols) {
|
|
||||||
if (at::allclose(ref_float, oth_float, rtol, atol)) {
|
|
||||||
last_succeed_atol = atol;
|
|
||||||
last_succeed_rtol = rtol;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (last_succeed_atol == 1) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
TUNABLE_LOG3("├──verify numerics: atol=", last_succeed_atol, ", rtol=", last_succeed_rtol);
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
const bool ok = at::allclose(ref_float, oth_float, config.rtol, config.atol);
|
||||||
|
if (ok) {
|
||||||
|
TUNABLE_LOG3("├──verify numerics: PASSED with atol=", config.atol, ", rtol=", config.rtol);
|
||||||
|
} else {
|
||||||
|
TUNABLE_LOG3("├──verify numerics: FAILED with atol=", config.atol, ", rtol=", config.rtol);
|
||||||
|
}
|
||||||
|
return ok;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -355,8 +349,10 @@ struct GemmParams : OpParams {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TuningStatus NumericalCheck(GemmParams<T> *other) {
|
TuningStatus NumericalCheck(GemmParams<T> *other) {
|
||||||
|
auto* ctx = getTuningContext();
|
||||||
|
auto cfg = ctx->GetNumericalCheckConfig();
|
||||||
auto c_dtype = c10::CppTypeToScalarType<T>::value;
|
auto c_dtype = c10::CppTypeToScalarType<T>::value;
|
||||||
return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
|
return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
|
||||||
}
|
}
|
||||||
|
|
||||||
char transa{};
|
char transa{};
|
||||||
@ -449,8 +445,10 @@ struct GemmAndBiasParams : OpParams {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TuningStatus NumericalCheck(GemmAndBiasParams<T> *other) {
|
TuningStatus NumericalCheck(GemmAndBiasParams<T> *other) {
|
||||||
|
auto* ctx = getTuningContext();
|
||||||
|
auto cfg = ctx->GetNumericalCheckConfig();
|
||||||
auto c_dtype = c10::CppTypeToScalarType<T>::value;
|
auto c_dtype = c10::CppTypeToScalarType<T>::value;
|
||||||
return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
|
return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
|
||||||
}
|
}
|
||||||
|
|
||||||
char transa{};
|
char transa{};
|
||||||
@ -546,8 +544,10 @@ struct GemmStridedBatchedParams : OpParams {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TuningStatus NumericalCheck(GemmStridedBatchedParams<T> *other) {
|
TuningStatus NumericalCheck(GemmStridedBatchedParams<T> *other) {
|
||||||
|
auto* ctx = getTuningContext();
|
||||||
|
auto cfg = ctx->GetNumericalCheckConfig();
|
||||||
auto c_dtype = c10::CppTypeToScalarType<C_Dtype>::value;
|
auto c_dtype = c10::CppTypeToScalarType<C_Dtype>::value;
|
||||||
return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
|
return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
|
||||||
}
|
}
|
||||||
|
|
||||||
char transa{};
|
char transa{};
|
||||||
@ -663,7 +663,9 @@ struct ScaledGemmParams : OpParams {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TuningStatus NumericalCheck(ScaledGemmParams<T> *other) {
|
TuningStatus NumericalCheck(ScaledGemmParams<T> *other) {
|
||||||
return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
|
auto* ctx = getTuningContext();
|
||||||
|
auto cfg = ctx->GetNumericalCheckConfig();
|
||||||
|
return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
|
||||||
}
|
}
|
||||||
|
|
||||||
char transa{};
|
char transa{};
|
||||||
|
@ -145,7 +145,7 @@ programmatically since the settings become fixed. Use the C++ or Python APIs ins
|
|||||||
| PYTORCH_TUNABLEOP_VERBOSE | Default is 0. Set to 1 to enable basic logging. 2 for basic tuning status. 3 for full trace. |
|
| PYTORCH_TUNABLEOP_VERBOSE | Default is 0. Set to 1 to enable basic logging. 2 for basic tuning status. 3 for full trace. |
|
||||||
| PYTORCH_TUNABLEOP_VERBOSE_FILENAME | Default is "err" for stderr. Set to "out" for stdout or a filename for capturing verbose logging. |
|
| PYTORCH_TUNABLEOP_VERBOSE_FILENAME | Default is "err" for stderr. Set to "out" for stdout or a filename for capturing verbose logging. |
|
||||||
| PYTORCH_TUNABLEOP_FILENAME | Default is 'tunableop_results.csv'. |
|
| PYTORCH_TUNABLEOP_FILENAME | Default is 'tunableop_results.csv'. |
|
||||||
| PYTORCH_TUNABLEOP_NUMERICAL_CHECK | Default is 0. Set to 1 to enable. |
|
| PYTORCH_TUNABLEOP_NUMERICAL_CHECK | Default is off. Set 'atol_rtol' to enable, for example "1e-5_1e-5". |
|
||||||
| PYTORCH_TUNABLEOP_ROCBLAS_ENABLED | Default is 1. Set to 0 to disable rocblas being considered during tuning. |
|
| PYTORCH_TUNABLEOP_ROCBLAS_ENABLED | Default is 1. Set to 0 to disable rocblas being considered during tuning. |
|
||||||
| PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED | Default is 1. Set to 0 to disable hipblaslt being considered during tuning. |
|
| PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED | Default is 1. Set to 0 to disable hipblaslt being considered during tuning. |
|
||||||
| PYTORCH_TUNABLEOP_MAX_TUNING_DURATION_MS | Default is 30. Unit is milliseconds. |
|
| PYTORCH_TUNABLEOP_MAX_TUNING_DURATION_MS | Default is 30. Unit is milliseconds. |
|
||||||
@ -173,10 +173,9 @@ All python APIs exist in the `torch.cuda.tunable` module.
|
|||||||
| get_max_tuning_iterations() -> int | |
|
| get_max_tuning_iterations() -> int | |
|
||||||
| set_filename(filename: str, insert_device_ordinal: bool = False) -> None | |
|
| set_filename(filename: str, insert_device_ordinal: bool = False) -> None | |
|
||||||
| get_filename() -> str | |
|
| get_filename() -> str | |
|
||||||
|
| set_numerical_check_tolerances(enable: bool, atol: float, rtol: float) -> None | Enable or disable numerical checking; atol and rtol default to 1e-5.
|
||||||
| get_results() -> Tuple[str, str, str, float] | |
|
| get_results() -> Tuple[str, str, str, float] | |
|
||||||
| get_validators() -> Tuple[str, str] | |
|
| get_validators() -> Tuple[str, str] | |
|
||||||
| write_file_on_exit(val: bool) -> None | Default is True. |
|
|
||||||
| write_file(filename: Optional[str] = None) -> None | If filename not given, it will call get_filename(). |
|
|
||||||
| read_file(filename: Optional[str] = None) -> None | If filename not given, it will call get_filename(). |
|
| read_file(filename: Optional[str] = None) -> None | If filename not given, it will call get_filename(). |
|
||||||
| tune_gemm_in_file(filename: str) -> None | read an untuned file and tune GEMMs in it. |
|
| tune_gemm_in_file(filename: str) -> None | read an untuned file and tune GEMMs in it. |
|
||||||
| mgpu_tune_gemm_in_file(filename_pattern: str, num_gpus: int) -> None: -> None | read one or more untuned files and tune all unique GEMMs on one or more GPUs. |
|
| mgpu_tune_gemm_in_file(filename_pattern: str, num_gpus: int) -> None: -> None | read one or more untuned files and tune all unique GEMMs on one or more GPUs. |
|
||||||
|
@ -107,14 +107,30 @@ void TuningResultsManager::AddImpl(const std::string& op_signature,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void TuningResultsManager::Add(const std::string& op_signature, const std::string& params_signature, ResultEntry best) {
|
void TuningResultsManager::Add(const std::string& op_signature, const std::string& params_signature, ResultEntry best) {
|
||||||
std::scoped_lock l{lock_};
|
bool is_new = false;
|
||||||
|
ResultEntry inserted = ResultEntry::Null();
|
||||||
|
|
||||||
auto it = results_.find(op_signature);
|
// ---- mutate maps under results lock ----
|
||||||
if (it == results_.end()) {
|
{
|
||||||
it = results_.insert({op_signature, {}}).first;
|
std::scoped_lock l{lock_};
|
||||||
|
auto& km = results_[op_signature]; // creates if missing
|
||||||
|
is_new = (km.find(params_signature) == km.end());
|
||||||
|
AddImpl(op_signature, params_signature, std::move(best), km);
|
||||||
|
if (is_new) {
|
||||||
|
inserted = km.at(params_signature); // snapshot for I/O after unlocking
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!is_new) return; // only write once per unique (op, params)
|
||||||
|
|
||||||
|
TuningContext* ctx = getTuningContext();
|
||||||
|
if (ctx->IsTuningEnabled() && !ctx->IsRecordUntunedEnabled()) {
|
||||||
|
InitRealtimeAppend(ctx->GetFilename(), ctx->GetTuningResultsValidator().GetAllValidators());
|
||||||
|
|
||||||
|
if (is_new && realtime_out_ && realtime_out_->good()) {
|
||||||
|
AppendResultLine(op_signature, params_signature, inserted);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
AddImpl(op_signature, params_signature, std::move(best), it->second);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std::string& op_signature,
|
void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std::string& op_signature,
|
||||||
@ -150,6 +166,77 @@ void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void TuningResultsManager::InitRealtimeAppend(const std::string& filename, const std::unordered_map<std::string, std::string>& validators) {
|
||||||
|
std::scoped_lock fl{realtime_file_mutex_};
|
||||||
|
|
||||||
|
if (realtime_out_ && realtime_out_->good() && realtime_filename_ == filename) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (realtime_out_ && realtime_filename_ != filename) {
|
||||||
|
realtime_out_->flush();
|
||||||
|
realtime_out_->close();
|
||||||
|
realtime_out_.reset();
|
||||||
|
validators_written_ = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool file_exists = false;
|
||||||
|
bool file_empty = true;
|
||||||
|
|
||||||
|
{
|
||||||
|
std::ifstream check_file(filename);
|
||||||
|
if (check_file.good()) {
|
||||||
|
file_exists = true;
|
||||||
|
file_empty = (check_file.peek() == std::ifstream::traits_type::eof());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
realtime_out_ = std::make_unique<std::ofstream>(filename, std::ios::out | std::ios::app);
|
||||||
|
|
||||||
|
if (!realtime_out_->good()) {
|
||||||
|
TORCH_WARN("TunableOp realtime append: failed to open '", filename,"'");
|
||||||
|
realtime_out_.reset();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!file_exists || file_empty) {
|
||||||
|
for(const auto& [key, val] : validators) {
|
||||||
|
(*realtime_out_) << "Validator," << key << "," << val << std::endl;
|
||||||
|
realtime_out_->flush();
|
||||||
|
}
|
||||||
|
validators_written_ = true;
|
||||||
|
|
||||||
|
TUNABLE_LOG2("Wrote validators to realtime output file");
|
||||||
|
}
|
||||||
|
|
||||||
|
realtime_filename_ = filename;
|
||||||
|
}
|
||||||
|
|
||||||
|
void TuningResultsManager::AppendResultLine(const std::string& op_sig, const std::string& param_sig, const ResultEntry& result) {
|
||||||
|
std::scoped_lock fl{realtime_file_mutex_};
|
||||||
|
|
||||||
|
if(!realtime_out_ || !realtime_out_->good()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
(*realtime_out_) << op_sig << "," << param_sig << "," << result << std::endl;
|
||||||
|
realtime_out_->flush(); //ensure immediate write to disk
|
||||||
|
|
||||||
|
TUNABLE_LOG3("Realtime append: ", op_sig, "(", param_sig, ") -> ", result);
|
||||||
|
}
|
||||||
|
|
||||||
|
void TuningResultsManager::CloseRealtimeAppend() {
|
||||||
|
std::scoped_lock fl{realtime_file_mutex_};
|
||||||
|
|
||||||
|
|
||||||
|
if(realtime_out_) {
|
||||||
|
realtime_out_->flush();
|
||||||
|
realtime_out_->close();
|
||||||
|
realtime_out_.reset();
|
||||||
|
TUNABLE_LOG2("Closed realtime output file");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void TuningResultsManager::Delete(const std::string& op_signature, const std::string& params_signature) {
|
void TuningResultsManager::Delete(const std::string& op_signature, const std::string& params_signature) {
|
||||||
std::scoped_lock l{lock_};
|
std::scoped_lock l{lock_};
|
||||||
|
|
||||||
@ -396,7 +483,6 @@ TuningContext::TuningContext() :
|
|||||||
tuning_enable_{true},
|
tuning_enable_{true},
|
||||||
record_untuned_enable_{false},
|
record_untuned_enable_{false},
|
||||||
manager_initialized_{false},
|
manager_initialized_{false},
|
||||||
write_file_on_exit_{true},
|
|
||||||
numerics_check_enable_{false},
|
numerics_check_enable_{false},
|
||||||
max_tuning_duration_ms_{30},
|
max_tuning_duration_ms_{30},
|
||||||
max_tuning_iterations_{100},
|
max_tuning_iterations_{100},
|
||||||
@ -417,20 +503,8 @@ TuningContext::~TuningContext() {
|
|||||||
// but doesn't do any computation itself.
|
// but doesn't do any computation itself.
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
auto filename = GetFilename();
|
TUNABLE_LOG1("Closing File");
|
||||||
if (IsTunableOpEnabled() && IsTuningEnabled() && !filename.empty() && write_file_on_exit_) {
|
GetTuningResultsManager().CloseRealtimeAppend(); // Since, we do instant logging by default now.
|
||||||
if (results_count_from_input_file_ < GetTuningResultsManager().GetSize()) {
|
|
||||||
if (results_count_from_input_file_ > 0) {
|
|
||||||
TUNABLE_LOG1("additional tuning results available, rewriting file ", filename);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
TUNABLE_LOG1("writing file ", filename);
|
|
||||||
}
|
|
||||||
if (!WriteFile(filename)) {
|
|
||||||
TUNABLE_LOG1("failed to write file ", filename);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (untuned_file_.good()) {
|
if (untuned_file_.good()) {
|
||||||
untuned_file_.close();
|
untuned_file_.close();
|
||||||
@ -511,20 +585,54 @@ std::ofstream& TuningContext::GetUntunedFile(){
|
|||||||
return untuned_file_;
|
return untuned_file_;
|
||||||
}
|
}
|
||||||
|
|
||||||
void TuningContext::WriteFileOnExit(bool value) {
|
|
||||||
write_file_on_exit_ = value;
|
|
||||||
}
|
|
||||||
|
|
||||||
void TuningContext::EnableNumericsCheck(bool value) {
|
void TuningContext::EnableNumericsCheck(bool value) {
|
||||||
numerics_check_enable_ = value;
|
numerics_check_enable_ = value;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool TuningContext::IsNumericsCheckEnabled() const {
|
NumericalCheckConfig TuningContext::GetNumericalCheckConfig() const {
|
||||||
const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_NUMERICAL_CHECK");
|
const auto env_opt = c10::utils::get_env("PYTORCH_TUNABLEOP_NUMERICAL_CHECK");
|
||||||
if (env == "1") {
|
|
||||||
return true;
|
if (!env_opt.has_value()) {
|
||||||
|
return numerics_cfg_;
|
||||||
}
|
}
|
||||||
return numerics_check_enable_;
|
|
||||||
|
const std::string& env = env_opt.value();
|
||||||
|
|
||||||
|
if (env == "0") {
|
||||||
|
return NumericalCheckConfig(false, 1e-5, 1e-5);
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t underscore = env.find('_');
|
||||||
|
|
||||||
|
TORCH_CHECK(
|
||||||
|
underscore != std::string::npos,
|
||||||
|
"Invalid PYTORCH_TUNABLEOP_NUMERICAL_CHECK format. "
|
||||||
|
"Expected 'atol_rtol', got: ",
|
||||||
|
env);
|
||||||
|
|
||||||
|
double atol = 0.0;
|
||||||
|
double rtol = 0.0;
|
||||||
|
|
||||||
|
try {
|
||||||
|
atol = std::stod(env.substr(0, underscore));
|
||||||
|
rtol = std::stod(env.substr(underscore + 1));
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
TORCH_CHECK(false, "Failed to parse PYTORCH_TUNABLEOP_NUMERICAL_CHECK: ", e.what());
|
||||||
|
}
|
||||||
|
|
||||||
|
TORCH_CHECK( atol > 0.0 && rtol > 0.0, "Tolerance values must be positive. atol=", atol, ", rtol=", rtol);
|
||||||
|
return NumericalCheckConfig(true, atol, rtol);
|
||||||
|
}
|
||||||
|
|
||||||
|
void TuningContext::SetNumericalCheckConfig(bool enabled, double atol, double rtol) {
|
||||||
|
TORCH_CHECK(atol > 0.0 && rtol > 0.0, "Numerical check tolerances must be positive");
|
||||||
|
numerics_cfg_ = {enabled, atol, rtol};
|
||||||
|
}
|
||||||
|
|
||||||
|
bool TuningContext::IsNumericsCheckEnabled() const {
|
||||||
|
const auto cfg = GetNumericalCheckConfig();
|
||||||
|
return cfg.enabled || numerics_check_enable_;
|
||||||
}
|
}
|
||||||
|
|
||||||
void TuningContext::SetMaxTuningDurationMs(int max_duration_ms) {
|
void TuningContext::SetMaxTuningDurationMs(int max_duration_ms) {
|
||||||
@ -634,11 +742,6 @@ TuningResultsManager& TuningContext::GetTuningResultsManager() {
|
|||||||
auto filename = GetFilename();
|
auto filename = GetFilename();
|
||||||
if (!filename.empty() && !IsRecordUntunedEnabled()) {
|
if (!filename.empty() && !IsRecordUntunedEnabled()) {
|
||||||
ReadFile(filename);
|
ReadFile(filename);
|
||||||
// attempt immediately to open file for writing to catch errors early
|
|
||||||
std::ofstream file(filename, std::ios::out | std::ios::app);
|
|
||||||
if (!file.good()) {
|
|
||||||
TORCH_WARN("failed to open file '", filename, "' for writing; your tuning results will not be saved");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
return manager_;
|
return manager_;
|
||||||
@ -744,27 +847,6 @@ bool TuningContext::ReadFile(const std::string& filename_) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool TuningContext::WriteFile(const std::string& filename_) {
|
|
||||||
std::string filename = filename_.empty() ? GetFilename() : filename_;
|
|
||||||
std::ofstream file(filename, std::ios::out | std::ios::trunc);
|
|
||||||
if (!file.good()) {
|
|
||||||
TUNABLE_LOG1("error opening tuning results file for writing ", filename);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
auto validators = GetTuningResultsValidator().GetAllValidators();
|
|
||||||
for (const auto& [key, val] : validators) {
|
|
||||||
file << "Validator," << key << "," << val << std::endl;
|
|
||||||
}
|
|
||||||
auto results = GetTuningResultsManager().Dump();
|
|
||||||
for (const auto& [op_sig, kernelmap] : results) {
|
|
||||||
for (const auto& [param_sig, result] : kernelmap) {
|
|
||||||
file << op_sig << "," << param_sig << "," << result << std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
file.close();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
struct MaybeDelete {
|
struct MaybeDelete {
|
||||||
|
@ -103,10 +103,24 @@ class TORCH_CUDA_CPP_API TuningResultsManager {
|
|||||||
|
|
||||||
void RecordUntuned( std::ofstream& untuned_file, const std::string& op_signature,
|
void RecordUntuned( std::ofstream& untuned_file, const std::string& op_signature,
|
||||||
const std::string& params_signature, const std::string& blas_signature);
|
const std::string& params_signature, const std::string& blas_signature);
|
||||||
|
|
||||||
|
void InitRealtimeAppend(
|
||||||
|
const std::string& filename,
|
||||||
|
const std::unordered_map<std::string, std::string>& validators);
|
||||||
|
|
||||||
|
void AppendResultLine(const std::string& op_sig,
|
||||||
|
const std::string& param_sig,
|
||||||
|
const ResultEntry& result);
|
||||||
|
|
||||||
|
void CloseRealtimeAppend(); // For clean shutdown
|
||||||
private:
|
private:
|
||||||
std::mutex lock_;
|
std::mutex lock_;
|
||||||
|
std::mutex realtime_file_mutex_;
|
||||||
|
std::unique_ptr<std::ofstream> realtime_out_;
|
||||||
|
std::string realtime_filename_;
|
||||||
ResultsMap results_;
|
ResultsMap results_;
|
||||||
UntunedMap untuned_results_;
|
UntunedMap untuned_results_;
|
||||||
|
bool validators_written_ = false;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -134,6 +148,16 @@ class TORCH_CUDA_CPP_API TuningResultsValidator {
|
|||||||
GetValidateFuncs validators_;
|
GetValidateFuncs validators_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct NumericalCheckConfig {
|
||||||
|
bool enabled{false};
|
||||||
|
double atol{1e-5};
|
||||||
|
double rtol{1e-5};
|
||||||
|
|
||||||
|
NumericalCheckConfig() = default;
|
||||||
|
NumericalCheckConfig(bool e, double a, double r) : enabled(e), atol(a), rtol(r) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
class TORCH_CUDA_CPP_API TuningContext {
|
class TORCH_CUDA_CPP_API TuningContext {
|
||||||
public:
|
public:
|
||||||
TuningContext();
|
TuningContext();
|
||||||
@ -155,6 +179,8 @@ class TORCH_CUDA_CPP_API TuningContext {
|
|||||||
|
|
||||||
void EnableNumericsCheck(bool value);
|
void EnableNumericsCheck(bool value);
|
||||||
bool IsNumericsCheckEnabled() const;
|
bool IsNumericsCheckEnabled() const;
|
||||||
|
void SetNumericalCheckConfig(bool enabled, double atol, double rtol);
|
||||||
|
NumericalCheckConfig GetNumericalCheckConfig() const;
|
||||||
|
|
||||||
void SetMaxTuningDurationMs(int max_duration_ms);
|
void SetMaxTuningDurationMs(int max_duration_ms);
|
||||||
int GetMaxTuningDurationMs() const;
|
int GetMaxTuningDurationMs() const;
|
||||||
@ -185,10 +211,7 @@ class TORCH_CUDA_CPP_API TuningContext {
|
|||||||
void SetFilename(const std::string& filename, bool insert_device_ordinal=false);
|
void SetFilename(const std::string& filename, bool insert_device_ordinal=false);
|
||||||
std::string GetFilename() const;
|
std::string GetFilename() const;
|
||||||
|
|
||||||
void WriteFileOnExit(bool value);
|
|
||||||
|
|
||||||
bool ReadFile(const std::string& filename={});
|
bool ReadFile(const std::string& filename={});
|
||||||
bool WriteFile(const std::string& filename={});
|
|
||||||
|
|
||||||
template<class... Types>
|
template<class... Types>
|
||||||
void Log(int level, Types... args) {
|
void Log(int level, Types... args) {
|
||||||
@ -207,7 +230,6 @@ class TORCH_CUDA_CPP_API TuningContext {
|
|||||||
bool tuning_enable_;
|
bool tuning_enable_;
|
||||||
bool record_untuned_enable_;
|
bool record_untuned_enable_;
|
||||||
bool manager_initialized_;
|
bool manager_initialized_;
|
||||||
bool write_file_on_exit_;
|
|
||||||
bool numerics_check_enable_;
|
bool numerics_check_enable_;
|
||||||
int max_tuning_duration_ms_;
|
int max_tuning_duration_ms_;
|
||||||
int max_tuning_iterations_;
|
int max_tuning_iterations_;
|
||||||
@ -222,6 +244,8 @@ class TORCH_CUDA_CPP_API TuningContext {
|
|||||||
std::ofstream untuned_file_;
|
std::ofstream untuned_file_;
|
||||||
size_t results_count_from_input_file_;
|
size_t results_count_from_input_file_;
|
||||||
bool is_shutting_down_;
|
bool is_shutting_down_;
|
||||||
|
|
||||||
|
NumericalCheckConfig numerics_cfg_{};
|
||||||
};
|
};
|
||||||
|
|
||||||
TORCH_CUDA_CPP_API TuningContext* getTuningContext();
|
TORCH_CUDA_CPP_API TuningContext* getTuningContext();
|
||||||
|
@ -109,7 +109,8 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
|
|||||||
params->c_scale_ptr,
|
params->c_scale_ptr,
|
||||||
params->ldc,
|
params->ldc,
|
||||||
params->c_dtype,
|
params->c_dtype,
|
||||||
params->use_fast_accum);
|
params->use_fast_accum,
|
||||||
|
std::nullopt /* alpha */);
|
||||||
return OK;
|
return OK;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -267,27 +267,10 @@ class TunableOp {
|
|||||||
for (size_t i = 0; i < op_names_.size(); i++) {
|
for (size_t i = 0; i < op_names_.size(); i++) {
|
||||||
auto* candidate = ops_[op_names_[i]].get(); // borrow pointer
|
auto* candidate = ops_[op_names_[i]].get(); // borrow pointer
|
||||||
|
|
||||||
if (do_numerics_check) {
|
auto status = candidate->Call(reusable_params[0]);
|
||||||
ParamsT* numerical_params = params->DeepCopy(false);
|
if (status != OK) {
|
||||||
auto status = candidate->Call(numerical_params);
|
TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
|
||||||
if (status != OK) {
|
continue;
|
||||||
numerical_params->Delete();
|
|
||||||
TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
status = reference_params->NumericalCheck(numerical_params);
|
|
||||||
numerical_params->Delete();
|
|
||||||
if (status != OK) {
|
|
||||||
TUNABLE_LOG3("├──numerics check failed for id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
auto status = candidate->Call(reusable_params[0]);
|
|
||||||
if (status != OK) {
|
|
||||||
TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// collect a small profile
|
// collect a small profile
|
||||||
@ -310,6 +293,22 @@ class TunableOp {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (do_numerics_check) {
|
||||||
|
ParamsT* numerical_params = params->DeepCopy(false);
|
||||||
|
auto status = candidate->Call(numerical_params);
|
||||||
|
if (status != OK) {
|
||||||
|
numerical_params->Delete();
|
||||||
|
TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
status = reference_params->NumericalCheck(numerical_params);
|
||||||
|
numerical_params->Delete();
|
||||||
|
if (status != OK) {
|
||||||
|
TUNABLE_LOG3("├──numerics check failed for id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// for warmup does user set max duration, max iters, or both?
|
// for warmup does user set max duration, max iters, or both?
|
||||||
// warmup is skipped by default, i.e. warmup_iter = 0
|
// warmup is skipped by default, i.e. warmup_iter = 0
|
||||||
// warmup will be set to the non-zero value of max_warmup_duration
|
// warmup will be set to the non-zero value of max_warmup_duration
|
||||||
|
@ -213,40 +213,22 @@ static cudnn_grid_sample_backward_batch_rule(
|
|||||||
return grid_sample_backward_helper_out(std::move(bw_out), 0, 0, bdim_size);
|
return grid_sample_backward_helper_out(std::move(bw_out), 0, 0, bdim_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: replace with targetable functionalization
|
// uses functional formulation for one_hot under vmap to be compatible with
|
||||||
|
// fakeTensor/dynamic shapes and compiled functorch transforms.
|
||||||
|
// mirrors the meta path in aten/src/ATen/native/Onehot.cpp,
|
||||||
|
// but requires explicit positive num_classes under vmap to avoid
|
||||||
|
// data-dependent output shapes.
|
||||||
static Tensor one_hot_decomposition_hack(const Tensor &self, int64_t num_classes) {
|
static Tensor one_hot_decomposition_hack(const Tensor &self, int64_t num_classes) {
|
||||||
TORCH_CHECK(self.dtype() == kLong, "one_hot is only applicable to index tensor.");
|
TORCH_CHECK(self.dtype() == kLong, "one_hot is only applicable to index tensor.");
|
||||||
auto shape = self.sym_sizes().vec();
|
|
||||||
|
|
||||||
// empty tensor could be converted to one hot representation,
|
|
||||||
// but shape inference is not possible.
|
|
||||||
if (self.sym_numel() == 0) {
|
|
||||||
if (num_classes <= 0) {
|
|
||||||
TORCH_CHECK(false, "Can not infer total number of classes from empty tensor.");
|
|
||||||
} else {
|
|
||||||
shape.emplace_back(num_classes);
|
|
||||||
return at::empty_symint(shape, self.options());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
// disallow implicit inference under vmap; this would be data-dependent
|
||||||
|
// and is intentionally guarded by Dynamo in torch/_dynamo/variables/torch.py.
|
||||||
TORCH_CHECK(num_classes > 0, "When vmap-ing torch.nn.functional.one_hot, please "
|
TORCH_CHECK(num_classes > 0, "When vmap-ing torch.nn.functional.one_hot, please "
|
||||||
"provide an explicit positive num_classes argument.");
|
"provide an explicit positive num_classes argument.");
|
||||||
|
|
||||||
// Disabling all of the following checks. This is OK because scatter has checks too.
|
const auto options = self.options();
|
||||||
// Maybe one_hot should be a primitive wrt autograd so we don't have to deal with this.
|
at::Tensor index = at::arange(num_classes, options);
|
||||||
// // non-empty tensor
|
return at::eq(self.unsqueeze(-1), index).to(at::kLong);
|
||||||
// if (self.device().type() != at::kCUDA) {
|
|
||||||
// //for cuda, rely on device assert thrown by scatter
|
|
||||||
// TORCH_CHECK(self.min().item().toLong() >= 0, "Class values must be non-negative.");
|
|
||||||
// }
|
|
||||||
// if (self.device().type() != at::kCUDA) {
|
|
||||||
// //rely on device asserts from scatter to avoid sync here
|
|
||||||
// TORCH_CHECK(num_classes > self.max().item().toLong(), "Class values must be smaller than num_classes.");
|
|
||||||
// }
|
|
||||||
|
|
||||||
shape.emplace_back(num_classes);
|
|
||||||
Tensor ret = at::zeros_symint(shape, self.options());
|
|
||||||
return ret.scatter(-1, self.unsqueeze(-1), 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename A, A a, typename C>
|
template <typename A, A a, typename C>
|
||||||
|
@ -240,8 +240,8 @@ TORCH_META_FUNC(gelu_backward) (
|
|||||||
|
|
||||||
namespace at::native {
|
namespace at::native {
|
||||||
|
|
||||||
static const double SELU_ALPHA = 1.6732632423543772848170429916717;
|
static constexpr double SELU_ALPHA = 1.6732632423543772848170429916717;
|
||||||
static const double SELU_SCALE = 1.0507009873554804934193349852946;
|
static constexpr double SELU_SCALE = 1.0507009873554804934193349852946;
|
||||||
|
|
||||||
DEFINE_DISPATCH(elu_stub);
|
DEFINE_DISPATCH(elu_stub);
|
||||||
DEFINE_DISPATCH(elu_backward_stub);
|
DEFINE_DISPATCH(elu_backward_stub);
|
||||||
|
@ -286,7 +286,7 @@ template void scal_fast_path<scalar_t>(int *n, scalar_t *a, scalar_t *x, int *in
|
|||||||
#if AT_BUILD_WITH_BLAS()
|
#if AT_BUILD_WITH_BLAS()
|
||||||
template <>
|
template <>
|
||||||
bool scal_use_fast_path<double>(int64_t n, int64_t incx) {
|
bool scal_use_fast_path<double>(int64_t n, int64_t incx) {
|
||||||
auto intmax = std::numeric_limits<int>::max();
|
auto constexpr intmax = std::numeric_limits<int>::max();
|
||||||
return n <= intmax && incx <= intmax;
|
return n <= intmax && incx <= intmax;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -315,7 +315,7 @@ bool gemv_use_fast_path<float>(
|
|||||||
int64_t incx,
|
int64_t incx,
|
||||||
[[maybe_unused]] float beta,
|
[[maybe_unused]] float beta,
|
||||||
int64_t incy) {
|
int64_t incy) {
|
||||||
auto intmax = std::numeric_limits<int>::max();
|
auto constexpr intmax = std::numeric_limits<int>::max();
|
||||||
return (m <= intmax) && (n <= intmax) && (lda <= intmax) &&
|
return (m <= intmax) && (n <= intmax) && (lda <= intmax) &&
|
||||||
(incx > 0) && (incx <= intmax) && (incy > 0) && (incy <= intmax);
|
(incx > 0) && (incx <= intmax) && (incy > 0) && (incy <= intmax);
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <array>
|
||||||
#include <ATen/native/Math.h>
|
#include <ATen/native/Math.h>
|
||||||
#include <c10/macros/Macros.h>
|
#include <c10/macros/Macros.h>
|
||||||
#include <c10/util/MathConstants.h>
|
#include <c10/util/MathConstants.h>
|
||||||
@ -127,7 +128,7 @@ C10_DEVICE scalar_t sample_gamma(scalar_t alpha, BaseSampler<accscalar_t, unifor
|
|||||||
|
|
||||||
template<typename scalar_t>
|
template<typename scalar_t>
|
||||||
C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) {
|
C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) {
|
||||||
const static scalar_t kTailValues[] = {
|
constexpr static scalar_t kTailValues[] = {
|
||||||
0.0810614667953272,
|
0.0810614667953272,
|
||||||
0.0413406959554092,
|
0.0413406959554092,
|
||||||
0.0276779256849983,
|
0.0276779256849983,
|
||||||
@ -139,7 +140,7 @@ C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) {
|
|||||||
0.00925546218271273,
|
0.00925546218271273,
|
||||||
0.00833056343336287
|
0.00833056343336287
|
||||||
};
|
};
|
||||||
if (k <= 9) {
|
if (k < std::size(kTailValues)) {
|
||||||
return kTailValues[static_cast<size_t>(k)];
|
return kTailValues[static_cast<size_t>(k)];
|
||||||
}
|
}
|
||||||
scalar_t kp1sq = (k + 1) * (k + 1);
|
scalar_t kp1sq = (k + 1) * (k + 1);
|
||||||
|
@ -581,7 +581,7 @@ scalar_t ratevl(scalar_t x, const scalar_t num[], int64_t M,
|
|||||||
template <typename scalar_t>
|
template <typename scalar_t>
|
||||||
static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
|
static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
|
||||||
// lanczos approximation
|
// lanczos approximation
|
||||||
static const scalar_t lanczos_sum_expg_scaled_num[13] = {
|
static constexpr scalar_t lanczos_sum_expg_scaled_num[13] = {
|
||||||
0.006061842346248906525783753964555936883222,
|
0.006061842346248906525783753964555936883222,
|
||||||
0.5098416655656676188125178644804694509993,
|
0.5098416655656676188125178644804694509993,
|
||||||
19.51992788247617482847860966235652136208,
|
19.51992788247617482847860966235652136208,
|
||||||
@ -596,7 +596,7 @@ static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
|
|||||||
103794043.1163445451906271053616070238554,
|
103794043.1163445451906271053616070238554,
|
||||||
56906521.91347156388090791033559122686859
|
56906521.91347156388090791033559122686859
|
||||||
};
|
};
|
||||||
static const scalar_t lanczos_sum_expg_scaled_denom[13] = {
|
static constexpr scalar_t lanczos_sum_expg_scaled_denom[13] = {
|
||||||
1.,
|
1.,
|
||||||
66.,
|
66.,
|
||||||
1925.,
|
1925.,
|
||||||
@ -712,7 +712,7 @@ static scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
|
|||||||
template <typename scalar_t>
|
template <typename scalar_t>
|
||||||
static scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) {
|
static scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) {
|
||||||
// Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
|
// Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
|
||||||
static const scalar_t d[25][25] =
|
static constexpr scalar_t d[25][25] =
|
||||||
{{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2,
|
{{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2,
|
||||||
1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4,
|
1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4,
|
||||||
3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6,
|
3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6,
|
||||||
|
@ -62,7 +62,7 @@
|
|||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static const int MIOPEN_DIM_MAX = 5;
|
static constexpr int MIOPEN_DIM_MAX = 5;
|
||||||
|
|
||||||
namespace at::meta {
|
namespace at::meta {
|
||||||
|
|
||||||
|
@ -34,16 +34,16 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto shape = self.sizes().vec();
|
auto shape = self.sym_sizes().vec();
|
||||||
|
|
||||||
// empty tensor could be converted to one hot representation,
|
// empty tensor could be converted to one hot representation,
|
||||||
// but shape inference is not possible.
|
// but shape inference is not possible.
|
||||||
if (self.numel() == 0) {
|
if (self.sym_numel() == 0) {
|
||||||
if (num_classes <= 0) {
|
if (num_classes <= 0) {
|
||||||
TORCH_CHECK(false, "Can not infer total number of classes from empty tensor.");
|
TORCH_CHECK(false, "Can not infer total number of classes from empty tensor.");
|
||||||
} else {
|
} else {
|
||||||
shape.push_back(num_classes);
|
shape.emplace_back(num_classes);
|
||||||
return at::empty(shape, self.options());
|
return at::empty_symint(shape, self.options());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -66,8 +66,8 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
shape.push_back(num_classes);
|
shape.emplace_back(num_classes);
|
||||||
Tensor ret = at::zeros(shape, self.options());
|
Tensor ret = at::zeros_symint(shape, self.options());
|
||||||
ret.scatter_(-1, self.unsqueeze(-1), 1);
|
ret.scatter_(-1, self.unsqueeze(-1), 1);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -1906,11 +1906,9 @@ Tensor& index_fill_(
|
|||||||
"This also applies to advanced indexing e.g. tensor[mask] = scalar");
|
"This also applies to advanced indexing e.g. tensor[mask] = scalar");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!self.is_complex() && source.isComplex()) {
|
TORCH_CHECK(
|
||||||
TORCH_CHECK(
|
self.is_complex() || !source.isComplex(),
|
||||||
false,
|
"index_fill_(): Converting complex Scalar to non-complex type is not supported");
|
||||||
"index_fill_(): Converting complex Scalar to non-complex type is not supported");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle the case when `self` is 0-dim
|
// Handle the case when `self` is 0-dim
|
||||||
Tensor self_nonzero_dim = (self.dim() == 0) ? self.unsqueeze(-1) : self;
|
Tensor self_nonzero_dim = (self.dim() == 0) ? self.unsqueeze(-1) : self;
|
||||||
|
@ -77,7 +77,7 @@ inline AdvancedIndex make_info(Tensor self, IOptTensorListRef orig) {
|
|||||||
// next broadcast all index tensors together
|
// next broadcast all index tensors together
|
||||||
try {
|
try {
|
||||||
indices = expand_outplace(indices);
|
indices = expand_outplace(indices);
|
||||||
} catch (std::exception& e) {
|
} catch (std::exception&) {
|
||||||
TORCH_CHECK_INDEX(
|
TORCH_CHECK_INDEX(
|
||||||
false,
|
false,
|
||||||
"shape mismatch: indexing tensors could not be broadcast together"
|
"shape mismatch: indexing tensors could not be broadcast together"
|
||||||
|
@ -120,7 +120,7 @@ static void pow_tensor_scalar_kernel(
|
|||||||
} else if (dtype == ScalarType::Half) {
|
} else if (dtype == ScalarType::Half) {
|
||||||
[&]() {
|
[&]() {
|
||||||
using scalar_t =
|
using scalar_t =
|
||||||
decltype(c10::impl::ScalarTypeToCPPType<ScalarType::Half>::t);
|
c10::impl::ScalarTypeToCPPTypeT<ScalarType::Half>;
|
||||||
const auto exp = exp_scalar.to<scalar_t>();
|
const auto exp = exp_scalar.to<scalar_t>();
|
||||||
using Vec = Vectorized<scalar_t>;
|
using Vec = Vectorized<scalar_t>;
|
||||||
cpu_kernel_vec(iter,
|
cpu_kernel_vec(iter,
|
||||||
|
@ -1038,7 +1038,7 @@ struct HelperInterpNearest : public HelperInterpBase {
|
|||||||
// We keep this structure for BC and consider as deprecated.
|
// We keep this structure for BC and consider as deprecated.
|
||||||
// See HelperInterpNearestExact as replacement
|
// See HelperInterpNearestExact as replacement
|
||||||
|
|
||||||
static const int interp_size = 1;
|
static constexpr int interp_size = 1;
|
||||||
|
|
||||||
static inline void init_indices_weights(
|
static inline void init_indices_weights(
|
||||||
at::ScalarType output_type,
|
at::ScalarType output_type,
|
||||||
@ -1155,7 +1155,7 @@ struct HelperInterpNearestExact : public HelperInterpNearest {
|
|||||||
|
|
||||||
struct HelperInterpLinear : public HelperInterpBase {
|
struct HelperInterpLinear : public HelperInterpBase {
|
||||||
|
|
||||||
static const int interp_size = 2;
|
static constexpr int interp_size = 2;
|
||||||
|
|
||||||
// Compute indices and weights for each interpolated dimension
|
// Compute indices and weights for each interpolated dimension
|
||||||
// indices_weights = {
|
// indices_weights = {
|
||||||
@ -1275,7 +1275,7 @@ struct HelperInterpLinear : public HelperInterpBase {
|
|||||||
|
|
||||||
struct HelperInterpCubic : public HelperInterpBase {
|
struct HelperInterpCubic : public HelperInterpBase {
|
||||||
|
|
||||||
static const int interp_size = 4;
|
static constexpr int interp_size = 4;
|
||||||
|
|
||||||
// Compute indices and weights for each interpolated dimension
|
// Compute indices and weights for each interpolated dimension
|
||||||
// indices_weights = {
|
// indices_weights = {
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -856,9 +856,13 @@ struct type_specialized_kernel_launcher {
|
|||||||
out_calc_t output_offset_calculator,
|
out_calc_t output_offset_calculator,
|
||||||
loader_t loader,
|
loader_t loader,
|
||||||
storer_t storer) {
|
storer_t storer) {
|
||||||
if (ret_t == rt_binary_specializations[arg_index][0] &&
|
constexpr ScalarType sret_t = rt_binary_specializations[arg_index][0];
|
||||||
arg0_t == rt_binary_specializations[arg_index][1] &&
|
constexpr ScalarType sarg0_t = rt_binary_specializations[arg_index][1];
|
||||||
arg1_t == rt_binary_specializations[arg_index][2])
|
constexpr ScalarType sarg1_t = rt_binary_specializations[arg_index][2];
|
||||||
|
if (ret_t == sret_t && arg0_t == sarg0_t && arg1_t == sarg1_t) {
|
||||||
|
using cret_t = c10::impl::ScalarTypeToCPPTypeT<sret_t>;
|
||||||
|
using carg0_t = c10::impl::ScalarTypeToCPPTypeT<sarg0_t>;
|
||||||
|
using carg1_t = c10::impl::ScalarTypeToCPPTypeT<sarg1_t>;
|
||||||
launch_vectorized_templated_kernel<
|
launch_vectorized_templated_kernel<
|
||||||
func_t,
|
func_t,
|
||||||
array_t,
|
array_t,
|
||||||
@ -866,12 +870,9 @@ struct type_specialized_kernel_launcher {
|
|||||||
out_calc_t,
|
out_calc_t,
|
||||||
loader_t,
|
loader_t,
|
||||||
storer_t,
|
storer_t,
|
||||||
decltype(c10::impl::ScalarTypeToCPPType<
|
cret_t,
|
||||||
rt_binary_specializations[arg_index][0]>::t),
|
carg0_t,
|
||||||
decltype(c10::impl::ScalarTypeToCPPType<
|
carg1_t>(
|
||||||
rt_binary_specializations[arg_index][1]>::t),
|
|
||||||
decltype(c10::impl::ScalarTypeToCPPType<
|
|
||||||
rt_binary_specializations[arg_index][2]>::t)>(
|
|
||||||
numel,
|
numel,
|
||||||
f,
|
f,
|
||||||
data,
|
data,
|
||||||
@ -879,6 +880,7 @@ struct type_specialized_kernel_launcher {
|
|||||||
output_offset_calculator,
|
output_offset_calculator,
|
||||||
loader,
|
loader,
|
||||||
storer);
|
storer);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -38,12 +38,41 @@ __device__ inline int min(int a, int b) {
|
|||||||
#define BLOCK_STRIDE_BWD 2 // increasing block_stride to lower # of blocks launched
|
#define BLOCK_STRIDE_BWD 2 // increasing block_stride to lower # of blocks launched
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static __device__ inline int p_start(int size, int pad, int kernel, int dilation, int stride) {
|
template <typename index_t>
|
||||||
return (size + pad < ((kernel - 1) * dilation + 1)) ? 0 : (size + pad - ((kernel - 1) * dilation + 1)) / stride + 1;
|
static __device__ inline index_t p_start(index_t size, int pad, int kernel, int dilation, int stride) {
|
||||||
|
const auto kernel_extent = static_cast<index_t>((kernel - 1) * dilation + 1);
|
||||||
|
return (size + pad < kernel_extent) ? index_t(0) : (size + pad - kernel_extent) / stride + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ inline int p_end(int size, int pad, int pooled_size, int stride) {
|
template <typename index_t>
|
||||||
return min((size + pad) / stride + 1, pooled_size);
|
static __device__ inline index_t p_end(index_t size, int pad, index_t pooled_size, int stride) {
|
||||||
|
return std::min((size + pad) / stride + 1, pooled_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool can_use_int32_nhwc(
|
||||||
|
int64_t nbatch, int64_t channels,
|
||||||
|
int64_t height, int64_t width,
|
||||||
|
int64_t pooled_height, int64_t pooled_width,
|
||||||
|
int64_t in_stride_n, int64_t in_stride_c,
|
||||||
|
int64_t in_stride_h, int64_t in_stride_w)
|
||||||
|
{
|
||||||
|
constexpr int64_t int_max = std::numeric_limits<int>::max();
|
||||||
|
|
||||||
|
int64_t max_intra_batch =
|
||||||
|
(height ? (height - 1) * in_stride_h : 0) +
|
||||||
|
(width ? (width - 1) * in_stride_w : 0) +
|
||||||
|
(channels? (channels - 1) * in_stride_c : 0);
|
||||||
|
|
||||||
|
int64_t max_input_offset = (nbatch ? (nbatch - 1) * in_stride_n : 0) + max_intra_batch;
|
||||||
|
|
||||||
|
if (max_input_offset > int_max) return false;
|
||||||
|
|
||||||
|
int64_t out_batch_stride = pooled_height * pooled_width * channels;
|
||||||
|
if ((nbatch ? (nbatch - 1) * out_batch_stride : 0) > int_max) return false;
|
||||||
|
|
||||||
|
if (height * width > int_max) return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// kernels borrowed from Caffe
|
// kernels borrowed from Caffe
|
||||||
@ -85,21 +114,25 @@ __global__ void max_pool_forward_nchw(const int nthreads, const scalar_t* bottom
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename scalar_t>
|
template <typename scalar_t, typename index_t>
|
||||||
C10_LAUNCH_BOUNDS_1(CUDA_MAX_THREADS)
|
C10_LAUNCH_BOUNDS_1(CUDA_MAX_THREADS)
|
||||||
__global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nbatch,
|
__global__ void max_pool_forward_nhwc(
|
||||||
const int64_t channels, const int64_t height,
|
const scalar_t* bottom_data,
|
||||||
const int64_t width, const int pooled_height, const int pooled_width,
|
const int nbatch,
|
||||||
const int kernel_h, const int kernel_w, const int stride_h,
|
const index_t channels, const index_t height, const index_t width,
|
||||||
const int stride_w, const int pad_h, const int pad_w,
|
const index_t pooled_height, const index_t pooled_width,
|
||||||
const int dilation_h, const int dilation_w,
|
const int kernel_h, const int kernel_w, const int stride_h,
|
||||||
const int in_stride_n, const int in_stride_c,
|
const int stride_w, const int pad_h, const int pad_w,
|
||||||
const int in_stride_h, const int in_stride_w,
|
const int dilation_h, const int dilation_w,
|
||||||
const int kernel_stride_C, const int kernel_size_C,
|
const index_t in_stride_n, const index_t in_stride_c,
|
||||||
scalar_t* top_data, int64_t* top_mask) {
|
const index_t in_stride_h, const index_t in_stride_w,
|
||||||
extern __shared__ int smem[];
|
const int kernel_stride_C, const int kernel_size_C,
|
||||||
int *out_mask_cached = smem;
|
scalar_t* top_data, int64_t* top_mask) {
|
||||||
scalar_t *out_cached = reinterpret_cast<scalar_t*>(&out_mask_cached[kernel_size_C*blockDim.x*blockDim.y*blockDim.z]);
|
|
||||||
|
extern __shared__ unsigned char smem_raw[];
|
||||||
|
index_t *out_mask_cached = reinterpret_cast<index_t*>(smem_raw);
|
||||||
|
scalar_t *out_cached = reinterpret_cast<scalar_t*>(
|
||||||
|
out_mask_cached + kernel_size_C*blockDim.x*blockDim.y*blockDim.z);
|
||||||
|
|
||||||
// flattening cta for pre-computation & smem initialization;
|
// flattening cta for pre-computation & smem initialization;
|
||||||
int thread_id = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
|
int thread_id = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
|
||||||
@ -118,26 +151,26 @@ __global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nba
|
|||||||
int channel_id = blockIdx.x / nbatch;
|
int channel_id = blockIdx.x / nbatch;
|
||||||
int channel_offset = threadIdx.x + channel_id * blockDim.x;
|
int channel_offset = threadIdx.x + channel_id * blockDim.x;
|
||||||
|
|
||||||
top_data = top_data + batch_id * pooled_height * pooled_width * channels;
|
top_data = top_data + static_cast<index_t>(batch_id) * (pooled_height * pooled_width * channels);
|
||||||
top_mask = top_mask + batch_id * pooled_height * pooled_width * channels;
|
top_mask = top_mask + static_cast<index_t>(batch_id) * (pooled_height * pooled_width * channels);
|
||||||
bottom_data = bottom_data + batch_id * in_stride_n;
|
bottom_data = bottom_data + static_cast<index_t>(batch_id) * in_stride_n;
|
||||||
|
|
||||||
out_cached = &out_cached[(threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C*blockDim.x];
|
out_cached += (threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C*blockDim.x;
|
||||||
out_mask_cached = &out_mask_cached[(threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C*blockDim.x];
|
out_mask_cached += (threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C*blockDim.x;
|
||||||
|
|
||||||
int oH = (pooled_height + gridDim.z-1) / gridDim.z;
|
int oH = (static_cast<int>(pooled_height) + gridDim.z - 1) / gridDim.z;
|
||||||
int oW = (pooled_width + gridDim.y-1) / gridDim.y;
|
int oW = (static_cast<int>(pooled_width) + gridDim.y - 1) / gridDim.y;
|
||||||
int ostartH = threadIdx.z + blockIdx.z*oH;
|
int ostartH = threadIdx.z + blockIdx.z*oH;
|
||||||
int oendH = ::min(ostartH+oH, pooled_height);
|
int oendH = ::min(ostartH+oH, static_cast<int>(pooled_height));
|
||||||
int ostartW = threadIdx.y + blockIdx.y*oW;
|
int ostartW = threadIdx.y + blockIdx.y*oW;
|
||||||
int oendW = ::min(ostartW+oW, pooled_width);
|
int oendW = ::min(ostartW+oW, static_cast<int>(pooled_width));
|
||||||
|
|
||||||
for (int oh = ostartH; oh < oendH; oh+=blockDim.z) {
|
for (int oh = ostartH; oh < oendH; oh+=blockDim.z) {
|
||||||
int hstart = oh * stride_h - pad_h;
|
index_t hstart = static_cast<index_t>(oh) * stride_h - pad_h;
|
||||||
int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height);
|
index_t hend = std::min(hstart + static_cast<index_t>((kernel_h - 1) * dilation_h + 1), height);
|
||||||
for (int ow = ostartW; ow < oendW; ow+=blockDim.y) {
|
for (int ow = ostartW; ow < oendW; ow+=blockDim.y) {
|
||||||
int wstart = ow * stride_w - pad_w;
|
index_t wstart = static_cast<index_t>(ow) * stride_w - pad_w;
|
||||||
int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width);
|
index_t wend = std::min(wstart + static_cast<index_t>((kernel_w - 1) * dilation_w + 1), width);
|
||||||
while(hstart < 0)
|
while(hstart < 0)
|
||||||
hstart += dilation_h;
|
hstart += dilation_h;
|
||||||
while(wstart < 0)
|
while(wstart < 0)
|
||||||
@ -185,12 +218,12 @@ __global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nba
|
|||||||
// Else do it Non-Prefetch...
|
// Else do it Non-Prefetch...
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
for (int ih = hstart; ih < hend; ih += dilation_h) {
|
for (index_t ih = hstart; ih < hend; ih += dilation_h) {
|
||||||
for (int iw = wstart; iw < wend; iw += dilation_w) {
|
for (index_t iw = wstart; iw < wend; iw += dilation_w) {
|
||||||
int cached_index = threadIdx.x;
|
int cached_index = threadIdx.x;
|
||||||
const scalar_t *ptr_input = bottom_data + ih * in_stride_h + iw * in_stride_w;
|
const scalar_t *ptr_input = bottom_data + ih * in_stride_h + iw * in_stride_w;
|
||||||
for(int c = channel_offset; c < channels; c+= blockDim.x*kernel_stride_C) {
|
for (index_t c = channel_offset; c < channels; c += static_cast<index_t>(blockDim.x) * kernel_stride_C) {
|
||||||
scalar_t val = ptr_input[c*in_stride_c];
|
scalar_t val = ptr_input[c * in_stride_c];
|
||||||
if ((val > out_cached[cached_index]) || at::_isnan(val)) {
|
if ((val > out_cached[cached_index]) || at::_isnan(val)) {
|
||||||
out_cached[cached_index] = val;
|
out_cached[cached_index] = val;
|
||||||
out_mask_cached[cached_index] = ih * width + iw;
|
out_mask_cached[cached_index] = ih * width + iw;
|
||||||
@ -200,15 +233,15 @@ __global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nba
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
scalar_t *ptr_output_data = top_data + (oh * pooled_width + ow) * channels;
|
scalar_t *ptr_output_data = top_data + (static_cast<index_t>(oh) * pooled_width + ow) * channels;
|
||||||
int64_t *ptr_output_mask = top_mask + (oh * pooled_width + ow) * channels;
|
int64_t *ptr_output_mask = top_mask + (static_cast<index_t>(oh) * pooled_width + ow) * channels;
|
||||||
|
|
||||||
int cached_index = threadIdx.x;
|
int cached_index = threadIdx.x;
|
||||||
for(int c = channel_offset; c < channels; c+= blockDim.x*kernel_stride_C) {
|
for (index_t c = channel_offset; c < channels; c += static_cast<index_t>(blockDim.x) * kernel_stride_C) {
|
||||||
ptr_output_data[c] = out_cached[cached_index];
|
ptr_output_data[c] = out_cached[cached_index];
|
||||||
ptr_output_mask[c] = out_mask_cached[cached_index];
|
ptr_output_mask[c] = static_cast<int64_t>(out_mask_cached[cached_index]);
|
||||||
out_cached[cached_index] = at::numeric_limits<scalar_t>::lower_bound();
|
out_cached[cached_index] = at::numeric_limits<scalar_t>::lower_bound();
|
||||||
out_mask_cached[cached_index] = 0;
|
out_mask_cached[cached_index] = index_t(0);
|
||||||
cached_index += blockDim.x;
|
cached_index += blockDim.x;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -216,7 +249,7 @@ __global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nba
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static const int BLOCK_THREADS = 256;
|
static constexpr int BLOCK_THREADS = 256;
|
||||||
|
|
||||||
template <typename scalar_t, typename accscalar_t>
|
template <typename scalar_t, typename accscalar_t>
|
||||||
#if defined (USE_ROCM)
|
#if defined (USE_ROCM)
|
||||||
@ -462,6 +495,11 @@ const Tensor& indices) {
|
|||||||
maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), max_threads / block_y / block_z));
|
maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), max_threads / block_y / block_z));
|
||||||
const dim3 block(block_x, block_y, block_z);
|
const dim3 block(block_x, block_y, block_z);
|
||||||
|
|
||||||
|
bool use_int32 = can_use_int32_nhwc(
|
||||||
|
nbatch, nInputPlane, inputHeight, inputWidth,
|
||||||
|
outputHeight, outputWidth,
|
||||||
|
in_stride_n, in_stride_c, in_stride_h, in_stride_w);
|
||||||
|
|
||||||
int kernel_stride_C = ceil_div(
|
int kernel_stride_C = ceil_div(
|
||||||
safe_downcast<int, int64_t>(nInputPlane), block_x * 4);
|
safe_downcast<int, int64_t>(nInputPlane), block_x * 4);
|
||||||
int kernel_size_C = ceil_div(
|
int kernel_size_C = ceil_div(
|
||||||
@ -476,18 +514,41 @@ const Tensor& indices) {
|
|||||||
ceil_div(safe_downcast<int, int64_t>(outputHeight), block_z*BLOCK_STRIDE_FWD));
|
ceil_div(safe_downcast<int, int64_t>(outputHeight), block_z*BLOCK_STRIDE_FWD));
|
||||||
const dim3 grid(grid_x, grid_y, grid_z);
|
const dim3 grid(grid_x, grid_y, grid_z);
|
||||||
|
|
||||||
size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * (sizeof(int) + sizeof(scalar_t));
|
size_t shmem_size;
|
||||||
AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock);
|
size_t mask_elems = static_cast<size_t>(kernel_size_C) * block_x * block_y * block_z;
|
||||||
|
|
||||||
max_pool_forward_nhwc<scalar_t>
|
if (use_int32) {
|
||||||
<<<grid, block, shmem_size, at::cuda::getCurrentCUDAStream()>>>(
|
shmem_size = mask_elems * (sizeof(int32_t) + sizeof(scalar_t));
|
||||||
input_data, nbatch,
|
TORCH_CHECK(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock,
|
||||||
nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
|
"shared memory too small");
|
||||||
kH, kW, dH, dW, padH, padW, dilationH, dilationW,
|
max_pool_forward_nhwc<scalar_t, int32_t>
|
||||||
in_stride_n, in_stride_c,
|
<<<grid, block, shmem_size, at::cuda::getCurrentCUDAStream()>>>(
|
||||||
in_stride_h, in_stride_w,
|
input_data, static_cast<int>(nbatch),
|
||||||
kernel_stride_C, kernel_size_C,
|
static_cast<int32_t>(nInputPlane),
|
||||||
output_data, indices_data);
|
static_cast<int32_t>(inputHeight),
|
||||||
|
static_cast<int32_t>(inputWidth),
|
||||||
|
static_cast<int32_t>(outputHeight),
|
||||||
|
static_cast<int32_t>(outputWidth),
|
||||||
|
kH, kW, dH, dW, padH, padW, dilationH, dilationW,
|
||||||
|
static_cast<int32_t>(in_stride_n),
|
||||||
|
static_cast<int32_t>(in_stride_c),
|
||||||
|
static_cast<int32_t>(in_stride_h),
|
||||||
|
static_cast<int32_t>(in_stride_w),
|
||||||
|
kernel_stride_C, kernel_size_C,
|
||||||
|
output_data, indices_data);
|
||||||
|
} else {
|
||||||
|
shmem_size = mask_elems * (sizeof(int64_t) + sizeof(scalar_t));
|
||||||
|
TORCH_CHECK(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock,
|
||||||
|
"shared memory too small");
|
||||||
|
max_pool_forward_nhwc<scalar_t, int64_t>
|
||||||
|
<<<grid, block, shmem_size, at::cuda::getCurrentCUDAStream()>>>(
|
||||||
|
input_data, static_cast<int>(nbatch),
|
||||||
|
nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
|
||||||
|
kH, kW, dH, dW, padH, padW, dilationH, dilationW,
|
||||||
|
in_stride_n, in_stride_c, in_stride_h, in_stride_w,
|
||||||
|
kernel_stride_C, kernel_size_C,
|
||||||
|
output_data, indices_data);
|
||||||
|
}
|
||||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -36,9 +36,9 @@ namespace at::native {
|
|||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
#if defined(USE_ROCM)
|
#if defined(USE_ROCM)
|
||||||
static const int BLOCKDIMY = 16;
|
static constexpr int BLOCKDIMY = 16;
|
||||||
#else
|
#else
|
||||||
static const int BLOCKDIMY = 32;
|
static constexpr int BLOCKDIMY = 32;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template
|
template
|
||||||
|
@ -82,7 +82,7 @@ __host__ __device__ scalar_t lanczos_sum_expg_scaled(scalar_t x) {
|
|||||||
// lanczos approximation
|
// lanczos approximation
|
||||||
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
|
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
|
||||||
|
|
||||||
static const accscalar_t lanczos_sum_expg_scaled_num[13] = {
|
constexpr accscalar_t lanczos_sum_expg_scaled_num[13] = {
|
||||||
0.006061842346248906525783753964555936883222,
|
0.006061842346248906525783753964555936883222,
|
||||||
0.5098416655656676188125178644804694509993,
|
0.5098416655656676188125178644804694509993,
|
||||||
19.51992788247617482847860966235652136208,
|
19.51992788247617482847860966235652136208,
|
||||||
@ -97,7 +97,7 @@ __host__ __device__ scalar_t lanczos_sum_expg_scaled(scalar_t x) {
|
|||||||
103794043.1163445451906271053616070238554,
|
103794043.1163445451906271053616070238554,
|
||||||
56906521.91347156388090791033559122686859
|
56906521.91347156388090791033559122686859
|
||||||
};
|
};
|
||||||
static const accscalar_t lanczos_sum_expg_scaled_denom[13] = {
|
constexpr accscalar_t lanczos_sum_expg_scaled_denom[13] = {
|
||||||
1.,
|
1.,
|
||||||
66.,
|
66.,
|
||||||
1925.,
|
1925.,
|
||||||
@ -126,10 +126,10 @@ __host__ __device__ scalar_t _igam_helper_fac(scalar_t a, scalar_t x) {
|
|||||||
|
|
||||||
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
|
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
|
||||||
accscalar_t ax, fac, res, num, numfac;
|
accscalar_t ax, fac, res, num, numfac;
|
||||||
static const accscalar_t MAXLOG = std::is_same_v<accscalar_t,double> ?
|
constexpr accscalar_t MAXLOG = std::is_same_v<accscalar_t,double> ?
|
||||||
7.09782712893383996843E2 : 88.72283905206835;
|
7.09782712893383996843E2 : 88.72283905206835;
|
||||||
static const accscalar_t EXP1 = 2.718281828459045;
|
constexpr accscalar_t EXP1 = 2.718281828459045;
|
||||||
static const accscalar_t lanczos_g = 6.024680040776729583740234375;
|
constexpr accscalar_t lanczos_g = 6.024680040776729583740234375;
|
||||||
|
|
||||||
if (::fabs(a - x) > 0.4 * ::fabs(a)) {
|
if (::fabs(a - x) > 0.4 * ::fabs(a)) {
|
||||||
ax = a * ::log(x) - x - ::lgamma(a);
|
ax = a * ::log(x) - x - ::lgamma(a);
|
||||||
@ -158,9 +158,9 @@ __host__ __device__ scalar_t _igam_helper_series(scalar_t a, scalar_t x) {
|
|||||||
// Compute igam using DLMF 8.11.4. [igam1]
|
// Compute igam using DLMF 8.11.4. [igam1]
|
||||||
|
|
||||||
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
|
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
|
||||||
static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
|
constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
|
||||||
1.11022302462515654042E-16 : 5.9604644775390625E-8;
|
1.11022302462515654042E-16 : 5.9604644775390625E-8;
|
||||||
static const int MAXITER = 2000;
|
constexpr int MAXITER = 2000;
|
||||||
|
|
||||||
int i;
|
int i;
|
||||||
accscalar_t ans, ax, c, r;
|
accscalar_t ans, ax, c, r;
|
||||||
@ -196,8 +196,8 @@ __host__ __device__ scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
|
|||||||
accscalar_t fac = 1;
|
accscalar_t fac = 1;
|
||||||
accscalar_t sum = 0;
|
accscalar_t sum = 0;
|
||||||
accscalar_t term, logx;
|
accscalar_t term, logx;
|
||||||
static const int MAXITER = 2000;
|
constexpr int MAXITER = 2000;
|
||||||
static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
|
constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
|
||||||
1.11022302462515654042E-16 : 5.9604644775390625E-8;
|
1.11022302462515654042E-16 : 5.9604644775390625E-8;
|
||||||
|
|
||||||
for (n = 1; n < MAXITER; n++) {
|
for (n = 1; n < MAXITER; n++) {
|
||||||
@ -219,7 +219,7 @@ __host__ __device__ scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t
|
|||||||
// Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
|
// Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
|
||||||
|
|
||||||
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
|
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
|
||||||
static const accscalar_t d[25][25] =
|
constexpr accscalar_t d[25][25] =
|
||||||
{{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2, 1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4, 3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6, 8.296711340953086e-7, -1.7665952736826079e-7, 6.7078535434014986e-9, 1.0261809784240308e-8, -4.3820360184533532e-9, 9.1476995822367902e-10, -2.551419399494625e-11, -5.8307721325504251e-11, 2.4361948020667416e-11, -5.0276692801141756e-12, 1.1004392031956135e-13, 3.3717632624009854e-13, -1.3923887224181621e-13, 2.8534893807047443e-14, -5.1391118342425726e-16, -1.9752288294349443e-15},
|
{{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2, 1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4, 3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6, 8.296711340953086e-7, -1.7665952736826079e-7, 6.7078535434014986e-9, 1.0261809784240308e-8, -4.3820360184533532e-9, 9.1476995822367902e-10, -2.551419399494625e-11, -5.8307721325504251e-11, 2.4361948020667416e-11, -5.0276692801141756e-12, 1.1004392031956135e-13, 3.3717632624009854e-13, -1.3923887224181621e-13, 2.8534893807047443e-14, -5.1391118342425726e-16, -1.9752288294349443e-15},
|
||||||
{-1.8518518518518519e-3, -3.4722222222222222e-3, 2.6455026455026455e-3, -9.9022633744855967e-4, 2.0576131687242798e-4, -4.0187757201646091e-7, -1.8098550334489978e-5, 7.6491609160811101e-6, -1.6120900894563446e-6, 4.6471278028074343e-9, 1.378633446915721e-7, -5.752545603517705e-8, 1.1951628599778147e-8, -1.7543241719747648e-11, -1.0091543710600413e-9, 4.1627929918425826e-10, -8.5639070264929806e-11, 6.0672151016047586e-14, 7.1624989648114854e-12, -2.9331866437714371e-12, 5.9966963656836887e-13, -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14, -4.13125571381061e-15},
|
{-1.8518518518518519e-3, -3.4722222222222222e-3, 2.6455026455026455e-3, -9.9022633744855967e-4, 2.0576131687242798e-4, -4.0187757201646091e-7, -1.8098550334489978e-5, 7.6491609160811101e-6, -1.6120900894563446e-6, 4.6471278028074343e-9, 1.378633446915721e-7, -5.752545603517705e-8, 1.1951628599778147e-8, -1.7543241719747648e-11, -1.0091543710600413e-9, 4.1627929918425826e-10, -8.5639070264929806e-11, 6.0672151016047586e-14, 7.1624989648114854e-12, -2.9331866437714371e-12, 5.9966963656836887e-13, -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14, -4.13125571381061e-15},
|
||||||
{4.1335978835978836e-3, -2.6813271604938272e-3, 7.7160493827160494e-4, 2.0093878600823045e-6, -1.0736653226365161e-4, 5.2923448829120125e-5, -1.2760635188618728e-5, 3.4235787340961381e-8, 1.3721957309062933e-6, -6.298992138380055e-7, 1.4280614206064242e-7, -2.0477098421990866e-10, -1.4092529910867521e-8, 6.228974084922022e-9, -1.3670488396617113e-9, 9.4283561590146782e-13, 1.2872252400089318e-10, -5.5645956134363321e-11, 1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12, 4.6622399463901357e-13, -9.905105763906906e-14, 1.8931876768373515e-17, 8.8592218725911273e-15},
|
{4.1335978835978836e-3, -2.6813271604938272e-3, 7.7160493827160494e-4, 2.0093878600823045e-6, -1.0736653226365161e-4, 5.2923448829120125e-5, -1.2760635188618728e-5, 3.4235787340961381e-8, 1.3721957309062933e-6, -6.298992138380055e-7, 1.4280614206064242e-7, -2.0477098421990866e-10, -1.4092529910867521e-8, 6.228974084922022e-9, -1.3670488396617113e-9, 9.4283561590146782e-13, 1.2872252400089318e-10, -5.5645956134363321e-11, 1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12, 4.6622399463901357e-13, -9.905105763906906e-14, 1.8931876768373515e-17, 8.8592218725911273e-15},
|
||||||
@ -248,7 +248,7 @@ __host__ __device__ scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t
|
|||||||
|
|
||||||
int k, n, sgn;
|
int k, n, sgn;
|
||||||
int maxpow = 0;
|
int maxpow = 0;
|
||||||
static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
|
constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
|
||||||
1.11022302462515654042E-16 : 5.9604644775390625E-8;
|
1.11022302462515654042E-16 : 5.9604644775390625E-8;
|
||||||
accscalar_t lambda = x / a;
|
accscalar_t lambda = x / a;
|
||||||
accscalar_t sigma = (x - a) / a;
|
accscalar_t sigma = (x - a) / a;
|
||||||
@ -314,12 +314,12 @@ __host__ __device__ scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar
|
|||||||
int i;
|
int i;
|
||||||
accscalar_t ans, ax, c, yc, r, t, y, z;
|
accscalar_t ans, ax, c, yc, r, t, y, z;
|
||||||
accscalar_t pk, pkm1, pkm2, qk, qkm1, qkm2;
|
accscalar_t pk, pkm1, pkm2, qk, qkm1, qkm2;
|
||||||
static const int MAXITER = 2000;
|
constexpr int MAXITER = 2000;
|
||||||
static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
|
constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
|
||||||
1.11022302462515654042E-16 : 5.9604644775390625E-8;
|
1.11022302462515654042E-16 : 5.9604644775390625E-8;
|
||||||
static const accscalar_t BIG = std::is_same_v<accscalar_t,double> ?
|
constexpr accscalar_t BIG = std::is_same_v<accscalar_t,double> ?
|
||||||
4.503599627370496e15 : 16777216.;
|
4.503599627370496e15 : 16777216.;
|
||||||
static const accscalar_t BIGINV = std::is_same_v<accscalar_t,double> ?
|
constexpr accscalar_t BIGINV = std::is_same_v<accscalar_t,double> ?
|
||||||
2.22044604925031308085e-16 : 5.9604644775390625E-8;
|
2.22044604925031308085e-16 : 5.9604644775390625E-8;
|
||||||
|
|
||||||
ax = _igam_helper_fac(a, x);
|
ax = _igam_helper_fac(a, x);
|
||||||
@ -385,10 +385,10 @@ __noinline__ __host__ __device__ scalar_t calc_igammac(scalar_t a, scalar_t x) {
|
|||||||
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
|
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
|
||||||
accscalar_t absxma_a;
|
accscalar_t absxma_a;
|
||||||
|
|
||||||
static const accscalar_t SMALL = 20.0;
|
constexpr accscalar_t SMALL = 20.0;
|
||||||
static const accscalar_t LARGE = 200.0;
|
constexpr accscalar_t LARGE = 200.0;
|
||||||
static const accscalar_t SMALLRATIO = 0.3;
|
constexpr accscalar_t SMALLRATIO = 0.3;
|
||||||
static const accscalar_t LARGERATIO = 4.5;
|
constexpr accscalar_t LARGERATIO = 4.5;
|
||||||
|
|
||||||
if ((x < 0) || (a < 0)) {
|
if ((x < 0) || (a < 0)) {
|
||||||
// out of defined-region of the function
|
// out of defined-region of the function
|
||||||
@ -467,10 +467,10 @@ __noinline__ __host__ __device__ scalar_t calc_igamma(scalar_t a, scalar_t x) {
|
|||||||
|
|
||||||
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
|
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
|
||||||
accscalar_t absxma_a;
|
accscalar_t absxma_a;
|
||||||
static const accscalar_t SMALL = 20.0;
|
constexpr accscalar_t SMALL = 20.0;
|
||||||
static const accscalar_t LARGE = 200.0;
|
constexpr accscalar_t LARGE = 200.0;
|
||||||
static const accscalar_t SMALLRATIO = 0.3;
|
constexpr accscalar_t SMALLRATIO = 0.3;
|
||||||
static const accscalar_t LARGERATIO = 4.5;
|
constexpr accscalar_t LARGERATIO = 4.5;
|
||||||
|
|
||||||
// boundary values following SciPy
|
// boundary values following SciPy
|
||||||
if ((x < 0) || (a < 0)) {
|
if ((x < 0) || (a < 0)) {
|
||||||
|
@ -231,7 +231,7 @@ const auto lcm_string = jiterator_stringify(
|
|||||||
const auto digamma_string = jiterator_stringify(
|
const auto digamma_string = jiterator_stringify(
|
||||||
template <typename T>
|
template <typename T>
|
||||||
T digamma(T x) {
|
T digamma(T x) {
|
||||||
static const double PI_f64 = 3.14159265358979323846;
|
static constexpr double PI_f64 = 3.14159265358979323846;
|
||||||
|
|
||||||
// Short-circuits if x is +/- 0 and returns -/+ ∞ per the C++ standard
|
// Short-circuits if x is +/- 0 and returns -/+ ∞ per the C++ standard
|
||||||
if (x == 0) {
|
if (x == 0) {
|
||||||
@ -3072,9 +3072,9 @@ template <typename scalar_t>
|
|||||||
static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) {
|
static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) {
|
||||||
// [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma
|
// [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma
|
||||||
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
|
using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
|
||||||
static const double PI_f64 = 3.14159265358979323846;
|
static constexpr double PI_f64 = 3.14159265358979323846;
|
||||||
const accscalar_t PSI_10 = 2.25175258906672110764;
|
constexpr accscalar_t PSI_10 = 2.25175258906672110764;
|
||||||
const accscalar_t A[] = {
|
constexpr accscalar_t A[] = {
|
||||||
8.33333333333333333333E-2,
|
8.33333333333333333333E-2,
|
||||||
-2.10927960927960927961E-2,
|
-2.10927960927960927961E-2,
|
||||||
7.57575757575757575758E-3,
|
7.57575757575757575758E-3,
|
||||||
|
@ -655,8 +655,14 @@ struct ReduceOp {
|
|||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
// Intra-warp reduction, fix CUDA to have offset decreasing for better numerics
|
||||||
|
// matching Triton, etc.
|
||||||
|
// todo for AMD
|
||||||
|
#ifdef USE_ROCM
|
||||||
for (int offset = 1; offset < dim_x; offset <<= 1) {
|
for (int offset = 1; offset < dim_x; offset <<= 1) {
|
||||||
|
#else
|
||||||
|
for (int offset = dim_x >> 1; offset > 0; offset >>= 1) {
|
||||||
|
#endif
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < output_vec_size; i++) {
|
for (int i = 0; i < output_vec_size; i++) {
|
||||||
arg_t other = ops.warp_shfl_down(value[i], offset);
|
arg_t other = ops.warp_shfl_down(value[i], offset);
|
||||||
@ -1091,11 +1097,7 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
|
|||||||
// threads with different threadIdx.x are independent and will produce results for different outputs.
|
// threads with different threadIdx.x are independent and will produce results for different outputs.
|
||||||
// In such case, values in each loaded vector always correspond to different outputs.
|
// In such case, values in each loaded vector always correspond to different outputs.
|
||||||
if (fastest_moving_stride == sizeof(scalar_t)) {
|
if (fastest_moving_stride == sizeof(scalar_t)) {
|
||||||
#ifdef USE_ROCM
|
|
||||||
if (reduction_on_fastest_striding_dimension && dim0 >= 128 && iter.num_reduce_dims() == 1) {
|
if (reduction_on_fastest_striding_dimension && dim0 >= 128 && iter.num_reduce_dims() == 1) {
|
||||||
#else
|
|
||||||
if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1 && vt0 >= input_vec_size) {
|
|
||||||
#endif
|
|
||||||
// Case 1: "vectorize along input"
|
// Case 1: "vectorize along input"
|
||||||
// Note that if vt0 < ReduceConfig::vec_size, then this means the register pressure could be high, in such case,
|
// Note that if vt0 < ReduceConfig::vec_size, then this means the register pressure could be high, in such case,
|
||||||
// we should avoid vectorization.
|
// we should avoid vectorization.
|
||||||
|
@ -39,9 +39,14 @@ static void std_var_kernel_cuda(TensorIterator& iter, double correction, bool ta
|
|||||||
template <typename scalar_t, typename acc_t=scalar_t, typename out_t=scalar_t>
|
template <typename scalar_t, typename acc_t=scalar_t, typename out_t=scalar_t>
|
||||||
void mean_kernel_impl(TensorIterator& iter) {
|
void mean_kernel_impl(TensorIterator& iter) {
|
||||||
// returns acc_t for all non-complex dtypes and returns T for c10::complex<T>
|
// returns acc_t for all non-complex dtypes and returns T for c10::complex<T>
|
||||||
|
constexpr bool is_16_bits = sizeof(scalar_t) == 2;
|
||||||
using factor_t = typename c10::scalar_value_type<acc_t>::type;
|
using factor_t = typename c10::scalar_value_type<acc_t>::type;
|
||||||
factor_t factor = static_cast<factor_t>(iter.num_output_elements()) / iter.numel();
|
factor_t factor = static_cast<factor_t>(iter.num_output_elements()) / iter.numel();
|
||||||
gpu_reduce_kernel<scalar_t, out_t>(iter, MeanOps<scalar_t, acc_t, factor_t, out_t> {factor});
|
if constexpr (is_16_bits) {
|
||||||
|
gpu_reduce_kernel<scalar_t, out_t, /*vt0=*/4, /*input_vec_size=*/8>(iter, MeanOps<scalar_t, acc_t, factor_t, out_t> {factor});
|
||||||
|
} else {
|
||||||
|
gpu_reduce_kernel<scalar_t, out_t>(iter, MeanOps<scalar_t, acc_t, factor_t, out_t> {factor});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mean_kernel_cuda(TensorIterator& iter) {
|
static void mean_kernel_cuda(TensorIterator& iter) {
|
||||||
|
@ -13,24 +13,19 @@ namespace at::native {
|
|||||||
template <typename scalar_t, typename acc_t = scalar_t, typename out_t = scalar_t>
|
template <typename scalar_t, typename acc_t = scalar_t, typename out_t = scalar_t>
|
||||||
struct sum_functor {
|
struct sum_functor {
|
||||||
void operator()(TensorIterator& iter) {
|
void operator()(TensorIterator& iter) {
|
||||||
#ifdef USE_ROCM
|
const auto sum_combine = [] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
|
||||||
// Half and BFloat16 can be packed in groups of up to 8 elements and
|
return a + b;
|
||||||
// can use *_DWORDX4 instructions to achieve that.
|
};
|
||||||
const bool is_16_bits =
|
constexpr bool is_16_bits = sizeof(scalar_t) == 2;
|
||||||
( (std::is_same<at::Half, scalar_t>::value) ||
|
if constexpr (is_16_bits) {
|
||||||
(std::is_same<at::BFloat16, scalar_t>::value) );
|
|
||||||
if (is_16_bits) {
|
|
||||||
gpu_reduce_kernel<scalar_t, out_t, /*vt0=*/4, /*input_vec_size=*/8>(
|
gpu_reduce_kernel<scalar_t, out_t, /*vt0=*/4, /*input_vec_size=*/8>(
|
||||||
iter, func_wrapper<out_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
|
iter, func_wrapper<out_t>(sum_combine)
|
||||||
return a + b;
|
);
|
||||||
}));
|
} else {
|
||||||
return;
|
gpu_reduce_kernel<scalar_t, out_t>(
|
||||||
|
iter, func_wrapper<out_t>(sum_combine)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
gpu_reduce_kernel<scalar_t, out_t>(
|
|
||||||
iter, func_wrapper<out_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
|
|
||||||
return a + b;
|
|
||||||
}));
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -77,8 +72,8 @@ struct nansum_functor_complex {
|
|||||||
#if AT_USE_JITERATOR()
|
#if AT_USE_JITERATOR()
|
||||||
void operator()(TensorIterator& iter) {
|
void operator()(TensorIterator& iter) {
|
||||||
std::string func = jiterator_stringify(
|
std::string func = jiterator_stringify(
|
||||||
arg_t combine(arg_t a, scalar_t b) {
|
arg_t combine(arg_t a, arg_t b) {
|
||||||
return a + (std::isnan(b) ? arg_t{0.} : arg_t{b});
|
return a + (std::isnan(b) ? arg_t{0.} : b);
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
jitted_gpu_reduce_kernel<nansum_name, scalar_t, scalar_t>(
|
jitted_gpu_reduce_kernel<nansum_name, scalar_t, scalar_t>(
|
||||||
|
@ -464,6 +464,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
int32_t trailingSize;
|
int32_t trailingSize;
|
||||||
|
int nDimsLocal = nDims;
|
||||||
TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam;
|
TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam;
|
||||||
if (isInOutAligned) {
|
if (isInOutAligned) {
|
||||||
// in this case we can and should flatten the tensors after the cat dim
|
// in this case we can and should flatten the tensors after the cat dim
|
||||||
@ -477,7 +478,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
|||||||
// and divide all strides except last by elems_per_vec (last stride is 1 always)
|
// and divide all strides except last by elems_per_vec (last stride is 1 always)
|
||||||
// for input, we will fix up the sizes and strides in the kernel directly
|
// for input, we will fix up the sizes and strides in the kernel directly
|
||||||
kernelOutputParam = outputParam;
|
kernelOutputParam = outputParam;
|
||||||
nDims = dimension + 1;
|
nDimsLocal = dimension + 1;
|
||||||
constexpr auto elems_per_vec = alignment / sizeof(scalar_t);
|
constexpr auto elems_per_vec = alignment / sizeof(scalar_t);
|
||||||
auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1];
|
auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1];
|
||||||
kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec;
|
kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec;
|
||||||
@ -494,7 +495,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
|||||||
case 0:
|
case 0:
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
cat_dim = nDims - cat_dim;
|
cat_dim = nDimsLocal - cat_dim;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
cat_dim--;
|
cat_dim--;
|
||||||
@ -525,7 +526,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
|||||||
data, catMetaData, outputParam, cat_dim, outputParam.tensorStride[cat_dim]);\
|
data, catMetaData, outputParam, cat_dim, outputParam.tensorStride[cat_dim]);\
|
||||||
}\
|
}\
|
||||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||||
switch (nDims) {
|
switch (nDimsLocal) {
|
||||||
case 1:
|
case 1:
|
||||||
HANDLE_CASE(1);
|
HANDLE_CASE(1);
|
||||||
break;
|
break;
|
||||||
|
@ -21,9 +21,15 @@ namespace {
|
|||||||
struct offset_t {
|
struct offset_t {
|
||||||
int stride;
|
int stride;
|
||||||
int begin;
|
int begin;
|
||||||
__device__ int operator[](int i) {
|
__device__ int operator[](int i) const {
|
||||||
return stride * (begin + i);
|
return stride * (begin + i);
|
||||||
}
|
}
|
||||||
|
#if CCCL_VERSION >= 3001000
|
||||||
|
__device__ offset_t& operator+=(int i) {
|
||||||
|
begin += i;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
// Segmented sort by full sort algorithm:.
|
// Segmented sort by full sort algorithm:.
|
||||||
// Say we are sorting a (2, 3) tensor. We have in flattened form:
|
// Say we are sorting a (2, 3) tensor. We have in flattened form:
|
||||||
|
@ -277,7 +277,7 @@ struct BilinearFilterFunctor {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const int size = 2;
|
static constexpr int size = 2;
|
||||||
};
|
};
|
||||||
|
|
||||||
// taken from
|
// taken from
|
||||||
@ -301,7 +301,7 @@ struct BicubicFilterFunctor {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const int size = 4;
|
static constexpr int size = 4;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename accscalar_t>
|
template <typename accscalar_t>
|
||||||
|
@ -127,6 +127,29 @@ __global__ void upsample_bilinear2d_nhwc_out_frame(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef USE_ROCM
|
||||||
|
// Helper function to compute output pixel range that can contribute to input pixel
|
||||||
|
template <typename accscalar_t>
|
||||||
|
__device__ __forceinline__ void compute_output_range(
|
||||||
|
int input_pos,
|
||||||
|
accscalar_t scale,
|
||||||
|
int output_size,
|
||||||
|
bool align_corners,
|
||||||
|
int& min_output,
|
||||||
|
int& max_output) {
|
||||||
|
accscalar_t lo, hi;
|
||||||
|
if (align_corners) {
|
||||||
|
lo = static_cast<accscalar_t>(input_pos - 1) / scale;
|
||||||
|
hi = static_cast<accscalar_t>(input_pos + 1) / scale;
|
||||||
|
} else {
|
||||||
|
lo = (input_pos - static_cast<accscalar_t>(0.5)) / scale - static_cast<accscalar_t>(0.5);
|
||||||
|
hi = (input_pos + static_cast<accscalar_t>(1.5)) / scale - static_cast<accscalar_t>(0.5);
|
||||||
|
}
|
||||||
|
min_output = max(0, static_cast<int>(ceil(lo)));
|
||||||
|
max_output = min(output_size - 1, static_cast<int>(floor(hi)));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Backward (adjoint) operation 1 <- 2 (accumulates)
|
// Backward (adjoint) operation 1 <- 2 (accumulates)
|
||||||
template <typename scalar_t, typename accscalar_t>
|
template <typename scalar_t, typename accscalar_t>
|
||||||
C10_LAUNCH_BOUNDS_1(1024)
|
C10_LAUNCH_BOUNDS_1(1024)
|
||||||
@ -141,8 +164,74 @@ __global__ void upsample_bilinear2d_backward_out_frame(
|
|||||||
const bool align_corners,
|
const bool align_corners,
|
||||||
scalar_t* __restrict__ idata,
|
scalar_t* __restrict__ idata,
|
||||||
const scalar_t* __restrict__ odata) {
|
const scalar_t* __restrict__ odata) {
|
||||||
const size_t o_numel = nc * width2 * height2;
|
// In C++, integer multiplication, like in standard arithmetic, is generally commutative.
|
||||||
const size_t i_numel = nc * width1 * height1;
|
const size_t i_numel = nc * width1 * height1;
|
||||||
|
#ifdef USE_ROCM
|
||||||
|
for (size_t index = blockDim.x * blockIdx.x + threadIdx.x; index < i_numel;
|
||||||
|
index += blockDim.x * gridDim.x) {
|
||||||
|
// Decode input pixel coordinates
|
||||||
|
size_t index_temp = index;
|
||||||
|
const int w1 = index_temp % width1;
|
||||||
|
index_temp /= width1;
|
||||||
|
const int h1 = index_temp % height1;
|
||||||
|
const size_t nc_idx = index_temp / height1;
|
||||||
|
|
||||||
|
accscalar_t grad_sum = 0;
|
||||||
|
|
||||||
|
// Find range of output pixels that could interpolate from this input pixel
|
||||||
|
int h2_min, h2_max, w2_min, w2_max;
|
||||||
|
compute_output_range<accscalar_t>(h1, rheight, height2, align_corners, h2_min, h2_max);
|
||||||
|
compute_output_range<accscalar_t>(w1, rwidth, width2, align_corners, w2_min, w2_max);
|
||||||
|
|
||||||
|
// Iterate over potential output pixels
|
||||||
|
for (int h2 = h2_min; h2 <= h2_max; h2++) {
|
||||||
|
for (int w2 = w2_min; w2 <= w2_max; w2++) {
|
||||||
|
// Compute source coordinates for this output pixel
|
||||||
|
const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
|
||||||
|
rheight, h2, align_corners, /*cubic=*/false);
|
||||||
|
const int h1_base = (int)h1r;
|
||||||
|
const int h1p = (h1_base < height1 - 1) ? 1 : 0;
|
||||||
|
const accscalar_t h1lambda = h1r - h1_base;
|
||||||
|
const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
|
||||||
|
|
||||||
|
const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
|
||||||
|
rwidth, w2, align_corners, /*cubic=*/false);
|
||||||
|
const int w1_base = (int)w1r;
|
||||||
|
const int w1p = (w1_base < width1 - 1) ? 1 : 0;
|
||||||
|
const accscalar_t w1lambda = w1r - w1_base;
|
||||||
|
const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
|
||||||
|
|
||||||
|
// Check if our input pixel participates in this interpolation and accumulate all weights
|
||||||
|
// At boundaries, h1p=0 or w1p=0 causes some sampling positions to collapse
|
||||||
|
// to the same pixel, so we need to accumulate weights from all matching positions
|
||||||
|
accscalar_t weight = 0;
|
||||||
|
|
||||||
|
// Check all four interpolation positions and accumulate weights
|
||||||
|
if (h1 == h1_base && w1 == w1_base) {
|
||||||
|
weight += h0lambda * w0lambda; // top-left
|
||||||
|
}
|
||||||
|
if (h1 == h1_base && w1 == w1_base + w1p) {
|
||||||
|
weight += h0lambda * w1lambda; // top-right (may be same as top-left if w1p=0)
|
||||||
|
}
|
||||||
|
if (h1 == h1_base + h1p && w1 == w1_base) {
|
||||||
|
weight += h1lambda * w0lambda; // bottom-left (may be same as top-left if h1p=0)
|
||||||
|
}
|
||||||
|
if (h1 == h1_base + h1p && w1 == w1_base + w1p) {
|
||||||
|
weight += h1lambda * w1lambda; // bottom-right (may collapse to other positions)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (weight > 0) {
|
||||||
|
const size_t output_idx = nc_idx * height2 * width2 + h2 * width2 + w2;
|
||||||
|
grad_sum += weight * static_cast<accscalar_t>(odata[output_idx]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write accumulated gradient (no atomics needed)
|
||||||
|
idata[index] = static_cast<scalar_t>(grad_sum);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
const size_t o_numel = nc * width2 * height2;
|
||||||
for (size_t index = blockDim.x * blockIdx.x + threadIdx.x; index < o_numel;
|
for (size_t index = blockDim.x * blockIdx.x + threadIdx.x; index < o_numel;
|
||||||
index += blockDim.x * gridDim.x) {
|
index += blockDim.x * gridDim.x) {
|
||||||
size_t index_temp = index;
|
size_t index_temp = index;
|
||||||
@ -191,6 +280,7 @@ __global__ void upsample_bilinear2d_backward_out_frame(
|
|||||||
static_cast<scalar_t>(h1lambda * w1lambda * d2val),
|
static_cast<scalar_t>(h1lambda * w1lambda * d2val),
|
||||||
true);
|
true);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename scalar_t, typename accscalar_t>
|
template <typename scalar_t, typename accscalar_t>
|
||||||
@ -387,7 +477,6 @@ static void upsample_bilinear2d_backward_out_cuda_template(
|
|||||||
// threads are not covering the whole input tensor.
|
// threads are not covering the whole input tensor.
|
||||||
grad_input.zero_();
|
grad_input.zero_();
|
||||||
|
|
||||||
const size_t num_kernels = nbatch * channels * output_height * output_width;
|
|
||||||
const int num_threads = std::min(
|
const int num_threads = std::min(
|
||||||
at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
|
at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
|
||||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
@ -397,6 +486,12 @@ static void upsample_bilinear2d_backward_out_cuda_template(
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef USE_ROCM
|
||||||
|
constexpr bool use_input = true;
|
||||||
|
#else
|
||||||
|
constexpr bool use_input = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
AT_DISPATCH_FLOATING_TYPES_AND2(
|
AT_DISPATCH_FLOATING_TYPES_AND2(
|
||||||
at::ScalarType::Half, at::ScalarType::BFloat16,
|
at::ScalarType::Half, at::ScalarType::BFloat16,
|
||||||
grad_output_.scalar_type(), "upsample_bilinear2d_backward_out_frame", [&] {
|
grad_output_.scalar_type(), "upsample_bilinear2d_backward_out_frame", [&] {
|
||||||
@ -414,6 +509,8 @@ static void upsample_bilinear2d_backward_out_cuda_template(
|
|||||||
const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
|
const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
|
||||||
input_width, output_width, align_corners, scales_w);
|
input_width, output_width, align_corners, scales_w);
|
||||||
|
|
||||||
|
const size_t num_kernels = nbatch * channels * output_height * output_width;
|
||||||
|
|
||||||
upsample_bilinear2d_backward_nhwc_out_frame<scalar_t, accscalar_t>
|
upsample_bilinear2d_backward_nhwc_out_frame<scalar_t, accscalar_t>
|
||||||
<<<ceil_div(num_kernels, static_cast<size_t>(num_threads)), num_threads, 0, stream>>>(
|
<<<ceil_div(num_kernels, static_cast<size_t>(num_threads)), num_threads, 0, stream>>>(
|
||||||
input_height,
|
input_height,
|
||||||
@ -444,6 +541,8 @@ static void upsample_bilinear2d_backward_out_cuda_template(
|
|||||||
const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
|
const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
|
||||||
input_width, output_width, align_corners, scales_w);
|
input_width, output_width, align_corners, scales_w);
|
||||||
|
|
||||||
|
const size_t num_kernels = nbatch * channels * (use_input ? input_height * input_width : output_height * output_width);
|
||||||
|
|
||||||
upsample_bilinear2d_backward_out_frame<scalar_t, accscalar_t>
|
upsample_bilinear2d_backward_out_frame<scalar_t, accscalar_t>
|
||||||
<<<ceil_div(num_kernels, static_cast<size_t>(num_threads)),
|
<<<ceil_div(num_kernels, static_cast<size_t>(num_threads)),
|
||||||
num_threads,
|
num_threads,
|
||||||
|
@ -141,7 +141,11 @@ WelfordDataLN cuWelfordOnlineSum(
|
|||||||
if constexpr (!rms_norm){
|
if constexpr (!rms_norm){
|
||||||
U delta = val - curr_sum.mean;
|
U delta = val - curr_sum.mean;
|
||||||
U new_count = curr_sum.count + 1.f;
|
U new_count = curr_sum.count + 1.f;
|
||||||
|
#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
|
||||||
|
U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
|
||||||
|
#else
|
||||||
U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
|
U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
|
||||||
|
#endif
|
||||||
return {new_mean, curr_sum.sigma2 + delta * (val - new_mean), new_count};
|
return {new_mean, curr_sum.sigma2 + delta * (val - new_mean), new_count};
|
||||||
} else{
|
} else{
|
||||||
return {0.f, curr_sum.sigma2 + val * val, 0};
|
return {0.f, curr_sum.sigma2 + val * val, 0};
|
||||||
@ -159,7 +163,11 @@ WelfordDataLN cuWelfordCombine(
|
|||||||
U count = dataA.count + dataB.count;
|
U count = dataA.count + dataB.count;
|
||||||
U mean, sigma2;
|
U mean, sigma2;
|
||||||
if (count > decltype(dataB.count){0}) {
|
if (count > decltype(dataB.count){0}) {
|
||||||
|
#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
|
||||||
|
auto coef = __builtin_amdgcn_rcpf(count);
|
||||||
|
#else
|
||||||
auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division
|
auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division
|
||||||
|
#endif
|
||||||
auto nA = dataA.count * coef;
|
auto nA = dataA.count * coef;
|
||||||
auto nB = dataB.count * coef;
|
auto nB = dataB.count * coef;
|
||||||
mean = nA*dataA.mean + nB*dataB.mean;
|
mean = nA*dataA.mean + nB*dataB.mean;
|
||||||
|
@ -466,7 +466,11 @@ struct ReduceJitOp {
|
|||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
|
#ifdef USE_ROCM
|
||||||
for (int offset = 1; offset < dim_x; offset <<= 1) {
|
for (int offset = 1; offset < dim_x; offset <<= 1) {
|
||||||
|
#else
|
||||||
|
for (int offset = dim_x >> 1; offset > 0; offset >>= 1) {
|
||||||
|
#endif
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < output_vec_size; i++) {
|
for (int i = 0; i < output_vec_size; i++) {
|
||||||
arg_t other = reducer::warp_shfl_down(value[i], offset);
|
arg_t other = reducer::warp_shfl_down(value[i], offset);
|
||||||
|
@ -160,8 +160,12 @@ static bool mkldnn_conv_enabled_fpmath_mode_bf16(){
|
|||||||
}
|
}
|
||||||
|
|
||||||
static bool mkldnn_conv_enabled_fpmath_mode_tf32(){
|
static bool mkldnn_conv_enabled_fpmath_mode_tf32(){
|
||||||
return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::CONV) == at::Float32Precision::TF32 &&
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
cpuinfo_has_x86_amx_fp16();
|
return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::CONV) == at::Float32Precision::TF32 &&
|
||||||
|
cpuinfo_has_x86_amx_fp16();
|
||||||
|
#else
|
||||||
|
return false; //TF32 not supported on power system
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline at::MemoryFormat mkldnn_convolution_memory_format(int64_t dims, bool is_channels_last) {
|
static inline at::MemoryFormat mkldnn_convolution_memory_format(int64_t dims, bool is_channels_last) {
|
||||||
|
@ -74,8 +74,12 @@ static bool use_mkldnn_bf32_linear() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static bool use_mkldnn_tf32_linear() {
|
static bool use_mkldnn_tf32_linear() {
|
||||||
return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::TF32 &&
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
|
return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::TF32 &&
|
||||||
cpuinfo_has_x86_amx_fp16();
|
cpuinfo_has_x86_amx_fp16();
|
||||||
|
#else
|
||||||
|
return false; // TF32 not supported on power system
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor mkldnn_linear(
|
Tensor mkldnn_linear(
|
||||||
|
@ -114,8 +114,13 @@ static bool use_mkldnn_bf32_matmul() {
|
|||||||
return use_mkldnn_bf16_matmul() && at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::BF16;
|
return use_mkldnn_bf16_matmul() && at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::BF16;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static bool use_mkldnn_tf32_matmul() {
|
static bool use_mkldnn_tf32_matmul() {
|
||||||
return cpuinfo_has_x86_amx_fp16() && at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::TF32;
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
|
return cpuinfo_has_x86_amx_fp16() && at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::TF32;
|
||||||
|
#else
|
||||||
|
return false; // TF32 not supported on power system
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// returns an ideep::tensor
|
// returns an ideep::tensor
|
||||||
@ -411,7 +416,7 @@ static inline bool checksize(const Tensor& mat1, const Tensor& mat2){
|
|||||||
// else if dim = 3, mat1's size = (b * m * n), mat2's size = (b * n * k)
|
// else if dim = 3, mat1's size = (b * m * n), mat2's size = (b * n * k)
|
||||||
// else called from aten::mv, mat1.size = (m * n), mat2.size = (n)
|
// else called from aten::mv, mat1.size = (m * n), mat2.size = (n)
|
||||||
// only m * n * b * k(if exist) are large enough we can get benefit from mkldnn optimized gemm kernel
|
// only m * n * b * k(if exist) are large enough we can get benefit from mkldnn optimized gemm kernel
|
||||||
static const int64_t mkldnn_gemm_min_size = 16 * 16 * 16;
|
constexpr int64_t mkldnn_gemm_min_size = 16 * 16 * 16;
|
||||||
if (mat1.dim() == 1 && mat2.dim() == 1) {
|
if (mat1.dim() == 1 && mat2.dim() == 1) {
|
||||||
// aten::dot
|
// aten::dot
|
||||||
return mat1.size(0) > mkldnn_gemm_min_size;
|
return mat1.size(0) > mkldnn_gemm_min_size;
|
||||||
|
@ -712,7 +712,7 @@ Tensor wrapped_scalar_tensor_mps(const Scalar& scalar, const Device device) {
|
|||||||
} else if (scalar.isBoolean()) {
|
} else if (scalar.isBoolean()) {
|
||||||
tensor = at::scalar_tensor(scalar, at::device(device).dtype(at::kBool));
|
tensor = at::scalar_tensor(scalar, at::device(device).dtype(at::kBool));
|
||||||
} else if (scalar.isComplex()) {
|
} else if (scalar.isComplex()) {
|
||||||
tensor = at::scalar_tensor(scalar, at::device(device).dtype(at::kComplexDouble));
|
tensor = at::scalar_tensor(scalar, at::device(device).dtype(at::kComplexFloat));
|
||||||
} else {
|
} else {
|
||||||
TORCH_INTERNAL_ASSERT(scalar.isIntegral(false));
|
TORCH_INTERNAL_ASSERT(scalar.isIntegral(false));
|
||||||
tensor = at::scalar_tensor(scalar, at::device(device).dtype(at::kLong));
|
tensor = at::scalar_tensor(scalar, at::device(device).dtype(at::kLong));
|
||||||
|
@ -1,16 +1,16 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <c10/metal/common.h>
|
#include <c10/metal/common.h>
|
||||||
|
|
||||||
template <unsigned N = c10::metal::max_ndim, typename idx_type_t = int64_t>
|
template <typename idx_type_t = int64_t, unsigned N = c10::metal::max_ndim>
|
||||||
struct CatLargeSharedParams {
|
struct CatSharedParams {
|
||||||
int32_t ndim;
|
int32_t ndim;
|
||||||
int32_t cat_dim;
|
int32_t cat_dim;
|
||||||
::c10::metal::array<idx_type_t, N> output_strides;
|
::c10::metal::array<idx_type_t, N> output_strides;
|
||||||
::c10::metal::array<idx_type_t, N> output_sizes;
|
::c10::metal::array<idx_type_t, N> output_sizes;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <unsigned N = c10::metal::max_ndim, typename idx_type_t = int64_t>
|
template <typename idx_type_t = int64_t, unsigned N = c10::metal::max_ndim>
|
||||||
struct CatLargeInputParams {
|
struct CatInputParams {
|
||||||
idx_type_t cat_dim_offset;
|
idx_type_t cat_dim_offset;
|
||||||
idx_type_t input_element_offset;
|
idx_type_t input_element_offset;
|
||||||
::c10::metal::array<idx_type_t, N> input_strides;
|
::c10::metal::array<idx_type_t, N> input_strides;
|
||||||
|
@ -6,26 +6,25 @@
|
|||||||
using namespace metal;
|
using namespace metal;
|
||||||
using namespace c10::metal;
|
using namespace c10::metal;
|
||||||
|
|
||||||
template <typename T_in, typename T_out>
|
template <typename I, typename T_in, typename T_out>
|
||||||
kernel void cat_large(
|
kernel void cat(
|
||||||
constant T_in* input [[buffer(0)]],
|
constant T_in* input [[buffer(0)]],
|
||||||
device T_out* output [[buffer(1)]],
|
device T_out* output [[buffer(1)]],
|
||||||
constant CatLargeSharedParams<>& shared_params [[buffer(2)]],
|
constant CatSharedParams<I>& shared_params [[buffer(2)]],
|
||||||
constant CatLargeInputParams<>& input_params [[buffer(3)]],
|
constant CatInputParams<I>& input_params [[buffer(3)]],
|
||||||
uint tid [[thread_position_in_grid]]) {
|
uint tid [[thread_position_in_grid]]) {
|
||||||
auto ndim = shared_params.ndim;
|
auto ndim = shared_params.ndim;
|
||||||
auto cat_dim = shared_params.cat_dim;
|
auto cat_dim = shared_params.cat_dim;
|
||||||
constant auto& output_strides = shared_params.output_strides;
|
constant auto& output_strides = shared_params.output_strides;
|
||||||
constant auto& output_sizes = shared_params.output_sizes;
|
|
||||||
|
|
||||||
auto cat_dim_offset = input_params.cat_dim_offset;
|
auto cat_dim_offset = input_params.cat_dim_offset;
|
||||||
auto input_element_offset = input_params.input_element_offset;
|
auto input_element_offset = input_params.input_element_offset;
|
||||||
constant auto& input_strides = input_params.input_strides;
|
constant auto& input_strides = input_params.input_strides;
|
||||||
constant auto& input_sizes = input_params.input_sizes;
|
constant auto& input_sizes = input_params.input_sizes;
|
||||||
|
|
||||||
auto input_element_idx = static_cast<int64_t>(tid) + input_element_offset;
|
auto input_element_idx = static_cast<I>(tid) + input_element_offset;
|
||||||
int64_t input_offset = 0;
|
I input_offset = 0;
|
||||||
int64_t output_offset = 0;
|
I output_offset = 0;
|
||||||
|
|
||||||
for (auto dim = ndim - 1; dim >= 0; dim--) {
|
for (auto dim = ndim - 1; dim >= 0; dim--) {
|
||||||
auto dim_size = input_sizes[dim];
|
auto dim_size = input_sizes[dim];
|
||||||
@ -42,41 +41,45 @@ kernel void cat_large(
|
|||||||
output[output_offset] = static_cast<T_out>(input[input_offset]);
|
output[output_offset] = static_cast<T_out>(input[input_offset]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define REGISTER_CAT_LARGE_OP(T_in, T_out) \
|
#define REGISTER_CAT_OP(I, T_in, T_out) \
|
||||||
template [[host_name("cat_large_" #T_in "_" #T_out)]] \
|
template [[host_name("cat_" #I "_" #T_in "_" #T_out)]] \
|
||||||
kernel void cat_large<T_in, T_out>( \
|
kernel void cat<I, T_in, T_out>( \
|
||||||
constant T_in * input [[buffer(0)]], \
|
constant T_in * input [[buffer(0)]], \
|
||||||
device T_out * output [[buffer(1)]], \
|
device T_out * output [[buffer(1)]], \
|
||||||
constant CatLargeSharedParams<> & shared_params [[buffer(2)]], \
|
constant CatSharedParams<I> & shared_params [[buffer(2)]], \
|
||||||
constant CatLargeInputParams<> & input_params [[buffer(3)]], \
|
constant CatInputParams<I> & input_params [[buffer(3)]], \
|
||||||
uint tid [[thread_position_in_grid]]);
|
uint tid [[thread_position_in_grid]]);
|
||||||
|
|
||||||
#define REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(T_out) \
|
#define REGISTER_CAT_OP_ALL_INPUT_TYPES(I, T_out) \
|
||||||
REGISTER_CAT_LARGE_OP(float, T_out); \
|
REGISTER_CAT_OP(I, float, T_out); \
|
||||||
REGISTER_CAT_LARGE_OP(half, T_out); \
|
REGISTER_CAT_OP(I, half, T_out); \
|
||||||
REGISTER_CAT_LARGE_OP(bfloat, T_out); \
|
REGISTER_CAT_OP(I, bfloat, T_out); \
|
||||||
REGISTER_CAT_LARGE_OP(int, T_out); \
|
REGISTER_CAT_OP(I, int, T_out); \
|
||||||
REGISTER_CAT_LARGE_OP(uint, T_out); \
|
REGISTER_CAT_OP(I, uint, T_out); \
|
||||||
REGISTER_CAT_LARGE_OP(long, T_out); \
|
REGISTER_CAT_OP(I, long, T_out); \
|
||||||
REGISTER_CAT_LARGE_OP(ulong, T_out); \
|
REGISTER_CAT_OP(I, ulong, T_out); \
|
||||||
REGISTER_CAT_LARGE_OP(short, T_out); \
|
REGISTER_CAT_OP(I, short, T_out); \
|
||||||
REGISTER_CAT_LARGE_OP(ushort, T_out); \
|
REGISTER_CAT_OP(I, ushort, T_out); \
|
||||||
REGISTER_CAT_LARGE_OP(char, T_out); \
|
REGISTER_CAT_OP(I, char, T_out); \
|
||||||
REGISTER_CAT_LARGE_OP(uchar, T_out); \
|
REGISTER_CAT_OP(I, uchar, T_out); \
|
||||||
REGISTER_CAT_LARGE_OP(bool, T_out);
|
REGISTER_CAT_OP(I, bool, T_out);
|
||||||
|
|
||||||
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(float);
|
#define REGISTER_CAT_FOR_INDEX_TYPE(I) \
|
||||||
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(half);
|
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, float); \
|
||||||
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(bfloat);
|
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, half); \
|
||||||
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(int);
|
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, bfloat); \
|
||||||
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(uint);
|
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, int); \
|
||||||
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(long);
|
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, uint); \
|
||||||
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(ulong);
|
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, long); \
|
||||||
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(short);
|
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, ulong); \
|
||||||
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(ushort);
|
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, short); \
|
||||||
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(char);
|
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, ushort); \
|
||||||
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(uchar);
|
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, char); \
|
||||||
REGISTER_CAT_LARGE_OP_ALL_INPUT_TYPES(bool);
|
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, uchar); \
|
||||||
|
REGISTER_CAT_OP_ALL_INPUT_TYPES(I, bool); \
|
||||||
|
\
|
||||||
|
REGISTER_CAT_OP(I, float2, float2); \
|
||||||
|
REGISTER_CAT_OP(I, half2, half2);
|
||||||
|
|
||||||
REGISTER_CAT_LARGE_OP(float2, float2);
|
REGISTER_CAT_FOR_INDEX_TYPE(int64_t);
|
||||||
REGISTER_CAT_LARGE_OP(half2, half2);
|
REGISTER_CAT_FOR_INDEX_TYPE(int32_t);
|
||||||
|
@ -54,6 +54,10 @@ Tensor dot_mps(const Tensor& self, const Tensor& other) {
|
|||||||
using namespace mps;
|
using namespace mps;
|
||||||
using CachedGraph = MPSBinaryCachedGraph;
|
using CachedGraph = MPSBinaryCachedGraph;
|
||||||
|
|
||||||
|
if (self.numel() == 0 & other.numel() == 0) {
|
||||||
|
return zeros({}, self.options());
|
||||||
|
}
|
||||||
|
|
||||||
dot_check(self, other);
|
dot_check(self, other);
|
||||||
|
|
||||||
auto output = at::empty({}, self.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
|
auto output = at::empty({}, self.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
|
||||||
|
@ -907,6 +907,8 @@ Tensor& index_fill_mps_(Tensor& self, int64_t dim, const Tensor& index, const Te
|
|||||||
TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int,
|
TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int,
|
||||||
"index_fill_(): Expected dtype int32 or int64 for index");
|
"index_fill_(): Expected dtype int32 or int64 for index");
|
||||||
TORCH_CHECK(dim == 0 || dim < self.dim(), "index_fill_(): Indexing dim ", dim, " is out of bounds of tensor");
|
TORCH_CHECK(dim == 0 || dim < self.dim(), "index_fill_(): Indexing dim ", dim, " is out of bounds of tensor");
|
||||||
|
TORCH_CHECK(self.is_complex() || !source.is_complex(),
|
||||||
|
"index_fill_(): Converting complex Scalar to non-complex type is not supported");
|
||||||
// MPS.scatter crashes if used with complex dtypes
|
// MPS.scatter crashes if used with complex dtypes
|
||||||
TORCH_CHECK(!c10::isComplexType(self.scalar_type()), "index_fill_(): Complex types are yet not supported");
|
TORCH_CHECK(!c10::isComplexType(self.scalar_type()), "index_fill_(): Complex types are yet not supported");
|
||||||
|
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include <ATen/MemoryOverlap.h>
|
#include <ATen/MemoryOverlap.h>
|
||||||
#include <ATen/WrapDimUtils.h>
|
#include <ATen/WrapDimUtils.h>
|
||||||
#include <ATen/mps/MPSProfiler.h>
|
#include <ATen/mps/MPSProfiler.h>
|
||||||
|
#include <ATen/native/Pool.h>
|
||||||
#include <ATen/native/TensorShape.h>
|
#include <ATen/native/TensorShape.h>
|
||||||
#include <ATen/native/TypeProperties.h>
|
#include <ATen/native/TypeProperties.h>
|
||||||
#include <ATen/native/mps/OperationUtils.h>
|
#include <ATen/native/mps/OperationUtils.h>
|
||||||
@ -69,29 +70,40 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// This implementation of cat is used only if one of the inputs or the output is
|
template <typename T>
|
||||||
// too large to use MPSGraph.
|
std::string get_type_str();
|
||||||
|
|
||||||
|
template <>
|
||||||
|
std::string get_type_str<int64_t>() {
|
||||||
|
return "int64_t";
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
std::string get_type_str<int32_t>() {
|
||||||
|
return "int32_t";
|
||||||
|
}
|
||||||
|
|
||||||
// NOTE: `output` is expected to already have the correct size.
|
// NOTE: `output` is expected to already have the correct size.
|
||||||
static void cat_out_large_tensor_mps(const ITensorListRef& inputs, int64_t dimension, const Tensor& output) {
|
template <typename idx_type_t>
|
||||||
CatLargeSharedParams shared_params;
|
static void cat_out_mps_impl(const ITensorListRef& inputs, int64_t dimension, const Tensor& output) {
|
||||||
|
CatSharedParams<idx_type_t> shared_params;
|
||||||
|
|
||||||
shared_params.ndim = output.dim();
|
shared_params.ndim = output.dim();
|
||||||
shared_params.cat_dim = dimension;
|
shared_params.cat_dim = dimension;
|
||||||
|
|
||||||
for (const auto dim : c10::irange(output.dim())) {
|
for (const auto dim : c10::irange(output.dim())) {
|
||||||
shared_params.output_strides[dim] = output.stride(dim);
|
shared_params.output_strides[dim] = safe_downcast<idx_type_t, int64_t>(output.stride(dim));
|
||||||
shared_params.output_sizes[dim] = output.size(dim);
|
shared_params.output_sizes[dim] = safe_downcast<idx_type_t, int64_t>(output.size(dim));
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t cat_dim_offset = 0;
|
idx_type_t cat_dim_offset = 0;
|
||||||
size_t input_idx = 0;
|
size_t input_idx = 0;
|
||||||
MPSStream* stream = getCurrentMPSStream();
|
MPSStream* stream = getCurrentMPSStream();
|
||||||
|
|
||||||
// Launch a separate kernels for each input. This will produce some overhead,
|
// Launch a separate kernels for each input. This will produce some overhead.
|
||||||
// but that should be relatively minimal since at least one of the inputs is
|
// In order to launch only one kernel to process all inputs, we would have to
|
||||||
// very large. In order to launch only one kernel to process all inputs, we
|
// copy all the input tensor data into a packed buffer, which would not be
|
||||||
// would have to copy all the input tensor data into a packed buffer, which
|
// ideal.
|
||||||
// would not be ideal.
|
|
||||||
for (const Tensor& input : inputs) {
|
for (const Tensor& input : inputs) {
|
||||||
if (input.numel() == 0) {
|
if (input.numel() == 0) {
|
||||||
continue;
|
continue;
|
||||||
@ -104,21 +116,23 @@ static void cat_out_large_tensor_mps(const ITensorListRef& inputs, int64_t dimen
|
|||||||
|
|
||||||
for (int64_t numel_remaining = input.numel(); numel_remaining > 0; numel_remaining -= max_num_threads) {
|
for (int64_t numel_remaining = input.numel(); numel_remaining > 0; numel_remaining -= max_num_threads) {
|
||||||
auto num_threads = std::min(max_num_threads, numel_remaining);
|
auto num_threads = std::min(max_num_threads, numel_remaining);
|
||||||
CatLargeInputParams input_params;
|
CatInputParams<idx_type_t> input_params;
|
||||||
|
|
||||||
input_params.cat_dim_offset = cat_dim_offset;
|
input_params.cat_dim_offset = safe_downcast<idx_type_t, int64_t>(cat_dim_offset);
|
||||||
input_params.input_element_offset = input.numel() - numel_remaining;
|
input_params.input_element_offset = safe_downcast<idx_type_t, int64_t>(input.numel() - numel_remaining);
|
||||||
|
|
||||||
for (const auto dim : c10::irange(input.dim())) {
|
for (const auto dim : c10::irange(input.dim())) {
|
||||||
input_params.input_strides[dim] = input.stride(dim);
|
input_params.input_strides[dim] = safe_downcast<idx_type_t, int64_t>(input.stride(dim));
|
||||||
input_params.input_sizes[dim] = input.size(dim);
|
input_params.input_sizes[dim] = safe_downcast<idx_type_t, int64_t>(input.size(dim));
|
||||||
}
|
}
|
||||||
|
|
||||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
|
id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
|
||||||
auto pipeline_state = lib.getPipelineStateForFunc(
|
auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("cat_{}_{}_{}",
|
||||||
fmt::format("cat_large_{}_{}", scalarToMetalTypeString(input), scalarToMetalTypeString(output)));
|
get_type_str<idx_type_t>(),
|
||||||
|
scalarToMetalTypeString(input),
|
||||||
|
scalarToMetalTypeString(output)));
|
||||||
getMPSProfiler().beginProfileKernel(pipeline_state, "cat", {input});
|
getMPSProfiler().beginProfileKernel(pipeline_state, "cat", {input});
|
||||||
[computeEncoder setComputePipelineState:pipeline_state];
|
[computeEncoder setComputePipelineState:pipeline_state];
|
||||||
mtl_setArgs(computeEncoder, input, output, shared_params, input_params);
|
mtl_setArgs(computeEncoder, input, output, shared_params, input_params);
|
||||||
@ -294,13 +308,6 @@ TORCH_IMPL_FUNC(cat_out_mps)
|
|||||||
" and out is on ",
|
" and out is on ",
|
||||||
out.device());
|
out.device());
|
||||||
|
|
||||||
// TODO: For better performance by eliminating input tensor gathering and post transpose,
|
|
||||||
// TODO: it is better to keep the out tensor's memory format.
|
|
||||||
// TODO: dimension needs to be recomputed as:
|
|
||||||
// TODO: dim = 0 --> dim = 0; dim = 1 or 2 --> dim = out.dim()- dim; otherwise dim = dim-1
|
|
||||||
if (needsGather(out)) {
|
|
||||||
out.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
|
|
||||||
}
|
|
||||||
std::vector<int64_t> size(notSkippedTensor.sizes().vec());
|
std::vector<int64_t> size(notSkippedTensor.sizes().vec());
|
||||||
|
|
||||||
// Compute size of the result in the cat dimension
|
// Compute size of the result in the cat dimension
|
||||||
@ -331,82 +338,9 @@ TORCH_IMPL_FUNC(cat_out_mps)
|
|||||||
has_large_tensor |= isTooLargeForMPSGraph(out);
|
has_large_tensor |= isTooLargeForMPSGraph(out);
|
||||||
|
|
||||||
if (has_large_tensor) {
|
if (has_large_tensor) {
|
||||||
return mps::cat_out_large_tensor_mps(materialized_inputs, dimension, out);
|
return mps::cat_out_mps_impl<int64_t>(materialized_inputs, dimension, out);
|
||||||
}
|
} else {
|
||||||
|
return mps::cat_out_mps_impl<int32_t>(materialized_inputs, dimension, out);
|
||||||
struct CachedGraph : public MPSCachedGraph {
|
|
||||||
CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
|
|
||||||
std::vector<MPSGraphTensor*> inputTensors_;
|
|
||||||
MPSGraphTensor* outputTensor_ = nil;
|
|
||||||
};
|
|
||||||
|
|
||||||
@autoreleasepool {
|
|
||||||
std::string key = "cat_out_mps:" + std::to_string(dimension) + ":" +
|
|
||||||
(memory_format == MemoryFormat::ChannelsLast ? "NHWC" : "NCHW");
|
|
||||||
if (!all_same_dtype) {
|
|
||||||
key += getTensorsStringKey(input_tensors, true, all_same_sizes_and_stride);
|
|
||||||
} else {
|
|
||||||
key += ":" + getMPSTypeString(input_tensors[0].scalar_type(), true) + ":" + std::to_string(inputs.size());
|
|
||||||
}
|
|
||||||
for (auto idx : skipped_tensor_indices) {
|
|
||||||
key += "," + std::to_string(idx);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
|
||||||
auto len_tensor_array = inputs.size() - skipped_tensor_indices.size();
|
|
||||||
std::vector<MPSGraphTensor*> castInputTensors(len_tensor_array);
|
|
||||||
newCachedGraph->inputTensors_.reserve(len_tensor_array);
|
|
||||||
|
|
||||||
for (const auto idx : c10::irange(len_tensor_array)) {
|
|
||||||
const Tensor& tensor = input_tensors[idx];
|
|
||||||
auto scalar_type = getMPSScalarType(tensor.scalar_type());
|
|
||||||
if (tensor.scalar_type() == kBool) {
|
|
||||||
scalar_type = MPSDataTypeInt8;
|
|
||||||
}
|
|
||||||
newCachedGraph->inputTensors_[idx] = mpsGraphUnrankedPlaceHolder(mpsGraph, scalar_type);
|
|
||||||
if (tensor.scalar_type() != out_dtype) {
|
|
||||||
castInputTensors[idx] = [mpsGraph castTensor:newCachedGraph->inputTensors_[idx]
|
|
||||||
toType:getMPSDataType(out_dtype)
|
|
||||||
name:@"castInput"];
|
|
||||||
} else {
|
|
||||||
castInputTensors[idx] = newCachedGraph->inputTensors_[idx];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
auto inputTensorsArray = [NSArray arrayWithObjects:castInputTensors.data() count:len_tensor_array];
|
|
||||||
MPSGraphTensor* outputTensor = [mpsGraph concatTensors:inputTensorsArray
|
|
||||||
dimension:dimension // Maybe convert this from int64_t -> int32
|
|
||||||
name:nil];
|
|
||||||
if (getMPSDataType(out_dtype) == MPSDataTypeBool) {
|
|
||||||
outputTensor = [mpsGraph castTensor:outputTensor toType:MPSDataTypeBool name:@"outputTensor"];
|
|
||||||
}
|
|
||||||
newCachedGraph->outputTensor_ = outputTensor;
|
|
||||||
});
|
|
||||||
|
|
||||||
std::vector<Placeholder> inputPlaceholders;
|
|
||||||
int i = 0;
|
|
||||||
int t_idx = 0;
|
|
||||||
for (const Tensor& tensor : materialized_inputs) {
|
|
||||||
if (std::find(skipped_tensor_indices.begin(), skipped_tensor_indices.end(), i) == skipped_tensor_indices.end()) {
|
|
||||||
auto scalar_type = getMPSScalarType(tensor.scalar_type());
|
|
||||||
if (tensor.scalar_type() == kBool) {
|
|
||||||
scalar_type = MPSDataTypeInt8;
|
|
||||||
}
|
|
||||||
inputPlaceholders.emplace_back(cachedGraph->inputTensors_[t_idx], tensor, nullptr, true, scalar_type);
|
|
||||||
t_idx++;
|
|
||||||
}
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto outputDataType = getMPSScalarType(out.scalar_type());
|
|
||||||
Placeholder outputPlaceholder =
|
|
||||||
Placeholder(cachedGraph->outputTensor_, out, /*mpsShape=*/nil, /*gatherTensorData=*/false, outputDataType);
|
|
||||||
|
|
||||||
NSMutableDictionary* feeds = [[NSMutableDictionary new] autorelease];
|
|
||||||
for (auto& inputPlaceholder : inputPlaceholders) {
|
|
||||||
feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
|
|
||||||
}
|
|
||||||
runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6531,6 +6531,7 @@
|
|||||||
dispatch:
|
dispatch:
|
||||||
CPU, CUDA: var
|
CPU, CUDA: var
|
||||||
MPS: var_mps
|
MPS: var_mps
|
||||||
|
MTIA: var_mtia
|
||||||
tags: core
|
tags: core
|
||||||
|
|
||||||
- func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
- func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||||
@ -7183,6 +7184,12 @@
|
|||||||
CUDA: _scaled_grouped_mm_cuda
|
CUDA: _scaled_grouped_mm_cuda
|
||||||
tags: needs_exact_strides
|
tags: needs_exact_strides
|
||||||
|
|
||||||
|
- func: _scaled_grouped_mm_v2(Tensor self, Tensor mat2, Tensor[] scale_a, int[] recipe_a, int[] swizzle_a, Tensor[] scale_b, int[] recipe_b, int[] swizzle_b, Tensor? offs=None, Tensor? bias=None, ScalarType? out_dtype=None, int[] contraction_dim=[], bool use_fast_accum=False) -> Tensor
|
||||||
|
variants: function
|
||||||
|
dispatch:
|
||||||
|
CUDA: _scaled_grouped_mm_cuda_v2
|
||||||
|
tags: needs_exact_strides
|
||||||
|
|
||||||
- func: _grouped_mm(Tensor self, Tensor mat2, Tensor? offs=None, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensor
|
- func: _grouped_mm(Tensor self, Tensor mat2, Tensor? offs=None, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensor
|
||||||
variants: function
|
variants: function
|
||||||
dispatch:
|
dispatch:
|
||||||
@ -7378,7 +7385,7 @@
|
|||||||
- func: sparse_mask(Tensor self, Tensor mask) -> Tensor
|
- func: sparse_mask(Tensor self, Tensor mask) -> Tensor
|
||||||
variants: method
|
variants: method
|
||||||
dispatch:
|
dispatch:
|
||||||
SparseCPU, SparseCUDA: sparse_mask
|
SparseCPU, SparseCUDA, SparseMPS: sparse_mask
|
||||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_mask_sparse_compressed
|
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_mask_sparse_compressed
|
||||||
autogen: sparse_mask.out
|
autogen: sparse_mask.out
|
||||||
|
|
||||||
|
@ -184,15 +184,23 @@ std::tuple<Tensor, Tensor, Tensor> _fake_quantize_learnable_per_tensor_affine_ba
|
|||||||
0 & \text{ else }
|
0 & \text{ else }
|
||||||
\end{cases}
|
\end{cases}
|
||||||
*/
|
*/
|
||||||
float scale_val = scale[0].item<float>();
|
|
||||||
float inv_scale_val = 1.0f / scale_val;
|
|
||||||
int64_t zero_point_val = native::_get_zero_point_from_tensor(zero_point, quant_min, quant_max, false);
|
|
||||||
|
|
||||||
TORCH_CHECK(dY.scalar_type() == ScalarType::Float);
|
bool is_bfloat16 = (X.scalar_type() == at::kBFloat16);
|
||||||
TORCH_CHECK(X.scalar_type() == ScalarType::Float);
|
|
||||||
TORCH_CHECK(scale.scalar_type() == ScalarType::Float);
|
at::Tensor X_ = is_bfloat16 ? X.to(ScalarType::Float) : X;
|
||||||
TORCH_CHECK(zero_point.scalar_type() == ScalarType::Float);
|
at::Tensor dY_ = is_bfloat16 ? dY.to(ScalarType::Float) : dY;
|
||||||
TORCH_CHECK(X.numel() == dY.numel(), "`X` and `dY` are not the same size");
|
at::Tensor scale_ = is_bfloat16 ? scale.to(ScalarType::Float) : scale;
|
||||||
|
at::Tensor zero_point_ = is_bfloat16 ? zero_point.to(ScalarType::Float) : zero_point;
|
||||||
|
|
||||||
|
float scale_val = scale_[0].item<float>();
|
||||||
|
float inv_scale_val = 1.0f / scale_val;
|
||||||
|
int64_t zero_point_val = native::_get_zero_point_from_tensor(zero_point_, quant_min, quant_max, false);
|
||||||
|
|
||||||
|
TORCH_CHECK(dY_.scalar_type() == ScalarType::Float);
|
||||||
|
TORCH_CHECK(X_.scalar_type() == ScalarType::Float);
|
||||||
|
TORCH_CHECK(scale_.scalar_type() == ScalarType::Float);
|
||||||
|
TORCH_CHECK(zero_point_.scalar_type() == ScalarType::Float);
|
||||||
|
TORCH_CHECK(X_.numel() == dY_.numel(), "`X` and `dY` are not the same size");
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
quant_min <= 0 && quant_max >= 0,
|
quant_min <= 0 && quant_max >= 0,
|
||||||
"`quant_min` should be less than or \
|
"`quant_min` should be less than or \
|
||||||
@ -200,28 +208,28 @@ std::tuple<Tensor, Tensor, Tensor> _fake_quantize_learnable_per_tensor_affine_ba
|
|||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
zero_point_val >= quant_min && zero_point_val <= quant_max,
|
zero_point_val >= quant_min && zero_point_val <= quant_max,
|
||||||
"`zero_point` must be between `quant_min` and `quant_max`.");
|
"`zero_point` must be between `quant_min` and `quant_max`.");
|
||||||
if (X.numel() <= 0) {
|
if (X_.numel() <= 0) {
|
||||||
return std::make_tuple(X, scale, zero_point);
|
return std::make_tuple(X, scale, zero_point);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto dX = at::empty_like(X, X.options(), MemoryFormat::Preserve);
|
auto dX = at::empty_like(X_, X_.options(), MemoryFormat::Preserve);
|
||||||
auto dScale_vec = at::empty_like(X, X.options(), MemoryFormat::Preserve);
|
auto dScale_vec = at::empty_like(X_, X_.options(), MemoryFormat::Preserve);
|
||||||
auto dZeroPoint_vec = at::empty_like(X, X.options(), MemoryFormat::Preserve);
|
auto dZeroPoint_vec = at::empty_like(X_, X_.options(), MemoryFormat::Preserve);
|
||||||
|
|
||||||
auto iter = TensorIteratorConfig()
|
auto iter = TensorIteratorConfig()
|
||||||
.add_output(dX)
|
.add_output(dX)
|
||||||
.add_output(dScale_vec)
|
.add_output(dScale_vec)
|
||||||
.add_output(dZeroPoint_vec)
|
.add_output(dZeroPoint_vec)
|
||||||
.add_input(X)
|
.add_input(X_)
|
||||||
.add_input(dY)
|
.add_input(dY_)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
fake_quant_grad_learnable_tensor_stub(
|
fake_quant_grad_learnable_tensor_stub(
|
||||||
X.device().type(), iter, scale_val, inv_scale_val, zero_point_val, quant_min, quant_max, grad_factor);
|
X_.device().type(), iter, scale_val, inv_scale_val, zero_point_val, quant_min, quant_max, grad_factor);
|
||||||
|
|
||||||
// The total sums over the scale and zero point gradient vectors are what will be returned in the end.
|
// The total sums over the scale and zero point gradient vectors are what will be returned in the end.
|
||||||
auto dScale = dScale_vec.sum().unsqueeze(0).to(scale.device());
|
auto dScale = dScale_vec.sum().unsqueeze(0).to(scale_.device());
|
||||||
auto dZeroPoint = dZeroPoint_vec.sum().unsqueeze(0).to(zero_point.device());
|
auto dZeroPoint = dZeroPoint_vec.sum().unsqueeze(0).to(zero_point_.device());
|
||||||
|
|
||||||
return std::make_tuple(dX, dScale, dZeroPoint);
|
return std::make_tuple(dX, dScale, dZeroPoint);
|
||||||
}
|
}
|
||||||
|
@ -3551,7 +3551,7 @@ void dequantize_tensor_per_tensor_affine_cpu(
|
|||||||
|
|
||||||
#if defined(__ARM_NEON__) || defined(__aarch64__)
|
#if defined(__ARM_NEON__) || defined(__aarch64__)
|
||||||
|
|
||||||
const static int PARALLEL_THRESHOLD = 1 << 20;
|
constexpr static int PARALLEL_THRESHOLD = 1 << 20;
|
||||||
|
|
||||||
// Generic template defaults to naive quantize implementation
|
// Generic template defaults to naive quantize implementation
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user