Compare commits

..

3 Commits

Author SHA1 Message Date
e4de72ea5d Experiment for cold start - fake tensor 2025-09-18 11:20:55 -07:00
d9258fb366 [functionalize] Avoid one more call to custom get_device on FunctionalTensorWrapper
ghstack-source-id: 801aa346f3a2519296f325c6a4b69c09cb484b95
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163019
2025-09-16 09:53:02 -07:00
ef78f99412 [functional] Use the saved device on storage instead for device_custom
ghstack-source-id: a2f54f448ccd8eb4c10f12243ccc8ecf98ae6036
Pull Request resolved: https://github.com/pytorch/pytorch/pull/162987
2025-09-16 09:53:02 -07:00
619 changed files with 8088 additions and 23650 deletions

View File

@ -31,7 +31,8 @@ pip install -r /pytorch/requirements.txt
pip install auditwheel==6.2.0 wheel
if [ "$DESIRED_CUDA" = "cpu" ]; then
echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
else
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
export USE_SYSTEM_NCCL=1
@ -45,5 +46,6 @@ else
export USE_NVIDIA_PYPI_LIBS=1
fi
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
fi

View File

@ -317,7 +317,7 @@ if __name__ == "__main__":
).decode()
print("Building PyTorch wheel")
build_vars = ""
build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
if enable_cuda:
build_vars += "MAX_JOBS=5 "

View File

@ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
try:
with socket.create_connection((addr, port), timeout=timeout):
return
except (ConnectionRefusedError, TimeoutError): # noqa: PERF203
except (ConnectionRefusedError, socket.timeout): # noqa: PERF203
if i == attempt_cnt - 1:
raise
time.sleep(timeout)

View File

@ -262,10 +262,13 @@ case "$tag" in
TRITON_CPU=yes
;;
pytorch-linux-jammy-linter)
PYTHON_VERSION=3.10
# TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
# We will need to update mypy version eventually, but that's for another day. The task
# would be to upgrade mypy to 1.0.0 with Python 3.11
PYTHON_VERSION=3.9
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter)
PYTHON_VERSION=3.10
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
PYTHON_VERSION=3.9
CUDA_VERSION=12.8.1
;;
pytorch-linux-jammy-aarch64-py3.10-gcc11)

View File

@ -1 +1 @@
e0dda9059d082537cee36be6c5e4fe3b18c880c0
56392aa978594cc155fa8af48cd949f5b5f1823a

View File

@ -1,2 +1,2 @@
transformers==4.56.0
transformers==4.54.0
soxr==0.5.0

View File

@ -1 +1 @@
bbb06c0334a6772b92d24bde54956e675c8c6604
5ae38bdb0dc066c5823e34dc9797afb9de42c866

View File

@ -42,27 +42,22 @@ install_pip_dependencies() {
# A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current
# numba and scipy version used in PyTorch CI
conda_run pip uninstall -y numba scipy
# Yaspin is needed for running CI test (get_benchmark_analysis_data.py)
pip_install yaspin==3.1.0
popd
}
setup_executorch() {
pushd executorch
export PYTHON_EXECUTABLE=python
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON"
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
popd
}
if [ $# -eq 0 ]; then
clone_executorch
install_buck2
install_conda_dependencies
install_pip_dependencies
pushd executorch
setup_executorch
popd
else
"$@"
fi
clone_executorch
install_buck2
install_conda_dependencies
install_pip_dependencies
setup_executorch

View File

@ -93,9 +93,8 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
#Pinned versions:
#test that import:
mypy==1.16.0 ; platform_system != "Windows"
mypy==1.16.0
# Pin MyPy version because new errors are likely to appear with each release
# Skip on Windows as lots of type annotations are POSIX specific
#Description: linter
#Pinned versions: 1.16.0
#test that import: test_typing.py, test_type_hints.py

View File

@ -1,7 +1,7 @@
sphinx==5.3.0
#Description: This is used to generate PyTorch docs
#Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably

View File

@ -41,6 +41,7 @@ def sample_vllm_test_library():
"pytest -v -s basic_correctness/test_cumem.py",
"pytest -v -s basic_correctness/test_basic_correctness.py",
"pytest -v -s basic_correctness/test_cpu_offload.py",
"VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
],
},
"vllm_basic_models_test": {
@ -67,12 +68,15 @@ def sample_vllm_test_library():
"-v",
"-s",
"entrypoints/llm",
"--ignore=entrypoints/llm/test_lazy_outlines.py",
"--ignore=entrypoints/llm/test_generate.py",
"--ignore=entrypoints/llm/test_generate_multiple_loras.py",
"--ignore=entrypoints/llm/test_collective_rpc.py",
]
),
"pytest -v -s entrypoints/llm/test_generate.py",
"pytest -v -s entrypoints/offline_mode",
"pytest -v -s entrypoints/llm/test_lazy_outlines.py",
"pytest -v -s entrypoints/llm/test_generate.py ",
"VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
],
},
"vllm_regression_test": {

View File

@ -0,0 +1,40 @@
#!/bin/bash
# This is where the local pytorch install in the docker image is located
pt_checkout="/var/lib/jenkins/workspace"
source "$pt_checkout/.ci/pytorch/common_utils.sh"
echo "functorch_doc_push_script.sh: Invoked with $*"
set -ex -o pipefail
version=${DOCS_VERSION:-nightly}
echo "version: $version"
# Build functorch docs
pushd $pt_checkout/functorch/docs
make html
popd
git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages
pushd functorch_ghpages
if [ "$version" == "main" ]; then
version=nightly
fi
git rm -rf "$version" || true
mv "$pt_checkout/functorch/docs/build/html" "$version"
git add "$version" || true
git status
git config user.email "soumith+bot@pytorch.org"
git config user.name "pytorchbot"
# If there aren't changes, don't make a commit; push is no-op
git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
git status
if [[ "${WITH_PUSH:-}" == true ]]; then
git push -u origin gh-pages
fi
popd

View File

@ -1,25 +0,0 @@
From 6e08c9d08e9de59c7af28b720289debbbd384764 Mon Sep 17 00:00:00 2001
From: Michael Wang <13521008+isVoid@users.noreply.github.com>
Date: Tue, 1 Apr 2025 17:28:05 -0700
Subject: [PATCH] Avoid bumping certain driver API to avoid future breakage
(#185)
Co-authored-by: isVoid <isVoid@users.noreply.github.com>
---
numba_cuda/numba/cuda/cudadrv/driver.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 1641bf77..233e9ed7 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -365,6 +365,9 @@ def _find_api(self, fname):
else:
variants = ('_v2', '')
+ if fname in ("cuCtxGetDevice", "cuCtxSynchronize"):
+ return getattr(self.lib, fname)
+
for variant in variants:
try:
return getattr(self.lib, f'{fname}{variant}')

View File

@ -32,16 +32,6 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v
git config --global --add safe.directory /var/lib/jenkins/workspace
fi
# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
if [ -n "$NUMBA_CUDA_DIR" ]; then
NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
pushd "$NUMBA_CUDA_DIR"
patch -p4 <"$NUMBA_PATCH"
popd
fi
echo "Environment variables:"
env
@ -334,17 +324,11 @@ test_python() {
}
test_python_smoke() {
# Smoke tests for H100/B200
# Smoke tests for H100
time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
assert_git_not_dirty
}
test_python_smoke_b200() {
# Targeted smoke tests for B200 - staged approach to avoid too many failures
time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
assert_git_not_dirty
}
test_h100_distributed() {
# Distributed tests at H100
time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
@ -1556,10 +1540,14 @@ test_executorch() {
install_torchvision
install_torchaudio
INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh"
pushd /executorch
"${INSTALL_SCRIPT}" setup_executorch
export PYTHON_EXECUTABLE=python
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
# NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
# from the PR
bash .ci/scripts/setup-linux.sh --build-tool cmake
echo "Run ExecuTorch unit tests"
pytest -v -n auto
@ -1573,6 +1561,10 @@ test_executorch() {
popd
# Test torchgen generated code for Executorch.
echo "Testing ExecuTorch op registration"
"$BUILD_BIN_DIR"/test_edge_op_registration
assert_git_not_dirty
}
@ -1580,7 +1572,6 @@ test_linux_aarch64() {
python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
# Dynamo tests
@ -1779,8 +1770,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
test_xpu_bin
elif [[ "${TEST_CONFIG}" == smoke ]]; then
test_python_smoke
elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then
test_python_smoke_b200
elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
test_h100_distributed
elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then

View File

@ -137,7 +137,7 @@ sccache --show-stats
python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
(
if "%BUILD_ENVIRONMENT%"=="" (
echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash.
echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
) else (
copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%"

View File

@ -3,12 +3,12 @@ if "%BUILD_ENVIRONMENT%"=="" (
) else (
set CONDA_PARENT_DIR=C:\Jenkins
)
set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3
:: Be conservative here when rolling out the new AMI with conda. This will try
:: to install conda as before if it couldn't find the conda installation. This
:: can be removed eventually after we gain enough confidence in the AMI
if not exist %CONDA_ROOT_DIR% (
if not exist %CONDA_PARENT_DIR%\Miniconda3 (
set INSTALL_FRESH_CONDA=1
)
@ -17,14 +17,10 @@ if "%INSTALL_FRESH_CONDA%"=="1" (
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
%TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR%
%TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
)
:: Activate conda so that we can use its commands, i.e. conda, python, pip
call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%
:: Activate conda so that we can use its commands, i.e. conda, python, pip
call conda activate py_tmp
call pip install -r .ci/docker/requirements-ci.txt
call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3

View File

@ -14,7 +14,7 @@ if not errorlevel 0 exit /b
:: build\torch. Rather than changing all these references, making a copy of torch folder
:: from conda to the current workspace is easier. The workspace will be cleaned up after
:: the job anyway
xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
pushd .
if "%VC_VERSION%" == "" (

View File

@ -38,14 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
fi
# TODO: Move both of them to Windows AMI
python -m pip install tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
# Copied from https://github.com/pytorch/test-infra/blob/be01a40157c36cd5a48391fdf44a7bc3ebd4c7e3/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1#L16 with some adjustments
# pytest-rerunfailures==10.3 as 10.2 fails with INTERNALERROR> pluggy._manager.PluginValidationError: unknown hook 'pytest_configure_node'
# scipy from 1.6.3 to 1.10
# expecttest from 0.1.3 to 0.3.0
# xdoctest from 1.0.2 to 1.3.0
python -m pip install "future==0.18.2" "hypothesis==5.35.1" "expecttest==0.3.0" "librosa>=0.6.2" "scipy==1.10.1" "psutil==5.9.1" "pynvml==11.4.1" "pillow==9.2.0" "unittest-xml-reporting<=3.2.0,>=2.0.0" "pytest==7.1.3" "pytest-xdist==2.5.0" "pytest-flakefinder==1.1.0" "pytest-rerunfailures==10.3" "pytest-shard==0.1.2" "sympy==1.11.1" "xdoctest==1.3.0" "pygments==2.12.0" "opt-einsum>=3.3" "networkx==2.8.8" "mpmath==1.2.1" "pytest-cpp==2.3.0" "boto3==1.35.42"
python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
# Install Z3 optional dependency for Windows builds.
python -m pip install z3-solver==4.15.1.0
@ -59,6 +52,9 @@ python -m pip install parameterized==0.8.1
# Install pulp for testing ilps under torch\distributed\_tools
python -m pip install pulp==2.9.0
# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
python -m pip install expecttest==0.3.0
run_tests() {
# Run nvidia-smi if available
for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do

View File

@ -264,7 +264,7 @@ def unzip_artifact_and_replace_files() -> None:
change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py")
for file in Path(f"artifacts/dist/{old_stem}").glob(
"*.dist-info/*",
"*.dist-info/**",
):
change_content_to_new_version(file)

View File

@ -6,12 +6,6 @@ inputs:
cuda-version:
description: which cuda version to install, 'cpu' for none
required: true
python-version:
required: false
type: string
default: "3.10"
description: |
The python version to be used. Will be 3.10 by default
runs:
using: composite
@ -44,24 +38,18 @@ runs:
CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat"
{
echo "CONDA=${CONDA}";
echo "CONDA_RUN=${CONDA} run --no-capture-output";
echo "CONDA_BUILD=${CONDA} run conda-build";
echo "CONDA_INSTALL=${CONDA} install";
} >> "${GITHUB_ENV}"
- name: Setup Python3
env:
PYTHON_VERSION: ${{ inputs.python-version }}
shell: bash
run: |
set +e
set -x
# Create new py_tmp env with python-version
${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp
PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
PYTHON3=$(${CONDA_RUN} which python3)
EXIT_CODE=$?
if [[ "${EXIT_CODE}" == "0" ]]; then
@ -74,7 +62,7 @@ runs:
# installation, which is Python 3 based. Its Python is default to Python 3. Further, there
# is also the Miniconda installation that is Python 2 based, and both can be installed if
# needed. In both cases, Python binary is just called python
PYTHON=$(${CONDA_RUN} -n py_tmp which python)
PYTHON=$(${CONDA_RUN} which python)
EXIT_CODE=$?
if [[ "${EXIT_CODE}" == "0" ]]; then

View File

@ -1 +1 @@
090197034faf3b193c4467cedeb9281e3078892d
5bcc153d7bf69ef34bc5788a33f60f1792cf2861

3
.github/labeler.yml vendored
View File

@ -130,6 +130,3 @@
- torch/csrc/inductor/aoti_include/**
- torchgen/aoti/**
- torchgen/gen_aoti_c_shim.py
"ciflow/vllm":
- .github/ci_commit_pins/vllm.txt

View File

@ -36,7 +36,6 @@ ciflow_push_tags:
- ciflow/win-arm64
- ciflow/h100-symm-mem
- ciflow/h100-cutlass-backend
- ciflow/b200
retryable_workflows:
- pull
- trunk

View File

@ -135,7 +135,7 @@ ROCM_SMOKE_WORKFLOWS = [
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
OperatingSystem.LINUX,
arches=["6.4"],
python_versions=["3.10"],
python_versions=["3.9"],
),
ciflow_config=CIFlowConfig(
labels={

View File

@ -187,6 +187,8 @@ jobs:
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
with:
driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }}
if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}
- name: configure aws credentials

View File

@ -75,6 +75,10 @@ jobs:
runner: ${{ inputs.runner_prefix }}linux.2xlarge
# It takes less than 30m to finish python docs unless there are issues
timeout-minutes: 30
- docs_type: functorch
runner: ${{ inputs.runner_prefix }}linux.2xlarge
# It takes less than 15m to finish functorch docs unless there are issues
timeout-minutes: 15
# Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
# The current name requires updating the database last docs push query from test-infra every time the matrix is updated
name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
@ -207,6 +211,16 @@ jobs:
path: cppdocs/
s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs
- name: Upload functorch Docs Preview
uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'functorch' && steps.build-docs.outcome == 'success' }}
with:
retention-days: 14
s3-bucket: doc-previews
if-no-files-found: error
path: functorch_ghpages/nightly/
s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always()

View File

@ -2,12 +2,6 @@ name: Get Changed Files
on:
workflow_call:
inputs:
all_files:
description: "Whether to return all files instead of just changed files"
required: false
type: boolean
default: false
outputs:
changed-files:
description: "List of changed files (space-separated) or '*' if not in a PR"
@ -32,23 +26,17 @@ jobs:
# Get the PR number from the github context
PR_NUMBER="${{ github.event.number }}"
# Check if all_files is requested
if [ "${{ inputs.all_files }}" = "true" ]; then
echo "all_files input is true, returning all files"
echo "changed-files=*" >> "$GITHUB_OUTPUT"
else
# Use gh CLI to get changed files in the PR with explicit repo
CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
# Use gh CLI to get changed files in the PR with explicit repo
CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
if [ -z "$CHANGED_FILES" ]; then
echo "No changed files found, setting to '*'"
CHANGED_FILES="*"
fi
echo "Changed files: $CHANGED_FILES"
echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
if [ -z "$CHANGED_FILES" ]; then
echo "No changed files found, setting to '*'"
CHANGED_FILES="*"
fi
echo "Changed files: $CHANGED_FILES"
echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
else
echo "Not in PR context, setting changed files to '*'"
echo "changed-files=*" >> "$GITHUB_OUTPUT"

View File

@ -169,7 +169,7 @@ jobs:
id: install-nvidia-driver
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
with:
driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }}
driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}
- name: Setup GPU_FLAG for docker run

View File

@ -151,7 +151,7 @@ jobs:
BUILD_WHEEL: 1
MAX_JOBS: 8
CUDA_VERSION: ${{ inputs.cuda-version }}
PYTHON_VERSION: "3.10"
PYTHON_VERSION: "3.9"
SCCACHE_BUCKET: "ossci-compiler-cache"
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
SCCACHE_REGION: us-east-1

View File

@ -184,7 +184,7 @@ jobs:
env:
USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
INSTALL_WINDOWS_SDK: 1
PYTHON_VERSION: "3.10"
PYTHON_VERSION: 3.9
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}

View File

@ -178,7 +178,7 @@ jobs:
contents: read
container:
image: continuumio/miniconda3:4.12.0
environment: ${{ ((github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && 'nightly-wheel-upload' || '' }}
environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }}
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

View File

@ -70,8 +70,9 @@ jobs:
pytorch-linux-jammy-py3-clang18-asan,
pytorch-linux-jammy-py3-clang12-onnx,
pytorch-linux-jammy-linter,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter,
pytorch-linux-jammy-py3-clang12-executorch,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
# Executorch pin needs update
# pytorch-linux-jammy-py3-clang12-executorch,
pytorch-linux-jammy-py3.12-triton-cpu,
pytorch-linux-noble-riscv64-py3.12-gcc14
]

View File

@ -44,7 +44,7 @@ jobs:
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
manywheel-py3_10-rocm6_4-build:
manywheel-py3_9-rocm6_4-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
@ -58,16 +58,16 @@ jobs:
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.10"
DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-rocm6_4
build_name: manywheel-py3_9-rocm6_4
build_environment: linux-binary-manywheel-rocm
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-rocm6_4-test: # Testing
manywheel-py3_9-rocm6_4-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_10-rocm6_4-build
- manywheel-py3_9-rocm6_4-build
- get-label-type
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
@ -82,14 +82,14 @@ jobs:
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.10"
DESIRED_PYTHON: "3.9"
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: manywheel-py3_10-rocm6_4
name: manywheel-py3_9-rocm6_4
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: actions/checkout@v4

View File

@ -31,8 +31,6 @@ jobs:
if: github.repository_owner == 'pytorch'
name: Get changed files
uses: ./.github/workflows/_get-changed-files.yml
with:
all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') }}
lintrunner-clang:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -55,7 +53,7 @@ jobs:
with:
timeout: 120
runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter
docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter
# NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
# to run git rev-parse HEAD~:.ci/docker when a new image is needed
fetch-depth: 0
@ -266,10 +264,10 @@ jobs:
with:
submodules: false
fetch-depth: 1
- name: Setup Python 3.10
- name: Setup Python 3.9
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3.10'
python-version: '3.9'
architecture: x64
cache: pip
- name: Install dependencies

View File

@ -316,6 +316,32 @@ jobs:
]}
secrets: inherit
linux-jammy-py3-clang12-executorch-build:
if: false # Docker build needs pin update
name: linux-jammy-py3-clang12-executorch
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3-clang12-executorch
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
test-matrix: |
{ include: [
{ config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
]}
secrets: inherit
linux-jammy-py3-clang12-executorch-test:
name: linux-jammy-py3-clang12-executorch
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-py3-clang12-executorch-build
if: false # Has been broken for a while
with:
build-environment: linux-jammy-py3-clang12-executorch
docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
name: cuda12.8-py3.10-gcc9-sm75
uses: ./.github/workflows/_linux-build.yml

View File

@ -1,76 +0,0 @@
# B200 Smoke Tests CI Workflow
#
# This workflow runs smoke tests on B200 hardware
#
# Flow:
# 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200
# 2. Runs smoke tests on linux.dgx.b200 runner
# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function
#
# Triggered by:
# - Pull requests modifying this workflow file
# - Manual dispatch
# - Schedule (every 6 hours)
# - Adding ciflow/b200 label to a PR (creates ciflow/b200/* tag)
name: B200 Smoke Tests
on:
pull_request:
paths:
- .github/workflows/test-b200.yml
workflow_dispatch:
schedule:
- cron: 0 4,10,16,22 * * * # every 6 hours
push:
tags:
- ciflow/b200/*
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-label-type:
if: github.repository_owner == 'pytorch'
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-cuda12_8-py3_10-gcc11-sm100-build:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '10.0'
test-matrix: |
{ include: [
{ config: "smoke_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
]}
# config: "smoke_b200" maps to test_python_smoke_b200() in .ci/pytorch/test.sh
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-sm100-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
secrets: inherit

View File

@ -259,27 +259,3 @@ jobs:
docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-py3-clang12-executorch-build:
name: linux-jammy-py3-clang12-executorch
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3-clang12-executorch
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
test-matrix: |
{ include: [
{ config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
]}
secrets: inherit
linux-jammy-py3-clang12-executorch-test:
name: linux-jammy-py3-clang12-executorch
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-py3-clang12-executorch-build
with:
build-environment: linux-jammy-py3-clang12-executorch
docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
secrets: inherit

View File

@ -53,3 +53,27 @@ jobs:
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-py3_9-clang9-xla-build:
name: linux-jammy-py3_9-clang9-xla
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-clang9-xla
docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
test-matrix: |
{ include: [
{ config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
]}
secrets: inherit
linux-jammy-py3_9-clang9-xla-test:
name: linux-jammy-py3_9-clang9-xla
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-py3_9-clang9-xla-build
with:
build-environment: linux-jammy-py3.9-clang9-xla
docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
secrets: inherit

View File

@ -36,8 +36,6 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
# When building vLLM, uv doesn't like that we rename wheel without changing the wheel metadata
allow-reuse-old-whl: false
build-additional-packages: "vision audio"
build-external-packages: "vllm"
build-environment: linux-jammy-cuda12.8-py3.12-gcc11

3
.gitignore vendored
View File

@ -259,9 +259,6 @@ gen
.pytest_cache
aten/build/*
# Linker scripts for prioritized text optimization
cmake/linker_script.ld
# Bram
plsdontbreak

View File

@ -123,7 +123,6 @@ is_formatter = true
code = 'MYPY'
include_patterns = [
'setup.py',
'functorch/dim/**/*.py',
'torch/**/*.py',
'torch/**/*.pyi',
'caffe2/**/*.py',
@ -196,7 +195,6 @@ exclude_patterns = [
'tools/test/gen_operators_yaml_test.py',
'tools/test/gen_oplist_test.py',
'tools/test/test_selective_build.py',
'tools/experimental/dynamic_shapes/torchfuzz/**',
]
command = [
'python3',
@ -966,6 +964,7 @@ exclude_patterns = [
'test/jit/**', # should be run through test/test_jit.py
'test/ao/sparsity/**', # should be run through test/test_ao_sparsity.py
'test/fx/**', # should be run through test/test_fx.py
'test/bottleneck_test/**', # excluded by test/run_test.py
'test/package/**', # excluded by test/run_test.py
'test/distributed/argparse_util_test.py',
'test/distributed/bin/test_script.py',
@ -1411,6 +1410,8 @@ exclude_patterns = [
'torch/utils/benchmark/utils/timer.py',
'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py',
'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py',
'torch/utils/bottleneck/__init__.py',
'torch/utils/bottleneck/__main__.py',
'torch/utils/bundled_inputs.py',
'torch/utils/checkpoint.py',
'torch/utils/collect_env.py',

View File

@ -1,4 +1,5 @@
cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
# cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW)
# Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this
# sometimes makes XCode C compiler gets detected as "Clang", even when the C++
@ -379,13 +380,6 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
OFF "USE_CUDA" OFF)
cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
"CPU_AARCH64" OFF)
# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le.
set(USE_PRIORITIZED_TEXT_DEFAULT OFF)
if(LINUX AND CPU_AARCH64)
set(USE_PRIORITIZED_TEXT_DEFAULT ON)
endif()
cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld."
"${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF)
option(USE_MIMALLOC "Use mimalloc" OFF)
# Enable third party mimalloc library to improve memory allocation performance
@ -663,11 +657,6 @@ endif(MSVC)
string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
# Set linker max-page-size to 64KiB on AArch64 Linux
if(LINUX AND CPU_AARCH64)
add_link_options_if_supported("-z,max-page-size=0x10000")
endif()
# Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
# applicable to mobile are disabled by this variable. Setting
# `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it
@ -902,7 +891,7 @@ IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
endif()
# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100.
if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a")
set(USE_FBGEMM_GENAI ON)
endif()
@ -1432,57 +1421,3 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas"
DESTINATION "${CMAKE_INSTALL_BINDIR}")
endif()
if(USE_PRIORITIZED_TEXT_FOR_LD)
add_compile_options(
$<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
$<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
)
set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
add_custom_command(
OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
COMMENT "Generating prioritized text linker files"
VERBATIM
)
add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
if(BUILD_PYTHON)
set(LINKER_OPT_TARGETS torch_python)
endif()
if(NOT BUILD_LIBTORCHLESS)
list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
if(USE_CUDA)
list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
endif()
if(USE_XPU)
list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
endif()
if(USE_ROCM)
list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
endif()
endif()
foreach(tgt IN LISTS LINKER_OPT_TARGETS)
if(TARGET ${tgt})
add_dependencies("${tgt}" generate_linker_script)
target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
else()
message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
endif()
endforeach()
else()
if(LINUX AND CPU_AARCH64)
message(WARNING [[
It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
]])
endif()
endif()

View File

@ -317,20 +317,10 @@ IF(USE_FBGEMM_GENAI)
-greedy-reverse-local-assignment=1
-fhip-new-launch-api)
# Only compile for gfx942 for now.
# This is rather hacky, I could not figure out a clean solution :(
set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
endif()
set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
hip_add_library(
fbgemm_genai STATIC
${fbgemm_genai_native_rocm_hip}
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)

View File

@ -65,24 +65,14 @@ DLDataType getDLDataType(const Tensor& t) {
break;
// TODO(#146647): use macro here instead of spelling out each shell dtype
case ScalarType::Float8_e5m2:
dtype.code = DLDataTypeCode::kDLFloat8_e5m2;
break;
case ScalarType::Float8_e5m2fnuz:
dtype.code = DLDataTypeCode::kDLFloat8_e5m2fnuz;
break;
case ScalarType::Float8_e4m3fn:
dtype.code = DLDataTypeCode::kDLFloat8_e4m3fn;
break;
case ScalarType::Float8_e4m3fnuz:
dtype.code = DLDataTypeCode::kDLFloat8_e4m3fnuz;
break;
case ScalarType::Float8_e8m0fnu:
dtype.code = DLDataTypeCode::kDLFloat8_e8m0fnu;
TORCH_CHECK_BUFFER(false, "float8 types are not supported by dlpack");
break;
case ScalarType::Float4_e2m1fn_x2:
dtype.code = DLDataTypeCode::kDLFloat4_e2m1fn;
dtype.lanes = 2;
dtype.bits = 4;
TORCH_CHECK_BUFFER(false, "float4 types are not supported by dlpack");
break;
case ScalarType::QInt8:
case ScalarType::QUInt8:
@ -187,11 +177,7 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat
ScalarType toScalarType(const DLDataType& dtype) {
ScalarType stype = ScalarType::Undefined;
if (dtype.code != DLDataTypeCode::kDLFloat4_e2m1fn) {
TORCH_CHECK_BUFFER(
dtype.lanes == 1,
"ATen does not support lanes != 1 for dtype code", std::to_string(dtype.code));
}
TORCH_CHECK_BUFFER(dtype.lanes == 1, "ATen does not support lanes != 1");
switch (dtype.code) {
case DLDataTypeCode::kDLUInt:
switch (dtype.bits) {
@ -283,73 +269,6 @@ ScalarType toScalarType(const DLDataType& dtype) {
false, "Unsupported kDLBool bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat8_e5m2:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e5m2;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e5m2 bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat8_e5m2fnuz:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e5m2fnuz;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e5m2fnuz bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat8_e4m3fn:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e4m3fn;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e4m3fn bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat8_e4m3fnuz:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e4m3fnuz;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e4m3fnuz bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat8_e8m0fnu:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e8m0fnu;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e8m0fnu bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat4_e2m1fn:
switch (dtype.bits) {
case 4:
switch (dtype.lanes) {
case 2:
stype = ScalarType::Float4_e2m1fn_x2;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat4_e2m1fn lanes ", std::to_string(dtype.lanes));
}
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat4_e2m1fn bits ", std::to_string(dtype.bits));
}
break;
default:
TORCH_CHECK_BUFFER(false, "Unsupported code ", std::to_string(dtype.code));
}
@ -401,13 +320,30 @@ T* toDLPackImpl(const Tensor& src) {
// The following code detects whether the src follows
// a continuous pattern. If the src follows such pattern (common-case)
// then we do not need to normalize the strides.
bool need_normalize_strides = src.dim() == 1 && src.size(0) == 1 && src.stride(0) != 1;
bool need_normalize_strides = false;
int64_t expected_stride = 1;
for (int i = src.dim() - 1; i >= 0; i--) {
// detect if we do not meet continuous pattern
// and the size is 1, so there is opportunity to normalize
if (src.stride(i) != expected_stride && src.size(i) == 1) {
need_normalize_strides = true;
break;
}
expected_stride *= src.size(i);
}
// less common case, try normalizing the strides
if (need_normalize_strides) {
// create a new tensor with possibly normalized strides
// gh-83069
auto shape = src.sizes();
view = src.as_strided(shape, {1}, src.storage_offset());
auto strides = src.strides().vec();
for (int i = 0; i < src.dim(); i++) {
if (shape[i] < 2) {
strides[i] = 1;
}
}
view = src.as_strided(shape, strides, src.storage_offset());
}
ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
@ -418,8 +354,8 @@ T* toDLPackImpl(const Tensor& src) {
atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device());
atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim());
atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src);
atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(view.sizes().data());
atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(view.strides().data());
atDLMTensor->tensor.dl_tensor.shape = view.sizes().data();
atDLMTensor->tensor.dl_tensor.strides = view.strides().data();
atDLMTensor->tensor.dl_tensor.byte_offset = 0;
fillVersion(&atDLMTensor->tensor);

View File

@ -1637,7 +1637,9 @@ bool gemm_and_bias(
if (activation == GEMMAndBiasActivationEpilogue::RELU) {
epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
} else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
#if CUDA_VERSION >= 11040 || defined(USE_ROCM)
epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
#endif
}
if (bias != nullptr) {
@ -1929,6 +1931,7 @@ void scaled_gemm(
bool use_fast_accum) {
// Note: see `cublasCommonArgs` for various non-intuitive manupulations
// of input arguments to this function.
#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
const auto computeType = CUBLAS_COMPUTE_32F;
const auto scaleType = CUDA_R_32F;
const float alpha_val = 1.0;
@ -1951,8 +1954,8 @@ void scaled_gemm(
#if ROCM_VERSION >= 70000
if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) {
// TODO: add constraints based on hipblaslt internals
TORCH_CHECK((m % 16 == 0) && (n % 16 == 0) && (k % 128 == 0),
"M, N must be multiples of 16 and K should be multiple of 128 for MX format. "
TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
"Matrix dimensions must be multiples of 32 for MX format. "
"Got m=", m, ", n=", n, ", k=", k);
}
#endif
@ -2130,6 +2133,8 @@ void scaled_gemm(
" scaleType ",
scaleType);
return;
#endif // if CUDA_VERSION >= 11080 || defined(USE_ROCM)
TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above");
}
void int8_gemm(

View File

@ -266,14 +266,11 @@ CUDAGeneratorImpl::CUDAGeneratorImpl(
* See Note [Acquire lock when using random generators]
*/
void CUDAGeneratorImpl::set_current_seed(uint64_t seed) {
if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
state_->seed_ = seed;
state_->philox_offset_per_thread_ = 0;
no_reset_rnn_state_.clear();
} else {
TORCH_CHECK(state_->seed_ == seed, "CUDAGeneratorImpl::set_current_seed can be called during stream capture only if new seed is the same as the original seed.");
// no-op case
}
at::cuda::assertNotCapturing(
"Cannot call CUDAGeneratorImpl::set_current_seed");
state_->seed_ = seed;
state_->philox_offset_per_thread_ = 0;
no_reset_rnn_state_.clear();
}
/**
@ -302,6 +299,9 @@ uint64_t CUDAGeneratorImpl::get_offset() const {
* Gets the current seed of CUDAGeneratorImpl.
*/
uint64_t CUDAGeneratorImpl::current_seed() const {
// Debatable if current_seed() should be allowed in captured regions.
// Conservatively disallow it for now.
at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::current_seed");
return state_->seed_;
}
@ -346,6 +346,8 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
* and size of the internal state.
*/
void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
at::cuda::assertNotCapturing(
"Please ensure to utilize the CUDAGeneratorImpl::set_state_index method during capturing.");
static const size_t seed_size = sizeof(uint64_t);
static const size_t offset_size = sizeof(int64_t);
static const size_t total_size = seed_size + offset_size;
@ -400,27 +402,15 @@ c10::intrusive_ptr<c10::GeneratorImpl> CUDAGeneratorImpl::graphsafe_get_state()
*/
void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
// see Note [Why enforce RNG offset % 4 == 0?]
// Note: If you use CUDNN RNN's, calling
// set_philox_offset_per_thread instead of set_offset will cause the
// cudnn RNN rng state to become stale.
TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4");
if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
state_->philox_offset_per_thread_ = offset;
} else {
state_->offset_intragraph_ = offset;
}
state_->philox_offset_per_thread_ = offset;
}
/**
* Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl.
*/
uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const {
if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
return state_->philox_offset_per_thread_;
} else {
return state_->offset_intragraph_;
}
return state_->philox_offset_per_thread_;
}
/**

View File

@ -19,7 +19,7 @@
#define DLPACK_MAJOR_VERSION 1
/*! \brief The current minor version of dlpack */
#define DLPACK_MINOR_VERSION 1
#define DLPACK_MINOR_VERSION 0
/*! \brief DLPACK_DLL prefix for windows */
#ifdef _WIN32
@ -32,7 +32,9 @@
#define DLPACK_DLL
#endif
// NOLINTNEXTLINE(modernize-deprecated-headers)
#include <stdint.h>
// NOLINTNEXTLINE(modernize-deprecated-headers)
#include <stddef.h>
#ifdef __cplusplus
@ -157,26 +159,6 @@ typedef enum {
kDLComplex = 5U,
/*! \brief boolean */
kDLBool = 6U,
/*! \brief FP8 data types */
kDLFloat8_e3m4 = 7U,
kDLFloat8_e4m3 = 8U,
kDLFloat8_e4m3b11fnuz = 9U,
kDLFloat8_e4m3fn = 10U,
kDLFloat8_e4m3fnuz = 11U,
kDLFloat8_e5m2 = 12U,
kDLFloat8_e5m2fnuz = 13U,
kDLFloat8_e8m0fnu = 14U,
/*! \brief FP6 data types
* Setting bits != 6 is currently unspecified, and the producer must ensure it is set
* while the consumer must stop importing if the value is unexpected.
*/
kDLFloat6_e2m3fn = 15U,
kDLFloat6_e3m2fn = 16U,
/*! \brief FP4 data types
* Setting bits != 4 is currently unspecified, and the producer must ensure it is set
* while the consumer must stop importing if the value is unexpected.
*/
kDLFloat4_e2m1fn = 17U,
} DLDataTypeCode;
/*!
@ -190,12 +172,6 @@ typedef enum {
* - int8: type_code = 0, bits = 8, lanes = 1
* - std::complex<float>: type_code = 5, bits = 64, lanes = 1
* - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
* - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory)
* - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory)
* - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory)
*
* When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e.,
* for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element.
*/
typedef struct {
/*!
@ -253,12 +229,12 @@ typedef struct {
/*! \brief The data type of the pointer*/
DLDataType dtype;
/*! \brief The shape of the tensor */
int64_t* shape;
const int64_t* shape;
/*!
* \brief strides of the tensor (in number of elements, not bytes)
* can be NULL, indicating tensor is compact and row-majored.
*/
int64_t* strides;
const int64_t* strides;
/*! \brief The offset in bytes to the beginning pointer to data */
uint64_t byte_offset;
} DLTensor;
@ -293,7 +269,7 @@ typedef struct DLManagedTensor {
void (*deleter)(struct DLManagedTensor * self);
} DLManagedTensor;
// bit masks used in the DLManagedTensorVersioned
// bit masks used in in the DLManagedTensorVersioned
/*! \brief bit mask to indicate that the tensor is read only. */
#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
@ -306,14 +282,6 @@ typedef struct DLManagedTensor {
*/
#define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL)
/*
* \brief bit mask to indicate that whether a sub-byte type is packed or padded.
*
* The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can
* be set by the producer to signal that a tensor of sub-byte type is padded.
*/
#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL)
/*!
* \brief A versioned and managed C Tensor object, manage memory of DLTensor.
*

View File

@ -171,8 +171,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
POINTWISE_BOXED(fill_.Scalar);
POINTWISE_BOXED(zero_);
// This is special because this op doesn't return anything
m.impl("_assert_tensor_metadata", native::_assert_tensor_metadata);
#undef UNARY_POINTWISE
#undef UNARY_POINTWISE_ALL

View File

@ -81,7 +81,7 @@ Tensor math_channel_shuffle(const Tensor& self, int64_t groups) {
// TODO: contiguous can be made to preserve the memory format
// of the input. However since the above reshape clobbers h and w
// it may not be safe to do that, since channels_last contiguous
// may think oc and the last dim correspond to h,w?
// may think oc and and the last dim correspond to h,w?
// It is not clear, however from initial looking around it feels that
// this may not be correct.
// In this case channels last will likely require custom implementation

View File

@ -1,4 +1,3 @@
#pragma once
#include <ATen/core/Tensor.h>
#include <ATen/Config.h>
#include <cstdint>

View File

@ -97,38 +97,43 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
int64_t nDims = self.dim();
TORCH_CHECK(nDims >= 2, "dimensions must larger than 1");
auto height = self.sym_size(0);
auto width = self.sym_size(1);
int64_t height = self.size(0);
int64_t width = self.size(1);
if (nDims > 2) {
int64_t dim1 = height;
for (const auto i : c10::irange(1, nDims)) {
if (self.sym_size(i) != height) {
if (self.size(i) != dim1) {
TORCH_CHECK(false, "all dimensions of input must be of equal length");
}
}
}
auto storage_offset = self.sym_storage_offset();
auto size = std::min(height, width);
int64_t storage_offset = self.storage_offset();
std::vector<int64_t> sizes;
std::vector<int64_t> strides;
int64_t size = std::min(height, width);
int64_t stride = 0;
for (const auto i : c10::irange(nDims)) {
stride += self.stride(i);
}
std::vector<SymInt> strides{stride};
std::vector<SymInt> sizes{size};
strides.push_back(stride);
sizes.push_back(size);
auto main_diag = self.as_strided_symint(sizes, strides, storage_offset);
auto main_diag = self.as_strided(sizes, strides, storage_offset);
main_diag.fill_(fill_value);
if (wrap && nDims == 2 && height > width + 1) {
auto step = width + 1;
auto wrap_size = ((self.numel() + step - 1) / step) - size;
std::vector<SymInt> wrap_sizes{wrap_size};
std::vector<int64_t> wrap_sizes;
auto offset = self.stride(0) * (width + 1);
int64_t step = width + 1;
int64_t wrap_size = ((self.numel() + step - 1) / step) - size;
wrap_sizes.push_back(wrap_size);
auto wrap_diag = self.as_strided_symint(wrap_sizes, strides, storage_offset + offset);
int64_t offset = self.stride(0) * (width + 1);
auto wrap_diag = self.as_strided(wrap_sizes, strides, storage_offset + offset);
wrap_diag.fill_(fill_value);
}

View File

@ -23,6 +23,8 @@ Tensor& max_unpooling2d_forward_out_cpu(
// Nondeterministic with duplicate indices
at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out");
auto oheight = output_size[0];
auto owidth = output_size[1];
TORCH_CHECK(
indices_.scalar_type() == at::ScalarType::Long,
"elements in indices should be type int64 but got: ", indices_.scalar_type());
@ -43,9 +45,6 @@ Tensor& max_unpooling2d_forward_out_cpu(
self_.sizes(), " with dimension ", i , " being empty.");
}
auto oheight = output_size[0];
auto owidth = output_size[1];
auto memory_format = self_.suggest_memory_format();
auto self = self_.contiguous(memory_format);
auto indices = indices_.contiguous(memory_format);

View File

@ -73,7 +73,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value)
for (const auto i : c10::irange((size_t)l_pad)) {
auto pad_idx = pad.size() - ((i + 1) * 2);
auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
TORCH_CHECK(new_dim >= 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
pad[pad_idx], " and ", pad[pad_idx + 1], " resulted in a negative output size, "
"which is invalid. Check dimension ", l_diff + i, " of your input.");
new_shape.emplace_back(new_dim);

View File

@ -85,11 +85,11 @@ void cpu_max_unpool(
if constexpr (is_3d) {
TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
" (output volumes are of size ", output_depth,
"x", output_height, "x", output_width, ")");
"x", output_height, "x", output_width);
} else {
TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
" (output volumes are of size ", output_height,
"x", output_width, ")");
"x", output_width);
}
}

View File

@ -1138,14 +1138,9 @@ bool is_blockwise_1x16_scaling(const at::Tensor& t, const at::Tensor& scale) {
bool is_blockwise_1x32_scaling(const at::Tensor& t, const at::Tensor& scale) {
// TODO: We might want to enforce some structure on the shapes of the scale
// tensors
bool is_fp8_path = (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
&& scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4));
bool is_packed_fp4_path = false;
#ifdef USE_ROCM
is_packed_fp4_path = (t.scalar_type() == ScalarType::Float4_e2m1fn_x2 && scale.scalar_type() == at::kFloat8_e8m0fnu
&& scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1) * 2, 32), 4));
#endif
return (is_fp8_path || is_packed_fp4_path) && scale.is_contiguous();
return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
&& scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4)
&& scale.is_contiguous());
}
bool is_blockwise_1x128_scaling(const at::Tensor& t, const at::Tensor& scale) {
@ -1386,15 +1381,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
"Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
int packed_factor = 1;
if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
// For float4 data type, each byte stores two 4-bit floating-point values,
// effectively packing two elements into one byte.
packed_factor = 2;
}
TORCH_CHECK(mat1.size(0) % 16 == 0 && (mat1.size(1) * packed_factor) % 128 == 0 &&
mat2.size(1) % 16 == 0,
"M, N must be multiples of 16 and K must be multiple of 128 for block-wise scaling");
TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
"Matrix dimensions must be multiples of 32 for block-wise scaling");
TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
out.scalar_type() == ScalarType::Half,

View File

@ -999,41 +999,12 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
dtypes[i] = iter.dtype(i);
}
auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
#ifdef USE_ROCM
constexpr int grp_sz = 128;
launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
if (unrl) {
auto offsets0 = offset_calc.get(idx);
auto offsets1 = offset_calc.get(idx + grp_sz);
auto offsets2 = offset_calc.get(idx + grp_sz * 2);
auto offsets3 = offset_calc.get(idx + grp_sz * 3);
void* out0 = data[0] + offsets0[0];
void* out1 = data[0] + offsets1[0];
void* out2 = data[0] + offsets2[0];
void* out3 = data[0] + offsets3[0];
arg0_t result0 = invoke(f, &data[1], &offsets0[1], &dtypes[1], 1);
arg0_t result1 = invoke(f, &data[1], &offsets1[1], &dtypes[1], 1);
arg0_t result2 = invoke(f, &data[1], &offsets2[1], &dtypes[1], 1);
arg0_t result3 = invoke(f, &data[1], &offsets3[1], &dtypes[1], 1);
c10::cast_and_store<arg0_t>(dtypes[0], out0, result0);
c10::cast_and_store<arg0_t>(dtypes[0], out1, result1);
c10::cast_and_store<arg0_t>(dtypes[0], out2, result2);
c10::cast_and_store<arg0_t>(dtypes[0], out3, result3);
} else {
auto offsets = offset_calc.get(idx);
void* out = data[0] + offsets[0];
arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
c10::cast_and_store<arg0_t>(dtypes[0], out, result);
}
});
#else
launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) {
auto offsets = offset_calc.get(idx);
void* out = data[0] + offsets[0];
arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
c10::cast_and_store<arg0_t>(dtypes[0], out, result);
});
#endif
}
}

View File

@ -51,7 +51,7 @@ std::vector<Tensor> foreach_tensor_list_op(
Op<opmath_t>(),
alpha.to<opmath_t>());
return std::move(tensor_lists[2]);
return tensor_lists[2];
}
template <typename T, template <class> class Op>

View File

@ -45,7 +45,7 @@ std::vector<Tensor> foreach_binary_op(
/* res_arg_index */ 1>(),
Op<opmath_t>(),
scalar.to<opmath_t>());
return std::move(tensor_lists[1]);
return tensor_lists[1];
}
template <typename T, template <class> class Op>

View File

@ -33,7 +33,7 @@ std::vector<Tensor> foreach_binary_op(
}
tensor_lists.emplace_back(tensors.vec());
tensor_lists.emplace_back(std::move(vec_res));
tensor_lists.emplace_back(vec_res);
using opmath_t = at::opmath_type<T>;
multi_tensor_apply<2, opmath_t>(
@ -46,7 +46,7 @@ std::vector<Tensor> foreach_binary_op(
/* res_arg_index */ 1>(),
Op<opmath_t>());
return std::move(tensor_lists[1]);
return tensor_lists[1];
}
template <typename T, template <class> class Op>

View File

@ -56,7 +56,7 @@ std::vector<Tensor> foreach_binary_op(
Op<opmath_t>(),
scalar.data_ptr<T>(),
alpha.to<opmath_t>());
return std::move(tensor_lists[1]);
return tensor_lists[1];
}
template <typename T, template <class> class Op>

View File

@ -57,7 +57,7 @@ std::vector<Tensor> foreach_pointwise_op(
scalar.to<opmath_t>());
});
return std::move(tensor_lists[3]);
return tensor_lists[3];
}
template <template <class> class Op>
@ -160,7 +160,7 @@ std::vector<Tensor> foreach_pointwise_op(
Op<opmath_t>());
});
return std::move(tensor_lists[3]);
return tensor_lists[3];
}
#define FOREACH_POINTWISE_OP_SCALAR(NAME, OP) \

View File

@ -37,7 +37,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
vec_res.emplace_back(at::native::empty_like(t));
}
std::vector<std::vector<at::Tensor>> tensor_lists{
tensors1.vec(), tensors2.vec(), tensors3.vec(), std::move(vec_res)};
tensors1.vec(), tensors2.vec(), tensors3.vec(), vec_res};
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
at::ScalarType::Half,
@ -56,7 +56,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
LerpFunctor<opmath_t>());
});
return std::move(tensor_lists[3]);
return tensor_lists[3];
}
void foreach_tensor_lerp_ternary_cuda_(
@ -104,7 +104,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
vec_res.emplace_back(at::native::empty_like(t));
}
std::vector<std::vector<at::Tensor>> tensor_lists{
tensors1.vec(), tensors2.vec(), std::move(vec_res)};
tensors1.vec(), tensors2.vec(), vec_res};
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
at::ScalarType::Half,
@ -124,7 +124,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
weight.to<opmath_t>());
});
return std::move(tensor_lists[2]);
return tensor_lists[2];
}
void foreach_tensor_lerp_list_cuda_(
@ -173,7 +173,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
vec_res.emplace_back(at::native::empty_like(t));
}
std::vector<std::vector<at::Tensor>> tensor_lists{
tensors1.vec(), tensors2.vec(), std::move(vec_res)};
tensors1.vec(), tensors2.vec(), vec_res};
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
at::ScalarType::Half,
@ -193,7 +193,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
LerpFunctor<opmath_t>());
});
return std::move(tensor_lists[2]);
return tensor_lists[2];
}
void foreach_tensor_lerp_scalarlist_cuda_(

View File

@ -67,7 +67,7 @@ std::vector<Tensor> foreach_unary_op(TensorList tensors) {
/* res_arg_index */ 1>(),
Op<opmath_t>());
return std::move(tensor_lists[1]);
return tensor_lists[1];
}
template <typename scalar_t, template <class> class Op>

View File

@ -125,6 +125,8 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
TORCH_CHECK(
indices_.scalar_type() == at::ScalarType::Long,
"elements in indices should be type int64 but got: ", indices_.scalar_type());
auto oheight = output_size[0];
auto owidth = output_size[1];
TensorArg output_arg{output, "output", 1}, self_arg{self_, "self_", 2},
indices_arg{indices_, "indices_", 3};
@ -147,9 +149,6 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
output_size.size() == 2,
"There should be exactly two elements (height, width) in output_size, but got ", output_size.size(), " elements.");
auto oheight = output_size[0];
auto owidth = output_size[1];
int64_t dimw = 2;
int64_t dimh = 1;
int64_t numBatch = 1;
@ -218,6 +217,9 @@ static void max_unpooling3d_shape_check(
IntArrayRef stride,
IntArrayRef padding,
const char *fn_name) {
int64_t oT = output_size[0];
int64_t oH = output_size[1];
int64_t oW = output_size[2];
TORCH_CHECK(
indices.scalar_type() == at::ScalarType::Long,
"elements in indices should be type int64 but got: ", indices.scalar_type());
@ -248,10 +250,6 @@ static void max_unpooling3d_shape_check(
"strides should be greater than zero, but got stride: ",
stride);
int64_t oT = output_size[0];
int64_t oH = output_size[1];
int64_t oW = output_size[2];
int dimw = 3;
int dimh = 2;
int dimt = 1;
@ -404,6 +402,8 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
const Tensor& indices_,
IntArrayRef output_size,
Tensor& grad_input) {
int64_t oheight = output_size[0];
int64_t owidth = output_size[1];
TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
TORCH_CHECK(
indices_.scalar_type() == at::ScalarType::Long,
@ -426,9 +426,6 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
TORCH_CHECK(output_size.size() == 2, "output_size must have two elements, got size: ", output_size.size());
int64_t oheight = output_size[0];
int64_t owidth = output_size[1];
int64_t nInputCols, nInputRows, nInputPlane;
int dimw = 2;
@ -508,14 +505,13 @@ at::Tensor& max_unpooling3d_backward_out_cuda(const Tensor& grad_output_,
IntArrayRef padding,
Tensor& grad_input) {
TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
max_unpooling3d_shape_check(
self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()");
int64_t oT = output_size[0];
int64_t oH = output_size[1];
int64_t oW = output_size[2];
max_unpooling3d_shape_check(
self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()");
int batchSize = 0;
int inputSlices = 0;
int inputTime = 0;

View File

@ -300,6 +300,8 @@ void nonzero_static_cuda_out_impl(
int64_t size,
int64_t fill_value,
Tensor& out) {
#if defined(CUDA_VERSION) || defined(USE_ROCM)
Tensor self_contiguous_ = self.contiguous();
// see comment in nonzero_cuda_out_impl on reqs for out
bool out_correct_size =
@ -375,6 +377,9 @@ void nonzero_static_cuda_out_impl(
if (need_to_copy) {
out.copy_(out_temp);
}
#else
TORCH_CHECK(false, "Nonzero_static is not supported for cuda <= 11.4");
#endif
}
Tensor& nonzero_out_cuda(const Tensor& self, Tensor& out) {

View File

@ -226,38 +226,6 @@ __global__ void CatArrayBatchedCopy_contig(
}
}
template <typename T, typename IndexType, int Dims, int batch_size, int stride_size, int alignment, int elems_per_vec>
__global__ void CatArrayBatchedCopy_vectorized(
char* output,
CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs,
TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
const int concatDim,
IndexType trailingSize) {
IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
IndexType nElements = inputs.nElements[blockIdx.y] / elems_per_vec;
if(tid >= nElements) return;
const char * data = (char*)inputs.input[blockIdx.y];
IndexType offset = inputs.offset[blockIdx.y] * trailingSize / elems_per_vec;
IndexType dimSize = inputs.dimSize[blockIdx.y] * trailingSize / elems_per_vec;
int64_t dataOffset = (int64_t)offset * alignment; // in bytes
IndexType stride = gridDim.x * blockDim.x;
while( tid < nElements){
int64_t elementOffset = (int64_t)CatArrIndexToOffset<IndexType, Dims>::compute(
os.tensorSize, os.tensorStride, dimSize, concatDim, tid) * alignment; // in bytes
auto vec = at::native::memory::ld_vec<alignment>(data + (int64_t)alignment * tid);
at::native::memory::st_vec<alignment>(output + dataOffset + elementOffset, vec);
tid += stride;
}
}
/*
Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads
to improve memory bandwidth throughput.
@ -328,27 +296,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
scalar_t *data = (scalar_t *)(out.mutable_data_ptr());
CatArrInputTensorMetadata<scalar_t, unsigned int, batch_size, stride_size> catMetaData;
TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> outputParam;
// If all batches are contiguous we can call a specialized implementation
// which requires the input tensor addresses to be aligned to a
// 16 Byte boundary.
constexpr bool isContig = stride_size == 1;
bool isAligned = true;
constexpr int alignment = 16;
// Next, let's initialize the size, stride arrays for the output Tensor.
// for contig case, we'll canonicalize output strides, so that
// we don't have arbitrary strides for dims of size 0
size_t stride0 = 1;
if (memory_format == c10::MemoryFormat::Contiguous) {
for (int i = nDims - 1; i >= 0; --i) {
for (int i = 0; i < nDims; ++i) {
outputParam.tensorSize[i] = out.size(i);
if (isContig) {
outputParam.tensorStride[i] = stride0;
stride0 *= out.size(i);
} else {
outputParam.tensorStride[i] = out.stride(i);
}
outputParam.tensorStride[i] = out.stride(i);
}
} else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) {
// permute the semantics of dims from NCHW to NHWC so that the input
@ -367,15 +320,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
// If all batches are contiguous we can call a specialized implementation
// which requires the input tensor addresses to be aligned to a
// 16 Byte boundary.
// for channels last computing slice size correctly is much more involved, so we never send it
// on the fully vectorized path
// we need output stride in cat dimension to be multiple of alignment,
// if we ever use it to compute offsets
// for catting in 0th dimension it doesn't matter
bool isInOutAligned = isContig && at::native::memory::get_alignment(data) >= alignment &&
memory_format == c10::MemoryFormat::Contiguous && (dimension == 0 ||
outputParam.tensorStride[dimension - 1] * sizeof(scalar_t) % alignment == 0);
bool isContig = true;
bool isAligned = true;
unsigned int max_elements_per_tensor = 0;
// Now we loop
@ -391,16 +341,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
// high-dimensional tensor
if (inputs[i+batchCounter].get().numel() > 0) {
dimSize = inputs[i+batchCounter].get().size(dimension);
if (isInOutAligned) {
auto t = inputs[i+batchCounter].get();
// similarly to output stride, we cannot trust stride value to
// determine slice size if the corresponding dimension is 1
// we have to multiply all the subsequent sizes
int64_t slice_size = dimension == 0 ? t.numel() : t.sizes()[dimension - 1] != 1 ?
t.strides()[dimension - 1] : c10::multiply_integers(t.sizes().begin() + dimension, t.sizes().end());
slice_size *= sizeof(scalar_t);
isInOutAligned &= (slice_size % alignment == 0);
}
}
catMetaData.input[batchCounter] = (scalar_t*)(inputs[i+batchCounter].get().const_data_ptr());
@ -411,12 +351,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
#ifdef USE_ROCM
// On ROCm, CatArrayBatchedCopy_contig is faster
isAligned = false;
isInOutAligned = false;
#else
// If at least one of the inputs is not aligned, we can't call the
// CatArrayBatchedCopy_alignedK_contig
isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]);
isInOutAligned &= at::native::memory::get_alignment(catMetaData.input[batchCounter]) >= alignment;
#endif
if (stride_size > 1) {
@ -427,6 +365,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j];
}
catMetaData.isContiguous[batchCounter] = false;
isContig = false;
} else {
catMetaData.isContiguous[batchCounter] = true;
}
@ -449,13 +388,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
max_elements_per_tensor, batchCounter);
#else
dim3 applyBlock, catGrid;
if (isInOutAligned) {
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, alignment>(
max_elements_per_tensor, batchCounter);
} else if (isContig && isAligned && sizeof(scalar_t) > 2) {
if (isContig && sizeof(scalar_t) > 2) {
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_16>(
max_elements_per_tensor, batchCounter);
} else if (isContig && isAligned && sizeof(scalar_t) == 2) {
} else if (isContig && sizeof(scalar_t) == 2) {
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_8>(
max_elements_per_tensor, batchCounter);
} else {
@ -463,30 +399,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
getCatGrid(batchCounter, catGrid);
}
#endif
int32_t trailingSize;
TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam;
if (isInOutAligned) {
// in this case we can and should flatten the tensors after the cat dim
// we want to view the tensors as if consisting of `alignment`-sized elements
// however, we might not be able to cleanly divide just the last dim -
// it might not be the multiple of alignment.
// however, we know that the full concatted slice is multiple of alignment,
// so if we flatten all the dims after and including concat dim,
// it will be divisible by alignment
// then we need to divide last out size by elems_per_vec,
// and divide all strides except last by elems_per_vec (last stride is 1 always)
// for input, we will fix up the sizes and strides in the kernel directly
kernelOutputParam = outputParam;
nDims = dimension + 1;
constexpr auto elems_per_vec = alignment / sizeof(scalar_t);
auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1];
kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec;
trailingSize = outputParam.tensorStride[dimension];
kernelOutputParam.tensorStride[dimension] = 1;
for (int i = 0; i < dimension; ++i) {
kernelOutputParam.tensorStride[i] /= elems_per_vec;
}
}
if (memory_format != c10::MemoryFormat::Contiguous) {
switch (dimension) {
@ -501,12 +413,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
}
// Template Declarations for dim = 1, 2, 3, 4
#define HANDLE_CASE(DIMS) \
if (isInOutAligned) {\
constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \
CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\
catGrid, applyBlock, 0, stream.stream()>>>(\
(char*)data, catMetaData, kernelOutputParam, dimension, trailingSize);\
} else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\
catGrid, applyBlock, 0, stream.stream()>>>(\
data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\

View File

@ -221,9 +221,22 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_
std::optional<CuFFTConfig> uncached_plan;
const CuFFTConfig * config = nullptr;
// Workaround for gh-63152, gh-58724
// Bluestein plans in CUDA 11.1 (cufft 10.3) cannot be re-used
// Bluestein's algorithm is only used when a size has large prime factors,
// sizes with only small prime factors can still be cached
if (plan_cache.max_size() > 0) {
bool use_caching = true;
#ifdef CUFFT_VERSION
if constexpr (10300 <= CUFFT_VERSION && CUFFT_VERSION < 10400) {
// Only cache plans for transforms with small prime factors
use_caching = std::none_of(
signal_size.begin() + 1, signal_size.end(), [](int64_t dim_size) {
return has_large_prime_factor(dim_size);
});
}
#endif
if (use_caching && plan_cache.max_size() > 0) {
guard.lock();
if (plan_cache.max_size() > 0) { // check again after acquiring the lock
config = &plan_cache.lookup(Params);

View File

@ -5,20 +5,12 @@
namespace at::native {
__global__ void weight_int8pack_mm_kernel(
const float* x,
const int8_t* w,
const float* scale,
float* out,
int B,
int K,
int N) {
__global__ void weight_int8pack_mm_kernel(const float* x, const int8_t* w, const float* scale, float* out, int B, int K, int N) {
// one thread per output element: [B, N]
int b = blockIdx.y * blockDim.y + threadIdx.y;
int n = blockIdx.x * blockDim.x + threadIdx.x;
if (b >= B || n >= N)
return;
if (b >= B || n >= N) return;
float acc = 0.0f;
for (int k = 0; k < K; ++k) {
@ -28,11 +20,7 @@ __global__ void weight_int8pack_mm_kernel(
out[b * N + n] = acc * scale[n];
}
void launch_weight_int8pack_mm_cuda_kernel(
const Tensor& x,
const Tensor& w_int8,
const Tensor& scale,
Tensor& out) {
void launch_weight_int8pack_mm_cuda_kernel(const Tensor& x, const Tensor& w_int8, const Tensor& scale, Tensor& out) {
const int B = x.size(0);
const int K = x.size(1);
const int N = w_int8.size(0);
@ -47,16 +35,12 @@ void launch_weight_int8pack_mm_cuda_kernel(
w_int8.data_ptr<int8_t>(),
scale.data_ptr<float>(),
out.data_ptr<float>(),
B,
K,
N);
B, K, N);
}
// Main GPU entry point
at::Tensor _weight_int8pack_mm_cuda(
const at::Tensor& x,
const at::Tensor& w_int8,
const at::Tensor& scale) {
at::Tensor _weight_int8pack_mm_cuda(const at::Tensor& x, const at::Tensor& w_int8, const at::Tensor& scale) {
// --- Check inputs ---
TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor");
TORCH_CHECK(w_int8.is_cuda(), "w must be a CUDA tensor");
@ -66,16 +50,12 @@ at::Tensor _weight_int8pack_mm_cuda(
TORCH_CHECK(w_int8.dim() == 2, "w must be 2D");
TORCH_CHECK(scale.dim() == 1, "scale must be 1D");
TORCH_CHECK(
x.size(1) == w_int8.size(1),
"K dimension mismatch: x.size(1) != w.size(1)");
TORCH_CHECK(
w_int8.size(0) == scale.size(0),
"Output dim mismatch: w.size(0) != scale.size(0)");
TORCH_CHECK(x.size(1) == w_int8.size(1), "K dimension mismatch: x.size(1) != w.size(1)");
TORCH_CHECK(w_int8.size(0) == scale.size(0), "Output dim mismatch: w.size(0) != scale.size(0)");
// --- Determine shapes ---
auto B = x.size(0); // batch size
auto N = w_int8.size(0); // output dim
auto B = x.size(0); // batch size
auto N = w_int8.size(0); // output dim
// Ensure inputs are in the correct types for the kernel
auto x_f32 = x.to(at::kFloat);
@ -83,13 +63,12 @@ at::Tensor _weight_int8pack_mm_cuda(
auto scale_f32 = scale.to(at::kFloat);
// --- Allocate output ---
auto out = at::empty({B, N}, x_f32.options());
auto out = at::empty({B, N}, x.options().dtype(at::kFloat));
// --- Launch kernel ---
launch_weight_int8pack_mm_cuda_kernel(
x_f32, w_int8_contiguous, scale_f32, out);
launch_weight_int8pack_mm_cuda_kernel(x_f32, w_int8_contiguous, scale_f32, out);
return out.to(x.dtype());
return out;
}
} // namespace at::native

View File

@ -482,9 +482,7 @@ auto build_graph(
auto scaled_dot_product_flash_attention_options =
fe::graph::SDPA_attributes()
.set_name("CUDNN_SDPA")
.set_is_inference(return_softmaxstats == false)
// TODO(eqy): switch to this API once cuDNN FE is upgraded
// .set_generate_stats(return_softmaxstats)
.set_generate_stats(return_softmaxstats)
.set_causal_mask(is_causal)
.set_attn_scale(attn_scale);
if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
@ -704,9 +702,7 @@ auto build_graph_nestedtensor(
auto scaled_dot_product_flash_attention_options =
fe::graph::SDPA_attributes()
.set_name("CUDNN_SDPA_NESTEDTENSOR")
.set_is_inference(return_softmaxstats == false)
// TODO(eqy): switch to this API once cuDNN FE is upgraded
// .set_generate_stats(return_softmaxstats)
.set_generate_stats(return_softmaxstats)
.set_causal_mask(is_causal)
.set_attn_scale(attn_scale)
.set_seq_len_q(SEQ_LEN_Q_)

View File

@ -2,7 +2,6 @@
#include <ATen/core/Tensor.h>
#include <ATen/TensorUtils.h>
#include <ATen/div_rtn.h>
#include <c10/util/safe_numerics.h>
namespace at::native {
@ -55,14 +54,6 @@ inline void col2im_shape_check(
int64_t batch_dim = (ndim == 3) ? 0 : -1;
int64_t n_input_plane = input.size(batch_dim + 1);
uint64_t prod_kernel_size = 1;
TORCH_CHECK(!c10::mul_overflows(static_cast<uint64_t>(kernel_width), static_cast<uint64_t>(kernel_height), &prod_kernel_size),
"Given kernel_width = ",
kernel_width,
" and kernel_height = ",
kernel_height,
" the product of kernel_width and kernel_height overflowed.");
if (n_input_plane % (kernel_width * kernel_height) != 0) {
TORCH_CHECK(false,

View File

@ -559,60 +559,4 @@ Tensor _int_mm_xpu(const Tensor& self, const Tensor& mat2) {
at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt));
return _int_mm_out_xpu(self, mat2, result);
}
Tensor _weight_int8pack_mm_xpu(
const Tensor& A,
const Tensor& B,
const Tensor& scales) {
auto M = A.size(0);
auto N = B.size(0);
auto K = A.size(1);
TORCH_CHECK(
A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat,
" : expect A to be either 32-bit or 16-bit float tensor.");
TORCH_CHECK(A.dim() == 2, __func__, " : expect A to be 2D tensor.");
TORCH_CHECK(
A.stride(1) == 1, " : A must be contiguous on the last dimension.");
TORCH_CHECK(B.dtype() == kChar, " : expect B to be int8 tensor.");
TORCH_CHECK(B.is_contiguous(), " : expect B to be contiguous.");
TORCH_CHECK(B.size(1) == K, " : expect B.size(1) == ", K);
TORCH_CHECK(
scales.dim() == 1 && scales.size(0) == N,
" : expect scales to be 1d tensor with size ",
N);
auto C = at::empty({M, N}, A.options());
// --- Launch kernel ---
Tensor bias = at::Tensor();
Tensor mat2_zero_points = at::Tensor();
Tensor non_const_scales = scales;
auto post_op_args = torch::List<std::optional<at::Scalar>>();
at::native::onednn::quantized_matmul(
A.contiguous(),
1.0,
0,
B,
non_const_scales,
mat2_zero_points,
bias,
C,
1.0,
0,
C.scalar_type(),
/*other*/ std::nullopt,
/*other scale*/ 1.0,
/*other zp*/ 0,
/*binary post op*/ "none",
/*binary alpha*/ 1.0,
/*post_op_name*/ "none",
post_op_args,
/*post_op_algorithm*/ "none",
/*m2_trans*/ false);
return C;
}
} // namespace at::native

View File

@ -110,9 +110,8 @@ void quantized_matmul(
// [Note] Quantized Matrix Multiplication at XPU
// The following code integrates oneDNN quantized gemm. The quantization
// config we support:
// activation: s8, u8, fp16, bf16, fp32; per tensor calibrated;
// symmetric&asymmetric weight: s8; per_tensor/per_channel calibrated;
// symmetric
// activation: s8&u8; per tensor calibrated; symmetric&asymmetric
// weight: s8; per_tensor/per_channel calibrated; symmetric
auto attr = Attr(static_cast<float>(1.0 / output_scale), output_zero_point);
construct_attr_by_post_op(
binary_post_op,

View File

@ -1,25 +0,0 @@
#pragma once
#include <c10/metal/common.h>
#ifdef __METAL__
enum class EmbeddingBagMode { SUM = 0, MEAN, MAX };
#else
#include <ATen/native/EmbeddingBag.h>
using at::native::EmbeddingBagMode;
#endif
template <typename idx_type_t = uint32_t>
struct EmbeddingBagParams {
::c10::metal::array<idx_type_t, 2> weight_strides;
::c10::metal::array<idx_type_t, 2> output_strides;
::c10::metal::array<idx_type_t, 2> max_indices_strides;
idx_type_t per_sample_weights_stride;
idx_type_t num_indices;
idx_type_t num_bags;
idx_type_t feature_size;
EmbeddingBagMode mode;
int64_t padding_idx;
};

View File

@ -1,261 +0,0 @@
#include <ATen/native/mps/kernels/EmbeddingBag.h>
#include <c10/metal/utils.h>
#include <metal_array>
#include <metal_stdlib>
using namespace metal;
using namespace c10::metal;
template <EmbeddingBagMode M, typename T>
struct ReductionOpInit {
inline opmath_t<T> operator()() {
return 0;
}
};
template <typename T>
struct ReductionOpInit<EmbeddingBagMode::MAX, T> {
inline opmath_t<T> operator()() {
return static_cast<opmath_t<T>>(-INFINITY);
}
};
template <EmbeddingBagMode M, typename T>
struct ReductionOp {
inline opmath_t<T> operator()(
opmath_t<T> weight_val,
opmath_t<T> out_val,
bool is_first) {
return weight_val + out_val;
}
};
template <typename T>
struct ReductionOp<EmbeddingBagMode::MAX, T> {
inline opmath_t<T> operator()(
opmath_t<T> weight_val,
opmath_t<T> out_val,
bool is_first) {
return (is_first || weight_val > out_val) ? weight_val : out_val;
}
};
template <EmbeddingBagMode M, typename T>
struct MaybeApplyPerSampleWeight {
inline opmath_t<T> operator()(
opmath_t<T> weight_val,
uint32_t per_sample_weights_index,
constant T* per_sample_weights,
uint32_t per_sample_weights_stride) {
return weight_val;
}
};
template <typename T>
struct MaybeApplyPerSampleWeight<EmbeddingBagMode::SUM, T> {
inline opmath_t<T> operator()(
opmath_t<T> weight_val,
uint32_t per_sample_weights_index,
constant T* per_sample_weights,
uint32_t per_sample_weights_stride) {
if (per_sample_weights_stride) {
T per_sample_weight = per_sample_weights
[per_sample_weights_stride * per_sample_weights_index];
return static_cast<opmath_t<T>>(per_sample_weight) * weight_val;
} else {
return weight_val;
}
}
};
template <EmbeddingBagMode M, typename T, typename I>
struct MaybeCalcMaxIndex {
inline void operator()(
opmath_t<T> weight_val,
opmath_t<T> out_val,
bool is_first,
thread I& max_idx,
I weight_idx,
bool pad) {}
};
template <typename T, typename I>
struct MaybeCalcMaxIndex<EmbeddingBagMode::MAX, T, I> {
inline void operator()(
opmath_t<T> weight_val,
opmath_t<T> out_val,
bool is_first,
thread I& max_idx,
I weight_idx,
bool pad) {
max_idx = !pad && (is_first || weight_val > out_val) ? weight_idx : max_idx;
}
};
template <EmbeddingBagMode M, typename T>
struct ReductionOpFinal {
inline T operator()(opmath_t<T> val, uint32_t) {
return static_cast<T>(val);
}
};
template <typename T>
struct ReductionOpFinal<EmbeddingBagMode::MEAN, T> {
inline T operator()(opmath_t<T> val, uint32_t count) {
auto out = val / count;
return static_cast<T>((count == 0) ? 0 : out);
}
};
template <typename T>
struct ReductionOpFinal<EmbeddingBagMode::MAX, T> {
inline T operator()(opmath_t<T> val, uint32_t count) {
return static_cast<T>((count == 0) ? 0 : val);
}
};
template <EmbeddingBagMode M, typename I>
struct MaybeWriteMaxIndex {
inline void operator()(
device I*,
const constant ::c10::metal::array<uint32_t, 2>&,
uint32_t,
uint32_t,
I) {}
};
template <typename I>
struct MaybeWriteMaxIndex<EmbeddingBagMode::MAX, I> {
inline void operator()(
device I* max_indices,
const constant ::c10::metal::array<uint32_t, 2>& max_indices_strides,
uint32_t bag_idx,
uint32_t feature_idx,
I max_idx) {
max_indices
[bag_idx * max_indices_strides[0] +
feature_idx * max_indices_strides[1]] = max_idx;
}
};
template <EmbeddingBagMode M, typename T, typename I>
void embedding_bag_impl(
constant T* weight,
constant I* indices,
constant I* offsets,
constant T* per_sample_weights,
device T* output,
device I* offset2bag,
device I* bag_size,
device I* max_indices,
constant EmbeddingBagParams<uint32_t>& params,
uint tid) {
auto num_indices = params.num_indices;
auto num_bags = params.num_bags;
auto feature_size = params.feature_size;
auto padding_idx = params.padding_idx;
auto per_sample_weights_stride = params.per_sample_weights_stride;
constant auto& output_strides = params.output_strides;
constant auto& weight_strides = params.weight_strides;
constant auto& max_indices_strides = params.max_indices_strides;
auto bag_idx = tid / feature_size;
auto feature_idx = tid % feature_size;
uint32_t offsets_end = min(bag_idx + 1, num_bags - 1);
bool is_last_bag = bag_idx + 1 == num_bags;
uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]);
uint32_t indices_end = is_last_bag * (num_indices) +
(!is_last_bag) * (static_cast<uint32_t>(offsets[offsets_end]));
auto out_val = ReductionOpInit<M, T>()();
uint32_t bag_size_ = 0;
I max_idx = 0;
for (uint32_t indices_idx = indices_start; indices_idx < indices_end;
indices_idx++) {
I weight_idx = indices[indices_idx];
bool pad = (weight_idx == padding_idx);
auto weight_val = static_cast<opmath_t<T>>(
weight
[static_cast<uint32_t>(weight_idx) * weight_strides[0] +
feature_idx * weight_strides[1]]);
weight_val = MaybeApplyPerSampleWeight<M, T>()(
weight_val, indices_idx, per_sample_weights, per_sample_weights_stride);
auto new_out_val = ReductionOp<M, T>()(weight_val, out_val, bag_size_ == 0);
MaybeCalcMaxIndex<M, T, I>()(
weight_val, out_val, bag_size_ == 0, max_idx, weight_idx, pad);
out_val = pad ? out_val : new_out_val;
offset2bag[indices_idx] = bag_idx;
bag_size_ += static_cast<uint32_t>(!pad);
}
output[bag_idx * output_strides[0] + feature_idx * output_strides[1]] =
ReductionOpFinal<M, T>()(out_val, bag_size_);
bag_size[bag_idx] = bag_size_;
MaybeWriteMaxIndex<M, I>()(
max_indices, max_indices_strides, bag_idx, feature_idx, max_idx);
}
#define DISPATCH_IMPL(MODE) \
return embedding_bag_impl<MODE>( \
weight, \
indices, \
offsets, \
per_sample_weights, \
output, \
offset2bag, \
bag_size, \
max_indices, \
params, \
tid)
template <typename T, typename I>
kernel void embedding_bag(
constant T* weight [[buffer(0)]],
constant I* indices [[buffer(1)]],
constant I* offsets [[buffer(2)]],
constant T* per_sample_weights [[buffer(3)]],
device T* output [[buffer(4)]],
device I* offset2bag [[buffer(5)]],
device I* bag_size [[buffer(6)]],
device I* max_indices [[buffer(7)]],
constant EmbeddingBagParams<uint32_t>& params [[buffer(8)]],
uint tid [[thread_position_in_grid]]) {
switch (params.mode) {
case EmbeddingBagMode::SUM:
DISPATCH_IMPL(EmbeddingBagMode::SUM);
case EmbeddingBagMode::MEAN:
DISPATCH_IMPL(EmbeddingBagMode::MEAN);
case EmbeddingBagMode::MAX:
DISPATCH_IMPL(EmbeddingBagMode::MAX);
}
}
#define REGISTER_EMBEDDING_BAG_OP(T, I) \
template [[host_name("embedding_bag_" #T "_" #I)]] \
kernel void embedding_bag<T, I>( \
constant T * weight [[buffer(0)]], \
constant I * indices [[buffer(1)]], \
constant I * offsets [[buffer(2)]], \
constant T * per_sample_weights [[buffer(3)]], \
device T * output [[buffer(4)]], \
device I * offset2bag [[buffer(5)]], \
device I * bag_size [[buffer(6)]], \
device I * max_indices [[buffer(7)]], \
constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]], \
uint tid [[thread_position_in_grid]]);
REGISTER_EMBEDDING_BAG_OP(float, int);
REGISTER_EMBEDDING_BAG_OP(float, long);
REGISTER_EMBEDDING_BAG_OP(half, int);
REGISTER_EMBEDDING_BAG_OP(half, long);
REGISTER_EMBEDDING_BAG_OP(bfloat, int);
REGISTER_EMBEDDING_BAG_OP(bfloat, long);

View File

@ -198,7 +198,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
if (input_t.is_contiguous(memory_format) && output_t.is_contiguous(memory_format) && is_macOS_15_0_or_newer) {
inputNDArray = getMPSNDArray(input_t, inputShape);
outputNDArray = getMPSNDArray(output_t, outputShape);
outputNDArray = getMPSNDArray(*output, outputShape);
}
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
@ -302,7 +302,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
}
}
auto outputPlaceholder = outputNDArray ? Placeholder(cachedGraph->outputTensor_, outputNDArray)
: Placeholder(cachedGraph->outputTensor_, output_t);
: Placeholder(cachedGraph->outputTensor_, *output);
NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
[[[NSMutableDictionary alloc] initWithCapacity:3] autorelease];
@ -315,7 +315,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
}
return output_t;
return *output;
}
Tensor _mps_convolution(const Tensor& input_t,

View File

@ -1,180 +0,0 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/TensorUtils.h>
#include <ATen/core/Tensor.h>
#include <ATen/mps/MPSProfiler.h>
#include <ATen/native/EmbeddingBag.h>
#include <ATen/native/Pool.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/native/mps/kernels/EmbeddingBag.h>
#include <fmt/format.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/_embedding_bag_forward_only_native.h>
#include <ATen/ops/_embedding_bag_native.h>
#include <ATen/ops/empty.h>
#endif
namespace at::native {
#ifndef PYTORCH_JIT_COMPILE_SHADERS
static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
#else
#include <ATen/native/mps/EmbeddingBag_metallib.h>
#endif
namespace {
std::pair<Tensor, Tensor> promoteIndicesAndOffsets(const Tensor& indices, const Tensor& offsets) {
const auto commonType = promoteTypes(offsets.scalar_type(), indices.scalar_type());
return {indices.scalar_type() == commonType ? indices : indices.toType(commonType),
offsets.scalar_type() == commonType ? offsets : offsets.toType(commonType)};
}
} // namespace
namespace mps {
static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
const Tensor& weight,
const Tensor& indices_,
const Tensor& offsets_,
const bool scale_grad_by_freq,
const int64_t mode,
bool sparse,
const std::optional<Tensor>& per_sample_weights_opt,
bool include_last_offset,
int64_t padding_idx) {
TORCH_CHECK(indices_.dim() == 1, "input has to be a 1D Tensor, but got Tensor of dimension ", indices_.dim());
if (indices_.dim() == 1) {
TORCH_CHECK(offsets_.dim() == 1, "offsets has to be a 1D Tensor, but got Tensor of dimension ", offsets_.dim());
}
TORCH_CHECK(weight.dim() == 2, "weight has to be a 2D Tensor, but got Tensor of dimension ", weight.dim());
Tensor indices, offsets;
std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_);
auto indices_arg = TensorArg(indices, "indices", 1);
checkScalarTypes("embedding_bag_mps", indices_arg, {kLong, kInt});
auto offsets_arg = TensorArg(offsets, "offsets", 1);
checkScalarTypes("embedding_bag_mps", offsets_arg, {kLong, kInt});
checkSameType("embedding_bag_mps", indices_arg, offsets_arg);
auto weight_arg = TensorArg(weight, "weight", 1);
int64_t num_indices = indices.size(0);
int64_t num_bags = offsets.size(0);
if (include_last_offset) {
TORCH_CHECK(num_bags >= 1, "include_last_offset: number of offsets should be at least 1");
num_bags -= 1;
}
int64_t feature_size = weight.size(1);
auto bag_size = at::empty({num_bags}, indices.options());
auto offset2bag = at::empty({indices.size(0)}, indices.options());
auto output = at::empty({num_bags, feature_size}, weight.options());
Tensor max_indices;
if (mode == EmbeddingBagMode::MAX) {
max_indices = at::empty({num_bags, feature_size}, indices.options());
} else {
max_indices = at::empty({0}, indices.options());
}
EmbeddingBagParams<uint32_t> params;
for (const auto dim : c10::irange(weight.dim())) {
params.weight_strides[dim] = safe_downcast<uint32_t, int64_t>(weight.stride(dim));
params.output_strides[dim] = safe_downcast<uint32_t, int64_t>(output.stride(dim));
if (mode == EmbeddingBagMode::MAX) {
params.max_indices_strides[dim] = safe_downcast<uint32_t, int64_t>(max_indices.stride(dim));
}
}
bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
params.num_indices = num_indices;
params.num_bags = num_bags;
params.feature_size = feature_size;
params.mode = static_cast<EmbeddingBagMode>(mode);
params.padding_idx = padding_idx;
auto num_threads = output.numel();
MPSStream* stream = getCurrentMPSStream();
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
auto pipeline_state = lib.getPipelineStateForFunc(
fmt::format("embedding_bag_{}_{}", scalarToMetalTypeString(weight), scalarToMetalTypeString(indices)));
getMPSProfiler().beginProfileKernel(pipeline_state, "embedding_bag", {weight, indices, offsets});
[computeEncoder setComputePipelineState:pipeline_state];
mtl_setArgs(computeEncoder,
weight,
indices,
offsets,
use_per_sample_weights ? per_sample_weights_opt : std::nullopt,
output,
offset2bag,
bag_size,
max_indices,
params);
mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
getMPSProfiler().endProfileKernel(pipeline_state);
}
});
return std::tuple<Tensor, Tensor, Tensor, Tensor>(
std::move(output), std::move(offset2bag), std::move(bag_size), std::move(max_indices));
}
} // namespace mps
std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps(const Tensor& weight,
const Tensor& indices,
const Tensor& offsets,
const bool scale_grad_by_freq,
const int64_t mode,
bool sparse,
const std::optional<Tensor>& per_sample_weights_opt,
bool include_last_offset,
int64_t padding_idx) {
return mps::_embedding_bag_mps_impl(weight,
indices,
offsets,
scale_grad_by_freq,
mode,
sparse,
per_sample_weights_opt,
include_last_offset,
padding_idx);
}
std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_forward_only_mps(
const Tensor& weight,
const Tensor& indices,
const Tensor& offsets,
const bool scale_grad_by_freq,
const int64_t mode,
bool sparse,
const std::optional<Tensor>& per_sample_weights_opt,
bool include_last_offset,
int64_t padding_idx) {
return _embedding_bag_mps(weight,
indices,
offsets,
scale_grad_by_freq,
mode,
sparse,
per_sample_weights_opt,
include_last_offset,
padding_idx);
}
} // namespace at::native

View File

@ -20,7 +20,6 @@
#include <ATen/ops/baddbmm_native.h>
#include <ATen/ops/bmm_native.h>
#include <ATen/ops/cholesky_native.h>
#include <ATen/ops/eye_native.h>
#include <ATen/ops/linalg_cholesky_ex_native.h>
#include <ATen/ops/linalg_inv_ex_native.h>
#include <ATen/ops/linalg_lu_factor_ex_native.h>
@ -497,24 +496,26 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
using namespace mps;
TORCH_CHECK(result.is_mps(), "Output tensor is not MPS");
TORCH_CHECK(!A.is_complex(), "linalg_inv: not supported for complex types yet!");
using CachedGraph = MPSUnaryCachedGraph;
MPSStream* stream = getCurrentMPSStream();
info.zero_();
if (A.numel() == 0) {
return;
}
if (!result.is_contiguous()) {
result.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
}
auto A_sizes = A.sizes();
int ndim = A.dim();
Tensor LU = empty_like(A, MemoryFormat::Contiguous);
Tensor identity = eye(A.size(-2), A.size(-1), A.scalar_type(), A.options().layout(), A.device()).expand_as(A);
Tensor LU = empty_like(A);
Tensor identity = zeros_like(A);
Tensor pivots = empty({A_sizes.begin(), A_sizes.end() - 1}, A.options().dtype(kInt));
// need to do this to keep the strides of the result tensor
// mps's solve expects row major layout, while inductor
// expects result to be column major
Tensor tmp = empty_like(A, MemoryFormat::Contiguous);
linalg_solve_out_mps_impl(A, identity, true, check_errors, tmp, LU, pivots, info);
result.copy_(tmp);
(ndim == 2 ? identity.diagonal() : identity.diagonal(0, -2, -1)).fill_(1);
linalg_solve_out_mps_impl(A, identity, true, check_errors, result, LU, pivots, info);
}
static Tensor& mm_out_mps_impl(const Tensor& self, const Tensor& other, Tensor& output) {

View File

@ -519,13 +519,6 @@ static void max_unpool_out_mps_template(const Tensor& input,
Tensor& output,
const int32_t pooling_dims,
const std::string& op_name) {
TORCH_CHECK(output_size_.size() == static_cast<size_t>(pooling_dims),
op_name,
"There should be exactly ",
pooling_dims,
" elements but got ",
output_size_.size());
auto dims = input.dim();
auto leading_dims = input.dim() - pooling_dims;
@ -541,18 +534,6 @@ static void max_unpool_out_mps_template(const Tensor& input,
output.resize_(output_size, memory_format);
output.fill_(0);
if (indices.defined() && indices.numel() > 0) {
auto output_image_size = c10::multiply_integers(output_size_);
int64_t min_idx = indices.min().item<int64_t>();
int64_t max_idx = indices.max().item<int64_t>();
if (min_idx < 0 || max_idx >= output_image_size) {
int64_t error_idx = (min_idx < 0) ? min_idx : max_idx;
TORCH_CHECK(false, "Found an invalid max index: ", error_idx, " for output tensor of shape ", output_size_);
}
}
id<MTLDevice> device = MPSDevice::getInstance()->device();
MPSStream* mpsStream = getCurrentMPSStream();
const auto numThreads = input.numel();

View File

@ -2351,7 +2351,6 @@
dispatch:
CPU: _embedding_bag_forward_only_cpu
CUDA: _embedding_bag_forward_only_cuda
MPS: _embedding_bag_forward_only_mps
autogen: _embedding_bag_forward_only.out
- func: _rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor)
@ -2373,7 +2372,6 @@
dispatch:
CPU: _embedding_bag_cpu
CUDA: _embedding_bag_cuda
MPS: _embedding_bag_mps
autogen: _embedding_bag.out
tags: core
@ -3858,7 +3856,7 @@
device_check: NoCheck # TensorIterator
structured: True
dispatch:
CPU, CUDA, MTIA: aminmax_out
CPU, CUDA: aminmax_out
MPS: aminmax_out_mps
- func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
@ -3909,7 +3907,7 @@
- func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
structured: True
dispatch:
CPU, CUDA, MTIA: amax_out
CPU, CUDA: amax_out
MPS: amax_out_mps
# Return: (Tensor output, Tensor indices)
@ -4090,7 +4088,7 @@
- func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
structured: True
dispatch:
CPU, CUDA, MTIA: amin_out
CPU, CUDA: amin_out
MPS: amin_out_mps
# TODO: Add this function to MPS dispatch key so that we avoid declaring it in
@ -4243,7 +4241,6 @@
CPU: _weight_int8pack_mm_cpu
CUDA: _weight_int8pack_mm_cuda
MPS: _weight_int8pack_mm_mps
XPU: _weight_int8pack_mm_xpu
- func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
python_module: sparse
@ -4375,7 +4372,7 @@
variants: function, method
dispatch:
CPU: narrow_copy_dense_cpu
SparseCPU, SparseCUDA, SparseMPS: narrow_copy_sparse
SparseCPU, SparseCUDA: narrow_copy_sparse
CompositeExplicitAutogradNonFunctional: narrow_copy_dense_symint
tags: view_copy
@ -6663,7 +6660,7 @@
- func: zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CompositeExplicitAutograd: zeros_out
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: zeros_sparse_out
SparseCPU, SparseCUDA, SparseMeta: zeros_sparse_out
- func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
dispatch:
@ -10702,7 +10699,6 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow
CUDA: foreach_tensor_div_list_kernel_cuda
MTIA: foreach_tensor_div_list_kernel_mtia
- func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@ -10710,7 +10706,6 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow_
CUDA: foreach_tensor_div_list_kernel_cuda_
MTIA: foreach_tensor_div_list_kernel_mtia_
autogen: _foreach_div.List_out
- func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
@ -10734,7 +10729,6 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow
CUDA: foreach_tensor_div_tensor_kernel_cuda
MTIA: foreach_tensor_div_tensor_kernel_mtia
- func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@ -10742,7 +10736,6 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow_
CUDA: foreach_tensor_div_tensor_kernel_cuda_
MTIA: foreach_tensor_div_tensor_kernel_mtia_
autogen: _foreach_div.Tensor_out
- func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
@ -10849,7 +10842,6 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
MTIA: foreach_tensor_maximum_scalar_kernel_mtia_
autogen: _foreach_maximum.Scalar_out
# foreach_minimum/maximum dispatches to clamp_max/min

View File

@ -1,6 +1,5 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/core/Tensor.h>
#include <ATen/Dispatch.h>
#include <ATen/ceil_div.h>
#include <ATen/native/cuda/Loops.cuh>
#include <c10/cuda/CUDAGuard.h>
@ -22,11 +21,10 @@
namespace at::native {
namespace {
template <typename T>
__global__ void ChooseQuantizationParamsKernelImpl(
const int64_t* fake_quant_on,
const T* x_min,
const T* x_max,
const float* x_min,
const float* x_max,
int32_t qmin,
int32_t qmax,
int size,
@ -95,44 +93,34 @@ __global__ void ChooseQuantizationParamsKernelImpl(
}
}
__device__ inline bool isinf_device(float v) {
return ::isinf(v);
}
__device__ inline bool isinf_device(c10::BFloat16 v) {
return ::isinf(static_cast<float>(v));
}
// CUDA kernel to compute Moving Average Min/Max of the tensor.
// It uses the running_min and running_max along with averaging const, c.
// The formula used to compute the new min/max is as follows
//
// running_min = (1 - c) * running_min + c * x_min, if running_min != inf
// running_min = x_min, if running_min == inf
template <typename T>
__global__ void MovingAverageMinMax(
const int64_t* observer_on,
const T* x_min,
const T* x_max,
T* running_min,
T* running_max,
const float* x_min,
const float* x_max,
float* running_min,
float* running_max,
const float averaging_const,
const int size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (*observer_on == 1) {
if (i < size) {
T curr_min = x_min[i];
T curr_max = x_max[i];
float curr_min = x_min[i];
float curr_max = x_max[i];
T averaging_const_t = static_cast<T>(averaging_const);
float adjusted_min = ::isinf(running_min[i])
? curr_min
: (running_min[i]) + averaging_const * (curr_min - (running_min[i]));
T adjusted_min = isinf_device(running_min[i]) ? curr_min
: (running_min[i]) +
averaging_const_t * (curr_min - (running_min[i]));
T adjusted_max = isinf_device(running_max[i]) ? curr_max
: (running_max[i]) +
averaging_const_t * (curr_max - (running_max[i]));
float adjusted_max = ::isinf(running_max[i])
? curr_max
: (running_max[i]) + averaging_const * (curr_max - (running_max[i]));
running_min[i] = adjusted_min;
running_max[i] = adjusted_max;
@ -154,51 +142,40 @@ void _calculate_moving_average(
at::Tensor x_min, x_max;
int64_t* observer_on_data = observer_on.data_ptr<int64_t>();
float* running_min_data = running_min.data_ptr<float>();
float* running_max_data = running_max.data_ptr<float>();
cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
if (per_row_fq) {
std::tie(x_min, x_max) = at::aminmax(x, 1);
float* x_min_data = x_min.data_ptr<float>();
float* x_max_data = x_max.data_ptr<float>();
int num_threads = std::min(size, (int64_t)512);
const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
AT_DISPATCH_FLOATING_TYPES_AND(
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
scalar_t* x_max_data = x_max.data_ptr<scalar_t>();
scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
// Moving Average Min/Max observer for activations
MovingAverageMinMax<<<num_blocks, num_threads, 0, cuda_stream>>>(
observer_on_data,
x_min_data,
x_max_data,
running_min_data,
running_max_data,
averaging_const,
size);
});
// Moving Average Min/Max observer for activations
MovingAverageMinMax<<<num_blocks, num_threads, 0, cuda_stream>>>(
observer_on_data,
x_min_data,
x_max_data,
running_min_data,
running_max_data,
averaging_const,
size);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
std::tie(x_min, x_max) = at::aminmax(x);
AT_DISPATCH_FLOATING_TYPES_AND(
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
scalar_t* x_max_data = x_max.data_ptr<scalar_t>();
scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
// Moving Average Min/Max observer for activations
MovingAverageMinMax<<<1, 1, 0, cuda_stream>>>(
observer_on_data,
x_min_data,
x_max_data,
running_min_data,
running_max_data,
averaging_const,
1 /*size*/);
});
float* x_min_data = x_min.data_ptr<float>();
float* x_max_data = x_max.data_ptr<float>();
// Moving Average Min/Max observer for activations
MovingAverageMinMax<<<1, 1, 0, cuda_stream>>>(
observer_on_data,
x_min_data,
x_max_data,
running_min_data,
running_max_data,
averaging_const,
1 /*size*/);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
}
@ -221,44 +198,34 @@ void _calc_moving_avg_qparams_helper(
cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
int64_t* fake_quant_on_data = fake_quant_on.data_ptr<int64_t>();
if (per_row_fq) {
AT_DISPATCH_FLOATING_TYPES_AND(
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
int num_threads = std::min(size, (int64_t)512);
const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
ChooseQuantizationParamsKernelImpl<<<
num_blocks,
num_threads,
0,
cuda_stream>>>(
fake_quant_on_data,
running_min_data,
running_max_data,
qmin,
qmax,
size,
symmetric_quant,
scale_ptr,
zp_ptr);
});
float* running_min_data = running_min.data_ptr<float>();
float* running_max_data = running_max.data_ptr<float>();
int num_threads = std::min(size, (int64_t)512);
const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
ChooseQuantizationParamsKernelImpl<<<num_blocks, num_threads, 0, cuda_stream>>>(
fake_quant_on_data,
running_min_data,
running_max_data,
qmin,
qmax,
size,
symmetric_quant,
scale_ptr,
zp_ptr);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
AT_DISPATCH_FLOATING_TYPES_AND(
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>(
fake_quant_on_data,
running_min_data,
running_max_data,
qmin,
qmax,
1, // size
symmetric_quant, // preserve_sparsity
scale_ptr,
zp_ptr);
});
float* running_min_data = running_min.data_ptr<float>();
float* running_max_data = running_max.data_ptr<float>();
ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>(
fake_quant_on_data,
running_min_data,
running_max_data,
qmin,
qmax,
1, // size
symmetric_quant, // preserve_sparsity
scale_ptr,
zp_ptr);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
}

View File

@ -64,6 +64,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
// create sparse descriptor, dtype
cusparseLtMatDescriptor_t sparse_input_descriptor;
cudaDataType type;
auto compression_factor = 9;
#ifdef USE_ROCM
TORCH_CHECK(isHipSparseLtSupported());
@ -72,6 +73,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
switch (sparse_input.scalar_type()) {
case at::ScalarType::Char:
type = CUDA_R_8I;
compression_factor = 10;
break;
case at::ScalarType::Half:
type = CUDA_R_16F;
@ -87,6 +89,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
#if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602 && !defined(USE_ROCM)
case at::ScalarType::Float8_e4m3fn:
type = CUDA_R_8F_E4M3;
compression_factor = 10;
break;
#endif
default:
@ -94,6 +97,10 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
break;
}
// create a new compressed tensor with the same dtype as
auto compressed_tensor =
sparse_input.new_empty(sparse_input.numel() * compression_factor / 16);
TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
&handle,
&sparse_input_descriptor,
@ -114,15 +121,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
&compressed_size,
&compressed_buffer_size));
// create a new compressed tensor with the same dtype as the input,
// and with packed data/metadata stored in an array with original
// number of rows, and sufficient columns to provide compressed_size
// buffer (in bytes)
size_t orig_m = sparse_input.size(0);
size_t div = orig_m * sparse_input.itemsize();
size_t new_n = (compressed_size + div - 1) / div; // floor
auto compressed_tensor = sparse_input.new_empty({(int64_t)orig_m, (int64_t)new_n});
auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@ -167,6 +165,7 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
cudaDataType output_type;
cudaDataType C_type;
cusparseComputeType compute_type;
auto compression_factor = 9;
#ifdef USE_ROCM
TORCH_CHECK(isHipSparseLtSupported());
@ -178,6 +177,7 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
output_type = CUDA_R_8I;
C_type = CUDA_R_8I;
compute_type = CUSPARSE_COMPUTE_32I;
compression_factor = 10;
break;
// cuSPARSELt v0.5.2 onwards changes CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUT_16F
@ -210,6 +210,7 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
output_type = CUDA_R_8F_E4M3;
C_type = CUDA_R_16F;
compute_type = CUSPARSE_COMPUTE_32F;
compression_factor = 10;
break;
#endif
// cuSPARSELt <= v0.5.2 uses CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUTE_16F
@ -299,10 +300,9 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
}
}
TORCH_INTERNAL_ASSERT(compressed_A.dim() == 2); // encoded M x S
int64_t k = dense_B.size(0);
int64_t n = dense_B.size(1);
int64_t m = compressed_A.size(0);
int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
// initialize sparse descriptor
cusparseLtMatDescriptor_t sparse_input_descriptor;

View File

@ -5,6 +5,51 @@
#include <ATen/test/allocator_clone_test.h>
#include <torch/csrc/cuda/CUDAPluggableAllocator.h>
TEST(AllocatorTestCUDA, test_clone) {
test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
}
static int called_dummy_free_0 = 0;
static int called_dummy_free_1 = 0;
void* dummy_alloc_0(size_t size, int device, void* stream) {return nullptr;}
void dummy_free_0(void* data, size_t size, int device, void* stream) {
called_dummy_free_0++;
}
void dummy_free_1(void* data, size_t size, int device, void* stream) {
called_dummy_free_1++;
}
// Tests that data_ptrs have their respective deleters
// when mixing allocators
TEST(AllocatorTestCUDA, test_pluggable_allocator_deleters) {
// Create a tensor with dummy_allocator_0, where dummy_free_0 is the deleter
auto dummy_allocator_0 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_0);
c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_0.get());
at::Tensor a = at::empty({0}, at::TensorOptions().device(at::kCUDA));
// Create a tensor with dummy_allocator_1, where dummy_free_1 is the deleter
auto dummy_allocator_1 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_1);
c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_1.get());
at::Tensor b = at::empty({0}, at::TensorOptions().device(at::kCUDA));
// Manually use a's deleter
auto* ctx = a.storage().data_ptr().get_context();
a.storage().data_ptr().get_deleter()(ctx);
a.storage().mutable_data_ptr().release_context();
// a's deleter is dummy_free_0
// dummy_free_0 should be called above, so called_dummy_free_0 should be 1
ASSERT_TRUE(called_dummy_free_0 == 1);
// Manually use b's deleter
ctx = b.storage().data_ptr().get_context();
b.storage().data_ptr().get_deleter()(ctx);
b.storage().mutable_data_ptr().release_context();
// b's deleter is dummy_free_1
// dummy_free_1 should be called above, so called_dummy_free_1 should be 1
ASSERT_TRUE(called_dummy_free_1 == 1);
}

View File

@ -42,7 +42,7 @@ TEST(MPSObjCInterfaceTest, MPSCustomKernel) {
id<MTLLibrary> customKernelLibrary = [device newLibraryWithSource: [NSString stringWithUTF8String:CUSTOM_KERNEL]
options: nil
error: &error];
TORCH_CHECK(customKernelLibrary, "Failed to create custom kernel library, error: ", error.localizedDescription.UTF8String);
TORCH_CHECK(customKernelLibrary, "Failed to to create custom kernel library, error: ", error.localizedDescription.UTF8String);
id<MTLFunction> customFunction = [customKernelLibrary newFunctionWithName: @"add_arrays"];
TORCH_CHECK(customFunction, "Failed to create function state object for the kernel");

View File

@ -76,23 +76,4 @@ int32_t getGlobalIdxFromDevice(DeviceIndex device) {
return device_global_idxs[device];
}
// Check if a device can access the memory of a peer device directly.
bool canDeviceAccessPeer(DeviceIndex device, DeviceIndex peer) {
if (device == -1) {
device = c10::xpu::current_device();
}
if (peer == -1) {
peer = c10::xpu::current_device();
}
check_device_index(device);
check_device_index(peer);
// A device can always access itself
if (device == peer) {
return true;
}
return c10::xpu::get_raw_device(device).ext_oneapi_can_access_peer(
c10::xpu::get_raw_device(peer),
sycl::ext::oneapi::peer_access::access_supported);
}
} // namespace at::xpu

View File

@ -17,6 +17,4 @@ TORCH_XPU_API DeviceProp* getDeviceProperties(DeviceIndex device);
TORCH_XPU_API int32_t getGlobalIdxFromDevice(DeviceIndex device);
TORCH_XPU_API bool canDeviceAccessPeer(DeviceIndex device, DeviceIndex peer);
} // namespace at::xpu

View File

@ -174,11 +174,11 @@ YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,0
google/gemma-2-2b,pass,5
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,0
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
174
175
176
177
178
179
180
181
182
183
184
186
187
188
189
190
191
192
193

View File

@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,pass,5
hf_Reformer,pass,8
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
hf_T5_generate,pass,7
hf_T5_generate,pass,11

1 name accuracy graph_breaks
162
163
164
165
166
167
168
178
179
180
181
182
183
184

View File

@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,pass,20
hf_Reformer,pass,25

1 name accuracy graph_breaks
110
111
112
113
114
115
116

View File

@ -170,15 +170,15 @@ YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,fail_to_run,0
meta-llama/Llama-3.2-1B,fail_accuracy,0
google/gemma-2-2b,fail_to_run,0
google/gemma-2-2b,fail_accuracy,0
google/gemma-3-4b-it,fail_to_run,0
google/gemma-3-4b-it,fail_accuracy,0
@ -186,4 +186,4 @@ openai/whisper-tiny,fail_to_run,0
Qwen/Qwen3-0.6B,fail_to_run,0
Qwen/Qwen3-0.6B,fail_accuracy,0

1 name accuracy graph_breaks
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
186
187
188
189

View File

@ -138,7 +138,7 @@ hf_Bert_large,pass,0
hf_BigBird,pass,27
hf_BigBird,pass,25
@ -158,7 +158,7 @@ hf_Longformer,pass,4
hf_Reformer,pass,5
hf_Reformer,pass,8

1 name accuracy graph_breaks
138
139
140
141
142
143
144
158
159
160
161
162
163
164

View File

@ -138,7 +138,7 @@ hf_Bert_large,pass,0
hf_BigBird,pass,27
hf_BigBird,pass,25
@ -158,7 +158,7 @@ hf_Longformer,pass,4
hf_Reformer,pass,5
hf_Reformer,pass,8

1 name accuracy graph_breaks
138
139
140
141
142
143
144
158
159
160
161
162
163
164

View File

@ -138,7 +138,7 @@ hf_Bert_large,pass,0
hf_BigBird,pass,27
hf_BigBird,pass,25
@ -158,7 +158,7 @@ hf_Longformer,pass,4
hf_Reformer,pass,5
hf_Reformer,pass,8

1 name accuracy graph_breaks
138
139
140
141
142
143
144
158
159
160
161
162
163
164

View File

@ -174,11 +174,11 @@ YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,0
google/gemma-2-2b,pass,5
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,0
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
174
175
176
177
178
179
180
181
182
183
184
186
187
188
189
190
191
192
193

View File

@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,pass,5
hf_Reformer,pass,8
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
hf_T5_generate,pass,7
hf_T5_generate,pass,11

1 name accuracy graph_breaks
162
163
164
165
166
167
168
178
179
180
181
182
183
184

View File

@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,pass,20
hf_Reformer,pass,25

1 name accuracy graph_breaks
110
111
112
113
114
115
116

View File

@ -122,7 +122,7 @@ hf_Bert_large,pass,0
hf_BigBird,pass,27
hf_BigBird,pass,25
@ -142,7 +142,7 @@ hf_Longformer,pass,4
hf_Reformer,pass,5
hf_Reformer,pass,8

1 name accuracy graph_breaks
122
123
124
125
126
127
128
142
143
144
145
146
147
148

View File

@ -174,11 +174,11 @@ YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,0
google/gemma-2-2b,pass,5
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass,0
openai/whisper-tiny,pass,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,0
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
174
175
176
177
178
179
180
181
182
183
184
186
187
188
189
190
191
192
193

View File

@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,pass,5
hf_Reformer,pass,8
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
hf_T5_generate,pass,7
hf_T5_generate,pass,11

1 name accuracy graph_breaks
162
163
164
165
166
167
168
178
179
180
181
182
183
184

View File

@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,pass,20
hf_Reformer,pass,25

1 name accuracy graph_breaks
110
111
112
113
114
115
116

View File

@ -174,11 +174,11 @@ YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,0
google/gemma-2-2b,pass,5
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,0
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
174
175
176
177
178
179
180
181
182
183
184
186
187
188
189
190
191
192
193

Some files were not shown because too many files have changed in this diff Show More