Compare commits

..

3 Commits

Author SHA1 Message Date
61afc9387b [BE][1/6] fix typos in test/
ghstack-source-id: 3bca652122b2eb7e52fddbc52c76b1d19f41b8ec
Pull-Request: https://github.com/pytorch/pytorch/pull/157635
2025-10-19 18:28:29 +08:00
860b057a7f [BE][5/5] fix typos in aten/ (aten/src/ATen/)
ghstack-source-id: aa87bfe7b8281e9eeb7a79079ebad2ad2bac6d1a
Pull-Request: https://github.com/pytorch/pytorch/pull/157554
2025-10-19 18:28:28 +08:00
0e32b4a985 [BE][4/5] fix typos in aten/ (aten/src/ATen/native/)
ghstack-source-id: 288daaa86090e465421c81ae8373b0f6f63eab4a
Pull-Request: https://github.com/pytorch/pytorch/pull/157553
2025-10-19 18:28:27 +08:00
482 changed files with 3276 additions and 5039 deletions

View File

@ -83,6 +83,10 @@ function build_cpython {
py_suffix=${py_ver::-1}
py_folder=$py_suffix
fi
# Update to rc2 due to https://github.com/python/cpython/commit/c72699086fe4
if [ "$py_suffix" == "3.14.0" ]; then
py_suffix="3.14.0rc2"
fi
wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz
do_cpython_build $py_ver Python-$py_suffix

View File

@ -1,11 +1,15 @@
sphinx==7.2.6
sphinx==5.3.0
#Description: This is used to generate PyTorch docs
#Pinned versions: 7.2.6
#Pinned versions: 5.3.0
pytorch_sphinx_theme2==0.1.0
#Description: This is needed to generate PyTorch docs
#Pinned versions: 0.1.0
standard-imghdr==3.13.0; python_version >= "3.13"
#Description: This is needed by Sphinx, so it needs to be added here.
# The reasons are as follows:
# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
# something related to Docker setup. We can investigate this later.
@ -32,17 +36,17 @@ tensorboard==2.18.0 ; python_version >= "3.13"
#Description: This is used to generate PyTorch docs
#Pinned versions: 2.13.0
breathe==4.36.0
breathe==4.34.0
#Description: This is used to generate PyTorch C++ docs
#Pinned versions: 4.36.0
#Pinned versions: 4.34.0
exhale==0.3.7
exhale==0.2.3
#Description: This is used to generate PyTorch C++ docs
#Pinned versions: 0.3.7
#Pinned versions: 0.2.3
docutils==0.20
docutils==0.16
#Description: This is used to generate PyTorch C++ docs
#Pinned versions: 0.20
#Pinned versions: 0.16
bs4==0.0.1
#Description: This is used to generate PyTorch C++ docs
@ -52,13 +56,13 @@ IPython==8.12.0
#Description: This is used to generate PyTorch functorch docs
#Pinned versions: 8.12.0
myst-nb==1.3.0
myst-nb==0.17.2
#Description: This is used to generate PyTorch functorch and torch.compile docs.
#Pinned versions: 1.3.0
#Pinned versions: 0.17.2
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
python-etcd==0.4.5
sphinx-copybutton==0.5.0
sphinx-design==0.6.1
sphinx-design==0.4.0
sphinxcontrib-mermaid==1.0.0
myst-parser==4.0.1
myst-parser==0.18.1

View File

@ -102,18 +102,8 @@ if [ "$is_main_doc" = true ]; then
echo coverage output not found
exit 1
elif [ $undocumented -gt 0 ]; then
echo "======================================"
echo "ERROR: $undocumented undocumented objects found!"
echo "======================================"
echo ""
echo "Full coverage report:"
echo undocumented objects found:
cat build/coverage/python.txt
echo ""
echo "======================================"
echo "Undocumented modules/objects (lines after TOTAL):"
tail -n +$((lines - undocumented + 1)) build/coverage/python.txt
echo "======================================"
echo ""
echo "Make sure you've updated relevant .rsts in docs/source!"
echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'"
exit 1

View File

@ -163,13 +163,8 @@ if [[ "$(uname)" != Darwin ]]; then
MEMORY_LIMIT_MAX_JOBS=12
NUM_CPUS=$(( $(nproc) - 2 ))
if [[ "$(uname)" == Linux ]]; then
# Defaults here for **binary** linux builds so they can be changed in one place
export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))}
else
# For other builds
export MAX_JOBS=${NUM_CPUS}
fi
# Defaults here for **binary** linux builds so they can be changed in one place
export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))}
cat >>"$envfile" <<EOL
export MAX_JOBS="${MAX_JOBS}"

View File

@ -7,7 +7,7 @@ max-line-length = 120
# C408 ignored because we like the dict keyword argument syntax
# E501 is not flexible enough, we're using B950 instead
ignore =
E203,E305,E402,E501,E704,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
# to line this up with executable bit
EXE001,

View File

@ -33,7 +33,6 @@ ciflow_push_tags:
- ciflow/rocm
- ciflow/rocm-mi300
- ciflow/rocm-mi355
- ciflow/rocm-navi31
- ciflow/s390
- ciflow/slow
- ciflow/torchbench

View File

@ -26,8 +26,9 @@ name: !{{ build_environment }}
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "!{{ py_ver.strip('t') + ('.4' if '3.14' not in py_ver else '.0') }}"
python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}"
freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }}
{%- endmacro %}

View File

@ -79,9 +79,9 @@ jobs:
runs-on: "windows-11-arm64-preview"
{%- else %}
{%- if branches == "nightly" %}
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
{%- else %}
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge.nonephemeral"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
{%- endif %}
{%- endif %}
timeout-minutes: !{{ common.timeout_minutes_windows_binary }}

View File

@ -63,6 +63,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.10.4"
freethreaded: false

View File

@ -59,6 +59,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.10.4"
freethreaded: false
@ -168,6 +169,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.11.4"
freethreaded: false
@ -277,6 +279,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.12.4"
freethreaded: false
@ -386,6 +389,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.13.4"
freethreaded: false
@ -495,6 +499,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.13.4"
freethreaded: true
@ -604,8 +609,9 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.14.0"
python-version: "3.14.0-rc.2"
freethreaded: false
- name: Checkout PyTorch
uses: actions/checkout@v4
@ -713,8 +719,9 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.14.0"
python-version: "3.14.0-rc.2"
freethreaded: true
- name: Checkout PyTorch
uses: actions/checkout@v4

View File

@ -44,7 +44,7 @@ jobs:
libtorch-cpu-shared-with-deps-debug-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -291,7 +291,7 @@ jobs:
libtorch-cuda12_6-shared-with-deps-debug-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -541,7 +541,7 @@ jobs:
libtorch-cuda12_8-shared-with-deps-debug-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -791,7 +791,7 @@ jobs:
libtorch-cuda13_0-shared-with-deps-debug-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch

View File

@ -44,7 +44,7 @@ jobs:
libtorch-cpu-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -291,7 +291,7 @@ jobs:
libtorch-cuda12_6-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -541,7 +541,7 @@ jobs:
libtorch-cuda12_8-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -791,7 +791,7 @@ jobs:
libtorch-cuda13_0-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch

View File

@ -44,7 +44,7 @@ jobs:
wheel-py3_10-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -279,7 +279,7 @@ jobs:
wheel-py3_10-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -517,7 +517,7 @@ jobs:
wheel-py3_10-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -755,7 +755,7 @@ jobs:
wheel-py3_10-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -993,7 +993,7 @@ jobs:
wheel-py3_10-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1229,7 +1229,7 @@ jobs:
wheel-py3_11-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1464,7 +1464,7 @@ jobs:
wheel-py3_11-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1702,7 +1702,7 @@ jobs:
wheel-py3_11-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1940,7 +1940,7 @@ jobs:
wheel-py3_11-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2178,7 +2178,7 @@ jobs:
wheel-py3_11-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2414,7 +2414,7 @@ jobs:
wheel-py3_12-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2649,7 +2649,7 @@ jobs:
wheel-py3_12-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2887,7 +2887,7 @@ jobs:
wheel-py3_12-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3125,7 +3125,7 @@ jobs:
wheel-py3_12-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3363,7 +3363,7 @@ jobs:
wheel-py3_12-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3599,7 +3599,7 @@ jobs:
wheel-py3_13-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3834,7 +3834,7 @@ jobs:
wheel-py3_13-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -4072,7 +4072,7 @@ jobs:
wheel-py3_13-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -4310,7 +4310,7 @@ jobs:
wheel-py3_13-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -4548,7 +4548,7 @@ jobs:
wheel-py3_13-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -4784,7 +4784,7 @@ jobs:
wheel-py3_13t-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5019,7 +5019,7 @@ jobs:
wheel-py3_13t-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5257,7 +5257,7 @@ jobs:
wheel-py3_13t-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5495,7 +5495,7 @@ jobs:
wheel-py3_13t-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5733,7 +5733,7 @@ jobs:
wheel-py3_13t-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5969,7 +5969,7 @@ jobs:
wheel-py3_14-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -6204,7 +6204,7 @@ jobs:
wheel-py3_14-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -6442,7 +6442,7 @@ jobs:
wheel-py3_14-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -6680,7 +6680,7 @@ jobs:
wheel-py3_14-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -6918,7 +6918,7 @@ jobs:
wheel-py3_14-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -7154,7 +7154,7 @@ jobs:
wheel-py3_14t-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -7389,7 +7389,7 @@ jobs:
wheel-py3_14t-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -7627,7 +7627,7 @@ jobs:
wheel-py3_14t-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -7865,7 +7865,7 @@ jobs:
wheel-py3_14t-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -8103,7 +8103,7 @@ jobs:
wheel-py3_14t-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch

View File

@ -1,63 +0,0 @@
name: rocm-navi31
on:
push:
tags:
- ciflow/rocm-navi31/*
workflow_dispatch:
schedule:
# We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
# Also run less frequently on weekends.
- cron: 45 */2 * * 1-5
- cron: 45 4,12 * * 0,6
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions: read-all
jobs:
target-determination:
if: github.repository_owner == 'pytorch'
name: before-test
uses: ./.github/workflows/target_determination.yml
permissions:
id-token: write
contents: read
linux-jammy-rocm-py3_10-build:
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-rocm-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
]}
secrets: inherit
linux-jammy-rocm-py3_10-test:
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3_10
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
tests-to-include: >-
${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
inductor/test_flex_attention inductor/test_max_autotune' || '' }}
secrets: inherit

View File

@ -59,3 +59,29 @@ jobs:
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-rocm-py3_10-gfx1100-test:
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3_10-gfx1100
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
]}
tests-to-include: >
test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
inductor/test_flex_attention inductor/test_max_autotune
secrets: inherit

View File

@ -190,40 +190,6 @@ jobs:
runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
secrets: inherit
linux-jammy-rocm-py3_10-build:
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-rocm-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
]}
secrets: inherit
linux-jammy-rocm-py3_10-test:
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
secrets: inherit
inductor-build:
name: inductor-build
uses: ./.github/workflows/_linux-build.yml

View File

@ -1202,12 +1202,6 @@ exclude_patterns = [
'torch/_inductor/fx_passes/serialized_patterns/**',
'torch/_inductor/autoheuristic/artifacts/**',
'torch/utils/model_dump/preact.mjs',
# These files are all grandfathered in, feel free to remove from this list
# as necessary
# NOTE: remove the patterns in the order they are listed
'aten/src/ATen/native/[a-pA-P]*/**',
'aten/src/ATen/[a-mA-M]*/**',
'test/**',
]
init_command = [
'python3',

View File

@ -289,14 +289,15 @@ IF(USE_FBGEMM_GENAI)
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
set(fbgemm_genai_mx8mx8bf16_grouped
set(fbgemm_genai_cuh
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
"${FBGEMM_GENAI_SRCS}/"
)
target_include_directories(fbgemm_genai PRIVATE
${FBGEMM_THIRD_PARTY}/cutlass/include
${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
${fbgemm_genai_mx8mx8bf16_grouped}
${fbgemm_genai_cuh}
${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h
)
@ -313,14 +314,13 @@ IF(USE_FBGEMM_GENAI)
# Add additional HIPCC compiler flags for performance
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
-mllvm
-amdgpu-coerce-illegal-types=1
-mllvm
-enable-post-misched=0
-mllvm
-greedy-reverse-local-assignment=1
-fhip-new-launch-api)
if(DEFINED ROCM_VERSION_DEV AND ROCM_VERSION_DEV VERSION_LESS "7.2.0")
list(PREPEND FBGEMM_GENAI_EXTRA_HIPCC_FLAGS -mllvm -amdgpu-coerce-illegal-types=1)
endif()
# Only compile for gfx942 for now.
# This is rather hacky, I could not figure out a clean solution :(

View File

@ -39,7 +39,7 @@ struct HostBlock {
};
template <typename B>
struct alignas(hardware_destructive_interference_size) FreeBlockList {
struct alignas(64) FreeBlockList {
std::mutex mutex_;
std::deque<B*> list_;
};
@ -94,11 +94,11 @@ struct PinnedReserveSegment {
struct TORCH_API HostStats {
// COUNT: total allocations (active)
Stat active_requests;
// SUM: bytes allocated/reserved by this memory alocator. (active)
// SUM: bytes allocated/reserved by this memory allocator. (active)
Stat active_bytes;
// COUNT: total allocations (active + free)
Stat allocations;
// SUM: bytes allocated/reserved by this memory alocator. This accounts
// SUM: bytes allocated/reserved by this memory allocator. This accounts
// for both free and in-use blocks.
Stat allocated_bytes;
@ -122,12 +122,12 @@ struct TORCH_API HostStats {
// Struct containing memory allocator summary statistics for host, as they
// are staged for reporting. This is a temporary struct that is used to
// avoid locking the allocator while collecting stats.
struct alignas(hardware_destructive_interference_size) HostStatsStaged {
struct alignas(64) HostStatsStaged {
std::mutex timing_mutex_;
// COUNT: total allocations (active + free)
// LOCK: access to this stat is protected by the allocator's blocks_mutex_
Stat allocations;
// SUM: bytes allocated/reserved by this memory alocator. This accounts
// SUM: bytes allocated/reserved by this memory allocator. This accounts
// for both free and in-use blocks.
Stat allocated_bytes;
// COUNT: number of allocations per bucket (active)
@ -455,7 +455,7 @@ struct CachingHostAllocatorImpl {
}
void resetAccumulatedStats() {
// Reseting accumulated memory stats requires concurrently holding both the
// Resetting accumulated memory stats requires concurrently holding both the
// free list mutexes and the blocks mutex. Previously, this was only done in
// empty_cache function.
for (size_t i = 0; i < free_list_.size(); ++i) {
@ -482,7 +482,7 @@ struct CachingHostAllocatorImpl {
}
void resetPeakStats() {
// Reseting peak memory stats requires concurrently holding both the
// Resetting peak memory stats requires concurrently holding both the
// free list mutexes and the blocks mutex. Previously, this was only done in
// empty_cache function.
for (size_t i = 0; i < free_list_.size(); ++i) {
@ -669,7 +669,7 @@ struct CachingHostAllocatorImpl {
TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event");
}
alignas(hardware_destructive_interference_size) std::mutex blocks_mutex_;
alignas(64) std::mutex blocks_mutex_;
ska::flat_hash_set<B*> blocks_; // block list
ska::flat_hash_map<void*, B*> ptr_to_block_;
@ -677,17 +677,17 @@ struct CachingHostAllocatorImpl {
// size. This allows us to quickly find a free block of the right size.
// We use deque to store per size free list and guard the list with its own
// mutex.
alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>> free_list_ =
alignas(64) std::vector<FreeBlockList<B>> free_list_ =
std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
alignas(64) std::mutex events_mutex_;
std::deque<std::pair<E, B*>> events_; // event queue paired with block
// Indicates whether the object is active.
// Set to false in the destructor to signal background threads to stop.
std::atomic<bool> active_{true};
protected:
alignas(hardware_destructive_interference_size) HostStatsStaged stats_;
alignas(64) HostStatsStaged stats_;
};
struct TORCH_API HostAllocator : public at::Allocator {

View File

@ -3,7 +3,7 @@
namespace at {
// Re-declaring 'DimVector' type and size inside 'at' namespace.
// Redeclaring 'DimVector' type and size inside 'at' namespace.
// This is done to avoid modifying every use into their 'c10'
// equivalent.

View File

@ -16,7 +16,7 @@ _GeneratorRegister::_GeneratorRegister(const GeneratorFuncType& func) {
TORCH_WARN_DEPRECATION(
"REGISTER_GENERATOR_PRIVATEUSE1 is deprecated. \
Please derive PrivateUse1HooksInterface to implememt getNewGenerator instead.")
Please derive PrivateUse1HooksInterface to implement getNewGenerator instead.")
TORCH_CHECK(
!GetGeneratorPrivate().has_value(),

View File

@ -149,7 +149,7 @@
* First, keep in mind that we assume that boxed containers will
* have to deal with `IValue` (e.g. `c10::List`). In this context,
* what may be happening is that `IValue` doesn't store internally
* your type `T`. Instead, it constructs a type new `T` everytime
* your type `T`. Instead, it constructs a type new `T` every time
* you try to get `T` for it (see `IListRef<at::OptinalTensorRef>`).
*/
@ -186,7 +186,7 @@ class IListRef;
* This macro is useful because it allows us to handle different
* types (that correspond to different tags) to be implemented
* only once. We can do it even when the implementation of the
* different tags aren't syntatically the same, by dispatching
* different tags aren't syntactically the same, by dispatching
* it to a function (e.g. `ImplT::<dispatch-function>(this_)`).
*/
#define TORCH_ILISTREF_UNWRAP(TAG, BODY) \

View File

@ -42,7 +42,7 @@ class IListRefTagImplBase<IListRefTag::Unboxed, T, ListElemT> {
/*
* We have these function (besides the `unwrap`s above) because the
* implementation for both `IListRef::operator[]` and `IListRefIterator::operator*`
* weren't syntatically equal for the existing tags at the time
* weren't syntactically equal for the existing tags at the time
* (`Unboxed` and `Boxed`).
*/
static IListRefConstRef<T> front(const list_type& lst) {

View File

@ -12,7 +12,7 @@ namespace at {
// in order. This is most commonly used in autogenerated code,
// where it is convenient to have a function that can uniformly
// take arguments of different types. If your arguments
// are homogenous consider using a std::initializer_list instead.
// are homogeneous consider using a std::initializer_list instead.
//
// For examples of this in use, see torch/csrc/utils/variadic.h
template <typename F>

View File

@ -148,7 +148,7 @@ struct TORCH_API ClassType : public NamedType {
void checkNotExist(const std::string& name, const std::string& what) const;
// Attributes are stored in a specific slot at runtime for effiency.
// Attributes are stored in a specific slot at runtime for efficiency.
// When emitting instructions we specify the slot so that attribute access is
// a constant lookup
std::optional<size_t> findAttributeSlot(const std::string& name) const {
@ -412,7 +412,7 @@ struct TORCH_API ClassType : public NamedType {
// Holds method attributes
std::weak_ptr<CompilationUnit> compilation_unit_;
// Holds all atrributes, attribute details are found on ClassAttribute
// Holds all attributes, attribute details are found on ClassAttribute
std::vector<ClassAttribute> attributes_;
// Construct mirroring attributes_, only around due to the fact that `containedTypes()` method returns an ArrayRef.
// Never fill this without using the appropriate provideNewClassAttribute method

View File

@ -111,7 +111,7 @@ void Dispatcher::waitForDef(const FunctionSchema& schema) {
TORCH_INTERNAL_ASSERT(r,
"Expected main interpreter to define ", schema.operator_name(),
", but this didn't happen within timeout. Are you trying to load "
"different models in the same torchdeploy/multipy instance? You "
"different models in the same torchdeploy/multipy instance? You " // codespell:ignore
"must warmup each interpreter identically, e.g., import all "
"the same dependencies.");
}
@ -129,7 +129,7 @@ void Dispatcher::waitForImpl(const OperatorName& op_name, std::optional<c10::Dis
TORCH_INTERNAL_ASSERT(r,
"Expected main interpreter to implement ", dk, " for ", op_name,
", but this didn't happen within timeout. Are you trying to load "
"different models in the same torchdeploy/multipy instance? You "
"different models in the same torchdeploy/multipy instance? You " // codespell:ignore
"must warmup each interpreter identically, e.g., import all "
"the same dependencies.");
}
@ -531,7 +531,7 @@ int64_t Dispatcher::sequenceNumberForRunningRecordFunction(DispatchKey dispatchK
// Note: this records a sequence number for both Autograd keys, and for
// non-Autograd keys where the dispatchKeySet still contains an autograd key.
// This means that we might collect the same sequence nubmer two different
// This means that we might collect the same sequence number two different
// events if they all occurred above Autograd and still had the Autograd
// dispatch key in the dispatch key set.
// However, this usually doesn't happen: normally the first call will

View File

@ -222,7 +222,8 @@ class TORCH_API Dispatcher final {
return backendFallbackKernels_[dispatch_ix].kernel.isValid();
}
// Used by torchdeploy/multipy for multiple interpreters racing.
// Used by torchdeploy/multipy for multiple // codespell:ignore: multipy
// interpreters racing.
void waitForDef(const FunctionSchema& schema);
void waitForImpl(
const OperatorName& op_name,
@ -414,7 +415,7 @@ class TORCH_API Dispatcher final {
std::unique_ptr<detail::RegistrationListenerList> listeners_;
// This condition variable gets notified whenever we add a new def/impl to the
// dispatch table. This is primarily used by multipy/torchdeploy, when
// dispatch table. This is primarily used by multiply/torchdeploy, when
// we have multiple interpreters trying to register to the dispatch table.
// In this situation, whenever the non-primary interpreter would have tried
// to register to the dispatch table, instead it will check to see if the
@ -585,7 +586,7 @@ class TORCH_API OperatorHandle {
// We need to store this iterator in order to make
// Dispatcher::cleanup() fast -- it runs a lot on program
// termination (and presuambly library unloading).
// termination (and presumably library unloading).
std::list<Dispatcher::OperatorDef>::iterator operatorIterator_;
};

View File

@ -261,7 +261,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
//
// There are 2 cases
// 1. something like 'aten::items.str(Dict(str, t) self) -> ((str, t)[])'.
// without the extra parenthesis, the c++ schem parser can not parse it.
// without the extra parenthesis, the c++ scheme parser can not parse it.
// 2. something like '-> ((str, str))'. Need extra parenthesis so the return
// type is a single tuple rather than two strings.
// PR (https://github.com/pytorch/pytorch/pull/23204) has more context about

View File

@ -1176,7 +1176,7 @@ struct TORCH_API IValue final {
using HashIdentityIValueMap =
std::unordered_map<IValue, IValue, HashIdentityIValue, CompIdentityIValues>;
// Chechs if this and rhs has a subvalues in common.
// Checks if this and rhs has a subvalues in common.
// [t1,t2] and [t2, t3] returns true.
bool overlaps(const IValue& rhs) const;

View File

@ -990,7 +990,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
std::unique_lock<std::mutex> lock(mutex_);
if (completed_) {
// This should be rare and shouldn't cause log spew. Its important to
// log errors and thats why we have this log here.
// log errors and that's why we have this log here.
std::string msg = c10::str(
"Skipping setting following error on the Future since "
"it is already marked completed (this is not necessarily "
@ -1501,7 +1501,7 @@ struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target {
// However, the CompilationUnit holds ownership of the type's graphs, so
// inserting a constant object into a Graph would create a reference cycle if
// that constant object held a shared_ptr to its CU. For these objects we
// instatiate them with non-owning references to its CU
// instantiate them with non-owning references to its CU
Object(WeakOrStrongTypePtr type, size_t numSlots) : type_(std::move(type)) {
slots_.resize(numSlots);
}

View File

@ -374,7 +374,7 @@ struct TORCH_API SymbolicShape {
// Unranked shape constructor.
SymbolicShape() : dims_(std::nullopt) {}
// Known rank but unknown dimentions.
// Known rank but unknown dimensions.
SymbolicShape(std::optional<size_t> rank) : dims_(std::nullopt) {
if(!rank) {
return;
@ -891,10 +891,10 @@ struct TORCH_API ListType
// global singleton
// Given an inner type T and an identifier,
// this function wil return the global singleton type pointer
// this function will return the global singleton type pointer
// the type List<T>.
// The extra "identifier" argument is needed beccause we have multiple container types
// that all re-use this function (List<T>, array<T, N>, etc.)
// The extra "identifier" argument is needed because we have multiple container types
// that all reuse this function (List<T>, array<T, N>, etc.)
static TypePtr get(const std::string& identifier, TypePtr inner);
// common cast List[Tensor]
@ -992,7 +992,7 @@ struct TORCH_API DictType : public SharedType {
// this function will return the global singleton type pointer
// the type List<T>.
// The extra "identifier" argument is needed because we have multiple container types
// that all re-use this function (Dict<K, V> and unordered_map<K, V>)
// that all reuse this function (Dict<K, V> and unordered_map<K, V>)
static TypePtr get(const std::string& identifier, TypePtr key, TypePtr val);
private:

View File

@ -21,7 +21,7 @@ namespace c10 {
namespace detail {
// The first argument of the schema might be of type DispatchKeySet, in which case we remove it.
// We do this because every argument in a function schema is expected to be convertable
// We do this because every argument in a function schema is expected to be convertible
// to an ivalue, but DispatchKeySet is not a type we want the jit to be aware of.
// See Note [Plumbing Keys Through The Dispatcher]
template<class KernelFunctor>

View File

@ -172,7 +172,7 @@ VaryingShape<Stride> TensorType::computeStrideProps(
// The logic below follows what TensorIterator uses in its logic:
// 1. Fast_set_up is the short-cut to identify a. channels_last and
// b. contiguous format, which is what we have in the below logic.
// 2. In more generla cases, it does best effort to preserve permutatoin.
// 2. In more general cases, it does best effort to preserve permutatoin.
if (is_channels_last_strides_2d(sizes, strides) || is_channels_last_strides_3d(sizes, strides)) {
// case 1.a. short cut channels last
std::iota(stride_indices.rbegin() + 1, stride_indices.rend() - 1, 2);

View File

@ -679,7 +679,7 @@ TORCH_API bool elementTypeCanBeInferredFromMembers(const TypePtr& elem_type) {
return false;
}
if (elem_type->kind() == AnyType::Kind) {
// List of Any can contains heterogenous types
// List of Any can contains heterogeneous types
return false;
}
return true;

View File

@ -234,7 +234,7 @@ class Vectorized<c10::Half> : public Vectorized16<
vshlq_u16(vandq_u16(is_zero_vec, vdupq_n_u16(1)), shift);
return vaddvq_u16(bits_vec);
#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
// use known working implmentation.
// use known working implementation.
__at_align__ value_type tmp[size()];
store(tmp);
int mask = 0;

View File

@ -1740,7 +1740,7 @@ Vectorized<int16_t> inline shift_256_16(
// Control masks for shuffle operation, treating 256 bits as an
// array of 16-bit elements, and considering pairs of neighboring
// elements. Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
// elements. Specifically, a mask named "ctl_M_N" (M,N in [0,1], and
// M!=N) is set so that shuffle will move element with index M from
// input pair into element with index N in output pair, and element
// with index M in output pair will be set to all 0s.
@ -1875,7 +1875,7 @@ Vectorized<T> inline shift_256_8(
// Control masks for shuffle operation, treating 256 bits as an
// array of 8-bit elements, and considering quadruples of
// neighboring elements. Specifially, a mask named "ctl_M_N" (M,N
// neighboring elements. Specifically, a mask named "ctl_M_N" (M,N
// in [0,1,2,3], and M!=N) is set so that shuffle will move element
// with index M from input quadruple into element with index N in
// output quadruple, and other elements in output quadruple will be

View File

@ -143,7 +143,7 @@ class Vectorized<double> {
const Vectorized<double>& a,
const Vectorized<double>& b,
const Vectorized<double>& mask) {
// the mask used here returned by comparision of vec256
// the mask used here returned by comparison of vec256
return {
vec_sel(a._vec0, b._vec0, mask._vecb0),

View File

@ -142,7 +142,7 @@ class Vectorized<float> {
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& mask) {
// the mask used here returned by comparision of vec256
// the mask used here returned by comparison of vec256
// assuming this we can use the same mask directly with vec_sel
return {
vec_sel(a._vec0, b._vec0, mask._vecb0),

View File

@ -202,7 +202,7 @@ class Vectorized<int16_t> {
const Vectorized<int16_t>& a,
const Vectorized<int16_t>& b,
const Vectorized<int16_t>& mask) {
// the mask used here returned by comparision of vec256
// the mask used here returned by comparison of vec256
// assuming this we can use the same mask directly with vec_sel
// warning intel style mask will not work properly
return {

View File

@ -155,7 +155,7 @@ class Vectorized<int32_t> {
const Vectorized<int32_t>& a,
const Vectorized<int32_t>& b,
const Vectorized<int32_t>& mask) {
// the mask used here returned by comparision of vec256
// the mask used here returned by comparison of vec256
// assuming this we can use the same mask directly with vec_sel
// warning intel style mask will not work properly
return {

View File

@ -119,7 +119,7 @@ class Vectorized<int64_t> {
const Vectorized<int64_t>& a,
const Vectorized<int64_t>& b,
const Vectorized<int64_t>& mask) {
// the mask used here returned by comparision of vec256
// the mask used here returned by comparison of vec256
return {
vec_sel(a._vec0, b._vec0, mask._vecb0),

View File

@ -397,7 +397,7 @@ inline Vectorized<bool> operator&&(
const __m512i* other_ = reinterpret_cast<const __m512i*>(other.as_bytes());
__m512i out = _mm512_and_si512(*self_, *other_);
Vectorized<bool> ret;
// We do not have a constructer that takes __m512i, so we need to memcpy
// We do not have a constructor that takes __m512i, so we need to memcpy
std::memcpy(ret, &out, ret.size() * sizeof(bool));
return ret;
}

View File

@ -498,8 +498,8 @@ static inline Vectorized<T> binary_fp8_op_as_fp32(
// Refer to
// https://github.com/pytorch/pytorch/pull/153364#discussion_r2086509353 FP8 +,
// -, *, /, planed to be deleted in the future and here is just to make compiler
// happy
// -, *, /, planned to be deleted in the future and here is just to make
// compiler happy
Vectorized<Float8_e4m3fn> inline operator+(
const Vectorized<Float8_e4m3fn>& a,
const Vectorized<Float8_e4m3fn>& b) {
@ -585,8 +585,8 @@ class Vectorized<Float8_e5m2> : public Vectorizedf8<Float8_e5m2> {
// Refer to
// https://github.com/pytorch/pytorch/pull/153364#discussion_r2086509353 FP8 +,
// -, *, /, planed to be deleted in the future and here is just to make compiler
// happy
// -, *, /, planned to be deleted in the future and here is just to make
// compiler happy
Vectorized<Float8_e5m2> inline operator+(
const Vectorized<Float8_e5m2>& a,
const Vectorized<Float8_e5m2>& b) {

View File

@ -1852,7 +1852,7 @@ Vectorized<T> inline shift_512_8(
// Control masks for shuffle operation, treating 512 bits as an
// array of 8-bit elements, and considering pairs of neighboring
// elements. Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
// elements. Specifically, a mask named "ctl_M_N" (M,N in [0,1], and
// M!=N) is set so that shuffle will move element with index M from
// input pair into element with index N in output pair, and element
// with index M in output pair will be set to all 0s.

View File

@ -1958,7 +1958,7 @@ void scaled_gemm(
ScalarType result_dtype,
bool use_fast_accum,
const std::optional<Tensor>& alpha) {
// Note: see `cublasCommonArgs` for various non-intuitive manupulations
// Note: see `cublasCommonArgs` for various non-intuitive manipulations
// of input arguments to this function.
const auto computeType = CUBLAS_COMPUTE_32F;
const auto scaleType = CUDA_R_32F;

View File

@ -311,7 +311,7 @@ CUDAGraph::~CUDAGraph() {
// There are recent HIP changes where hipGraphExecDestroy doesn't immediately free memory.
// They wait for next sync point in order to free the memory, this is to ensure that all
// hipGraphLaunch are finished before we release any memory. This feature was enabled in rocm6.2.
// We need to ensure all async opreations finish before deleting the object.
// We need to ensure all async operations finish before deleting the object.
#if (defined(USE_ROCM) && ROCM_VERSION >= 60200)
if (capture_dev_ != UNDEFINED_DEVICE) // check if capture_dev_ contains the real device id
{

View File

@ -179,7 +179,7 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input, int6
batch_offset * values_batch_stride * values.itemsize(),
index_type, // data type of row offsets index
index_type, // data type of col indices
CUSPARSE_INDEX_BASE_ZERO, // base index of row offset and col indes
CUSPARSE_INDEX_BASE_ZERO, // base index of row offset and col index
value_type // data type of values
));

View File

@ -137,7 +137,7 @@ struct CUDACachingHostAllocatorImpl
void free_block_slowpath(Block* block) {
auto start = std::chrono::steady_clock::now();
// Users may change the allocator config at will. torch unit tests do this.
// However, allocations using cudaHostRegister should use corresonding
// However, allocations using cudaHostRegister should use corresponding
// cudaHostUnregister and similarly for cudaHostAlloc / cudaFreeHost.
void* ptr = block->ptr_;
bool use_register = false;
@ -183,6 +183,11 @@ struct CUDACachingHostAllocatorImpl
return true;
}
bool pinned_use_background_threads() override {
return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
pinned_use_background_threads();
}
EventPool::Event create_event_internal(DeviceIndex idx) {
// Leak the event pool to avoid shutdown issue.
static auto* event_pool = new EventPool();

View File

@ -10,7 +10,7 @@ namespace at::cuda {
//
// A caching allocator for CUDA host allocations (pinned memory).
//
// This provides a drop-in replacement for THCudaHostAllocator, which re-uses
// This provides a drop-in replacement for THCudaHostAllocator, which reuses
// freed pinned (page-locked) memory allocations. This avoids device
// synchronizations due to cudaFreeHost calls.
//
@ -26,7 +26,7 @@ inline TORCH_CUDA_CPP_API at::HostAllocator* getCachingHostAllocator() {
}
// Records an event in the specified stream. The allocation corresponding to the
// input `ptr`/`ctx` will not be re-used until the event has occurred.
// input `ptr`/`ctx` will not be reused until the event has occurred.
C10_DEPRECATED_MESSAGE(
"at::cuda::CachingHostAllocator_recordEvent(...) is deprecated. Please use at::getHostAllocator(at::kCUDA)->record_event(...) instead.")
inline TORCH_CUDA_CPP_API bool CachingHostAllocator_recordEvent(

View File

@ -4,7 +4,7 @@
#include <ATen/cuda/CUDAConfig.h>
// NOTE: These templates are intentionally not defined in this header,
// which aviods re-compiling them for each translation unit. If you get
// which avoids re-compiling them for each translation unit. If you get
// a link error, you need to add an explicit instantiation for your
// types in cub.cu

View File

@ -93,7 +93,7 @@ struct IndexToOffset {
}
};
// Uses dynamic (runtime) instead of static (compiletime) dims
// Uses dynamic (runtime) instead of static (compile time) dims
template <typename T, typename IndexType>
struct IndexToOffset<T, IndexType, -1> {
static inline __host__ __device__ IndexType get(

View File

@ -32,7 +32,7 @@ static inline void launch_jitted_vectorized_kernel_dynamic(
// Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements)
// fn_ptr is set to the appropriate function based on the vec size and GPU used
// TODO: Memory use can probably be optimized by re-using kernels across GPUs with
// TODO: Memory use can probably be optimized by reusing kernels across GPUs with
// the same compute capability
std::string f_inputs_type_str = at::cuda::jit::typeName(common_dtype);

View File

@ -38,7 +38,7 @@ GemmTunableOp_float_NT,nt_25088_4096_64,1219,1.262
GemmTunableOp_float_NT,nt_4096_4096_64,1216,0.033
```
Note the "Validator" lines. If you change a library verison, or ROCm version, or PyTorch version, TunableOp will detect
Note the "Validator" lines. If you change a library version, or ROCm version, or PyTorch version, TunableOp will detect
this and reject the tunings file because the prior tunings are likely affected by other software changes.
The remaining lines are the tuned solutions for each TunableOp encountered during your execution. Each line consists of

View File

@ -235,7 +235,7 @@ class TunableOp {
// numeric check option is controlled by non-static env var, so check it once per tuned operator
bool do_numerics_check = ctx->IsNumericsCheckEnabled();
// calcaulte a reference answer for numerical check
// calculate a reference answer for numerical check
if (do_numerics_check) {
reference_params = params->DeepCopy(false);
TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK);

View File

@ -12,7 +12,7 @@ namespace at {
// AcceleratorHooksInterface is a shared interface provided by all
// accelerators to allow generic code.
// This inferface is hook-based as it corresponds to all the functions
// This interface is hook-based as it corresponds to all the functions
// that are going to be called in a generic way from the CPU code.
struct TORCH_API AcceleratorHooksInterface {

View File

@ -38,7 +38,7 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
Generator getNewGenerator(
[[maybe_unused]] DeviceIndex device_index = -1) const override {
// TODO(FFFrog): Perserved for BC and will be removed in the future.
// TODO(FFFrog): Preserved for BC and will be removed in the future.
if (at::GetGeneratorPrivate().has_value())
return at::GetGeneratorForPrivateuse1(device_index);

View File

@ -283,7 +283,7 @@ inline void boxed_existing_bdim_all_batch_rule(
// Use when all tensors arguments accept one (normal) batch dim.
// This batching rule expands the batch dim on all Tensors, reshapes it into
// dim 0, calls the op, and then reshapes the batch dim out of dim 0.
// This is not the most efficient thing; if there are alternatives, plese try
// This is not the most efficient thing; if there are alternatives, please try
// to use them. Use this only as a last resort.
#define EXISTING_BDIM_ALL_BOXED(op) \
m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_existing_bdim_all_batch_rule>());

View File

@ -384,7 +384,7 @@ fourOutputs solve_ex_batch_rule(
// NOTE [ solve_ex Batch Rule Contiguity ]
// A determines whether or not linalg_solve takes an optimized path. We need the check on A_ to match the one run on
// A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behvaior
// A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behavior
// differs based on whether or not the optimized path was taken
const auto batched_A_was_contiguous = A_bdim.has_value() ? at::select(A, *A_bdim, 0).is_contiguous() : A.is_contiguous();
if (batched_A_was_contiguous && !A.is_complex()) {

View File

@ -282,7 +282,7 @@ static std::tuple<Tensor, std::optional<int64_t>> _softmax_backward_batch_rule(
dim = getPhysicalDim(output_, /*has_batch_dim*/true, dim);
// Not sure why output_ needs to be marked as .contiguous(). Someting must
// Not sure why output_ needs to be marked as .contiguous(). Something must
// have changed in PyTorch (and output of softmax is probably always contiguous)
return std::make_tuple(at::_softmax_backward_data(grad_output_, output_.contiguous(), dim, input_dtype), 0);
}

View File

@ -224,7 +224,7 @@ static Tensor safeStack(TensorList tensors) {
// is possible for the backward function to return an undefined grad for some
// grad_input for each example. In that case, we return an undefined grad.
//
// It is theoretically posssible for *some* of the examples to produce an
// It is theoretically possible for *some* of the examples to produce an
// undefined grad (a kernel could peek at the gradient values and return an
// undefined tensor if it determines the gradient is full of zeros). We
// could handle this by treating the undefined grad as a zero-filled tensor

View File

@ -113,7 +113,7 @@ SymIntArrayRef BatchedTensorImpl::sym_sizes_custom() const {
return sym_sizes_default();
}
// The following are publically exposed as methods of Tensor
// The following are publicly exposed as methods of Tensor
IntArrayRef BatchedTensorImpl::strides_custom() const {
return strides_default();

View File

@ -37,7 +37,7 @@ namespace at::functorch {
// how to perform the transform.
//
// TODO: we can excise DynamicLayer in favor of Interpreter,
// But I am going to leave it for now as a compatiblity shim to avoid
// But I am going to leave it for now as a compatibility shim to avoid
// needing to refactor a lot of callsites...
struct TORCH_API DynamicLayer {
explicit DynamicLayer(

View File

@ -88,7 +88,7 @@ std::ostream& operator<<(std::ostream& os, const TransformType& t);
// >>> VmapInterpreterPtr(&interpreter).batchSize()
//
// Finally, Interpreter::process switches on the type of the interpreter
// and calls one of {Transform}Intepreter::processImpl under the hood.
// and calls one of {Transform}Interpreter::processImpl under the hood.
// Same for Interpreter::sendToNextInterpreter :)
struct VmapInterpreterMeta {

View File

@ -143,7 +143,7 @@ struct TORCH_API VmapPhysicalView {
// mapping a physical tensor to a new logical tensor (BatchedTensor)
VmapPhysicalToLogicalMap getPhysicalToLogicalMap() const;
// Maps a logical shape to a physical shape by pre-pending the batch
// Maps a logical shape to a physical shape by prepending the batch
// sizes to the logical shape.
VmapDimVector getPhysicalShape(IntArrayRef logical_shape) const;
SymDimVector getPhysicalShape(c10::SymIntArrayRef logical_shape) const;

View File

@ -27,7 +27,7 @@ namespace at::functorch {
//
// There are alternative designs we could have chosen (e.g. each grad transform
// stores a weak map of Tensor -> AutogradMeta); the benefit of the TensorWrapper
// design is that we can re-use existing VariableType kernels (i.e. Autograd kernels)
// design is that we can reuse existing VariableType kernels (i.e. Autograd kernels)
// without much modification. Since a TensorWrapper looks like a regular Tensor,
// the VariableType kernel can pull out the AutogradMeta struct from where it
// expects and extend the autograd graph

View File

@ -158,7 +158,7 @@ void MPSStream::fill(id<MTLBuffer> buffer, uint8_t value, size_t length, size_t
endKernelCoalescing();
id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
// For some reason fillBufferfor stopped working for lengh > 4Gb on MacOS 26
// For some reason fillBufferfor stopped working for length > 4Gb on MacOS 26
// See https://github.com/pytorch/pytorch/issues/163962
// Workaround by batching copy commands into 4Gb chunks
constexpr size_t max_copy_size = 0x100000000; // 4GB

View File

@ -3620,7 +3620,7 @@ Tensor& _int_mm_out_cpu(const Tensor& self, const Tensor& mat2, Tensor& result)
try {
mkldnn_matmul_i8i8i32(self, mat2, result);
dispatched = true;
} catch ([[maybe_unused]] const std::exception& e) {
} catch (const std::exception& e) {
TORCH_WARN(func_name, " failed, switching to BLAS gemm: ", e.what());
}
}

View File

@ -128,7 +128,7 @@ at::Tensor PackedLinearWeight::apply_impl(
auto* input_tr_ptr =
reinterpret_cast<uint8_t*>(input_tr.data_ptr<c10::quint8>());
// TODO: Activation transpose before and after the kernel can be removed if we
// keep activation tensor always tranposed.
// keep activation tensor always transposed.
fbgemm::transpose_simd<uint8_t>(
batch_size, K, input_ptr, K, input_tr_ptr, batch_size);

View File

@ -34,7 +34,7 @@ struct Dist {
// finish : This tells what to do with the aggregated value to compute
// the norm. Generally this is the result of val ^ (1 / p).
// backward : This is the gradient for that norm. Arguments are pretty
// self explanitory.
// self explanatory.
//
// There are a few cases where these aren't used. The 0 norm has no backward,
// because it's always 0, so that's shortcircuited earlier. There's a special

View File

@ -74,7 +74,7 @@ it to sum up the entire array into a single value.
`ReduceOpsKernel.cpp` uses the `CPU_CAPABILITY_*` macros to "know" under which
compiler flags it is currently compiled. This allows the programmer to write
generic code, which will be compiled under multipled compilation settings.
generic code, which will be compiled under multiplied compilation settings.
`../ReduceOps.cpp` now includes the header `ReduceOpsKernel.h`, which contains
a generic definition of `sumImplAll`. This function allows the user to reduce

View File

@ -1017,7 +1017,7 @@ struct HelperInterpBase {
while (aligned_interp_size % sizeof(int32_t) != 0) {
aligned_interp_size += 1;
}
// assert that we wont go out of bounds
// assert that we won't go out of bounds
TORCH_INTERNAL_ASSERT(aligned_interp_size * sizeof(int16_t) < interp_size * sizeof(double));
}

View File

@ -655,7 +655,7 @@ void ImagingResampleHorizontalConvolution8u4x(
// last element
auto mmk = _mm256_set1_epi32(k[i]);
// For num_channels == 3 (3 bytes = one pixel) we tolerate to read 4 bytes
// lines 0, 1 and 2 wont go out of allocated memory bounds
// lines 0, 1 and 2 won't go out of allocated memory bounds
auto pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
mm_cvtepu8_epi32(lineIn0_min + stride * i, i32_aligned)),
mm_cvtepu8_epi32(lineIn1_min + stride * i, i32_aligned), 1);
@ -889,7 +889,7 @@ void ImagingResampleHorizontalConvolution8u(
_mm_loadu_si128((__m128i *) (lineIn_min + stride * i))),
_mm_loadu_si128((__m128i *) (lineIn_min + stride * (i + 4))), 1);
// Extract lower part of each lane, cast to epi16 and reoder RGBARGBA -> RRGGBBAA
// Extract lower part of each lane, cast to epi16 and reorder RGBARGBA -> RRGGBBAA
// RGBA: pix1 = [
// r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0
// r4 0 r5 0 g4 0 g5 0 b4 0 b5 0 a4 0 a5 0
@ -1312,7 +1312,7 @@ void ImagingResampleVerticalConvolution8u(
// Here we write 4 bytes to the output even if num_channels < 4, e.g o = {r,g,b,X} for num_channels=3
// It is OK to write 4th byte (e.g. X) as on the next step we will overwrite it with new data.
// We also wont go out of bounds of lineOut memory allocation
// We also won't go out of bounds of lineOut memory allocation
std::memcpy(lineOut + j, (uint8_t *) &o, 4);
}

View File

@ -240,7 +240,7 @@ _PS256_CONST(coscof_p2, 4.166664568298827E-002);
_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
/* evaluation of 8 sines at onces using AVX intrinsics
/* evaluation of 8 sines at once using AVX intrinsics
The code is the exact rewriting of the cephes sinf function.
Precision is excellent as long as x < 8192 (I did not bother to

View File

@ -311,7 +311,7 @@ void GroupNormKernelImplChannelsLastInternal(
const bool gamma_null = (gamma_data == nullptr);
const bool beta_null = beta_data == nullptr;
// NB: About algorithm choosen:
// NB: About algorithm chosen:
//
// On channels last, GroupNorm has a input shape of {N, H, W, GD},
// Mean and rstd are collected per each n and g, which involves reduction

View File

@ -930,7 +930,7 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
}
};
// Dynamically Quantize the float32 input to 8 bit assymetric
// Dynamically Quantize the float32 input to 8 bit asymmetric
input_quant_pack_8bit_channelwise(m, k, lhs_f32, (int8_t*)lhs_qa8dx);
const size_t lhs_stride =
@ -1163,7 +1163,7 @@ void dyn_quant_matmul_4bit_kernel(
const int64_t weight_packed_size =
kleidiai::kai_pack_rhs_int4_size(N, K, block_size);
if (weight_packed_size == packed_weights.numel()) {
// KleidiAI interface intenally handles the Channelwise and groupwise
// KleidiAI interface internally handles the Channelwise and groupwise
// distinction
kleidiai::kai_quant_pack_lhs_int4_mm(
output, inp, packed_weights, M, N, K, block_size);

View File

@ -705,7 +705,7 @@ namespace {
);
} while (!done && max_threads);
if (!done) {
TORCH_INTERNAL_ASSERT(false, "Couldn't reduce launch bounds to accomodate sharedMemPerBlock limit");
TORCH_INTERNAL_ASSERT(false, "Couldn't reduce launch bounds to accommodate sharedMemPerBlock limit");
}
break;
}

View File

@ -154,19 +154,19 @@ struct cublasCommonArgs {
const std::optional<ScalingType>& scaling_choice_b = std::nullopt) {
bool transpose_result = false, transpose_a = false, transpose_b = false;
result = prepare_matrix_for_cublas(c, transpose_result);
mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result);
matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result);
mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result); // codespell:ignore
matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result); // codespell:ignore
// Handle scale tensors if provided
if (scale_a && scale_b) {
// By default since we return in row-major we run the gemm
// as B.T @ A.T, check transpose_result to determine if we flip the scales
scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr();
scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type();
scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a;
scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr();
scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type();
scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b;
scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr(); // codespell:ignore
scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type(); // codespell:ignore
scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a; // codespell:ignore
scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr(); // codespell:ignore
scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type(); // codespell:ignore
scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b; // codespell:ignore
}
if (scale_result) {
@ -180,17 +180,17 @@ struct cublasCommonArgs {
transpose_b = !transpose_b;
}
auto sizes_a = mata->sizes();
auto sizes_b = matb->sizes();
auto sizes_a = mata->sizes(); // codespell:ignore
auto sizes_b = matb->sizes(); // codespell:ignore
m = sizes_a[transpose_result ? 1 : 0];
k = sizes_a[transpose_result ? 0 : 1];
n = sizes_b[transpose_result ? 0 : 1];
lda = mata->stride((transpose_a == transpose_result) ? 1 : 0);
ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0);
lda = mata->stride((transpose_a == transpose_result) ? 1 : 0); // codespell:ignore
ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0); // codespell:ignore
result_ld = result->stride(transpose_result ? 0 : 1);
transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n';
transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n';
transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n'; // codespell:ignore
transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n'; // codespell:ignore
// cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing
// if the gemm operands are in packed float4
@ -205,16 +205,16 @@ struct cublasCommonArgs {
char transa, transb;
int64_t m, n, k;
int64_t lda, ldb, result_ld;
c10::MaybeOwned<Tensor> mata, matb, result;
c10::MaybeOwned<Tensor> mata, matb, result; // codespell:ignore
// Scale members
void* scale_mata_ptr = nullptr;
void* scale_matb_ptr = nullptr;
void* scale_mata_ptr = nullptr; // codespell:ignore
void* scale_matb_ptr = nullptr; // codespell:ignore
void* scale_result_ptr = nullptr;
std::optional<c10::ScalarType> scale_mata_dtype;
std::optional<ScalingType> scaling_mata_type;
std::optional<c10::ScalarType> scale_matb_dtype;
std::optional<ScalingType> scaling_matb_type;
std::optional<c10::ScalarType> scale_mata_dtype; // codespell:ignore
std::optional<ScalingType> scaling_mata_type; // codespell:ignore
std::optional<c10::ScalarType> scale_matb_dtype; // codespell:ignore
std::optional<ScalingType> scaling_matb_type; // codespell:ignore
std::optional<c10::ScalarType> scale_result_dtype;
};
} // namespace
@ -362,7 +362,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
static bool disable_addmm_cuda_lt = getDisableAddmmCudaLt();
#endif
// if lt path fails, we recurse back into this function here and force the lt path to off
// we cannot update varible disable_addmm_cuda_lt from above since it is static and would be permanent
// we cannot update variable disable_addmm_cuda_lt from above since it is static and would be permanent
bool disable_addmm_cuda_lt_final = disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
#if defined(USE_ROCM) && ROCM_VERSION == 60400
// hipblaslt TT fp32 regression on ROCm 6.4, cannot use
@ -2886,7 +2886,7 @@ _scaled_grouped_mm_cuda_v2(
"Contraction dimensions (", dim_a, ",", dim_b, ") of mat_a and mat_b must match, got: ", mat_a.size(dim_a), " and ",
mat_b.size(dim_b));
// Note: only (-1, -2) is currently supported
TORCH_CHECK_VALUE(dim_a == -1 && dim_b == -2, "Curently contraction dims must be (-1, -2) only");
TORCH_CHECK_VALUE(dim_a == -1 && dim_b == -2, "Currently contraction dims must be (-1, -2) only");
} else {
TORCH_CHECK_VALUE(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
}

View File

@ -298,7 +298,7 @@ static void jitted_gpu_kernel_impl(
at::opmath_type<f_inputs_type> scalar_val,
const std::tuple<ExtraArgs...>& extra_args) {
// TODO: Memory use can probably be optimized by re-using kernels across GPUs with
// TODO: Memory use can probably be optimized by reusing kernels across GPUs with
// the same compute capability
static std::mutex jiterator_mutex;
static std::vector<JittedKernelVariantCache> device_caches(c10::cuda::device_count());

View File

@ -494,7 +494,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen)
auto value = static_cast<scalar_t>(rand * range + from);
// reverse the bounds of curand4 from (0, 1] to [0, 1)
// Note that this method is from legacy THCTensorRandom and is likely to give
// you more 0-s, since, the probability of gettings 1-s is higher than 0-s and
// you more 0-s, since, the probability of getting 1-s is higher than 0-s and
// by reversing the bounds, we are flipping the probabilities of 1-s and 0-s.
// BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706
auto reverse_bound_value = value == to ? from : value;

View File

@ -75,7 +75,7 @@ fused_dropout_kernel_vec(at::cuda::detail::TensorInfo<const scalar_t, IndexType>
// We'll use this to actually cause vectorized loads later
LoadT *value = reinterpret_cast<LoadT*>(&src);
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for Halfs, so generate float for everything
// Note: need a new set of random values per 4 elements -- we'll handle VEC elements in this thread, so need ceil(VEC / 4)
// sets of rand.
if ((VEC >= 4) || (gridxvec_loop_state == 0)) {
@ -159,7 +159,7 @@ fused_dropout_kernel(cuda::detail::TensorInfo<const scalar_t, IndexType> a,
for (IndexType linearIndex = idx;
linearIndex < rounded_size;
linearIndex += gridDim.x * blockDim.x*UNROLL) {
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for Halfs, so generate float for everything
float4 rand = curand_uniform4(&state);
scalar_t src[UNROLL];
rand.x = rand.x < p;

View File

@ -24,7 +24,7 @@ namespace at::native {
namespace {
/* This code computes the sum of the weights in two-steps:
1) Each GPU warp sums `NROWS_PER_THREAD` number of row given by `indeces`
1) Each GPU warp sums `NROWS_PER_THREAD` number of row given by `indices`
2) Each partial-sum from 1) are summed and scatter into `grad_weight`
Notice, `NROWS_PER_THREAD` impacts the Achieved Occupancy of the

View File

@ -204,7 +204,7 @@ Scalar scalar_reciprocal(const Scalar& scalar) {
return Scalar(1. / scalar.toComplexDouble());
}
TORCH_INTERNAL_ASSERT(
false, "divison with ", scalar.type(), " not supported");
false, "division with ", scalar.type(), " not supported");
}
void foreach_tensor_div_scalar_kernel_cuda_(

View File

@ -57,7 +57,7 @@ namespace {
const index_t n = index / (out_H * out_W);
const index_t grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
// get the corresponding input x, y co-ordinates from grid
// get the corresponding input x, y coordinates from grid
opmath_t x = grid.data[grid_offset];
opmath_t y = grid.data[grid_offset + grid_sCoor];
@ -193,7 +193,7 @@ namespace {
const index_t n = index / (out_D * out_H * out_W);
const index_t grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
// get the corresponding input x, y, z co-ordinates from grid
// get the corresponding input x, y, z coordinates from grid
opmath_t x = grid.data[grid_offset];
opmath_t y = grid.data[grid_offset + grid_sCoor];
opmath_t z = grid.data[grid_offset + 2 * grid_sCoor];
@ -358,7 +358,7 @@ namespace {
const index_t n = index / (out_H * out_W);
const auto grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
// get the corresponding input x, y co-ordinates from grid
// get the corresponding input x, y coordinates from grid
scalar_t x = grid.data[grid_offset];
scalar_t y = grid.data[grid_offset + grid_sCoor];
@ -572,7 +572,7 @@ namespace {
const index_t n = index / (out_D * out_H * out_W);
const auto grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
// get the corresponding input x, y, z co-ordinates from grid
// get the corresponding input x, y, z coordinates from grid
scalar_t ix = grid.data[grid_offset];
scalar_t iy = grid.data[grid_offset + grid_sCoor];
scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor];

View File

@ -8,7 +8,7 @@
#include <c10/util/irange.h>
// Three warninngs in Cutlass included header files
// Three warnings in Cutlass included header files
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")

View File

@ -377,7 +377,7 @@ __noinline__ __host__ __device__ scalar_t calc_igammac(scalar_t a, scalar_t x) {
* result at the boundary
* - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
* Large Parameter (see DLMF 8.12.4 [igam1])
* - if x > 1.1 and x < a, using the substraction from the regularized lower
* - if x > 1.1 and x < a, using the subtraction from the regularized lower
* incomplete gamma
* - otherwise, calculate the series from [igam2] eq (5)
*/
@ -460,7 +460,7 @@ __noinline__ __host__ __device__ scalar_t calc_igamma(scalar_t a, scalar_t x) {
* result at the boundary
* - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
* Large Parameter (see DLMF 8.12.3 [igam1])
* - if x > 1 and x > a, using the substraction from the regularized upper
* - if x > 1 and x > a, using the subtraction from the regularized upper
* incomplete gamma
* - otherwise, calculate the series from [igam2] eq (4)
*/

View File

@ -332,7 +332,7 @@ void cuda_take_put_kernel(
const auto offset_calc = make_offset_calculator<2>(iter);
using uindex_t = std::make_unsigned_t<index_t>;
// OffsetCalculator needs the sizes and strides reveresed
// OffsetCalculator needs the sizes and strides reversed
const auto indexed_sizes = std::vector<int64_t>(indexed.sizes().rbegin(), indexed.sizes().rend());
const auto indexed_strides = std::vector<int64_t>(indexed.strides().rbegin(), indexed.strides().rend());
const auto* indexed_strides_data = indexed_strides.data();

View File

@ -1611,7 +1611,7 @@ void index_select_out_cuda_impl(
// SmallIndexKernel is more performant when the number of indices is small, and pre-loading
// the index reduces memory accesses. When the number of indices is large, we avoid that
// and increase parallellism by calling gather_out which is a generalization of index_select
// and increase parallelism by calling gather_out which is a generalization of index_select
if (cuda::detail::canUse32BitIndexMath(out) &&
cuda::detail::canUse32BitIndexMath(self) &&
cuda::detail::canUse32BitIndexMath(index) &&

View File

@ -273,7 +273,7 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
scalar_t* dst = self_ptr + index;
//pack coalseced bf16 and fp16
//pack coalesced bf16 and fp16
if constexpr (std::is_same<scalar_t, c10::BFloat16>::value || std::is_same<scalar_t, c10::Half>::value)
{
typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
@ -316,7 +316,7 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
}
}
// not coalsced, so now let try to capture lane-matches...
// not coalesced, so now let try to capture lane-matches...
if (numel > 16 /*<-hueristic threshold*/ * 64 ) {
// well shucks, unlikely to capture same-dest atomics in a wave.

View File

@ -343,7 +343,7 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
if (input_length == 0)
return;
// "first" row, the beta initialization before eq (10) (t=target_length - differes per batch)
// "first" row, the beta initialization before eq (10) (t=target_length - differs per batch)
for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) {
int64_t s = threadIdx.x + block_s;
scalar_t lb;

View File

@ -816,7 +816,7 @@ const auto erfcx_string = jiterator_stringify(
with the usual checks for overflow etcetera.
Performance-wise, it seems to be substantially faster than either
the SLATEC DERFC function [or an erfcx function derived therefrom]
the SLATEC DERFC function [or an erfcx function derived there from]
or Cody's CALERF function (from netlib.org/specfun), while
retaining near machine precision in accuracy.
*/

View File

@ -370,7 +370,7 @@ struct vectorized {
#ifdef USE_ROCM
// This is similar to vectorized policy above, but this one supports
// heterogenous input tensor types as templated parameters.
// heterogeneous input tensor types as templated parameters.
// Its use should be limited to frequently used heterogeneous data types
// as each instantiation will generate a separate kernel, leading to code
// bloating if applied to all combinations supported in PyTorch. Assumption: all

View File

@ -309,7 +309,7 @@ __global__ void sampleMultinomialOnce(
} else {
// This should address a rare bug where we don't select a valid index. This likely occurs when
// due to floating point arithmetic rounding errors, our cumulative sum does not add up to 1, but
// and our uniform sample is greater than this value. In this case we likely have unitialized memory
// and our uniform sample is greater than this value. In this case we likely have uninitialized memory
// in dest[curDist]. So basically we will loop through the distribution and pick the largest index
// where the distribution is non-zero. This is obviously terribly inefficient, but due to the
// rarity in which this occurs, this should not be an issue.

View File

@ -146,7 +146,6 @@ __global__ void nll_loss2d_backward_no_reduce_kernel(
int64_t batch_size = target.size(0);
int64_t H = target.size(1);
int64_t W = target.size(2);
int64_t n_classes = grad_input.size(1);
CUDA_KERNEL_LOOP(index, n_threads) {
const int64_t b = index % batch_size;
@ -157,7 +156,6 @@ __global__ void nll_loss2d_backward_no_reduce_kernel(
if (cur_target == ignore_index) {
continue;
}
CUDA_KERNEL_ASSERT(cur_target >= 0 && cur_target < n_classes);
scalar_t value = -(weight != nullptr ? weight[cur_target] : static_cast<scalar_t>(1));
grad_input[b][cur_target][h][w] = value * grad_output[b][h][w];
}

View File

@ -1623,7 +1623,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
const auto stride = input.sizes()[1];
const auto reduction_size = input.numel() / stride;
// Input is guarunteed to be channels-last compatible
// Input is guaranteed to be channels-last compatible
at::Tensor grad_input = at::empty_like(input);
dim3 block;
@ -1691,7 +1691,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
const auto reduction_size = input.numel() / stride;
auto norm_fct = 1.0 / reduction_size;
// Input is guarunteed to be channels-last compatible
// Input is guaranteed to be channels-last compatible
at::Tensor grad_input = at::empty_like(input);
dim3 block;

View File

@ -37,7 +37,7 @@ namespace at::native {
// threshold probability for having non-duplicate keys, then it can be proved that[1]
// the number of bits required is: ceil(log2(n - (6 n^2 + 1) / (12 log(q))))
//
// Then after sort, we lauch a separate kernel that additionally shuffles any islands
// Then after sort, we launch a separate kernel that additionally shuffles any islands
// of values whose keys matched. The algorithm of this kernel is as follows:
// Each thread reads its key and the keys of its neighbors to tell if it's part of an island.
// For each island, the first thread in the island sees a key match at index i+1 but not index i-1.

View File

@ -413,12 +413,14 @@ struct ReduceOp {
value = thread_reduce<output_vec_size>(input_slice);
}
if (config.should_block_x_reduce()) {
value = block_x_reduce<output_vec_size>(value, shared_memory);
}
if (config.should_block_y_reduce()) {
value = block_y_reduce<output_vec_size>(value, shared_memory);
}
__syncthreads();
if (config.should_block_x_reduce()) {
value = block_x_reduce<output_vec_size>(value, shared_memory);
}
using out_ptr_vec_t = std::array<out_scalar_t*, output_vec_size>;
using offset_vec_t = std::array<index_t, output_vec_size>;
offset_vec_t base_offsets;
@ -655,8 +657,8 @@ struct ReduceOp {
__syncthreads();
// Intra-warp reduction, fix CUDA to have offset decreasing for better numerics
// matching Triton, etc.
// TODO(PaulZhang12): AMD and internal
#if defined(USE_ROCM) || defined(FBCODE_CAFFE2)
// todo for AMD
#ifdef USE_ROCM
for (int offset = 1; offset < dim_x; offset <<= 1) {
#else
for (int offset = dim_x >> 1; offset > 0; offset >>= 1) {
@ -1086,12 +1088,12 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
// load instructions.
//
// Case 1: "vectorize along input"
// This case happens when we are reducing along fastest moving dimesion. In such case, threads
// This case happens when we are reducing along fastest moving dimension. In such case, threads
// with the same threadIdx.y works on the same reduction cooperatively and will produce results
// for the same output. In such case, values in each loaded vector always correspond to the same output.
//
// Case 2: "vectorize along output"
// This case happens when the fastest moving dimesion is not the dimension of reduction. In such case,
// This case happens when the fastest moving dimension is not the dimension of reduction. In such case,
// threads with different threadIdx.x are independent and will produce results for different outputs.
// In such case, values in each loaded vector always correspond to different outputs.
if (fastest_moving_stride == sizeof(scalar_t)) {

View File

@ -241,7 +241,7 @@ __global__ void reflection_pad2d_backward_det_out_kernel(
const int64_t dist_cols = ::abs(inp_col - (input_dim_x - 1));
// we were dist_rows after, now we want to be dist_rows before
// we were dist_cols before, now we wnat to be dist_cols after
// we were dist_cols before, now we want to be dist_cols after
const int64_t reflect_tr_out_row = (corner_tr_out_row - dist_rows);
const int64_t reflect_tr_out_col = (corner_tr_out_col + dist_cols);
const int64_t reflect_tr_out =

View File

@ -5,7 +5,7 @@
#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
#include <c10/macros/Macros.h>
// Two warninngs in Cutlass included header files
// Two warnings in Cutlass included header files
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wmissing-field-initializers")

View File

@ -7,7 +7,7 @@
#include <c10/macros/Macros.h>
#include <c10/util/irange.h>
// Two warninngs in Cutlass included header files
// Two warnings in Cutlass included header files
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")

View File

@ -44,7 +44,7 @@ __global__ void triu_tril_kernel(
const int64_t k,
const int64_t N_padded,
const IndexType last_dim_padded) {
int64_t linear_idx = (((int64_t)blockIdx.x) * blockDim.x + threadIdx.x) * elements_per_thread;
int64_t linear_idx = (blockIdx.x * blockDim.x + threadIdx.x) * elements_per_thread;
if (linear_idx >= N_padded) {
return;
}

Some files were not shown because too many files have changed in this diff Show More