mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-30 03:34:56 +08:00
Compare commits
17 Commits
v2.0.1-rc1
...
v2.0.1-rc4
| Author | SHA1 | Date | |
|---|---|---|---|
| 96ca226a73 | |||
| a78b5f6680 | |||
| 51b42d98d6 | |||
| 0bd6be966b | |||
| e9ebda29d8 | |||
| 9e8bd61836 | |||
| e4bdb86e80 | |||
| 55b4f95cd8 | |||
| 6943c4b15e | |||
| 91c455e85d | |||
| c83bbdc032 | |||
| 661fa0c5e7 | |||
| 0f49e97be9 | |||
| b90fd01221 | |||
| 3aae95a884 | |||
| 34dd578b91 | |||
| 654da19c7c |
@ -175,7 +175,7 @@ function checkout_install_torchdeploy() {
|
||||
pushd multipy
|
||||
git checkout "${commit}"
|
||||
python multipy/runtime/example/generate_examples.py
|
||||
pip install -e . --install-option="--cudatests"
|
||||
pip install -e .
|
||||
popd
|
||||
popd
|
||||
}
|
||||
@ -184,7 +184,6 @@ function test_torch_deploy(){
|
||||
pushd ..
|
||||
pushd multipy
|
||||
./multipy/runtime/build/test_deploy
|
||||
./multipy/runtime/build/test_deploy_gpu
|
||||
popd
|
||||
popd
|
||||
}
|
||||
|
||||
@ -196,10 +196,11 @@ def generate_wheels_matrix(os: str,
|
||||
for python_version in python_versions:
|
||||
for arch_version in arches:
|
||||
gpu_arch_type = arch_type(arch_version)
|
||||
gpu_arch_version = "" if arch_version == "cpu" or arch_version == "cpu-cxx11-abi" else arch_version
|
||||
# Skip rocm 3.11 binaries for now as the docker image are not correct
|
||||
if python_version == "3.11" and gpu_arch_type == "rocm":
|
||||
continue
|
||||
gpu_arch_version = (
|
||||
""
|
||||
if arch_version == "cpu" or arch_version == "cpu-cxx11-abi"
|
||||
else arch_version
|
||||
)
|
||||
|
||||
# special 11.7 wheels package without dependencies
|
||||
# dependency downloaded via pip install
|
||||
|
||||
2
.github/workflows/_android-build-test.yml
vendored
2
.github/workflows/_android-build-test.yml
vendored
@ -35,7 +35,7 @@ jobs:
|
||||
|
||||
# [see note: pytorch repo ref]
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
|
||||
- name: Setup Linux
|
||||
uses: ./.github/actions/setup-linux
|
||||
|
||||
@ -35,7 +35,7 @@ jobs:
|
||||
|
||||
# [see note: pytorch repo ref]
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
|
||||
- name: Setup Linux
|
||||
uses: ./.github/actions/setup-linux
|
||||
|
||||
2
.github/workflows/_bazel-build-test.yml
vendored
2
.github/workflows/_bazel-build-test.yml
vendored
@ -35,7 +35,7 @@ jobs:
|
||||
|
||||
# [see note: pytorch repo ref]
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
|
||||
- name: Setup Linux
|
||||
uses: ./.github/actions/setup-linux
|
||||
|
||||
2
.github/workflows/_binary-build-linux.yml
vendored
2
.github/workflows/_binary-build-linux.yml
vendored
@ -131,7 +131,7 @@ jobs:
|
||||
with:
|
||||
github-secret: ${{ secrets.github-token }}
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
- name: Setup Linux
|
||||
uses: ./.github/actions/setup-linux
|
||||
- name: Chown workspace
|
||||
|
||||
2
.github/workflows/_binary-test-linux.yml
vendored
2
.github/workflows/_binary-test-linux.yml
vendored
@ -128,7 +128,7 @@ jobs:
|
||||
github-secret: ${{ secrets.github-token }}
|
||||
# Setup the environment
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
- name: Setup Linux
|
||||
uses: ./.github/actions/setup-linux
|
||||
- name: Chown workspace
|
||||
|
||||
2
.github/workflows/_binary-upload.yml
vendored
2
.github/workflows/_binary-upload.yml
vendored
@ -96,7 +96,7 @@ jobs:
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
with:
|
||||
no-sudo: true
|
||||
|
||||
|
||||
2
.github/workflows/_buck-build-test.yml
vendored
2
.github/workflows/_buck-build-test.yml
vendored
@ -12,7 +12,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
|
||||
- name: Set up JDK 8
|
||||
uses: actions/setup-java@v3
|
||||
|
||||
@ -22,7 +22,7 @@ jobs:
|
||||
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: 1
|
||||
|
||||
2
.github/workflows/_docs.yml
vendored
2
.github/workflows/_docs.yml
vendored
@ -76,7 +76,7 @@ jobs:
|
||||
|
||||
# [see note: pytorch repo ref]
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
|
||||
- name: Setup Linux
|
||||
uses: ./.github/actions/setup-linux
|
||||
|
||||
2
.github/workflows/_ios-build-test.yml
vendored
2
.github/workflows/_ios-build-test.yml
vendored
@ -38,7 +38,7 @@ jobs:
|
||||
steps:
|
||||
# [see note: pytorch repo ref]
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
|
||||
- name: Populate CI build options
|
||||
run: |
|
||||
|
||||
2
.github/workflows/_linux-build.yml
vendored
2
.github/workflows/_linux-build.yml
vendored
@ -76,7 +76,7 @@ jobs:
|
||||
# checkout because when we run this action we don't *have* a local
|
||||
# checkout. In other cases you should prefer a local checkout.
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
|
||||
- name: Setup Linux
|
||||
uses: ./.github/actions/setup-linux
|
||||
|
||||
4
.github/workflows/_linux-test.yml
vendored
4
.github/workflows/_linux-test.yml
vendored
@ -48,7 +48,7 @@ jobs:
|
||||
keep-going: ${{ steps.filter.outputs.keep-going }}
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
with:
|
||||
fetch-depth: 1
|
||||
submodules: false
|
||||
@ -79,7 +79,7 @@ jobs:
|
||||
docker exec -it $(docker container ps --format '{{.ID}}') bash
|
||||
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
|
||||
- name: Setup Linux
|
||||
uses: ./.github/actions/setup-linux
|
||||
|
||||
2
.github/workflows/_mac-build.yml
vendored
2
.github/workflows/_mac-build.yml
vendored
@ -79,7 +79,7 @@ jobs:
|
||||
|
||||
# [see note: pytorch repo ref]
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
|
||||
- name: Set xcode version
|
||||
env:
|
||||
|
||||
4
.github/workflows/_mac-test.yml
vendored
4
.github/workflows/_mac-test.yml
vendored
@ -47,7 +47,7 @@ jobs:
|
||||
is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
with:
|
||||
fetch-depth: 1
|
||||
submodules: false
|
||||
@ -96,7 +96,7 @@ jobs:
|
||||
|
||||
# [see note: pytorch repo ref]
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
|
||||
- name: Download build artifacts
|
||||
uses: ./.github/actions/download-build-artifacts
|
||||
|
||||
4
.github/workflows/_rocm-test.yml
vendored
4
.github/workflows/_rocm-test.yml
vendored
@ -54,7 +54,7 @@ jobs:
|
||||
is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
with:
|
||||
fetch-depth: 1
|
||||
submodules: false
|
||||
@ -78,7 +78,7 @@ jobs:
|
||||
steps:
|
||||
# [see note: pytorch repo ref]
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
with:
|
||||
no-sudo: true
|
||||
|
||||
|
||||
2
.github/workflows/_run_android_tests.yml
vendored
2
.github/workflows/_run_android_tests.yml
vendored
@ -13,7 +13,7 @@ jobs:
|
||||
steps:
|
||||
# [see note: pytorch repo ref]
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
|
||||
- name: Setup miniconda
|
||||
uses: pytorch/test-infra/.github/actions/setup-miniconda@main
|
||||
|
||||
2
.github/workflows/_win-build.yml
vendored
2
.github/workflows/_win-build.yml
vendored
@ -68,7 +68,7 @@ jobs:
|
||||
|
||||
# [see note: pytorch repo ref]
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
with:
|
||||
no-sudo: true
|
||||
|
||||
|
||||
4
.github/workflows/_win-test.yml
vendored
4
.github/workflows/_win-test.yml
vendored
@ -36,7 +36,7 @@ jobs:
|
||||
is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
with:
|
||||
fetch-depth: 1
|
||||
submodules: false
|
||||
@ -119,7 +119,7 @@ jobs:
|
||||
|
||||
# [see note: pytorch repo ref]
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
with:
|
||||
no-sudo: true
|
||||
|
||||
|
||||
4
.github/workflows/build-triton-wheel.yml
vendored
4
.github/workflows/build-triton-wheel.yml
vendored
@ -38,7 +38,7 @@ jobs:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
with:
|
||||
submodules: false
|
||||
|
||||
@ -160,7 +160,7 @@ jobs:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
with:
|
||||
submodules: false
|
||||
|
||||
|
||||
2
.github/workflows/check-labels.yml
vendored
2
.github/workflows/check-labels.yml
vendored
@ -15,7 +15,7 @@ jobs:
|
||||
runs-on: linux.20_04.4x
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: 1
|
||||
|
||||
2
.github/workflows/docker-builds.yml
vendored
2
.github/workflows/docker-builds.yml
vendored
@ -62,7 +62,7 @@ jobs:
|
||||
# [see note: pytorch repo ref]
|
||||
# deep clone (fetch-depth 0) required for git merge-base
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
|
||||
- name: Setup Linux
|
||||
uses: ./.github/actions/setup-linux
|
||||
|
||||
200
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
200
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
@ -1828,3 +1828,203 @@ jobs:
|
||||
aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
|
||||
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
manywheel-py3_11-rocm5_3-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm5.3
|
||||
GPU_ARCH_VERSION: 5.3
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-rocm5_3
|
||||
build_environment: linux-binary-manywheel
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_11-rocm5_3-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: manywheel-py3_11-rocm5_3-build
|
||||
runs-on: linux.rocm.gpu
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm5.3
|
||||
GPU_ARCH_VERSION: 5.3
|
||||
GPU_ARCH_TYPE: rocm
|
||||
SKIP_ALL_TESTS: 1
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
|
||||
DESIRED_PYTHON: "3.11"
|
||||
steps:
|
||||
- name: Setup ROCm
|
||||
uses: ./.github/actions/setup-rocm
|
||||
- uses: actions/download-artifact@v3
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: manywheel-py3_11-rocm5_3
|
||||
path: "${{ runner.temp }}/artifacts/"
|
||||
- name: Checkout PyTorch
|
||||
uses: malfet/checkout@silent-checkout
|
||||
with:
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
quiet-checkout: true
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: Checkout pytorch/builder
|
||||
uses: malfet/checkout@silent-checkout
|
||||
with:
|
||||
ref: release/2.0
|
||||
submodules: recursive
|
||||
repository: pytorch/builder
|
||||
path: builder
|
||||
quiet-checkout: true
|
||||
- name: Clean pytorch/builder checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: builder
|
||||
- name: ROCm set GPU_FLAG
|
||||
run: |
|
||||
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
|
||||
- name: Pull Docker image
|
||||
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
||||
with:
|
||||
docker-image: pytorch/manylinux-builder:rocm5.3
|
||||
- name: Test Pytorch binary
|
||||
uses: ./pytorch/.github/actions/test-pytorch-binary
|
||||
- name: Teardown ROCm
|
||||
uses: ./.github/actions/teardown-rocm
|
||||
manywheel-py3_11-rocm5_3-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: manywheel-py3_11-rocm5_3-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm5.3
|
||||
GPU_ARCH_VERSION: 5.3
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-rocm5_3
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
|
||||
aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
|
||||
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
manywheel-py3_11-rocm5_4_2-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm5.4.2
|
||||
GPU_ARCH_VERSION: 5.4.2
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-rocm5_4_2
|
||||
build_environment: linux-binary-manywheel
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_11-rocm5_4_2-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: manywheel-py3_11-rocm5_4_2-build
|
||||
runs-on: linux.rocm.gpu
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm5.4.2
|
||||
GPU_ARCH_VERSION: 5.4.2
|
||||
GPU_ARCH_TYPE: rocm
|
||||
SKIP_ALL_TESTS: 1
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
|
||||
DESIRED_PYTHON: "3.11"
|
||||
steps:
|
||||
- name: Setup ROCm
|
||||
uses: ./.github/actions/setup-rocm
|
||||
- uses: actions/download-artifact@v3
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: manywheel-py3_11-rocm5_4_2
|
||||
path: "${{ runner.temp }}/artifacts/"
|
||||
- name: Checkout PyTorch
|
||||
uses: malfet/checkout@silent-checkout
|
||||
with:
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
quiet-checkout: true
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: Checkout pytorch/builder
|
||||
uses: malfet/checkout@silent-checkout
|
||||
with:
|
||||
ref: release/2.0
|
||||
submodules: recursive
|
||||
repository: pytorch/builder
|
||||
path: builder
|
||||
quiet-checkout: true
|
||||
- name: Clean pytorch/builder checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: builder
|
||||
- name: ROCm set GPU_FLAG
|
||||
run: |
|
||||
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
|
||||
- name: Pull Docker image
|
||||
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
||||
with:
|
||||
docker-image: pytorch/manylinux-builder:rocm5.4.2
|
||||
- name: Test Pytorch binary
|
||||
uses: ./pytorch/.github/actions/test-pytorch-binary
|
||||
- name: Teardown ROCm
|
||||
uses: ./.github/actions/teardown-rocm
|
||||
manywheel-py3_11-rocm5_4_2-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: manywheel-py3_11-rocm5_4_2-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm5.4.2
|
||||
GPU_ARCH_VERSION: 5.4.2
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-rocm5_4_2
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
|
||||
aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
|
||||
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
4
.github/workflows/lint.yml
vendored
4
.github/workflows/lint.yml
vendored
@ -106,7 +106,7 @@ jobs:
|
||||
if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'skip-pr-sanity-checks')
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: -1
|
||||
@ -216,7 +216,7 @@ jobs:
|
||||
# [see note: pytorch repo ref]
|
||||
# deep clone (fetch-depth 0) required, to allow us to use git log
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: 1
|
||||
|
||||
2
.github/workflows/update_pytorch_labels.yml
vendored
2
.github/workflows/update_pytorch_labels.yml
vendored
@ -14,7 +14,7 @@ jobs:
|
||||
if: ${{ github.repository == 'pytorch/pytorch' }}
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
with:
|
||||
fetch-depth: 1
|
||||
submodules: false
|
||||
|
||||
2
.github/workflows/upload-test-stats.yml
vendored
2
.github/workflows/upload-test-stats.yml
vendored
@ -37,7 +37,7 @@ jobs:
|
||||
run: echo "${TRIGGERING_WORKFLOW}"
|
||||
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.0
|
||||
|
||||
- run: |
|
||||
pip3 install requests==2.26
|
||||
|
||||
13
Dockerfile
13
Dockerfile
@ -7,11 +7,11 @@
|
||||
#
|
||||
# For reference:
|
||||
# https://docs.docker.com/develop/develop-images/build_enhancements/
|
||||
ARG BASE_IMAGE=ubuntu:18.04
|
||||
ARG BASE_IMAGE=ubuntu:20.04
|
||||
ARG PYTHON_VERSION=3.8
|
||||
|
||||
FROM ${BASE_IMAGE} as dev-base
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
ca-certificates \
|
||||
ccache \
|
||||
@ -82,15 +82,16 @@ ARG TRITON_VERSION
|
||||
ARG TARGETPLATFORM
|
||||
ARG CUDA_VERSION
|
||||
LABEL com.nvidia.volumes.needed="nvidia_driver"
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
libjpeg-dev \
|
||||
libpng-dev
|
||||
libpng-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
COPY --from=conda-installs /opt/conda /opt/conda
|
||||
RUN if test -n "${TRITON_VERSION}" -a "${TARGETPLATFORM}" != "linux/arm64"; then \
|
||||
apt install -y --no-install-recommends gcc; \
|
||||
DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends gcc; \
|
||||
rm -rf /var/lib/apt/lists/*; \
|
||||
fi
|
||||
RUN rm -rf /var/lib/apt/lists/*
|
||||
ENV PATH /opt/conda/bin:$PATH
|
||||
ENV NVIDIA_VISIBLE_DEVICES all
|
||||
ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
|
||||
|
||||
@ -379,16 +379,33 @@ inline bool check_gpu_sm50_or_greater(sdp_params params, bool debug) {
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool check_gpu_sm86_head_dim_128(sdp_params params, bool debug) {
|
||||
inline bool check_head_dim_gt64_and_sm_ge86(sdp_params params, bool debug) {
|
||||
// Memory Efficient Attention is throwing a cuda illegal memory error
|
||||
// on sm86 when head_dim is 128.
|
||||
// on sm86 or newer when head_dim is greater than 64.
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
bool is_sm86 = (dprops->major == 8) && (dprops->minor == 6);
|
||||
if (is_sm86 && (params.query.size(-1) == 128)) {
|
||||
bool is_sm86_or_newer = (dprops->major == 8) && (dprops->minor >= 6);
|
||||
// Categorically disable sm90 as well. Will want to fix this once we have H100s available for testing.
|
||||
is_sm86_or_newer = is_sm86_or_newer || (dprops->major > 8);
|
||||
if (is_sm86_or_newer && (params.query.sym_size(-1) > 64)) {
|
||||
if (debug) {
|
||||
TORCH_WARN(
|
||||
"Memory Efficient Attention does not currently support head_dim == 128 on sm86",
|
||||
"because it is throwing a cuda illegal memory error on sm86 when head_dim is 128.");
|
||||
"Memory Efficient Attention does not currently support head_dim greater than 64 on sm86 or newer");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool check_requires_grad_and_head_dim_gt64_and_sm_ge86(
|
||||
sdp_params params,
|
||||
bool debug) {
|
||||
// Flash Attention will raise an error in the backward pass if the head_dim
|
||||
// size is greater than 64 And the device is sm86 or newer.
|
||||
if (!check_requires_grad(params, false) &&
|
||||
!check_head_dim_gt64_and_sm_ge86(params, false)) {
|
||||
if (debug) {
|
||||
TORCH_WARN(
|
||||
"Flash attention currently doesn't support training with head_dim greater than 64 on sm86 or newer.");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -422,13 +439,14 @@ inline bool use_flash_attention(sdp_params params, bool debug) {
|
||||
return false;
|
||||
#endif
|
||||
// Define gate functions that determine if a flash kernel can be ran
|
||||
constexpr std::array<bool(*)(sdp_params, bool), 8> constraints {{
|
||||
constexpr std::array<bool(*)(sdp_params, bool), 9> constraints {{
|
||||
check_runtime_disabled_flash,
|
||||
check_tensor_shapes,
|
||||
check_equal_batch_size_and_num_heads,
|
||||
check_for_attn_mask,
|
||||
check_head_dim_size,
|
||||
check_gpu_sm75_or_greater,
|
||||
check_requires_grad_and_head_dim_gt64_and_sm_ge86,
|
||||
check_for_nested_inputs,
|
||||
check_for_seq_len_1_nested_tensor}};
|
||||
for (auto& constraint : constraints) {
|
||||
@ -465,7 +483,7 @@ inline bool use_mem_efficient_attention(sdp_params params, bool debug) {
|
||||
check_equal_batch_size_and_num_heads,
|
||||
check_for_attn_mask,
|
||||
check_head_dim_size_mem_efficient,
|
||||
check_gpu_sm86_head_dim_128,
|
||||
check_head_dim_gt64_and_sm_ge86,
|
||||
check_for_seq_len_1_nested_tensor,
|
||||
check_for_non_zero_dropout,
|
||||
check_use_deterministic_algorithms}};
|
||||
|
||||
@ -10,8 +10,8 @@ endif
|
||||
|
||||
CUDA_VERSION = 11.7.0
|
||||
CUDNN_VERSION = 8
|
||||
BASE_RUNTIME = ubuntu:18.04
|
||||
BASE_DEVEL = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-ubuntu18.04
|
||||
BASE_RUNTIME = ubuntu:20.04
|
||||
BASE_DEVEL = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-ubuntu20.04
|
||||
|
||||
# The conda channel to use to install cudatoolkit
|
||||
CUDA_CHANNEL = nvidia
|
||||
|
||||
@ -51,11 +51,8 @@ class AotAutogradFallbackTests(torch._dynamo.test_case.TestCase):
|
||||
y = torch.randn(4)
|
||||
x = torch.nn.Parameter(torch.randn(4))
|
||||
aot_fn = torch._dynamo.optimize("aot_eager")(fn)
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError,
|
||||
"a leaf Variable that requires grad is being used in an in-place operation.",
|
||||
):
|
||||
aot_fn(x, y)
|
||||
# This should not error: we mutated an autograd leaf under no_grad mode.
|
||||
aot_fn(x, y)
|
||||
|
||||
def test_mutation1(self):
|
||||
def fn(_stack0: torch.Tensor, diagonal_chunked_attention_scores: torch.Tensor):
|
||||
@ -179,11 +176,8 @@ class AotAutogradFallbackTests(torch._dynamo.test_case.TestCase):
|
||||
|
||||
# Run exported graph with AOT
|
||||
aot_fn = torch._dynamo.optimize("aot_eager")(graph)
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError,
|
||||
"a leaf Variable that requires grad is being used in an in-place operation.",
|
||||
):
|
||||
aot_fn(x, y)
|
||||
# This should not error: we mutated an autograd leaf under no_grad mode.
|
||||
aot_fn(x, y)
|
||||
|
||||
def test_call_fn_with_non_const_inputs_aot_unsafe_control_flow(self):
|
||||
class ModuleSpecialFwd(torch.nn.Module):
|
||||
|
||||
@ -2294,7 +2294,6 @@ class MiscTests(torch._dynamo.test_case.TestCase):
|
||||
self.assertIs(x_ref(), None)
|
||||
|
||||
def test_release_module_memory(self):
|
||||
|
||||
mod = torch.nn.Linear(10, 10)
|
||||
x = torch.rand([10, 10])
|
||||
mod_weight_ref = weakref.ref(mod.weight)
|
||||
@ -2640,7 +2639,6 @@ class MiscTests(torch._dynamo.test_case.TestCase):
|
||||
self.names = []
|
||||
|
||||
def forward(self, idx, targets=None):
|
||||
|
||||
b, t = idx.size()
|
||||
assert (
|
||||
t <= self.block_size
|
||||
@ -3763,7 +3761,6 @@ class MiscTests(torch._dynamo.test_case.TestCase):
|
||||
self.assertTrue(same(ref, res))
|
||||
|
||||
def test_disable_flag(self):
|
||||
|
||||
cnt = torch._dynamo.testing.CompileCounter()
|
||||
|
||||
with patch.dict(os.environ, {"TORCH_COMPILE_DISABLE": "1"}):
|
||||
@ -4046,6 +4043,23 @@ class MiscTests(torch._dynamo.test_case.TestCase):
|
||||
res = opt_fn(x, y)
|
||||
self.assertTrue(same(ref, res))
|
||||
|
||||
def test_tuple_from_tuple_iter(self):
|
||||
def inner_fn(*args):
|
||||
acc = torch.ones(10, 10)
|
||||
for arg in args:
|
||||
acc.add_(arg)
|
||||
|
||||
return acc
|
||||
|
||||
@torch._dynamo.optimize("eager")
|
||||
def fn(inputs, params):
|
||||
y = tuple(inputs) + tuple(params)
|
||||
return inner_fn(*y)
|
||||
|
||||
inputs = [torch.randn(10, 10) for _ in range(3)]
|
||||
|
||||
fn(inputs, iter(tuple(inputs)))
|
||||
|
||||
def test_torch_package_working_with_trace(self):
|
||||
# from torch._dynamo.test_case import run_tests
|
||||
|
||||
|
||||
@ -295,6 +295,31 @@ class ModuleList(torch.nn.Module):
|
||||
return x
|
||||
|
||||
|
||||
class CustomGetItemModuleList(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.layers = torch.nn.ModuleList(
|
||||
[
|
||||
torch.nn.Linear(10, 10),
|
||||
torch.nn.ReLU(),
|
||||
torch.nn.Linear(10, 10),
|
||||
torch.nn.ReLU(),
|
||||
]
|
||||
)
|
||||
|
||||
def __getitem__(self, idx: int):
|
||||
return self.layers[idx]
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.layers)
|
||||
|
||||
def forward(self, x):
|
||||
for i in range(len(self)):
|
||||
x = self[i](x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class ModuleDict(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
@ -310,6 +335,23 @@ class ModuleDict(torch.nn.Module):
|
||||
return x
|
||||
|
||||
|
||||
class CustomGetItemModuleDict(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.layers = torch.nn.ModuleDict(
|
||||
{
|
||||
"0": torch.nn.Linear(10, 10),
|
||||
}
|
||||
)
|
||||
|
||||
def __getitem__(self, key: str) -> torch.nn.Module:
|
||||
return self.layers[key]
|
||||
|
||||
def forward(self, x):
|
||||
x = self["0"](x)
|
||||
return x
|
||||
|
||||
|
||||
class TensorList(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
@ -728,7 +770,9 @@ class NNModuleTests(torch._dynamo.test_case.TestCase):
|
||||
test_cfgmod = make_test(CfgModule())
|
||||
test_stringmember = make_test(StringMember())
|
||||
test_modulelist = make_test(ModuleList())
|
||||
test_modulelist = make_test(CustomGetItemModuleList())
|
||||
test_moduledict = make_test(ModuleDict())
|
||||
test_moduledict = make_test(CustomGetItemModuleDict())
|
||||
test_super1 = make_test(SuperModule())
|
||||
test_super2 = make_test(SuperModule2())
|
||||
test_super_class_method = make_test(SuperChildCallsClassMethod())
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# Owner(s): ["module: dynamo"]
|
||||
import collections
|
||||
import contextlib
|
||||
import copy
|
||||
import inspect
|
||||
import itertools
|
||||
@ -2173,6 +2174,81 @@ class ReproTests(torch._dynamo.test_case.TestCase):
|
||||
self.assertEqual(cnt.frame_count, 2)
|
||||
self.assertEqual(cnt.op_count, 2)
|
||||
|
||||
def test_exception_in_dynamo_handling(self):
|
||||
hit_handler = False
|
||||
|
||||
# See https://github.com/pytorch/pytorch/pull/96488
|
||||
@contextlib.contextmanager
|
||||
def ctx():
|
||||
try:
|
||||
yield
|
||||
except RuntimeError:
|
||||
nonlocal hit_handler
|
||||
hit_handler = True
|
||||
|
||||
@torch._dynamo.optimize("eager")
|
||||
def f():
|
||||
with ctx():
|
||||
h()
|
||||
|
||||
def h():
|
||||
raise RuntimeError("boof")
|
||||
|
||||
# Should not error
|
||||
f()
|
||||
self.assertTrue(hit_handler)
|
||||
|
||||
def test_generator_dealloc(self):
|
||||
# See https://github.com/pytorch/pytorch/pull/96488
|
||||
#
|
||||
# NB: yes, [(...)] is intentional, this is a list containing a
|
||||
# generator
|
||||
generator_box = [(x for x in [1, 2, 3])]
|
||||
|
||||
counter = torch._dynamo.testing.CompileCounter()
|
||||
|
||||
def g(x):
|
||||
return x + 2
|
||||
|
||||
# TODO: This test is pretty delicate. To test if it's actually doing
|
||||
# anything, rebuild eval_frame.c with '#define TORCHDYNAMO_DEBUG 1'
|
||||
# and then look at the logs for:
|
||||
#
|
||||
# TRACE[_custom_eval_frame:650] begin <genexpr> test_repros.py 2276 -1 0 0
|
||||
# TRACE[_custom_eval_frame:664] throw <genexpr>
|
||||
#
|
||||
# This means we're actually hitting the relevant codepath
|
||||
|
||||
# NB: Make sure we don't actually Dynamo this frame; if we do Dynamo
|
||||
# this frame, Dynamo actually DOES understand list.clear and will
|
||||
# arrange for the generator deallocation to happen when the eval frame
|
||||
# handler is disabled, which will prevent the bug from happening (we
|
||||
# specifically want to trigger the generator deallocation WHILE the
|
||||
# dynamo eval frame handler is active), as that will cause the
|
||||
# generator to become exhausted and trigger the throw_flag == TRUE
|
||||
# case.
|
||||
@torch._dynamo.skip
|
||||
def f(x):
|
||||
generator_box.clear()
|
||||
return g(x)
|
||||
|
||||
self.assertNoUnraisable(
|
||||
lambda: torch._dynamo.optimize(counter)(f)(torch.randn(3))
|
||||
)
|
||||
|
||||
# Make sure the x + 2 is captured (a previous incorrect implementation
|
||||
# of this fix would have disabled the eval frame callback, which means
|
||||
# g wouldn't get traced
|
||||
self.assertEqual(counter.op_count, 1)
|
||||
|
||||
def test_error_return_without_exception_set(self):
|
||||
# https://github.com/pytorch/pytorch/issues/93781
|
||||
@torch.compile
|
||||
def f():
|
||||
_generator_type = type((_ for _ in ()))
|
||||
|
||||
self.assertNoUnraisable(f)
|
||||
|
||||
@skip_if_pytest
|
||||
@torch._dynamo.config.patch("rewrite_assert_with_torch_assert", True)
|
||||
def test_rewrite_assert_with_msg(self):
|
||||
|
||||
@ -495,6 +495,44 @@ def forward(self, primals_1, primals_2, primals_3):
|
||||
self.verify_aot_autograd(f, create_inp(True), test_mutation=True)
|
||||
self.verify_aot_autograd(f, create_inp(False), test_mutation=True)
|
||||
|
||||
@patch("functorch.compile.config.use_fake_tensor", True)
|
||||
def test_input_mutation_requires_grad_detach(self):
|
||||
# Here, "a" requires grad, and gets mutated, so we append a copy_() to the end of the graph.
|
||||
# Its mutation doesn't take part in autograd though, because we mutated a detach'd view.
|
||||
# Need to make sure that this copy_() doesn't error, and doesn't participate in autograd either.
|
||||
def f(a):
|
||||
a.detach().mul_(2)
|
||||
return a + 3
|
||||
inp = [torch.ones(4, requires_grad=True)]
|
||||
self.verify_aot_autograd(f, inp, test_mutation=False)
|
||||
inp = [torch.ones(4, requires_grad=True)]
|
||||
# test_mutation=True will first do some compute on inp, so it is no longer an autograd leaf
|
||||
# by the time it becomes a graph input. Good to test both cases.
|
||||
self.verify_aot_autograd(f, inp, test_mutation=True)
|
||||
|
||||
@patch("functorch.compile.config.use_fake_tensor", True)
|
||||
def test_input_mutation_requires_grad_no_grad(self):
|
||||
def f(a):
|
||||
with torch.no_grad():
|
||||
a.mul_(2)
|
||||
return a + 3
|
||||
inp = [torch.ones(4, requires_grad=True)]
|
||||
fw_graph = self.verify_aot_autograd(f, inp, test_mutation=False)
|
||||
|
||||
@patch("functorch.compile.config.use_fake_tensor", True)
|
||||
def test_input_mutation_requires_grad_no_grad_detach_mixed(self):
|
||||
# Perform a mix of mutations on a:
|
||||
# 1 normal, 1 in no_grad, 1 on a detach'd tensor.
|
||||
# Only the first should participate in gradient computation.
|
||||
def f(a):
|
||||
a.detach().mul_(2)
|
||||
a.mul_(3)
|
||||
with torch.no_grad():
|
||||
a.mul_(4)
|
||||
return a + 5
|
||||
inp = [torch.ones(4, requires_grad=True)]
|
||||
fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
|
||||
|
||||
@patch("functorch.compile.config.use_fake_tensor", True)
|
||||
def test_input_mutation_metadata2(self):
|
||||
def f(a):
|
||||
|
||||
@ -2512,6 +2512,43 @@ torch.cuda.synchronize()
|
||||
actual = actual.squeeze()
|
||||
self.assertEqual(state_control[k], actual)
|
||||
|
||||
# Make sure that the parameters become nonsense when scaled gradients are finite
|
||||
# but they get invalidated before `optimizer.step`, after `GradScaler.unscale_`
|
||||
def test_params_invalidated_with_grads_invalidated_between_unscale_and_step(self):
|
||||
for optimizer_ctor, optimizer_kwargs in product(
|
||||
(torch.optim.Adam, torch.optim.AdamW),
|
||||
(
|
||||
{"foreach": False, "fused": False},
|
||||
{"foreach": True, "fused": False},
|
||||
{"foreach": False, "fused": True},
|
||||
),
|
||||
):
|
||||
with self.subTest(optimizer=optimizer_ctor, optimizer_kwargs=optimizer_kwargs):
|
||||
self._test_grads_invalidated_between_unscale_and_step(optimizer_ctor, optimizer_kwargs)
|
||||
|
||||
def _test_grads_invalidated_between_unscale_and_step(self, optimizer_ctor, optimizer_kwargs):
|
||||
model, _, optimizer, _, data, loss_fn, _ = self._create_scaling_case(
|
||||
optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs,
|
||||
)
|
||||
scaler = torch.cuda.amp.GradScaler(init_scale=128.0)
|
||||
|
||||
for input, target in data:
|
||||
optimizer.zero_grad()
|
||||
with torch.autocast('cuda', enabled=True):
|
||||
output = model(input)
|
||||
loss = loss_fn(output, target)
|
||||
scaler.scale(loss).backward()
|
||||
scaler.unscale_(optimizer)
|
||||
|
||||
# deliberately break grads
|
||||
for j, param in enumerate(model.parameters()):
|
||||
param.grad.copy_(torch.inf if j % 2 else torch.nan)
|
||||
|
||||
scaler.step(optimizer)
|
||||
scaler.update()
|
||||
|
||||
self.assertTrue(all((p.isnan().any() or p.isinf().any()) for p in model.parameters()))
|
||||
|
||||
def test_grad_scaling_clipping(self):
|
||||
def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
|
||||
max_norm = 0.2 # A reasonable value that actually has an effect, based on printouts of grads
|
||||
|
||||
@ -3947,6 +3947,13 @@ class TestAsArray(TestCase):
|
||||
self.assertEqual(tensor.dim(), 0)
|
||||
self.assertEqual(tensor.item(), scalar.item())
|
||||
self.assertEqual(tensor.dtype, torch.float64)
|
||||
# Regression test for https://github.com/pytorch/pytorch/issues/97021
|
||||
zerodim_arr = np.array(1.)
|
||||
tensor = torch.asarray(zerodim_arr, dtype=torch.int32)
|
||||
self.assertEqual(tensor.dim(), 0)
|
||||
self.assertEqual(tensor.item(), zerodim_arr.item())
|
||||
self.assertEqual(tensor.dtype, torch.int32)
|
||||
|
||||
|
||||
instantiate_device_type_tests(TestTensorCreation, globals())
|
||||
instantiate_device_type_tests(TestRandomTensorCreation, globals())
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
import contextlib
|
||||
from functools import partial
|
||||
import sys
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
@ -55,7 +56,7 @@ def use_deterministic_algorithims(mode: bool, warn_only: bool):
|
||||
default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float32: 1e-5}
|
||||
default_rtol = {torch.float16: 1e-3, torch.bfloat16: 1.6e-2, torch.float32: 1.3e-6}
|
||||
|
||||
isSM86Device = torch.cuda.is_available() and torch.cuda.get_device_capability() == (8, 6)
|
||||
isSM86or89Device = torch.cuda.is_available() and torch.cuda.get_device_capability() in [(8, 6), (8, 9)]
|
||||
|
||||
|
||||
def get_rtol(true_value: torch.Tensor, computed_value: torch.Tensor) -> float:
|
||||
@ -540,6 +541,105 @@ class TestTransformers(NNTestCase):
|
||||
with cm:
|
||||
_test(batch_first, training, enable_nested_tensor)
|
||||
|
||||
@unittest.skipIf(sys.version_info < (3, 11), "not supported on pre-3.11 Python")
|
||||
def test_encoder_padding_and_src_mask_bool(self):
|
||||
encoder_layer = nn.TransformerEncoderLayer(
|
||||
d_model=16,
|
||||
nhead=2,
|
||||
dim_feedforward=32,
|
||||
dropout=0.1,
|
||||
activation='relu',
|
||||
batch_first=True,
|
||||
)
|
||||
encoder_norm = nn.LayerNorm(16)
|
||||
encoder = nn.TransformerEncoder(
|
||||
encoder_layer, 2, encoder_norm
|
||||
)
|
||||
|
||||
inputs = torch.randn(2, 3, 16)
|
||||
|
||||
src_mask = torch.ones(3, 3, dtype=torch.bool).triu_(diagonal=1)
|
||||
input_seq_len = torch.tensor([3, 2])
|
||||
padding_mask = (
|
||||
torch.arange(3)[None, :].cpu() >= input_seq_len[:, None]
|
||||
)
|
||||
|
||||
with self.assertNoLogs(None):
|
||||
encoder(
|
||||
inputs,
|
||||
mask=src_mask,
|
||||
src_key_padding_mask=padding_mask,
|
||||
)
|
||||
|
||||
@unittest.skipIf(sys.version_info < (3, 11), "not supported on pre-3.11 Python")
|
||||
def test_decoder_padding_and_src_mask_bool(self):
|
||||
|
||||
def transformer_decoder(inputs, input_seq_len, memory):
|
||||
decoder_layer = nn.TransformerDecoderLayer(
|
||||
d_model=16,
|
||||
nhead=2,
|
||||
dim_feedforward=32,
|
||||
dropout=0.1,
|
||||
activation='relu',
|
||||
batch_first=True,
|
||||
)
|
||||
decoder_norm = nn.LayerNorm(16)
|
||||
decoder = nn.TransformerDecoder(
|
||||
decoder_layer, 2, decoder_norm
|
||||
)
|
||||
|
||||
src_mask = torch.ones(
|
||||
inputs.shape[1], inputs.shape[1], dtype=torch.bool
|
||||
).triu_(diagonal=1)
|
||||
padding_mask = (
|
||||
torch.arange(inputs.shape[1])[None, :].cpu()
|
||||
>= input_seq_len[:, None]
|
||||
)
|
||||
|
||||
return decoder(
|
||||
inputs,
|
||||
memory,
|
||||
tgt_mask=src_mask,
|
||||
tgt_key_padding_mask=padding_mask,
|
||||
memory_key_padding_mask=padding_mask,
|
||||
)
|
||||
|
||||
inputs = torch.randn(2, 3, 16)
|
||||
memory = torch.randn(2, 3, 16)
|
||||
input_seq_len = torch.tensor([3, 2])
|
||||
|
||||
with self.assertNoLogs(None):
|
||||
transformer_decoder(inputs, input_seq_len, memory)
|
||||
|
||||
|
||||
def test_padding_and_src_mask_bool(self):
|
||||
encoder_layer = nn.TransformerEncoderLayer(
|
||||
d_model=16,
|
||||
nhead=2,
|
||||
dim_feedforward=32,
|
||||
dropout=0.1,
|
||||
activation='relu',
|
||||
batch_first=True,
|
||||
)
|
||||
encoder_norm = nn.LayerNorm(16)
|
||||
encoder = nn.TransformerEncoder(
|
||||
encoder_layer, 2, encoder_norm
|
||||
)
|
||||
|
||||
inputs = torch.randn(2, 3, 16)
|
||||
|
||||
src_mask = torch.ones(3, 3, dtype=torch.bool).triu_(diagonal=1)
|
||||
input_seq_len = torch.tensor([3, 2])
|
||||
padding_mask = (
|
||||
torch.arange(3)[None, :].cpu() >= input_seq_len[:, None]
|
||||
)
|
||||
|
||||
encoder(
|
||||
inputs,
|
||||
mask=src_mask,
|
||||
src_key_padding_mask=padding_mask,
|
||||
)
|
||||
|
||||
|
||||
def test_encoder_is_causal(self):
|
||||
|
||||
@ -1545,18 +1645,47 @@ class TestSDPA(NNTestCase):
|
||||
assert torch._fused_sdp_choice(query, key, value) == (
|
||||
SDPBackend.EFFICIENT_ATTENTION if warn_only else SDPBackend.MATH)
|
||||
|
||||
@unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA or not isSM86Device, "CUDA unavailable")
|
||||
def test_memory_efficeint_sm86_failure(self):
|
||||
@unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA or not isSM86or89Device,
|
||||
"Does not support fused SDPA or not SM86+ hardware")
|
||||
@parametrize("head_dim", [72, 96, 128])
|
||||
def test_memory_efficient_sm86_plus_failure(self, head_dim: int):
|
||||
device = 'cuda'
|
||||
dtype = torch.float16
|
||||
make_tensor = partial(self.rand_tensor, type="dense", device=device, dtype=dtype)
|
||||
# See check_gpu_sm86_head_dim_128 in pytorch/aten/src/ATen/native/transformers/cuda/sdp_utils.h
|
||||
size = (2, 2, 4, 128)
|
||||
# See check_head_dim_gt64_and_sm_ge86 in pytorch/aten/src/ATen/native/transformers/cuda/sdp_utils.h
|
||||
size = (2, 2, 4, head_dim)
|
||||
q, k, v = make_tensor(size), make_tensor(size), make_tensor(size)
|
||||
with sdp_kernel(enable_mem_efficient=True, enable_flash=False, enable_math=False):
|
||||
self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
|
||||
q, k, v, None, 0.0, False))
|
||||
|
||||
@unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA or not isSM86or89Device,
|
||||
"Does not support fused SDPA or not SM86+ hardware")
|
||||
@parametrize("head_dim", [72, 96, 128])
|
||||
def test_flash_backward_failure_sm86plus(self, head_dim: int):
|
||||
device = 'cuda'
|
||||
dtype = torch.float16
|
||||
make_tensor = partial(self.rand_tensor, type="dense", device=device, dtype=dtype)
|
||||
# See check_requires_grad_and_head_dim_gt64_and_sm_ge86 in pytorch/aten/src/ATen/native/transformers/cuda/sdp_utils.h
|
||||
size = (2, 2, 4, head_dim)
|
||||
q, k, v = make_tensor(size), make_tensor(size), make_tensor(size)
|
||||
|
||||
with sdp_kernel(enable_mem_efficient=False, enable_flash=False, enable_math=True):
|
||||
math_ref = torch.nn.functional.scaled_dot_product_attention(q, k, v, None, 0.0, False)
|
||||
|
||||
with sdp_kernel(enable_mem_efficient=False, enable_flash=True, enable_math=False):
|
||||
# Should not fail because inputs don't require grad
|
||||
flash_ref = torch.nn.functional.scaled_dot_product_attention(q, k, v, None, 0.0, False)
|
||||
|
||||
self.assertEqual(math_ref, flash_ref, atol=1e-3, rtol=1e-3)
|
||||
|
||||
# Should fail because inputs require grad
|
||||
q = make_tensor(size, requires_grad=True)
|
||||
k = make_tensor(size, requires_grad=True)
|
||||
v = make_tensor(size, requires_grad=True)
|
||||
self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
|
||||
q, k, v, None, 0.0, False))
|
||||
|
||||
@unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA, "Does not support fused scaled dot product attention")
|
||||
def test_dispatch_fails_no_backend(self):
|
||||
dtype = torch.float16
|
||||
@ -1727,7 +1856,7 @@ class TestSDPA(NNTestCase):
|
||||
@parametrize("batch_size", [1, 8])
|
||||
@parametrize("seq_len_q", [4, 8, 64, 128, 256, 512, 1024, 2048])
|
||||
@parametrize("seq_len_k", [4, 8, 64, 128, 256, 512, 1024, 2048])
|
||||
@parametrize("head_dim", [8, 16, 32, 64, 128])
|
||||
@parametrize("head_dim", [8, 16, 32, 64, 72, 96, 128])
|
||||
@parametrize("is_causal", [True, False])
|
||||
@parametrize("dropout_p", [0.0]) # mem_efficient_attention does not support dropout
|
||||
@parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
|
||||
@ -1754,8 +1883,8 @@ class TestSDPA(NNTestCase):
|
||||
|
||||
# Create real output
|
||||
with sdp_kernel(enable_mem_efficient=True, enable_flash=False, enable_math=False):
|
||||
# See check_gpu_sm86_head_dim_128 in pytorch/aten/src/ATen/native/transformers/cuda/sdp_utils.h
|
||||
if isSM86Device and head_dim == 128:
|
||||
# See check_head_dim_gt64_and_sm_ge86 in pytorch/aten/src/ATen/native/transformers/cuda/sdp_utils.h
|
||||
if isSM86or89Device and head_dim in range(65, 129):
|
||||
self.assertRaises(RuntimeError, lambda: F.scaled_dot_product_attention(query, key, value,
|
||||
dropout_p=dropout_p, is_causal=is_causal))
|
||||
return
|
||||
|
||||
@ -65,12 +65,12 @@ from .dicts import (
|
||||
)
|
||||
from .functions import UserFunctionVariable
|
||||
from .lists import (
|
||||
ListIteratorVariable,
|
||||
ListVariable,
|
||||
NamedTupleVariable,
|
||||
RangeVariable,
|
||||
SizeVariable,
|
||||
SliceVariable,
|
||||
TupleIteratorVariable,
|
||||
TupleVariable,
|
||||
)
|
||||
from .misc import (
|
||||
@ -265,7 +265,7 @@ class VariableBuilder:
|
||||
)(tuple_iterator_getitem(value, i)).add_guards(guards)
|
||||
for i in range(tuple_iterator_len(value))
|
||||
]
|
||||
return ListIteratorVariable(
|
||||
return TupleIteratorVariable(
|
||||
output, mutable_local=MutableLocal(), guards=guards
|
||||
)
|
||||
elif istype(value, (slice, range)):
|
||||
|
||||
@ -26,7 +26,7 @@ from ..utils import (
|
||||
from .base import MutableLocal, typestr, VariableTracker
|
||||
from .constant import ConstantVariable
|
||||
from .dicts import ConstDictVariable
|
||||
from .lists import BaseListVariable, ListVariable, TupleVariable
|
||||
from .lists import BaseListVariable, ListVariable, TupleIteratorVariable, TupleVariable
|
||||
from .tensor import FakeItemVariable, SymNodeVariable, UnspecializedPythonVariable
|
||||
from .user_defined import UserDefinedVariable
|
||||
|
||||
@ -195,7 +195,7 @@ class BuiltinVariable(VariableTracker):
|
||||
|
||||
# Override table contains: op_fn -> [list of handlers]
|
||||
op_handlers = {}
|
||||
for (op, magic_method_names) in itertools.chain(
|
||||
for op, magic_method_names in itertools.chain(
|
||||
BuiltinVariable._inplace_binops().items(),
|
||||
BuiltinVariable._reversible_binops().items(),
|
||||
):
|
||||
@ -355,7 +355,7 @@ class BuiltinVariable(VariableTracker):
|
||||
return None
|
||||
|
||||
# Return first handler that matches the type checks
|
||||
for ((type1, type2), handler) in handlers[op]:
|
||||
for (type1, type2), handler in handlers[op]:
|
||||
if isinstance(a, type1) and isinstance(b, type2):
|
||||
return handler
|
||||
|
||||
@ -646,7 +646,6 @@ class BuiltinVariable(VariableTracker):
|
||||
)
|
||||
for i in [a, b]
|
||||
):
|
||||
|
||||
if any([isinstance(val, FakeItemVariable) for val in [a, b]]):
|
||||
return variables.FakeItemVariable.from_tensor_variable(result)
|
||||
|
||||
@ -683,7 +682,6 @@ class BuiltinVariable(VariableTracker):
|
||||
)
|
||||
return SymNodeVariable.create(tx, proxy, None)
|
||||
else:
|
||||
|
||||
unimplemented(f"unsupported min / max over args {str(a)}, {str(b)}")
|
||||
|
||||
call_min = _call_min_max
|
||||
@ -739,7 +737,10 @@ class BuiltinVariable(VariableTracker):
|
||||
elif obj.has_unpack_var_sequence(tx):
|
||||
guards = set()
|
||||
if obj.source and not is_constant_source(obj.source):
|
||||
guards.add(obj.source.make_guard(GuardBuilder.LIST_LENGTH))
|
||||
if isinstance(obj, TupleIteratorVariable):
|
||||
guards.add(obj.source.make_guard(GuardBuilder.TUPLE_ITERATOR_LEN))
|
||||
else:
|
||||
guards.add(obj.source.make_guard(GuardBuilder.LIST_LENGTH))
|
||||
return cls(
|
||||
list(obj.unpack_var_sequence(tx)),
|
||||
mutable_local=MutableLocal(),
|
||||
|
||||
@ -534,3 +534,7 @@ class ListIteratorVariable(VariableTracker):
|
||||
create_instruction("BUILD_TUPLE", len(remaining_items)),
|
||||
create_instruction("GET_ITER"),
|
||||
]
|
||||
|
||||
|
||||
class TupleIteratorVariable(ListIteratorVariable):
|
||||
pass
|
||||
|
||||
@ -399,12 +399,28 @@ class NNModuleVariable(VariableTracker):
|
||||
)
|
||||
elif name == "__getitem__":
|
||||
assert not kwargs and len(args) == 1
|
||||
assert type(module).__getitem__ in (
|
||||
builtin_supported = (
|
||||
torch.nn.ModuleDict.__getitem__,
|
||||
torch.nn.ModuleList.__getitem__,
|
||||
torch.nn.ParameterList.__getitem__,
|
||||
torch.nn.Sequential.__getitem__,
|
||||
), typestr(module)
|
||||
)
|
||||
|
||||
if type(module).__getitem__ not in builtin_supported:
|
||||
assert isinstance(args[0], variables.ConstantVariable), typestr(args[0])
|
||||
key = args[0].as_python_constant()
|
||||
assert isinstance(key, (str, int))
|
||||
fn = getattr(module, name).__func__
|
||||
|
||||
assert isinstance(fn, types.FunctionType)
|
||||
|
||||
src = AttrSource(AttrSource(self.source, name), "__func__")
|
||||
return tx.inline_user_function_return(
|
||||
variables.UserFunctionVariable(fn, source=src, **options),
|
||||
[self] + list(args),
|
||||
kwargs,
|
||||
)
|
||||
|
||||
assert self.source
|
||||
|
||||
if isinstance(args[0], SliceVariable):
|
||||
|
||||
@ -407,6 +407,7 @@ class OutputAliasInfo:
|
||||
# This class tells us info about user inputs.
|
||||
@dataclass(frozen=True)
|
||||
class InputAliasInfo:
|
||||
is_leaf: bool
|
||||
mutates_data: bool
|
||||
mutates_metadata: bool
|
||||
|
||||
@ -632,6 +633,7 @@ def run_functionalized_fw_and_collect_metadata(
|
||||
mutates_metadata = False
|
||||
|
||||
input_info.append(InputAliasInfo(
|
||||
is_leaf=isinstance(arg, torch.Tensor) and arg.is_leaf,
|
||||
mutates_data=mutates_data,
|
||||
mutates_metadata=mutates_metadata
|
||||
))
|
||||
@ -1967,7 +1969,22 @@ def create_runtime_wrapper(
|
||||
)
|
||||
else:
|
||||
assert meta.mutates_data
|
||||
original_inpt.copy_(updated_inpt)
|
||||
if meta.is_leaf and original_inpt.requires_grad:
|
||||
# We can hit this situation in this case:
|
||||
# def f(x):
|
||||
# x.detach().mul_(2)
|
||||
# return x + 1
|
||||
# AOTAutograd will see a mutation in the above case, and try to
|
||||
# apply a copy_() here, in the epilogue.
|
||||
# But if x required gradients, and is a leaf, then autograd
|
||||
# will yell at us for trying to mutate it.
|
||||
# However, it's only possible to end up in this scenario (like the above)
|
||||
# if all of the mutations to the leaf input were non-autograd-tracking mutations
|
||||
# (aka mutations under no_grad(), or on detached views).
|
||||
# In that case, we fully want to hide the mutation from autograd, so detaching is ok.
|
||||
original_inpt.detach().copy_(updated_inpt)
|
||||
else:
|
||||
original_inpt.copy_(updated_inpt)
|
||||
else:
|
||||
fw_outs = all_outs
|
||||
|
||||
|
||||
@ -622,6 +622,32 @@ static PyObject* _custom_eval_frame(
|
||||
frame->f_lasti,
|
||||
frame->f_iblock,
|
||||
frame->f_executing);
|
||||
|
||||
if (throw_flag) {
|
||||
// When unwinding generators, eval frame is called with throw_flag ==
|
||||
// true. Frame evaluation is supposed to continue unwinding by propagating
|
||||
// the exception. Dynamo doesn't really know how to do this, nor does it
|
||||
// really want to do this, because there's unlikely any code to capture
|
||||
// (you're going to immediately quit out of the frame, perhaps running
|
||||
// some unwinding logic along the way). So we just run the default
|
||||
// handler in this case.
|
||||
//
|
||||
// NB: A previous version of this patch returned NULL. This is wrong,
|
||||
// because returning NULL is *different* from unwinding an exception.
|
||||
// In particular, you will not execute things like context manager
|
||||
// __exit__ if you just return NULL.
|
||||
//
|
||||
// NB: It's /conceivable/ that you might want to actually still call the
|
||||
// Dynamo callback when throw_flag == TRUE, to give Dynamo a chance to
|
||||
// do any stack unwinding code. But this is not really useful because
|
||||
// (1) Dynamo doesn't actually know how to do stack unwinding, so it would
|
||||
// immediately skip the frame, and (2) even if it did, this would only
|
||||
// be profitable if there was tensor code in the unwinding code. Seems
|
||||
// unlikely.
|
||||
DEBUG_TRACE("throw %s", name(frame));
|
||||
return eval_frame_default(tstate, frame, throw_flag);
|
||||
}
|
||||
|
||||
CacheEntry* extra = get_extra(frame->f_code);
|
||||
if (extra == SKIP_CODE || (callback == Py_False && extra == NULL)) {
|
||||
DEBUG_TRACE("skip %s", name(frame));
|
||||
@ -684,6 +710,10 @@ static PyObject* _custom_eval_frame(
|
||||
// internal exception, returning here will leak the exception into user code
|
||||
// this is useful for debugging -- but we dont want it to happen outside of
|
||||
// testing
|
||||
// NB: we intentionally DO NOT re-enable custom behavior to prevent
|
||||
// cascading failure from internal exceptions. The upshot is if
|
||||
// Dynamo barfs, that's it for Dynamo, even if you catch the exception
|
||||
// inside the torch.compile block we won't try to Dynamo anything else.
|
||||
return NULL;
|
||||
} else if (result != Py_None) {
|
||||
DEBUG_TRACE("create cache %s", name(frame));
|
||||
|
||||
@ -1608,7 +1608,10 @@ Tensor asarray(
|
||||
THPObjectPtr ptr;
|
||||
auto arr = obj;
|
||||
|
||||
if (is_numpy_scalar) {
|
||||
// PyArray_CheckScalar is true for both scalars and 0-dim arrays, per
|
||||
// https://numpy.org/devdocs/reference/c-api/array.html#c.PyArray_CheckScalar
|
||||
// But for 0-dim arrays no `PyArray_FromScalar` call is needed
|
||||
if (is_numpy_scalar && !is_numpy_array) {
|
||||
TORCH_CHECK(
|
||||
!force_alias,
|
||||
"can't alias NumPy scalars. ",
|
||||
|
||||
@ -336,6 +336,8 @@ class GradScaler:
|
||||
# and `found_inf` to the passed optimizer so that the optimizer can utilize those
|
||||
# to skip the parameter updates or unscale gradients before updating parameters in
|
||||
# the fused kernel, e.g. `FusedAdamMathFunctor`.
|
||||
# In this behavior, `GradScaler._check_inf_per_device` is called if `OptState.READY`,
|
||||
# while the method is expected to be called by users side, i.e. their optimizers.
|
||||
kwargs_ = kwargs
|
||||
has_grad_scaler_kwarg = "grad_scaler" in inspect.signature(optimizer.step).parameters
|
||||
if has_grad_scaler_kwarg:
|
||||
@ -346,11 +348,13 @@ class GradScaler:
|
||||
FutureWarning)
|
||||
kwargs_.update({"grad_scaler": self})
|
||||
else:
|
||||
if optimizer_state["stage"] is OptState.READY:
|
||||
self._check_inf_per_device(optimizer)
|
||||
scaler = self._get_scale_async()
|
||||
found_inf = cast(
|
||||
torch.Tensor,
|
||||
sum([
|
||||
t.to(scaler.device, non_blocking=True) for t in self._check_inf_per_device(optimizer).values()
|
||||
t.to(scaler.device, non_blocking=True) for t in optimizer_state["found_inf_per_device"].values()
|
||||
])
|
||||
)
|
||||
optimizer.grad_scale = None if optimizer_state["stage"] == OptState.UNSCALED else scaler
|
||||
|
||||
@ -255,7 +255,8 @@ def load_sharded_optimizer_state_dict(
|
||||
sharding_spec = ChunkShardingSpec(
|
||||
dim=0,
|
||||
placements=[
|
||||
f"rank:{i}/cuda:{i}" for i in range(dist.get_world_size())
|
||||
f"rank:{i}/cuda:{i % torch.cuda.device_count()}"
|
||||
for i in range(dist.get_world_size())
|
||||
],
|
||||
)
|
||||
else:
|
||||
|
||||
@ -5194,6 +5194,7 @@ def multi_head_attention_forward(
|
||||
check_other=False,
|
||||
)
|
||||
|
||||
|
||||
if key_padding_mask is not None:
|
||||
# We have the attn_mask, and use that to merge kpm into it.
|
||||
# Turn off use of is_causal hint, as the merged mask is no
|
||||
@ -5236,8 +5237,8 @@ def multi_head_attention_forward(
|
||||
attn_mask = _canonical_mask(
|
||||
mask=attn_mask,
|
||||
mask_name="attn_mask",
|
||||
other_type=_none_or_dtype(key_padding_mask),
|
||||
other_name="key_padding_mask",
|
||||
other_type=None,
|
||||
other_name="",
|
||||
target_type=q.dtype,
|
||||
check_other=False,
|
||||
)
|
||||
|
||||
@ -898,11 +898,14 @@ class MultiheadAttention(Module):
|
||||
|
||||
where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
|
||||
|
||||
``nn.MultiHeadAttention`` will use the optimized implementations of
|
||||
``scaled_dot_product_attention()`` when possible.
|
||||
``forward()`` will use the optimized implementations of
|
||||
``scaled_dot_product_attention()``.
|
||||
|
||||
- self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor. This
|
||||
restriction will be loosened in the future.)
|
||||
In addition to support for the new ``scaled_dot_product_attention()``
|
||||
function, for speeding up Inference, MHA will use
|
||||
fastpath inference with support for Nested Tensors, iff:
|
||||
|
||||
- self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor.
|
||||
- inputs are batched (3D) with ``batch_first==True``
|
||||
- Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor argument ``requires_grad``
|
||||
- training is disabled (using ``.eval()``)
|
||||
@ -914,7 +917,7 @@ class MultiheadAttention(Module):
|
||||
nor ``attn_mask`` is passed
|
||||
- autocast is disabled
|
||||
|
||||
If the optimized implementation is in use, a
|
||||
If the optimized inference fastpath implementation is in use, a
|
||||
`NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ can be passed for
|
||||
``query``/``key``/``value`` to represent padding more efficiently than using a
|
||||
padding mask. In this case, a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_
|
||||
@ -945,6 +948,7 @@ class MultiheadAttention(Module):
|
||||
https://arxiv.org/abs/2205.14135
|
||||
|
||||
"""
|
||||
|
||||
__constants__ = ['batch_first']
|
||||
bias_k: Optional[torch.Tensor]
|
||||
bias_v: Optional[torch.Tensor]
|
||||
@ -1090,6 +1094,16 @@ class MultiheadAttention(Module):
|
||||
target_type=query.dtype
|
||||
)
|
||||
|
||||
attn_mask = F._canonical_mask(
|
||||
mask=attn_mask,
|
||||
mask_name="attn_mask",
|
||||
other_type=None,
|
||||
other_name="",
|
||||
target_type=query.dtype,
|
||||
check_other=False,
|
||||
)
|
||||
|
||||
|
||||
why_not_fast_path = ''
|
||||
if not is_batched:
|
||||
why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}"
|
||||
@ -1225,8 +1239,8 @@ class MultiheadAttention(Module):
|
||||
attn_mask = F._canonical_mask(
|
||||
mask=attn_mask,
|
||||
mask_name="attn_mask",
|
||||
other_type=F._none_or_dtype(key_padding_mask),
|
||||
other_name="key_padding_mask",
|
||||
other_type=None,
|
||||
other_name="",
|
||||
target_type=query.dtype,
|
||||
check_other=False,
|
||||
)
|
||||
|
||||
@ -219,6 +219,15 @@ class TransformerEncoder(Module):
|
||||
target_type=src.dtype
|
||||
)
|
||||
|
||||
mask = F._canonical_mask(
|
||||
mask=mask,
|
||||
mask_name="mask",
|
||||
other_type=None,
|
||||
other_name="",
|
||||
target_type=src.dtype,
|
||||
check_other=False,
|
||||
)
|
||||
|
||||
output = src
|
||||
convert_to_nested = False
|
||||
first_layer = self.layers[0]
|
||||
@ -492,6 +501,15 @@ class TransformerEncoderLayer(Module):
|
||||
target_type=src.dtype
|
||||
)
|
||||
|
||||
src_mask = F._canonical_mask(
|
||||
mask=src_mask,
|
||||
mask_name="src_mask",
|
||||
other_type=None,
|
||||
other_name="",
|
||||
target_type=src.dtype,
|
||||
check_other=False,
|
||||
)
|
||||
|
||||
# see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf
|
||||
why_not_sparsity_fast_path = ''
|
||||
if not src.dim() == 3:
|
||||
|
||||
@ -3036,6 +3036,29 @@ class TestCase(expecttest.TestCase):
|
||||
else:
|
||||
return super().assertRaisesRegex(expected_exception, expected_regex, *args, **kwargs)
|
||||
|
||||
# Verifies that no unraisable exceptions are raised by callable. Unlike regular
|
||||
# exceptions, these do not actually propagate to the caller and are
|
||||
# suppressed. We must test for them specially.
|
||||
def assertNoUnraisable(self, callable, *args, **kwargs):
|
||||
raised = None
|
||||
|
||||
def record_unraisable(unraisable):
|
||||
nonlocal raised
|
||||
raised = unraisable
|
||||
|
||||
# Disable GC when running the callable to prevent spurious flakiness
|
||||
# from unlucky GCs inside the callable
|
||||
prev = gc.isenabled()
|
||||
gc.disable()
|
||||
try:
|
||||
with unittest.mock.patch("sys.unraisablehook", record_unraisable):
|
||||
callable(*args, **kwargs)
|
||||
finally:
|
||||
if prev:
|
||||
gc.enable()
|
||||
|
||||
self.assertIsNone(raised)
|
||||
|
||||
# TODO: Support context manager interface
|
||||
# NB: The kwargs forwarding to callable robs the 'subname' parameter.
|
||||
# If you need it, manually apply your callable in a lambda instead.
|
||||
|
||||
Reference in New Issue
Block a user