2023-12-04 nightly release (3fbfa8cd0a5cefadb3f116c5cd0d60e96ab8c99e)

This commit is contained in:
pytorchbot
2023-12-04 07:33:35 +00:00
parent 6fc9d89863
commit a111e45dfe
51 changed files with 1726 additions and 335 deletions

View File

@ -1 +1 @@
d96c49e878defa7cccf976b30ae67bc7b20f531a
5f72802710a941d53a49b28ae0c7386332d41935

View File

@ -1 +1 @@
6e4932cda85fb9128728072c821bd54410a9b5be
bcad9dabe15021c53b6a88296e9d7a210044f108

View File

@ -96,13 +96,7 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
conda install \${EXTRA_CONDA_FLAGS} -y "\$pkg" --offline
)
elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
if [[ "$(uname -m)" == aarch64 ]]; then
# Using "extra-index-url" until all needed aarch64 dependencies are
# added to "https://download.pytorch.org/whl/"
pip install "\$pkg" --extra-index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
else
pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
fi
pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
retry pip install -q numpy protobuf typing-extensions
fi
if [[ "$PACKAGE_TYPE" == libtorch ]]; then

View File

@ -1 +1 @@
db624844f5c95bb7618fe5a5f532bf9b68efeb45
6518fa9b2c74e84d7eb1fc6e3eb51e43213f0c05

View File

@ -67,10 +67,12 @@ def build_triton(
max_jobs = os.cpu_count() or 1
env["MAX_JOBS"] = str(max_jobs)
version_suffix = ""
if not release:
# Nightly binaries include the triton commit hash, i.e. 2.1.0+e6216047b8
# while release build should only include the version, i.e. 2.1.0
version = f"{version}+{commit_hash[:10]}"
version_suffix = f"+{commit_hash[:10]}"
version += version_suffix
with TemporaryDirectory() as tmpdir:
triton_basedir = Path(tmpdir) / "triton"
@ -132,17 +134,21 @@ def build_triton(
shutil.copy(conda_path, Path.cwd())
return Path.cwd() / conda_path.name
patch_setup_py(
triton_pythondir / "setup.py",
name=triton_pkg_name,
version=f"{version}",
)
# change built wheel name and version
env["TRITON_WHEEL_NAME"] = triton_pkg_name
env["TRITON_WHEEL_VERSION_SUFFIX"] = version_suffix
patch_init_py(
triton_pythondir / "triton" / "__init__.py",
version=f"{version}",
)
if build_rocm:
# TODO: Remove me when ROCM triton is updated
patch_setup_py(
triton_pythondir / "setup.py",
name=triton_pkg_name,
version=f"{version}",
)
check_call("scripts/amd/setup_rocm_libs.sh", cwd=triton_basedir, shell=True)
print("ROCm libraries setup for triton installation...")

View File

@ -174,7 +174,7 @@ LIBTORCH_CONTAINER_IMAGES: Dict[Tuple[str, str], str] = {
("cpu", CXX11_ABI): f"pytorch/libtorch-cxx11-builder:cpu-{DEFAULT_TAG}",
}
FULL_PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11"]
FULL_PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11", "3.12"]
def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
@ -288,7 +288,7 @@ def generate_wheels_matrix(
package_type = "manywheel"
if python_versions is None:
python_versions = FULL_PYTHON_VERSIONS + ["3.12"]
python_versions = FULL_PYTHON_VERSIONS
if arches is None:
# Define default compute archivectures

View File

@ -4,7 +4,6 @@ on:
push:
branches:
- main
- nightly
tags:
# NOTE: Binary build pipelines should only get triggered on release candidate builds
# Release candidate tags look like: v1.11.0-rc1
@ -134,7 +133,7 @@ jobs:
needs: build-wheel
container:
image: continuumio/miniconda3:4.12.0
environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
steps:
- uses: actions/checkout@v3
@ -145,7 +144,7 @@ jobs:
path: ${{ runner.temp }}/artifacts/
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/v'))) }}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
shell: bash
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
@ -161,7 +160,7 @@ jobs:
echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
fi
# NB: This step is gated by DRY_RUN, which is enabled everywhere except nightly and release branches
# NB: This step is gated by DRY_RUN, which is enabled everywhere except main and release branches
- name: Upload binaries
env:
PACKAGE_TYPE: wheel
@ -247,7 +246,7 @@ jobs:
needs: build-conda
container:
image: continuumio/miniconda3:4.12.0
environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
steps:
- uses: actions/checkout@v3
@ -258,7 +257,7 @@ jobs:
path: ${{ runner.temp }}/artifacts/
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/v'))) }}
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
shell: bash
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"

View File

@ -763,3 +763,185 @@ jobs:
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
conda-py3_12-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/conda-builder:cpu-main
DESIRED_PYTHON: "3.12"
build_name: conda-py3_12-cpu
build_environment: linux-binary-conda
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
conda-py3_12-cpu-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_12-cpu-build
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/conda-builder:cpu-main
DESIRED_PYTHON: "3.12"
build_name: conda-py3_12-cpu
build_environment: linux-binary-conda
runs_on: linux.4xlarge
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
conda-py3_12-cpu-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_12-cpu-test
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/conda-builder:cpu-main
DESIRED_PYTHON: "3.12"
build_name: conda-py3_12-cpu
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
conda-py3_12-cuda11_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu118
GPU_ARCH_VERSION: 11.8
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-main
DESIRED_PYTHON: "3.12"
runs_on: linux.24xlarge
build_name: conda-py3_12-cuda11_8
build_environment: linux-binary-conda
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
conda-py3_12-cuda11_8-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_12-cuda11_8-build
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu118
GPU_ARCH_VERSION: 11.8
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-main
DESIRED_PYTHON: "3.12"
build_name: conda-py3_12-cuda11_8
build_environment: linux-binary-conda
runs_on: linux.4xlarge.nvidia.gpu
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
conda-py3_12-cuda11_8-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_12-cuda11_8-test
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu118
GPU_ARCH_VERSION: 11.8
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-main
DESIRED_PYTHON: "3.12"
build_name: conda-py3_12-cuda11_8
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
conda-py3_12-cuda12_1-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu121
GPU_ARCH_VERSION: 12.1
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-main
DESIRED_PYTHON: "3.12"
runs_on: linux.24xlarge
build_name: conda-py3_12-cuda12_1
build_environment: linux-binary-conda
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
conda-py3_12-cuda12_1-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_12-cuda12_1-build
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu121
GPU_ARCH_VERSION: 12.1
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-main
DESIRED_PYTHON: "3.12"
build_name: conda-py3_12-cuda12_1
build_environment: linux-binary-conda
runs_on: linux.4xlarge.nvidia.gpu
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
conda-py3_12-cuda12_1-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_12-cuda12_1-test
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu121
GPU_ARCH_VERSION: 12.1
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-main
DESIRED_PYTHON: "3.12"
build_name: conda-py3_12-cuda12_1
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml

View File

@ -502,3 +502,120 @@ jobs:
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
conda-py3_12-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: macos-12-xl
timeout-minutes: 240
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.12"
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
# shellcheck disable=SC2129
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Install conda and dependencies
run: |
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
fi
- name: Checkout PyTorch
uses: malfet/checkout@silent-checkout
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
quiet-checkout: true
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: malfet/checkout@silent-checkout
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
quiet-checkout: true
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
uses: nick-fields/retry@v2.8.2
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
with:
timeout_minutes: 5
max_attempts: 3
retry_wait_seconds: 90
command: |
sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
sudo chmod +x /usr/local/bin/sccache
echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
- name: Populate binary env
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
- uses: actions/upload-artifact@v3
if: always()
with:
name: conda-py3_12-cpu
retention-days: 14
if-no-files-found: error
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
conda-py3_12-cpu-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_12-cpu-build
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/conda-builder:cpu-main
DESIRED_PYTHON: "3.12"
build_name: conda-py3_12-cpu
use_s3: False
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml

View File

@ -500,3 +500,120 @@ jobs:
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
conda-py3_12-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: macos-12-xl
timeout-minutes: 240
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.12"
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
# shellcheck disable=SC2129
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
# shellcheck disable=SC2129
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
- name: Install conda and dependencies
run: |
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
fi
- name: Checkout PyTorch
uses: malfet/checkout@silent-checkout
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
quiet-checkout: true
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: malfet/checkout@silent-checkout
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
quiet-checkout: true
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
uses: nick-fields/retry@v2.8.2
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
with:
timeout_minutes: 5
max_attempts: 3
retry_wait_seconds: 90
command: |
sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
sudo chmod +x /usr/local/bin/sccache
echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
- name: Populate binary env
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
run: |
# shellcheck disable=SC1091
source "${RUNNER_TEMP}/anaconda/bin/activate"
"${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
- uses: actions/upload-artifact@v3
if: always()
with:
name: conda-py3_12-cpu
retention-days: 14
if-no-files-found: error
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
conda-py3_12-cpu-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_12-cpu-build
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DOCKER_IMAGE: pytorch/conda-builder:cpu-main
DESIRED_PYTHON: "3.12"
build_name: conda-py3_12-cpu
use_s3: False
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml

View File

@ -2948,3 +2948,732 @@ jobs:
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
conda-py3_12-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: windows.4xlarge.nonephemeral
timeout-minutes: 240
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.12"
steps:
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- name: Checkout PyTorch
uses: malfet/checkout@silent-checkout
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
quiet-checkout: true
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: malfet/checkout@silent-checkout
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
quiet-checkout: true
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
- uses: actions/upload-artifact@v3
if: always()
with:
name: conda-py3_12-cpu
retention-days: 14
if-no-files-found: error
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
conda-py3_12-cpu-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_12-cpu-build
runs-on: windows.4xlarge.nonephemeral
timeout-minutes: 240
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.12"
steps:
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- uses: actions/download-artifact@v3
name: Download Build Artifacts
with:
name: conda-py3_12-cpu
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Checkout PyTorch
uses: malfet/checkout@silent-checkout
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
quiet-checkout: true
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: malfet/checkout@silent-checkout
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
quiet-checkout: true
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Test PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
conda-py3_12-cpu-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_12-cpu-test
with:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cpu
GPU_ARCH_TYPE: cpu
DESIRED_PYTHON: "3.12"
build_name: conda-py3_12-cpu
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
conda-py3_12-cuda11_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: windows.4xlarge.nonephemeral
timeout-minutes: 240
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu118
GPU_ARCH_VERSION: 11.8
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.12"
steps:
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- name: Checkout PyTorch
uses: malfet/checkout@silent-checkout
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
quiet-checkout: true
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: malfet/checkout@silent-checkout
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
quiet-checkout: true
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
- uses: actions/upload-artifact@v3
if: always()
with:
name: conda-py3_12-cuda11_8
retention-days: 14
if-no-files-found: error
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
conda-py3_12-cuda11_8-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_12-cuda11_8-build
runs-on: windows.8xlarge.nvidia.gpu
timeout-minutes: 240
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu118
GPU_ARCH_VERSION: 11.8
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.12"
steps:
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- uses: actions/download-artifact@v3
name: Download Build Artifacts
with:
name: conda-py3_12-cuda11_8
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Checkout PyTorch
uses: malfet/checkout@silent-checkout
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
quiet-checkout: true
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: malfet/checkout@silent-checkout
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
quiet-checkout: true
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Test PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
conda-py3_12-cuda11_8-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_12-cuda11_8-test
with:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu118
GPU_ARCH_VERSION: 11.8
GPU_ARCH_TYPE: cuda
DESIRED_PYTHON: "3.12"
build_name: conda-py3_12-cuda11_8
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
conda-py3_12-cuda12_1-build:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: windows.4xlarge.nonephemeral
timeout-minutes: 240
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu121
GPU_ARCH_VERSION: 12.1
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.12"
steps:
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- name: Checkout PyTorch
uses: malfet/checkout@silent-checkout
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
quiet-checkout: true
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: malfet/checkout@silent-checkout
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
quiet-checkout: true
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
- uses: actions/upload-artifact@v3
if: always()
with:
name: conda-py3_12-cuda12_1
retention-days: 14
if-no-files-found: error
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
conda-py3_12-cuda12_1-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_12-cuda12_1-build
runs-on: windows.8xlarge.nvidia.gpu
timeout-minutes: 240
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu121
GPU_ARCH_VERSION: 12.1
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.12"
steps:
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- uses: actions/download-artifact@v3
name: Download Build Artifacts
with:
name: conda-py3_12-cuda12_1
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Checkout PyTorch
uses: malfet/checkout@silent-checkout
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
quiet-checkout: true
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: malfet/checkout@silent-checkout
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
quiet-checkout: true
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Test PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
conda-py3_12-cuda12_1-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
needs: conda-py3_12-cuda12_1-test
with:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
BUILDER_ROOT: ${{ github.workspace }}/builder
PACKAGE_TYPE: conda
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu121
GPU_ARCH_VERSION: 12.1
GPU_ARCH_TYPE: cuda
DESIRED_PYTHON: "3.12"
build_name: conda-py3_12-cuda12_1
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml

View File

@ -118,7 +118,7 @@ hf_T5,pass,0
hf_T5_generate,pass,20
hf_T5_generate,pass,18

1 name accuracy graph_breaks
118
119
120
121
122
123
124

View File

@ -118,7 +118,7 @@ hf_T5,pass,0
hf_T5_generate,fail_to_run,10
hf_T5_generate,fail_to_run,9

1 name accuracy graph_breaks
118
119
120
121
122
123
124

View File

@ -102,7 +102,7 @@ hf_Reformer,pass,27
hf_T5_base,OOM,7
hf_T5_base,pass,7

1 name accuracy graph_breaks
102
103
104
105
106
107
108

View File

@ -118,7 +118,7 @@ hf_T5,pass,0
hf_T5_generate,fail_to_run,10
hf_T5_generate,fail_to_run,9

1 name accuracy graph_breaks
118
119
120
121
122
123
124

View File

@ -118,7 +118,7 @@ hf_T5,pass,0
hf_T5_generate,pass,20
hf_T5_generate,pass,18

1 name accuracy graph_breaks
118
119
120
121
122
123
124

View File

@ -118,7 +118,7 @@ hf_T5,pass,0
hf_T5_generate,pass,20
hf_T5_generate,pass,18

1 name accuracy graph_breaks
118
119
120
121
122
123
124

View File

@ -69,6 +69,7 @@ BATCH_SIZE_DIVISORS = {
REQUIRE_HIGHER_TOLERANCE = {
"fbnetv3_b",
"gmixer_24_224",
"hrnet_w18",
"inception_v3",
"sebotnet33ts_256",

View File

@ -1461,7 +1461,6 @@ def define_buck_targets(
"torch/csrc/jit/mobile/train/random.cpp",
"torch/csrc/jit/mobile/train/sequential.cpp",
":gen_aten_libtorch[autograd/generated/Functions.cpp]",
"torch/csrc/quantized/quantized_backward.cpp",
],
compiler_flags = get_pt_compiler_flags(),
exported_preprocessor_flags = get_pt_preprocessor_flags() + ["-DUSE_MOBILE_CLASSTYPE"],

View File

@ -923,12 +923,12 @@ Here is an example that shows logging modes of each type::
class FunctionLog(TorchFunctionMode):
def __torch_function__(self, func, types, args, kwargs=None):
print(f"Function Log: {resolve_name(func)}(*{args}, **{kwargs})")
return func(*args, **kwargs or {})
return func(*args, **(kwargs or {}))
class DispatchLog(TorchDispatchMode):
def __torch_dispatch__(self, func, types, args, kwargs=None):
print(f"Dispatch Log: {func}(*{args}, **{kwargs})")
return func(*args, **kwargs or {})
return func(*args, **(kwargs or {}))
def f():
a = torch.rand(10, requires_grad=True)

View File

@ -85,6 +85,9 @@ select = [
"PIE804",
"PIE807",
"PIE810",
"PLC0131", # type bivariance
"PLC0132", # type param mismatch
"PLC0205", # string as __slots__
"PLE",
"PLR0133", # constant comparison
"PLR0206", # property with params

View File

@ -2679,6 +2679,18 @@ TEST_F(ModulesTest, AdaptiveLogSoftmaxWithLoss) {
ASSERT_TRUE(
torch::allclose(asfm(x, y).output.squeeze(0), asfm(x2, y2).output));
}
{
// test div_value
auto options =
AdaptiveLogSoftmaxWithLossOptions(16, 20, {4, 10, 15}).div_value(0.);
ASSERT_THROWS_WITH(
AdaptiveLogSoftmaxWithLoss(options),
"div_value should not be equal to 0");
options =
AdaptiveLogSoftmaxWithLossOptions(16, 20, {4, 10, 15}).div_value(0.25);
ASSERT_TRUE(AdaptiveLogSoftmaxWithLoss(options));
}
}
TEST_F(ModulesTest, Softmax2d) {

View File

@ -54,6 +54,7 @@ def make_dynamic_cls(cls):
test_classes[test_class.__name__] = test_class
# REMOVING THIS LINE WILL STOP TESTS FROM RUNNING
globals()[test_class.__name__] = test_class
test_class.__module__ = __name__
return test_class

View File

@ -745,6 +745,9 @@ class FunctionTests(torch._dynamo.test_case.TestCase):
dd["a"] = x + 1
dd[param] = 123
dd["c"] = x * 2
dd.update({"b": x * 3})
dd.update([["d", x - 2], ("e", x + 2)])
dd.update(zip("ab", [x + 3, x + 4]))
return dd["b"], dd
x = torch.randn(10, 10)
@ -754,7 +757,10 @@ class FunctionTests(torch._dynamo.test_case.TestCase):
self.assertTrue(same(ref[0], res[0]))
self.assertTrue(same(ref[1]["a"], res[1]["a"]))
self.assertTrue(same(ref[1]["b"], res[1]["b"]))
self.assertTrue(same(ref[1]["c"], res[1]["c"]))
self.assertTrue(same(ref[1]["d"], res[1]["d"]))
self.assertTrue(same(ref[1]["e"], res[1]["e"]))
self.assertTrue(same(ref[1][param], res[1][param]))
@make_test
@ -802,6 +808,42 @@ class FunctionTests(torch._dynamo.test_case.TestCase):
d2["c"] = x + 20
return d1["a"] + d2["c"] + 1
@make_test
def test_dict_fromkeys(x, y):
lst = ["a", "b"]
d = dict.fromkeys(lst)
d1 = dict.fromkeys(d, x + 1)
d2 = collections.defaultdict.fromkeys(iter(d1), x - 2)
d3 = collections.OrderedDict.fromkeys(tuple(lst), value=y)
return d1["a"] * d2["b"] + d2["a"] + d1["b"] + d3["a"] + d3["b"] + 1
@make_test
def test_dict_copy(x):
my_list = [("a", x), ("b", x + 1), ("c", x + 2)]
d1 = dict(my_list)
d1["a"] = x + 10
d2 = d1.copy()
d2["a"] = x - 5
d2["b"] = x + 3
d3 = collections.OrderedDict(my_list)
d3["c"] = x + 20
d4 = d3.copy()
d4["c"] = x - 10
return d1["a"] * d2["a"] + d2["b"] + d3["c"] * d4["c"] + 1
@make_test
def test_dict_update(x, y, z):
d = {"a": x, "b": y}
d.update({"a": y - 1})
d.update([("b", z + 1), ["c", z]])
d.update(zip("ab", [z + 3, y + 2]))
od = collections.OrderedDict(a=x * 3, b=y + 2)
od.update({"a": y + 5})
od.update([["b", z + 6], ("c", z - 7)])
od.update(zip("ab", [z - 3, x + 2]))
return d["a"] * od["a"] + od["c"] + d["b"] + od["b"] * d["c"]
@make_test
def test_min_max(a, b):
c = a + b

View File

@ -2940,6 +2940,52 @@ class <lambda>(torch.nn.Module):
):
aot_export_module(mod, [inp], trace_joint=True, output_loss_index=1)
@unittest.skipIf(not torch._dynamo.is_dynamo_supported(), "Cond needs dynamo to run")
def test_aot_export_with_torch_cond(self):
class M(torch.nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
def true_fn(x):
y = x + 4
y.add_(5)
return x.cos()
def false_fn(x):
y = x + 5
y.add_(6)
return x.sin()
a = torch.cond(x.shape[0] > 4, true_fn, false_fn, [x])
return (a + 3, a + 4)
inp = torch.randn(3, 4)
gm, _ = aot_export_module(M(), (inp,), trace_joint=False)
self.assertExpectedInline(gm.code.strip(), """\
def forward(self, arg0_1):
true_graph_0 = self.true_graph_0
false_graph_0 = self.false_graph_0
conditional = torch.ops.higher_order.cond(False, true_graph_0, false_graph_0, [arg0_1]); true_graph_0 = false_graph_0 = arg0_1 = None
getitem = conditional[0]; conditional = None
add = torch.ops.aten.add.Tensor(getitem, 3)
add_1 = torch.ops.aten.add.Tensor(getitem, 4); getitem = None
return (add, add_1)""") # noqa: B950
self.assertExpectedInline(gm.true_graph_0.code.strip(), """\
def forward(self, arg0_1):
add = torch.ops.aten.add.Tensor(arg0_1, 4)
add_1 = torch.ops.aten.add.Tensor(add, 5); add = None
cos = torch.ops.aten.cos.default(arg0_1); arg0_1 = None
return (cos,)""")
self.assertExpectedInline(gm.false_graph_0.code.strip(), """\
def forward(self, arg0_1):
add = torch.ops.aten.add.Tensor(arg0_1, 5)
add_1 = torch.ops.aten.add.Tensor(add, 6); add = None
sin = torch.ops.aten.sin.default(arg0_1); arg0_1 = None
return (sin,)""")
def test_aot_export_simplified_input_mutations_banned(self):
def fn(x):
x.mul_(2)

View File

@ -1422,6 +1422,27 @@ class AOTInductorTestsTemplate:
self.check_model(Model(), inputs)
def test_convolution(self):
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
def forward(self, x, w, b):
return torch.ops.aten.convolution(x, w, b, [4], [0], [1], True, [0], 1)
example_inputs = (
torch.randn([2, 32, 90], device=self.device),
torch.randn([32, 16, 8], device=self.device),
torch.randn([16], device=self.device),
)
with config.patch(
{
"max_autotune": True,
"max_autotune_gemm_backends": "Triton",
}
):
self.check_model(Model(), example_inputs)
def test_zero_size_weight(self):
class Model(torch.nn.Module):
def __init__(self, channel, r=8):
@ -1470,8 +1491,8 @@ copy_tests(
# TODO: test_freezing_abi_compatible_cpu somehow fails on CI but not locally,
# NotImplementedError: Cannot access storage of OpaqueTensorImpl
"test_freezing": TestFailure(("abi_compatible_cpu",), is_skip=True),
# Need to support convolution
"test_missing_cubin": TestFailure(("abi_compatible_cpu",)),
# FIXME: failed with Segfault while exiting the Python runtime
"test_missing_cubin": TestFailure(("abi_compatible_cpu",), is_skip=True),
"test_normal_functional": TestFailure(("abi_compatible_cpu",)),
"test_poi_multiple_dynamic": TestFailure(("abi_compatible_cpu",)),
# There is a double-free issue which will be fixed in another PR
@ -1501,8 +1522,6 @@ copy_tests(
# test_failures, xfail by default, set is_skip=True to skip
{
"test_dup_unbacked_sym_decl": TestFailure(("abi_compatible_cuda",)),
# Need to support convolution
"test_missing_cubin": TestFailure(("abi_compatible_cuda",)),
"test_normal_functional": TestFailure(("abi_compatible_cuda",)),
# There is a double-free issue which will be fixed in another PR
"test_repeat_output": TestFailure(("abi_compatible_cuda",), is_skip=True),

View File

@ -2423,6 +2423,21 @@ class CPUReproTests(TestCase):
self.assertFalse("= as_strided(" in code)
self.assertEqual(run(*v), mod(*v))
def test_invalid_dropout_args(self):
class MyModel(torch.nn.Module):
def forward(self, x):
x = x * 2
x = torch.nn.functional.dropout(x, p=0.5)
x = torch.relu(x)
return x
example_inputs = torch.tensor([[1, 2, 3], [4, 5, 6]])
func = MyModel()
jit_func = torch.compile(func)
self.assertRaises(RuntimeError, lambda: func(example_inputs))
self.assertRaises(RuntimeError, lambda: jit_func(example_inputs))
@config.patch(inplace_buffers=True)
def test_in_out_buffer(self):
def fn(x, y):
@ -2463,6 +2478,55 @@ class CPUReproTests(TestCase):
)
self.assertEqual(metrics.generated_kernel_count, 1)
def test_attention_size_mismatch(self):
class Attention(torch.nn.Module):
def __init__(self, hidden_size, num_heads):
super().__init__()
self.hidden_size = hidden_size
self.num_heads = num_heads
self.head_size = hidden_size // num_heads
self.query = torch.nn.Linear(hidden_size, hidden_size)
self.key = torch.nn.Linear(hidden_size, hidden_size)
self.value = torch.nn.Linear(hidden_size, hidden_size)
self.inv_scale = torch.nn.Parameter(
torch.Tensor([1 / self.head_size**0.5]), requires_grad=False
)
def forward(self, x):
query = self.query(x)
key = self.key(x)
value = self.value(x)
(batch_size, seq_len, hidden_size) = query.size()
query = query.view(
batch_size, seq_len, self.num_heads, self.head_size
).permute(0, 2, 1, 3)
key = key.view(
batch_size, seq_len, self.num_heads, self.head_size
).permute(0, 2, 3, 1)
value = value.view(
batch_size, seq_len, self.num_heads, self.head_size
).permute(0, 2, 1, 3)
attention_weights = (
torch.matmul(query, key).div(self.inv_scale).softmax(dim=-1)
)
output = torch.matmul(attention_weights, value)
return output
torch.manual_seed(123)
hidden_size = 16
num_heads = 1
seq_len = 4
batch_size = 1
x = torch.randn(batch_size, seq_len, hidden_size)
func = Attention(hidden_size, num_heads).to("cpu")
with torch.no_grad():
res1 = func(x)
jit_func = torch.compile(func)
res2 = jit_func(x)
self.assertEqual(res1, res2)
def test_scalar_mul_bfloat16(self):
def f(x):
return torch.ops.aten.mul.Tensor(x, 1.7015043497085571)

View File

@ -217,9 +217,9 @@ class TestPoitwiseOps(torch.nn.Module):
@requires_cuda()
@torch._inductor.config.patch(
post_grad_fusion_options={"group_linear": {"require_fbgemm": True}}
)
@torch._inductor.config.patch(post_grad_fusion_options={})
@torch._inductor.config.patch(pre_grad_fusion_options={})
@torch._inductor.config.patch(group_fusion=True, batch_fusion=True)
class TestGroupBatchFusion(TestCase):
def compare_dict_tensors(self, ref_dict, res_dict, rtol=1e-3, atol=1e-3):
if len(set(ref_dict.keys())) != len(set(res_dict.keys())):
@ -433,9 +433,7 @@ class TestBMMFusionModule(torch.nn.Module):
@requires_cuda()
@torch._inductor.config.patch(
post_grad_fusion_options={"batch_linear": {"require_fbgemm": False}}
)
@torch._inductor.config.patch(post_grad_fusion_options={"batch_linear_post_grad": {}})
class TestPostGradBatchLinearFusion(TestCase):
def test_batch_linear_post_grad_fusion(self):
pt1_module = TestBMMFusionModule().cuda()

View File

@ -4143,7 +4143,7 @@ SinBackward0, MulBackward0, torch::autograd::AccumulateGrad
# Don't use node.name() here as it is not consistent on windows
node_name = node.__class__.__name__ if node else "None"
pr.append(f"Running {func} from within {node_name}")
return func(*args, **kwargs or {})
return func(*args, **(kwargs or {}))
with MyMode():
pr.append("FW")

View File

@ -1095,6 +1095,10 @@ def native_dropout(input: Tensor, p: float, train: Optional[bool]):
if train and p != 0:
if p == 1:
return (torch.zeros_like(input), torch.zeros_like(input, dtype=torch.bool))
if not input.dtype.is_floating_point:
raise RuntimeError(
"result type Float can't be cast to the desired output type Long"
)
bool_mask = torch.rand_like(input) > p
res = bool_mask * input * float(1.0 / (1.0 - p))
return (res, bool_mask)

View File

@ -294,7 +294,12 @@ class VariableBuilder:
# NB: Careful not to close over self to avoid ref cycle from lru_cache
entries = [
(
(torch.Tensor, torch.nn.Parameter, torch._subclasses.FakeTensor),
(
torch.Tensor,
torch.nn.Parameter,
torch._subclasses.FakeTensor,
torch._subclasses.functional_tensor.FunctionalTensor,
),
cls.wrap_tensor,
),
((tuple, list, odict_values, collections.deque), cls.wrap_listlike),
@ -1005,6 +1010,7 @@ class VariableBuilder:
torch.Tensor,
torch.nn.Parameter,
torch._subclasses.fake_tensor.FakeTensor,
torch._subclasses.functional_tensor.FunctionalTensor,
) or is_traceable_wrapper_subclass(value), type(value)
subclass_type = None

View File

@ -6,6 +6,7 @@ import logging
import math
import operator
import types
from collections import defaultdict, OrderedDict
from typing import Dict, List
import torch
@ -39,7 +40,7 @@ from ..utils import (
from .base import MutableLocal, typestr, VariableTracker
from .constant import ConstantVariable
from .ctx_manager import EventVariable, StreamVariable
from .dicts import ConstDictVariable, SetVariable
from .dicts import ConstDictVariable, DefaultDictVariable, SetVariable
from .lists import (
BaseListVariable,
ListIteratorVariable,
@ -662,6 +663,17 @@ class BuiltinVariable(VariableTracker):
)
return super().call_function(tx, args, kwargs)
def call_method(
self,
tx,
name,
args: "List[VariableTracker]",
kwargs: "Dict[str, VariableTracker]",
) -> "VariableTracker":
if self.fn == dict and name == "fromkeys":
return BuiltinVariable.call_custom_dict_fromkeys(tx, dict, *args, **kwargs)
return super().call_method(tx, name, args, kwargs)
def _call_min_max(self, tx, *args):
if len(args) == 1 and args[0].has_unpack_var_sequence(tx):
# expand iterable
@ -898,7 +910,44 @@ class BuiltinVariable(VariableTracker):
return variables.ConstDictVariable(
dict(kwargs), user_cls=user_cls, mutable_local=MutableLocal()
)
unimplemented(f"dict(): {args} {kwargs}")
unimplemented(f"{user_cls.__name__}(): {args} {kwargs}")
@staticmethod
def call_custom_dict_fromkeys(tx, user_cls, *args, **kwargs):
assert user_cls in {dict, OrderedDict, defaultdict}
if kwargs:
# Only `OrderedDict.fromkeys` accepts `value` passed by keyword
assert user_cls is OrderedDict
assert len(args) == 1 and len(kwargs) == 1 and "value" in kwargs
args = (*args, kwargs.pop("value"))
if len(args) == 0:
raise UserError(TypeError, "fromkeys expected at least 1 argument, got 0")
if len(args) == 1:
args = (*args, ConstantVariable.create(None))
assert len(args) == 2
arg, value = args
DictVariableType = (
ConstDictVariable if user_cls is not defaultdict else DefaultDictVariable
)
if isinstance(arg, dict):
return DictVariableType(
dict.fromkeys(arg, value), user_cls, mutable_local=MutableLocal()
)
elif isinstance(
arg,
(
ConstDictVariable,
ListVariable,
TupleVariable,
ListIteratorVariable,
),
):
keys = [DictVariableType.get_key(x) for x in arg.unpack_var_sequence(tx)]
return DictVariableType(
dict.fromkeys(keys, value), user_cls, mutable_local=MutableLocal()
)
unimplemented(f"{user_cls.__name__}.fromkeys(): {args} {kwargs}")
def call_zip(self, tx, *args, **kwargs):
if kwargs:

View File

@ -77,13 +77,18 @@ class ConstDictVariable(VariableTracker):
args: "List[VariableTracker]",
kwargs: "Dict[str, VariableTracker]",
) -> "VariableTracker":
from . import ConstantVariable, TupleVariable
from . import (
ConstantVariable,
ListIteratorVariable,
ListVariable,
TupleVariable,
)
val = self.items
if name == "__getitem__":
assert len(args) == 1
return self.getitem_const(args[0])
elif name == "items":
assert not (args or kwargs)
return TupleVariable(
@ -112,10 +117,12 @@ class ConstDictVariable(VariableTracker):
],
mutable_local=MutableLocal(),
)
elif name == "values":
assert not (args or kwargs)
return TupleVariable(list(val.values()))
elif name == "copy":
assert not (args or kwargs)
return self.modifed(self.items.copy(), mutable_local=MutableLocal())
elif name == "__len__":
assert not (args or kwargs)
return ConstantVariable.create(len(self.items))
@ -139,10 +146,9 @@ class ConstDictVariable(VariableTracker):
)
elif (
name in ("pop", "get")
and args
and len(args) == 2
and ConstDictVariable.is_valid_key(args[0])
and ConstDictVariable.get_key(args[0]) not in self.items
and len(args) == 2
):
# missing item, return the default value
return args[1]
@ -158,12 +164,34 @@ class ConstDictVariable(VariableTracker):
return result
elif (
name == "update"
and args
and len(args) == 1
and isinstance(args[0], ConstDictVariable)
and self.mutable_local
):
newval = dict(val)
newval.update(args[0].items)
newval.update(kwargs) # all keys in kwargs are valid (`str`s)
result = self.modifed(newval)
return tx.replace_all(self, result)
elif (
name == "update"
and len(args) == 1
and isinstance(
args[0],
(
ListVariable,
TupleVariable,
ListIteratorVariable,
),
)
and self.mutable_local
):
newval = dict(val)
for x in args[0].unpack_var_sequence(tx):
k, v = x.unpack_var_sequence(tx)
assert ConstDictVariable.is_valid_key(k)
newval[ConstDictVariable.get_key(k)] = v
newval.update(kwargs) # all keys in kwargs are valid (`str`s)
result = self.modifed(newval)
return tx.replace_all(self, result)
elif (

View File

@ -972,6 +972,24 @@ class SkipFilesVariable(VariableTracker):
msg += f"', {self.reason}'" if self.reason else ""
unimplemented(msg)
def call_method(
self,
tx,
name,
args: "List[VariableTracker]",
kwargs: "Dict[str, VariableTracker]",
) -> "VariableTracker":
if (
self.value in {collections.OrderedDict, collections.defaultdict}
and name == "fromkeys"
):
from .builtin import BuiltinVariable
return BuiltinVariable.call_custom_dict_fromkeys(
tx, self.value, *args, **kwargs
)
return super().call_method(tx, name, args, kwargs)
class TypingVariable(VariableTracker):
def __init__(self, value, **kwargs):

View File

@ -1147,7 +1147,7 @@ def aot_compile(
constraints,
disable_constraint_solver=disable_constraint_solver
)
flat_example_inputs = pytree.arg_tree_leaves(*args, **kwargs or {})
flat_example_inputs = pytree.arg_tree_leaves(*args, **(kwargs or {}))
with torch.no_grad():
so_path = torch._inductor.aot_compile(gm, flat_example_inputs, options) # type: ignore[arg-type]

View File

@ -27,7 +27,7 @@ from ..codecache import code_hash, get_path, PyCodeCache
from ..dependencies import MemoryDep, StarDep
from ..ir import IRNode, ReductionHint, TritonTemplateBuffer
from ..optimize_indexing import indexing_dtype_strength_reduction
from ..scheduler import BaseScheduling
from ..scheduler import BaseScheduling, WhyNoFuse
from ..triton_heuristics import AutotuneHint
from ..utils import (
do_bench,
@ -2226,12 +2226,13 @@ class TritonScheduling(BaseScheduling):
_, (numel1, rnumel1) = node1.group
_, (numel2, rnumel2) = node2.group
why = WhyNoFuse(node1, node2)
if node1.is_reduction() and node2.is_reduction():
reduction_can_fuse = numel1 == numel2 and rnumel1 == rnumel2
if not reduction_can_fuse:
fusion_log.debug(
"cannot fuse (triton:1): numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)",
why(
"numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)",
numel1,
numel2,
rnumel1,
@ -2241,8 +2242,8 @@ class TritonScheduling(BaseScheduling):
if not node1.is_reduction() and not node2.is_reduction():
if not (numel1 == numel2 and rnumel1 == rnumel2):
fusion_log.debug(
"cannot fuse (triton:2): numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)",
why(
"numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)",
numel1,
numel2,
rnumel1,
@ -2255,10 +2256,7 @@ class TritonScheduling(BaseScheduling):
# Fusion for CUDATemplates are not supported.
is_triton_template = isinstance(node1.node, TritonTemplateBuffer)
if not is_triton_template:
fusion_log.debug(
"cannot fuse (triton:3): is not TritonTemplateBuffer %s",
node1,
)
why("node1 is not TritonTemplateBuffer")
return is_triton_template
# check for a bad combined tiling
@ -2277,13 +2275,13 @@ class TritonScheduling(BaseScheduling):
elif len(tiling2) > 2:
cond = tiling2 == tiling3
if not cond:
fusion_log.debug(
"cannot fuse (triton:4): tiling mismatch (%s, %s, %s)",
why(
"tiling mismatch (%s, %s, %s)",
tiling1,
tiling2,
tiling3,
)
return cond
return False
return True
@ -2294,9 +2292,7 @@ class TritonScheduling(BaseScheduling):
TritonKernel.is_compatible((numel2, rnumel2), n.get_ranges())
for n in node1.get_nodes()
):
fusion_log.debug(
"cannot fuse (triton:5): nodes numel/rnumel incompatibility"
)
why("nodes numel/rnumel incompatibility")
return False
if (
config.triton.tiling_prevents_reduction_fusion
@ -2309,12 +2305,12 @@ class TritonScheduling(BaseScheduling):
(numel2, rnumel2, 1),
)
if not is_reduction_tiling_valid:
fusion_log.debug(
"cannot fuse (triton:6): invalid tiling for reduction"
)
why("invalid tiling for reduction")
return is_reduction_tiling_valid
return True
if numel1 != numel2:
why("nodes numel incompatibility")
return numel1 == numel2
assert node1.is_reduction() and not node2.is_reduction()

View File

@ -497,6 +497,9 @@ class WrapperCodeGen(CodeGen):
def generate_end(self, result):
return
def generate_fallback_kernel(self, fallback_kernel, args):
self.generate_extern_kernel_alloc(fallback_kernel, args)
def generate_extern_kernel_alloc(self, extern_kernel, args):
ending = self.ending
if config.memory_planning and "view_as_complex" in str(extern_kernel.kernel):
@ -879,10 +882,8 @@ class WrapperCodeGen(CodeGen):
signature: List[Union[TensorArg, SizeArg]] = []
constants = {}
for key, arg in kwargs.items():
if (
key in kernel.__annotations__
and "constexpr" in kernel.__annotations__[key]
):
idx = kernel.arg_names.index(key)
if idx in kernel.constexprs:
constants[key] = arg
continue
if isinstance(arg, (ir.Buffer, ir.ReinterpretView)):
@ -1732,11 +1733,28 @@ class CppWrapperCodeGen(WrapperCodeGen):
shim_fn = f"aoti_torch_{kernel_suffix}"
self.writeline(f"AOTI_TORCH_ERROR_CODE_CHECK({shim_fn}({', '.join(args)}));")
def generate_c_shim_extern_kernel_alloc_call(self, extern_kernel, args):
def generate_c_shim_extern_kernel_alloc(self, extern_kernel, args):
# registered output buffer name
name = extern_kernel.name
output_handle_name = f"{name}_handle"
self.writeline(f"AtenTensorHandle {output_handle_name};")
output_arg = f"&{output_handle_name}"
self.generate_c_shim_extern_kernel_call(
extern_kernel.codegen_kernel_name(), args + [output_arg]
)
self.writeline(f"RAIIAtenTensorHandle {name}({output_handle_name});")
def generate_extern_kernel_alloc(self, extern_kernel, args):
if V.graph.aot_mode and config.aot_inductor.abi_compatible:
self.generate_c_shim_extern_kernel_alloc(extern_kernel, args)
else:
super().generate_extern_kernel_alloc(extern_kernel, args)
def generate_c_shim_fallback_kernel(self, fallback_kernel, args):
output_args = []
output_raii_handles = []
output_name_base = extern_kernel.get_name()
for idx, output in enumerate(extern_kernel.outputs):
output_name_base = fallback_kernel.get_name()
for idx, output in enumerate(fallback_kernel.outputs):
if isinstance(output, ir.MultiOutput):
name = f"{output.get_name()}"
output_handle_name = f"{name}_handle"
@ -1759,19 +1777,19 @@ class CppWrapperCodeGen(WrapperCodeGen):
raise NotImplementedError("unsupported type of {output=}")
args = args + output_args
assert (
extern_kernel.abi_compatible_kernel is not None
), f"abi_compatible_kernel is None for {extern_kernel.kernel=}"
fallback_kernel.abi_compatible_kernel is not None
), f"abi_compatible_kernel is None for {fallback_kernel.kernel=}"
self.generate_c_shim_extern_kernel_call(
extern_kernel.abi_compatible_kernel, args
fallback_kernel.abi_compatible_kernel, args
)
for raii_handle in output_raii_handles:
self.writeline(raii_handle)
def generate_extern_kernel_alloc(self, extern_kernel, args):
def generate_fallback_kernel(self, fallback_kernel, args):
if V.graph.aot_mode and config.aot_inductor.abi_compatible:
self.generate_c_shim_extern_kernel_alloc_call(extern_kernel, args)
self.generate_c_shim_fallback_kernel(fallback_kernel, args)
else:
super().generate_extern_kernel_alloc(extern_kernel, args)
super().generate_fallback_kernel(fallback_kernel, args)
def generate_extern_kernel_out(self, output_view, codegen_reference, args, kernel):
if output_view:

View File

@ -117,7 +117,7 @@ class BatchPointwiseOpsFusionFactory(BatchFusion):
self.op = op
@register_fusion("batch_linear", pre_grad=False)
@register_fusion("batch_linear_post_grad", pre_grad=False)
class PostGradBatchLinearFusion(BatchFusion):
"""
Fuse ops in a batch way in post grad (aten level).
@ -172,20 +172,18 @@ class PostGradBatchLinearFusion(BatchFusion):
fused_inputs = decompose_stack(graph, batch_inputs)
fused_weights = decompose_stack(graph, batch_weights)
fused_bmm = graph.call_function(
torch.ops.aten.bmm,
aten.bmm,
args=(fused_inputs, fused_weights),
)
for i, original_mm in enumerate(batch_nodes):
has_bias = False
with graph.inserting_after(fused_bmm):
new_mm = graph.call_function(
torch.ops.aten.select, args=((fused_bmm, 0, i))
)
new_mm = graph.call_function(aten.select, args=((fused_bmm, 0, i)))
if batch_biases[i]:
has_bias = True
new_bias_add = graph.call_function(
torch.ops.aten.add, args=((batch_biases[i], new_mm))
aten.add, args=((batch_biases[i], new_mm))
)
new_mm_cont = new_bias_add if has_bias else new_mm
original_mm.replace_all_uses_with(new_mm_cont)
@ -763,7 +761,25 @@ def generate_fusion_from_config(config_options: Dict[str, Any], pre_grad=True):
def group_batch_fusion_passes(graph: torch.fx.Graph, pre_grad=True):
print_graph(graph, "Before group_batch fusion in pre grad pass.")
fusions: List[GroupBatchFusionBase] = []
# we keep all current pre grad fusions to keep
# current implementation, will remove this later
# TODO: deperate batch_fusion and group_fusion flags
if config.batch_fusion:
config.pre_grad_fusion_options = {
"batch_linear": {},
"batch_linear_lhs": {},
"batch_layernorm": {},
"batch_tanh": {},
"batch_relu": {},
"batch_sigmoid": {},
}
# config.post_grad_fusion_options = {
# "batch_linear_post_grad": {},
# }
if config.group_fusion:
config.post_grad_fusion_options = {
"group_linear": {"require_fbgemm": True},
}
if pre_grad:
fusions += generate_fusion_from_config(
config.pre_grad_fusion_options, pre_grad=True

View File

@ -3787,38 +3787,12 @@ class RandomSeeds(ExternKernelOut):
class ExternKernelAlloc(ExternKernel):
# Generate abi-compatible kernel names for shim kernels.
# Each individual shim kernel may have its own versioning rule.
# However, we don't expect we would end up with too many of such rules.
def _get_abi_compatible_kernel(self):
if not V.graph.cpp_wrapper:
return self.kernel
def sdpa_ver_fn():
# For sdpa, we need the v2 version only if any optional
# kwarg is missing.
if any(
self.get_kwargs_value(arg_name) is None
for arg_name in self.ordered_kwargs_for_cpp_kernel
):
return f"{self.cpp_kernel}_v2"
else:
return self.cpp_kernel
kernel_to_ver = {"at::_scaled_dot_product_flash_attention": sdpa_ver_fn}
if (ver_fn := kernel_to_ver.get(self.cpp_kernel, None)) is not None:
return ver_fn()
return self.cpp_kernel
def codegen_kernel_name(self):
return self.cpp_kernel if V.graph.cpp_wrapper else self.kernel
def codegen(self, wrapper):
self.codegen_comment(wrapper)
args = [*self.codegen_args(), *self.codegen_kwargs()]
# Now we setup abi_compatible_kernel after self.kernel
# and kwargs are adjusted appropriately.
self.abi_compatible_kernel = self._get_abi_compatible_kernel()
V.graph.wrapper_code.generate_extern_kernel_alloc(self, args)
if isinstance(self.layout, Layout):
self.codegen_size_asserts(wrapper)
@ -3839,7 +3813,6 @@ class ExternKernelAlloc(ExternKernel):
self.name = V.graph.register_buffer(self)
self.kernel = kernel
self.cpp_kernel = cpp_kernel
self.abi_compatible_kernel = None
self.ordered_kwargs_for_cpp_kernel = ordered_kwargs_for_cpp_kernel
def should_allocate(self):
@ -4294,6 +4267,7 @@ class FallbackKernel(ExternKernelAlloc):
# output through the abi-compatible interface.
self.outputs: Sequence[Any] = []
self.use_runtime_dispatch = False
self.abi_compatible_kernel = None
assert isinstance(
kernel,
@ -4351,6 +4325,29 @@ class FallbackKernel(ExternKernelAlloc):
), f"expected the index {pos} to be smaller than len(self.args_default_value): {len(self.args_default_value)}"
return self.args_default_value[pos]["value"]
# Generate abi-compatible kernel names for shim kernels.
# Each individual shim kernel may have its own versioning rule.
# However, we don't expect we would end up with too many of such rules.
def _get_abi_compatible_kernel(self):
if not V.graph.cpp_wrapper:
return self.kernel
def sdpa_ver_fn():
# For sdpa, we need the v2 version only if any optional
# kwarg is missing.
if any(
self.get_kwargs_value(arg_name) is None
for arg_name in self.ordered_kwargs_for_cpp_kernel
):
return f"{self.cpp_kernel}_v2"
else:
return self.cpp_kernel
kernel_to_ver = {"at::_scaled_dot_product_flash_attention": sdpa_ver_fn}
if (ver_fn := kernel_to_ver.get(self.cpp_kernel, None)) is not None:
return ver_fn()
return self.cpp_kernel
def codegen_args(self):
@dataclasses.dataclass
class Shim:
@ -4361,6 +4358,9 @@ class FallbackKernel(ExternKernelAlloc):
tensor_args = [Shim(x.codegen_reference()) for x in self.inputs]
args, kwargs = self.unflatten_args(tensor_args, self.constant_args)
# Now we setup abi_compatible_kernel after self.kernel
# and kwargs are adjusted appropriately.
self.abi_compatible_kernel = self._get_abi_compatible_kernel()
if V.graph.cpp_wrapper and isinstance(self.op_overload, torch._ops.OpOverload):
args = [
@ -4579,7 +4579,11 @@ class FallbackKernel(ExternKernelAlloc):
self.outputs,
)
else:
super().codegen(wrapper)
self.codegen_comment(wrapper)
args = [*self.codegen_args(), *self.codegen_kwargs()]
V.graph.wrapper_code.generate_fallback_kernel(self, args)
if isinstance(self.layout, Layout):
self.codegen_size_asserts(wrapper)
@staticmethod
def tensor_to_layout(output: torch.Tensor):

View File

@ -919,7 +919,15 @@ def register_replacement(
device=args[i].device,
requires_grad=grad,
)
specific_graph = trace_fn(search_fn, args)
try:
specific_graph = trace_fn(search_fn, args)
except RuntimeError as e:
log.info(
"Replacement pattern %s failed to apply due to shape mismatch: %s",
search_fn.__name__,
e,
)
return False
specific_pattern = fx_to_pattern(
specific_graph,
argnames=argnames,

View File

@ -7,7 +7,18 @@ import math
import os
import pprint
import textwrap
from typing import Counter, DefaultDict, Dict, List, Optional, Sequence, Set, Union
from typing import (
Any,
Counter,
DefaultDict,
Dict,
List,
Optional,
Sequence,
Set,
Tuple,
Union,
)
import sympy
@ -41,6 +52,28 @@ log = logging.getLogger(__name__)
fusion_log = torch._logging.getArtifactLogger(__name__, "fusion")
class WhyNoFuse:
# TODO when we drop support for Python < 3.10, we can use
# @dataclass(slots=True) instead of manually specifying __slots__.
__slots__ = ["node1", "node2", "reason", "args"]
reason: str
args: Tuple[Any, ...]
def __init__(self, node1: "BaseSchedulerNode", node2: "BaseSchedulerNode"):
self.node1 = node1
self.node2 = node2
def __call__(self, reason, *args):
self.reason = reason
self.args = args
fusion_log.debug(self)
def __str__(self):
return f"cannot fuse {self.node1.get_name()} with {self.node2.get_name()}: " + (
self.reason % self.args
)
def pformat(obj):
if isinstance(obj, set):
# pformat has trouble with sets of sympy exprs
@ -930,12 +963,11 @@ class ForeachKernelSchedulerNode(FusedSchedulerNode):
@classmethod
def can_fuse(cls, producer, consumer):
why = WhyNoFuse(producer, consumer)
if producer.is_foreach() and consumer.is_foreach():
foreach_match = len(producer.snodes) == len(consumer.snodes)
if not foreach_match:
fusion_log.debug(
"cannot fuse (foreach:1): foreach do not have same length"
)
why("foreach do not have same length")
return foreach_match and all(
producer.scheduler.can_fuse(l, r)
for l, r in zip(producer.snodes, consumer.snodes)
@ -945,9 +977,7 @@ class ForeachKernelSchedulerNode(FusedSchedulerNode):
if consumer_subnode is not None:
return consumer.scheduler.can_fuse(producer, consumer_subnode)
fusion_log.debug(
"cannot fuse (foreach:2): candidate producer is not dep of any foreach consumer"
)
why("candidate producer is not dep of any foreach consumer")
return False
elif producer.is_foreach():
@ -955,9 +985,7 @@ class ForeachKernelSchedulerNode(FusedSchedulerNode):
if producer_subnode is not None:
return producer.scheduler.can_fuse(producer_subnode, consumer)
fusion_log.debug(
"cannot fuse (foreach:3): candidate consumer has no dep in any foreach producer"
)
why("candidate consumer has no dep in any foreach producer")
return False
raise AssertionError(
@ -1785,9 +1813,7 @@ class Scheduler:
combined_ancestors = (node1.ancestors | node2.ancestors) - combined_names
cycle = any(found_path(self.name_to_fused_node[n]) for n in combined_ancestors)
if cycle:
fusion_log.debug(
"cannot fuse (cycle): will create cycle - %s %s", node1, node2
)
WhyNoFuse(node1, node2)("will create cycle")
return cycle
def can_fusion_increase_peak_memory(
@ -1825,24 +1851,27 @@ class Scheduler:
if node1 is node2:
return False
why = WhyNoFuse(node1, node2)
if (
isinstance(node1, (ExternKernelSchedulerNode, NopKernelSchedulerNode))
and not node1.is_template()
):
fusion_log.debug("cannot fuse (1): node1 %s is extern or nop", node1)
why("node1 is extern or nop")
return False
if (
isinstance(node2, (ExternKernelSchedulerNode, NopKernelSchedulerNode))
and not node2.is_template()
):
fusion_log.debug("cannot fuse (2): node2 %s is extern or nop", node2)
why("node2 is extern or nop")
return False
if node1.is_foreach() or node2.is_foreach():
return ForeachKernelSchedulerNode.can_fuse(node1, node2)
if node2.get_names() & node1.ancestors:
fusion_log.debug("cannot fuse (3): node1 must go before node2")
why("node1 must go before node2")
return False
if (
@ -1865,24 +1894,20 @@ class Scheduler:
return False
if node2.is_template():
fusion_log.debug("cannot fuse (4): templates can only fuse epilogues")
why("templates can only fuse epilogues")
return False
if node1.is_template() and (
node2.has_aliasing_or_mutation()
or node2.is_reduction()
or not config.epilogue_fusion
):
fusion_log.debug("cannot fuse (5): template epilogue not satisfied")
why("template epilogue not satisfied")
return False
device = node1.get_device()
device2 = node2.get_device()
if device != device2:
fusion_log.debug(
"cannot fuse (6): device mismatch (node1: %s, node2: %s)",
device,
device2,
)
why("device mismatch (%s vs %s)", device, device2)
return False
del device2
@ -1890,7 +1915,7 @@ class Scheduler:
if no_shared_data and (
not config.aggressive_fusion or node1.is_reduction() or node2.is_reduction()
):
fusion_log.debug("cannot fuse (7): no shared data")
why("no shared data")
return False # heuristic not needed for correctness
if (
@ -1898,7 +1923,7 @@ class Scheduler:
and not node2.is_foreach()
and len(node1.get_nodes()) + len(node2.get_nodes()) > config.max_fusion_size
):
fusion_log.debug("cannot fuse (8): exceeds max fusion")
why("exceeds max fusion")
return False # heuristic not needed for correctness
if node1.get_names() & node2.ancestors:
@ -1908,7 +1933,7 @@ class Scheduler:
return self.get_backend(device).can_fuse_vertical(node1, node2)
else: # nodes don't depend on each other, but may have common reads
if self.can_fusion_increase_peak_memory(node1, node2):
fusion_log.debug("cannot fuse (9): will increase peak memory")
why("will increase peak memory")
return False
return self.get_backend(device).can_fuse_horizontal(node1, node2)
@ -1922,6 +1947,7 @@ class Scheduler:
"""
node1_names = node1.get_names()
computed_deps = set()
why = WhyNoFuse(node1, node2)
for rd in node2.unmet_dependencies:
for cd in node1.read_writes.writes:
@ -1946,13 +1972,11 @@ class Scheduler:
# Examples here include:
# - MemoryDep("foo", x) != MemoryDep("foo", x + 1)
# - MemoryDep("foo", x) != StarDep("foo")
fusion_log.debug("cannot fuse (vert:1): memory deps did not match")
why("memory deps did not match")
return False
for name in remaining_deps:
if node1_names & self.name_to_fused_node[name].ancestors:
fusion_log.debug(
"cannot fuse (vert:2): intermediate nodes between node1 & node2"
)
why("intermediate nodes between node1 & node2")
return False
return True

View File

@ -1110,16 +1110,17 @@ def blue_text(msg):
@functools.lru_cache(None)
def get_device_tflops(dtype):
from triton.testing import get_max_simd_tflops, get_max_tensorcore_tflops
from triton.testing import get_max_simd_tflops, get_max_tensorcore_tflops, nvsmi
assert dtype in (torch.float16, torch.bfloat16, torch.float32)
cur_sm_clock = nvsmi(["clocks.current.sm"])[0]
if dtype in (torch.float16, torch.bfloat16):
return get_max_tensorcore_tflops(dtype)
return get_max_tensorcore_tflops(dtype, cur_sm_clock)
if torch.backends.cuda.matmul.allow_tf32:
return get_max_tensorcore_tflops(torch.float32)
return get_max_tensorcore_tflops(torch.float32, cur_sm_clock)
else:
return get_max_simd_tflops(torch.float32)
return get_max_simd_tflops(torch.float32, cur_sm_clock)
@functools.lru_cache(None)

View File

@ -17,7 +17,7 @@ ArrayLikeOrScalar = typing.Union[ArrayLike, Scalar]
DTypeLike = typing.TypeVar("DTypeLike")
AxisLike = typing.TypeVar("AxisLike")
NDArray = typing.TypeVar("NDarray")
NDArray = typing.TypeVar("NDArray")
CastingModes = typing.TypeVar("CastingModes")
KeepDims = typing.TypeVar("KeepDims")

View File

@ -7,7 +7,7 @@ import types
from typing import Any, Callable, Dict, List, Type, Union
import torch._C
import torch.utils._pytree as pytree
from torch import _utils_internal
from torch._functorch.pyfunctorch import dispatch_functorch
@ -369,7 +369,7 @@ class HigherOrderOperator(OperatorBase):
def _to_flat_tuple(args, kwargs):
return torch.utils._pytree.arg_tree_leaves(*args, **kwargs)
return pytree.arg_tree_leaves(*args, **kwargs)
def _compute_keyset(args, kwargs, non_fallthrough_keys):
@ -506,7 +506,7 @@ class OpOverload(OperatorBase):
)
def __call__(self, *args, **kwargs):
return self._op(*args, **kwargs or {})
return self._op(*args, **(kwargs or {}))
def __hash__(self):
return hash(self._op)
@ -601,9 +601,7 @@ class OpOverload(OperatorBase):
with temporarily_pop_mode(curr_stack) as curr_mode:
assert hasattr(curr_mode, "__torch_dispatch__")
overload_types = []
args_flattened, _ = torch.utils._pytree.tree_flatten(
(args, kwargs.values())
)
args_flattened = pytree.arg_tree_leaves(*args, **kwargs)
for a in args_flattened:
# TODO: need to double check the semantics of the "types" argument to torch_dispatch.
# It's generated in PyInterpreter.cpp, but seems to be generated in two places,
@ -750,7 +748,7 @@ class OpOverloadPacket:
# is still callable from JIT
# We save the function ptr as the `op` attribute on
# OpOverloadPacket to access it here.
return self._op(*args, **kwargs or {})
return self._op(*args, **(kwargs or {}))
# TODO: use this to make a __dir__
def overloads(self):

View File

@ -38,6 +38,7 @@ void AdaptiveLogSoftmaxWithLossImpl::reset() {
.size() == options.cutoffs().size(),
"cutoffs should be a sequence of unique, positive integers sorted in an increasing order, ",
"where each value is between 1 and n_classes-1");
TORCH_CHECK(options.div_value() != 0, "div_value should not be equal to 0");
cutoffs = options.cutoffs();
cutoffs.push_back(options.n_classes());
@ -53,8 +54,8 @@ void AdaptiveLogSoftmaxWithLossImpl::reset() {
tail = this->register_module("tail", ModuleList());
for (const auto i : c10::irange(n_clusters)) {
int64_t hsz = options.in_features() /
static_cast<int64_t>(std::pow(options.div_value(), (i + 1)));
int64_t hsz = static_cast<int64_t>(std::floor(
options.in_features() / std::pow(options.div_value(), (i + 1))));
int64_t osz = cutoffs[i + 1] - cutoffs[i];
Sequential projection(

View File

@ -233,6 +233,23 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_mm(
AtenTensorHandle* ret0,
AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_convolution(
AtenTensorHandle input,
AtenTensorHandle weight,
AtenTensorHandle bias, // optional argument
int64_t* stride_ptr,
int64_t stride_size,
int64_t* padding_ptr,
int64_t padding_size,
int64_t* dilation_ptr,
int64_t dilation_size,
int transposed,
int64_t* output_padding_ptr,
int64_t output_padding_size,
int64_t groups,
AtenTensorHandle* ret // returns new reference
);
// This function will create a new uninitialized tensor object
// and its pointer is returned through *ret.
AOTI_TORCH_EXPORT AOTITorchError

View File

@ -359,6 +359,47 @@ AOTITorchError aoti_torch__scaled_dot_product_flash_attention(
ret8);
}
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_convolution(
AtenTensorHandle input,
AtenTensorHandle weight,
AtenTensorHandle bias, // optional argument
int64_t* stride_ptr,
int64_t stride_size,
int64_t* padding_ptr,
int64_t padding_size,
int64_t* dilation_ptr,
int64_t dilation_size,
int transposed,
int64_t* output_padding_ptr,
int64_t output_padding_size,
int64_t groups,
AtenTensorHandle* out // returns new reference
) {
AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
at::Tensor* input_tensor = tensor_handle_to_tensor_pointer(input);
at::Tensor* weight_tensor = tensor_handle_to_tensor_pointer(weight);
at::Tensor* bias_tensor = tensor_handle_to_tensor_pointer(bias);
auto optional_bias = pointer_to_optional(bias_tensor);
c10::IntArrayRef stride(stride_ptr, stride_size);
c10::IntArrayRef padding(padding_ptr, padding_size);
c10::IntArrayRef dilation(dilation_ptr, dilation_size);
c10::IntArrayRef output_padding(output_padding_ptr, output_padding_size);
at::Tensor out_tensor = at::convolution(
*input_tensor,
*weight_tensor,
optional_bias,
stride,
padding,
dilation,
static_cast<bool>(transposed),
output_padding,
groups);
at::Tensor* out_tensor_ptr = new at::Tensor(std::move(out_tensor));
*out = tensor_pointer_to_tensor_handle(out_tensor_ptr);
});
}
AOTITorchError aoti_torch_new_uninitialized_tensor(AtenTensorHandle* ret) {
AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
at::Tensor* out_tensor = new at::Tensor();

View File

@ -1,167 +0,0 @@
#include <ATen/native/quantized/PackedParams.h>
#include <ATen/native/quantized/cpu/QuantUtils.h>
#include <torch/library.h>
#include <torch/torch.h>
namespace {
using namespace torch::autograd;
using namespace at;
// This class is a custom gradient function that enables quantized tensor to
// pass input gradient back to the previous layers This function can be used
// when the user is adapting mixed precision for traninig after quantization
// From torch layer, we have no access to linear_dynamic operator which needs to
// access via redispatching mechanism TO-DO : currently we are supporting per
// tensor quantization only, will expand to per channel later on
class PackedLinearWeightDynamicBackward
: public Function<PackedLinearWeightDynamicBackward> {
public:
static torch::Tensor forward(
AutogradContext* ctx,
at::Tensor input,
const c10::intrusive_ptr<LinearPackedParamsBase>& packed_weight,
bool reduce_range) {
static auto op =
at::Dispatcher::singleton()
.findSchemaOrThrow("quantized::linear_dynamic", "")
.typed<at::Tensor(
at::Tensor,
c10::intrusive_ptr<
LinearPackedParamsBase,
c10::detail::intrusive_target_default_null_type<
LinearPackedParamsBase>> const&,
bool)>();
// Calculate statistics for quantization of input Tensor
float x_min = 0;
float x_max = 0;
if (input.numel() > 0) {
auto input_contig = input.contiguous();
x_min = input_contig.min().item<float>();
x_max = input_contig.max().item<float>();
}
auto output = op.redispatch(
DispatchKeySet({DispatchKey::CPU}),
std::move(input),
packed_weight,
reduce_range);
auto q_params = quant_utils::ChooseQuantizationParams(
/*min=*/x_min,
/*max=*/x_max,
/*qmin=*/0,
/*qmax=*/255);
ctx->saved_data["weight"] = packed_weight;
// q_params.scale : shape [1] (per-tensor)
ctx->saved_data["input_scale"] = q_params.scale;
return output;
}
static tensor_list backward(AutogradContext* ctx, tensor_list grad_outputs) {
if (grad_outputs.empty()) {
return {torch::Tensor(), torch::Tensor(), torch::Tensor()};
}
auto packed_weight =
ctx->saved_data["weight"].toCustomClass<LinearPackedParamsBase>();
auto unpacked_parameters = packed_weight->unpack();
auto original_weight = std::get<0>(unpacked_parameters);
auto input_scale = ctx->saved_data["input_scale"].toDouble();
// Gradient for post-scaling
// Let us rewrite this layer by separating the matmul from the output
// scaling: y = (x * s1) @ w * s2 + b So you now back-propagate through four
// operations: + b, * s2, @ W, and * s1. The steps are: start with the
// gradient from the top, aka the adjoint, which is grad_outputs[0].
// gradient for + b: this is a no-op.
// gradient for * s2: scale by s2. That's the affine/per-channel scale baked
// into W. gradient for @ W: matmul with W.t. gradient for * s1: scale by
// s1.
auto grad_output0 = grad_outputs[0];
const auto qtype = original_weight.qscheme();
if (qtype == at::kPerTensorAffine) {
grad_output0 *= original_weight.q_scale();
original_weight = at::permute(original_weight, {1, 0});
} else if (qtype == at::kPerChannelAffine) {
// Per Channel quantizer does not support transpose.
// Manual transpose is necessary
original_weight = original_weight.dequantize();
// kwanghoon(TODO): This is going to be a long term solution that is applicable
// to every models One issue with quantizing a gradient, we can't get good
// enough gradient to improve model accuracy when model become complicated As of
// now, we can disable, and comeback when we figure it out better solution.
#if 0
// Enable Kernel backend for quantized backpropagaiton matrix
// multiplication
original_weight = at::permute(original_weight, {1, 0});
// Take advantage of QNNPACK for matrix multiplication
// Per channel scales & zero point computation
// Sources :
// https://github.com/pytorch/pytorch/blob/master/torch/ao/quantization/observer.py#L350-L353
auto [amin, amax] = at::aminmax(original_weight, /*dim* = */ 1);
// QInt8 type signed quantization
auto qmax = 127;
auto qmin = -128;
// Clamp with some epsilon number, so that value does not go below zero
auto epsilon = 1e-9;
auto new_scales = (amax - amin) / float(qmax - qmin);
new_scales = at::clamp(new_scales, epsilon);
auto new_zero_point =
qmin - at::round(amin / new_scales).toType(c10::kInt);
new_zero_point = at::clamp(new_zero_point, qmin, qmax);
// TO-DO (BUGBUG)
// Backend kernel is designed for inference, tightly coded for output
// channel. For mathematical correctness, we should enable to run kernel
// with input channel axis after transpose. As workaround, we are simply
// either exploring per tensor quantization or per channel quantization
// with axis = 0
original_weight = at::quantize_per_channel(
original_weight,
new_scales,
new_zero_point,
/*axis = 1 for transpose, but we are forcing it to non-transposed case
due to above issue*/
0,
c10::kQInt8);
#endif
} else {
TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme.");
}
#if 1
// Pure FP32 computation, useful for debugging purpose
auto dLdX1 = torch::matmul(grad_output0, original_weight);
#else
// Take advantage of QNNPACK for matrix multiplication
static auto op = at::Dispatcher::singleton()
.findSchemaOrThrow("quantized::linear_prepack", "")
.typed<c10::intrusive_ptr<LinearPackedParamsBase>(
at::Tensor, c10::optional<at::Tensor>)>();
auto prepacked_weight = op.call(original_weight, nullopt);
auto dLdX1 =
prepacked_weight->apply_dynamic(grad_output0.toType(c10::kFloat));
#endif
auto input_grad0 = dLdX1 * input_scale;
return {input_grad0, torch::Tensor(), torch::Tensor()};
}
};
at::Tensor packed_linear_weight_grad(
c10::DispatchKeySet ks,
at::Tensor input,
const c10::intrusive_ptr<LinearPackedParamsBase>& packed_weight,
bool reduce_range) {
return PackedLinearWeightDynamicBackward::apply(
std::move(input), packed_weight, reduce_range);
}
} // namespace
namespace at {
namespace native {
namespace {
TORCH_LIBRARY_IMPL(quantized, Autograd, m) {
m.impl(
TORCH_SELECTIVE_NAME("quantized::linear_dynamic"),
TORCH_FN(packed_linear_weight_grad));
}
} // namespace
} // namespace native
} // namespace at

View File

@ -73,7 +73,7 @@ class DiagTensorBelow(WrapperTensor):
# For everything else, call the handler:
fn = cls.handled_ops.get(func.__name__, None)
if fn:
return fn(*args, **kwargs or {})
return fn(*args, **(kwargs or {}))
else:
# Note that here, because we don't need to provide the autograd formulas
# we can have a default "fallback" that creates a plain Tensor based

View File

@ -159,7 +159,7 @@ def generate_cct_and_mode(autograd_view_consistency=True):
@classmethod
def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
all_args = pytree.arg_tree_leaves(*args, **kwargs or {})
all_args = pytree.arg_tree_leaves(*args, **(kwargs or {}))
modes = tuple(e.mode for e in all_args if isinstance(e, CompositeCompliantTensor))
if not all_same_mode(modes):
raise RuntimeError("Multiple CompositeCompliantTensorModes NYI")