mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-28 18:54:57 +08:00
Compare commits
136 Commits
yguo/patch
...
xmfan/sing
| Author | SHA1 | Date | |
|---|---|---|---|
| 3d732de4c8 | |||
| 19a27343a0 | |||
| cabc560762 | |||
| 663565c84a | |||
| 95f317d9f8 | |||
| 643da4854f | |||
| 995b125cdd | |||
| 7c8c82cd64 | |||
| 698f6f9fae | |||
| 2fb9416e6f | |||
| d91be786cb | |||
| ef6b16ea9d | |||
| 05e6f15966 | |||
| 6eb795c9e8 | |||
| 5ed1e23e3a | |||
| fd8ae1aa04 | |||
| c0ee62573a | |||
| b5c3bb6185 | |||
| 76ce194b8e | |||
| 0295aabf60 | |||
| 2190ca7f47 | |||
| f4e4cfcb91 | |||
| a0c7d96028 | |||
| 784f64bb05 | |||
| 6a6de0e09d | |||
| a8ce4d1846 | |||
| af1072ffb6 | |||
| d6bb1d7f0a | |||
| 36c461af95 | |||
| 7ce4974e50 | |||
| 654f2666d9 | |||
| 51748a5d1a | |||
| 71d2827eeb | |||
| e7bf490c43 | |||
| cffe7183f1 | |||
| b61a556427 | |||
| 3395da7f7c | |||
| e5da9df421 | |||
| 4986f0f52e | |||
| c74b59fc1f | |||
| 86ae672b6a | |||
| 533b884870 | |||
| a2c3a2c5c4 | |||
| c615b8c174 | |||
| fe100c3c5b | |||
| ba214ab56c | |||
| 8a5265cb37 | |||
| 8b818ab58f | |||
| ac88a6c00d | |||
| 4b35139a46 | |||
| fdb1305ace | |||
| 87e6e2924e | |||
| be0df96b50 | |||
| af31640391 | |||
| d068141c3b | |||
| 2565951f8a | |||
| fb1f7f6a09 | |||
| 6971b77510 | |||
| 863ac20659 | |||
| 83bb921a5a | |||
| 382fbcc1e4 | |||
| 574371d828 | |||
| ead970c8d0 | |||
| 5116b27792 | |||
| 6beba8dcce | |||
| f9b8121350 | |||
| 9da250aada | |||
| 6a72aaadae | |||
| 452315c84f | |||
| a000c7e6d2 | |||
| db4ce78d46 | |||
| 76ad19a549 | |||
| 8f6b9403c1 | |||
| 77aa602871 | |||
| 7185ca8348 | |||
| 5f5b44f6bf | |||
| 0d56b7e665 | |||
| 0b0da81021 | |||
| de1cb0f351 | |||
| fea718f062 | |||
| f79b352f5a | |||
| 93316cfe94 | |||
| 16e202a38e | |||
| 1e94c7aaa4 | |||
| 3986c3e4a6 | |||
| a88d7d4268 | |||
| 004d65aeb0 | |||
| 48203bec63 | |||
| f63db6255f | |||
| fb55bac3de | |||
| 41ae15faa3 | |||
| 24738768a8 | |||
| 394676759d | |||
| 8bea08e5bc | |||
| e758d8b4d1 | |||
| 279c7f262e | |||
| 4f3c070b25 | |||
| 5a3a50c791 | |||
| 9fee408daa | |||
| 5220d402b5 | |||
| e6c86952c6 | |||
| 8cbf7d0d6e | |||
| ed83b0b70b | |||
| 303ad1916f | |||
| 77dbd28535 | |||
| e9b3ff0570 | |||
| 81eb2a78ad | |||
| 655b061ef0 | |||
| 454fbd5bbe | |||
| 2c3680ce38 | |||
| 465930ee81 | |||
| 4ece056791 | |||
| bd370c138a | |||
| 5006932cbc | |||
| f16d30137c | |||
| 953f7834cc | |||
| 757d7f28d1 | |||
| 959d79f85f | |||
| babb2dc2af | |||
| 525ca80f53 | |||
| 5d547d82e6 | |||
| bae049b439 | |||
| ca397d82a6 | |||
| c9a15d980f | |||
| c8433c2c6c | |||
| a21a123fd5 | |||
| 7622e29a37 | |||
| 3f35664ee8 | |||
| 63e8ad49b8 | |||
| 75db0fd8a0 | |||
| eb892cd768 | |||
| 1b047d5d7a | |||
| 166419b9c1 | |||
| 74682e8595 | |||
| d9b3d76b85 | |||
| 302f56a1f2 |
@ -39,7 +39,7 @@ def build_ArmComputeLibrary() -> None:
|
||||
"clone",
|
||||
"https://github.com/ARM-software/ComputeLibrary.git",
|
||||
"-b",
|
||||
"v24.09",
|
||||
"v25.02",
|
||||
"--depth",
|
||||
"1",
|
||||
"--shallow-submodules",
|
||||
|
||||
@ -329,7 +329,7 @@ def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None
|
||||
]
|
||||
)
|
||||
host.run_cmd(
|
||||
f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v24.09 {git_clone_flags}"
|
||||
f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v25.02 {git_clone_flags}"
|
||||
)
|
||||
|
||||
host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}")
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
set -euo pipefail
|
||||
|
||||
readonly version=v24.04
|
||||
readonly version=v25.02
|
||||
readonly src_host=https://github.com/ARM-software
|
||||
readonly src_repo=ComputeLibrary
|
||||
|
||||
|
||||
@ -66,7 +66,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
|
||||
|
||||
# Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
|
||||
if [[ $(uname -m) == "aarch64" ]]; then
|
||||
conda_install "openblas==0.3.28=*openmp*"
|
||||
conda_install "openblas==0.3.29=*openmp*"
|
||||
else
|
||||
conda_install "mkl=2021.4.0 mkl-include=2021.4.0"
|
||||
fi
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
set -ex
|
||||
|
||||
cd /
|
||||
git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.28 --depth 1 --shallow-submodules
|
||||
git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.29 --depth 1 --shallow-submodules
|
||||
|
||||
|
||||
OPENBLAS_BUILD_FLAGS="
|
||||
|
||||
@ -54,7 +54,7 @@ cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
|
||||
TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6"
|
||||
case ${CUDA_VERSION} in
|
||||
12.8)
|
||||
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0;10.0;12.0+PTX" #Ripping out 5.0 and 6.0 due to ld error
|
||||
TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0+PTX" #Ripping out 5.0 and 6.0 due to ld error
|
||||
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
|
||||
;;
|
||||
12.6)
|
||||
|
||||
@ -191,7 +191,7 @@ fi
|
||||
|
||||
# We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
|
||||
# memory to build and will OOM
|
||||
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then
|
||||
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]] && [ -z "$MAX_JOBS_OVERRIDE" ]; then
|
||||
echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM"
|
||||
echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage"
|
||||
export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))"
|
||||
@ -377,8 +377,10 @@ else
|
||||
# This is an attempt to mitigate flaky libtorch build OOM error. By default, the build parallelization
|
||||
# is set to be the number of CPU minus 2. So, let's try a more conservative value here. A 4xlarge has
|
||||
# 16 CPUs
|
||||
MAX_JOBS=$(nproc --ignore=4)
|
||||
export MAX_JOBS
|
||||
if [ -z "$MAX_JOBS_OVERRIDE" ]; then
|
||||
MAX_JOBS=$(nproc --ignore=4)
|
||||
export MAX_JOBS
|
||||
fi
|
||||
|
||||
# NB: Install outside of source directory (at the same level as the root
|
||||
# pytorch folder) so that it doesn't get cleaned away prior to docker push.
|
||||
|
||||
@ -490,6 +490,13 @@ else
|
||||
DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
|
||||
fi
|
||||
|
||||
test_cachebench() {
|
||||
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
||||
mkdir -p "$TEST_REPORTS_DIR"
|
||||
|
||||
$TASKSET python "benchmarks/dynamo/cachebench.py" --output "$TEST_REPORTS_DIR/cachebench.json"
|
||||
}
|
||||
|
||||
test_perf_for_dashboard() {
|
||||
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
||||
mkdir -p "$TEST_REPORTS_DIR"
|
||||
@ -518,6 +525,8 @@ test_perf_for_dashboard() {
|
||||
test_inductor_set_cpu_affinity
|
||||
elif [[ "${TEST_CONFIG}" == *cuda_a10g* ]]; then
|
||||
device=cuda_a10g
|
||||
elif [[ "${TEST_CONFIG}" == *h100* ]]; then
|
||||
device=cuda_h100
|
||||
elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
|
||||
device=rocm
|
||||
fi
|
||||
@ -1507,6 +1516,11 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
|
||||
install_torchvision
|
||||
id=$((SHARD_NUMBER-1))
|
||||
test_dynamo_benchmark timm_models "$id"
|
||||
elif [[ "${TEST_CONFIG}" == cachebench ]]; then
|
||||
install_torchaudio cuda
|
||||
install_torchvision
|
||||
checkout_install_torchbench nanogpt BERT_pytorch resnet50
|
||||
PYTHONPATH=$(pwd)/torchbench test_cachebench
|
||||
elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
|
||||
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
|
||||
install_torchaudio cpu
|
||||
|
||||
1
.flake8
1
.flake8
@ -38,6 +38,7 @@ per-file-ignores =
|
||||
torchgen/api/types/__init__.py: F401,F403
|
||||
torchgen/executorch/api/types/__init__.py: F401,F403
|
||||
test/dynamo/test_higher_order_ops.py: B950
|
||||
test/dynamo/test_graph_break_messages.py: B950
|
||||
torch/testing/_internal/dynamo_test_failures.py: B950
|
||||
# TOR901 is only for test, we want to ignore it for everything else.
|
||||
# It's not easy to configure this without affecting other per-file-ignores,
|
||||
|
||||
1
.github/actionlint.yaml
vendored
1
.github/actionlint.yaml
vendored
@ -10,7 +10,6 @@ self-hosted-runner:
|
||||
- linux.9xlarge.ephemeral
|
||||
- am2.linux.9xlarge.ephemeral
|
||||
- linux.12xlarge
|
||||
- linux.12xlarge.ephemeral
|
||||
- linux.24xlarge
|
||||
- linux.24xlarge.ephemeral
|
||||
- linux.arm64.2xlarge
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
f084f34bbb743fada85f66b0ed8041387565e69c
|
||||
c670ad81fda266b6598aeeef434583eb98197ae8
|
||||
|
||||
@ -246,14 +246,8 @@ def generate_libtorch_matrix(
|
||||
if os == "linux":
|
||||
arches += CUDA_ARCHES
|
||||
arches += ROCM_ARCHES
|
||||
# skip CUDA 12.8 builds for libtorch
|
||||
if "12.8" in arches:
|
||||
arches.remove("12.8")
|
||||
elif os == "windows":
|
||||
arches += CUDA_ARCHES
|
||||
# skip CUDA 12.8 builds on Windows
|
||||
if "12.8" in arches:
|
||||
arches.remove("12.8")
|
||||
if libtorch_variants is None:
|
||||
libtorch_variants = [
|
||||
"shared-with-deps",
|
||||
@ -318,9 +312,6 @@ def generate_wheels_matrix(
|
||||
arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
|
||||
elif os == "windows":
|
||||
arches += CUDA_ARCHES + XPU_ARCHES
|
||||
# skip CUDA 12.8 builds on Windows until available
|
||||
if "12.8" in arches:
|
||||
arches.remove("12.8")
|
||||
elif os == "linux-aarch64":
|
||||
# Separate new if as the CPU type is different and
|
||||
# uses different build/test scripts
|
||||
|
||||
50
.github/scripts/trymerge.py
vendored
50
.github/scripts/trymerge.py
vendored
@ -1507,6 +1507,36 @@ def checks_to_markdown_bullets(
|
||||
]
|
||||
|
||||
|
||||
def post_starting_merge_comment(
|
||||
repo: GitRepo,
|
||||
pr: GitHubPR,
|
||||
explainer: TryMergeExplainer,
|
||||
dry_run: bool,
|
||||
ignore_current_checks_info: Optional[
|
||||
list[tuple[str, Optional[str], Optional[int]]]
|
||||
] = None,
|
||||
) -> None:
|
||||
"""Post the initial merge starting message on the PR. Also post a short
|
||||
message on all PRs in the stack."""
|
||||
gh_post_pr_comment(
|
||||
pr.org,
|
||||
pr.project,
|
||||
pr.pr_num,
|
||||
explainer.get_merge_message(ignore_current_checks_info),
|
||||
dry_run=dry_run,
|
||||
)
|
||||
if pr.is_ghstack_pr():
|
||||
for additional_prs, _ in get_ghstack_prs(repo, pr):
|
||||
if additional_prs.pr_num != pr.pr_num:
|
||||
gh_post_pr_comment(
|
||||
additional_prs.org,
|
||||
additional_prs.project,
|
||||
additional_prs.pr_num,
|
||||
f"Starting merge as part of PR stack under #{pr.pr_num}",
|
||||
dry_run=dry_run,
|
||||
)
|
||||
|
||||
|
||||
def manually_close_merged_pr(
|
||||
pr: GitHubPR,
|
||||
additional_merged_prs: list[GitHubPR],
|
||||
@ -2130,13 +2160,7 @@ def merge(
|
||||
check_for_sev(pr.org, pr.project, skip_mandatory_checks)
|
||||
|
||||
if skip_mandatory_checks:
|
||||
gh_post_pr_comment(
|
||||
pr.org,
|
||||
pr.project,
|
||||
pr.pr_num,
|
||||
explainer.get_merge_message(),
|
||||
dry_run=dry_run,
|
||||
)
|
||||
post_starting_merge_comment(repo, pr, explainer, dry_run)
|
||||
return pr.merge_into(
|
||||
repo,
|
||||
dry_run=dry_run,
|
||||
@ -2159,12 +2183,12 @@ def merge(
|
||||
)
|
||||
ignore_current_checks_info = failing
|
||||
|
||||
gh_post_pr_comment(
|
||||
pr.org,
|
||||
pr.project,
|
||||
pr.pr_num,
|
||||
explainer.get_merge_message(ignore_current_checks_info),
|
||||
dry_run=dry_run,
|
||||
post_starting_merge_comment(
|
||||
repo,
|
||||
pr,
|
||||
explainer,
|
||||
dry_run,
|
||||
ignore_current_checks_info=ignore_current_checks_info,
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
1
.github/templates/common.yml.j2
vendored
1
.github/templates/common.yml.j2
vendored
@ -4,6 +4,7 @@
|
||||
{%- set download_artifact_action = "actions/download-artifact@v4.1.7" -%}
|
||||
|
||||
{%- set timeout_minutes = 240 -%}
|
||||
{%- set timeout_minutes_windows_binary = 300 -%}
|
||||
|
||||
{%- macro concurrency(build_environment) -%}
|
||||
concurrency:
|
||||
|
||||
@ -111,7 +111,10 @@ jobs:
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
{%- elif config["gpu_arch_type"] == "rocm" %}
|
||||
runs_on: linux.rocm.gpu
|
||||
{%- elif config["gpu_arch_type"] == "cuda" %}
|
||||
{%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] == "12.8" %}
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 build needs sm_70+ runner
|
||||
{%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] != "12.8"%}
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
{%- else %}
|
||||
|
||||
@ -71,7 +71,7 @@ jobs:
|
||||
{%- else %}
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
{%- endif %}
|
||||
timeout-minutes: !{{ common.timeout_minutes }}
|
||||
timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
|
||||
!{{ upload.binary_env(config, True) }}
|
||||
{%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0 %}
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
|
||||
@ -110,7 +110,7 @@ jobs:
|
||||
{%- else %}
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
{%- endif %}
|
||||
timeout-minutes: !{{ common.timeout_minutes }}
|
||||
timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
|
||||
!{{ upload.binary_env(config, True) }}
|
||||
steps:
|
||||
!{{ common.setup_ec2_windows() }}
|
||||
|
||||
2
.github/workflows/_binary-build-linux.yml
vendored
2
.github/workflows/_binary-build-linux.yml
vendored
@ -18,7 +18,7 @@ on:
|
||||
description: prefix for runner label
|
||||
runs_on:
|
||||
required: false
|
||||
default: linux.12xlarge.ephemeral
|
||||
default: linux.12xlarge.memory.ephemeral
|
||||
type: string
|
||||
description: Hardware to run this "build" job on, linux.12xlarge or linux.arm64.2xlarge.
|
||||
timeout-minutes:
|
||||
|
||||
15
.github/workflows/_linux-build.yml
vendored
15
.github/workflows/_linux-build.yml
vendored
@ -76,6 +76,11 @@ on:
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
max-jobs:
|
||||
description: |
|
||||
Overwrite the number of jobs to use for the build
|
||||
required: false
|
||||
type: string
|
||||
|
||||
secrets:
|
||||
HUGGING_FACE_HUB_TOKEN:
|
||||
@ -211,6 +216,7 @@ jobs:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
|
||||
USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
|
||||
MAX_JOBS_OVERRIDE: ${{ inputs.max-jobs }}
|
||||
run: |
|
||||
START_TIME=$(date +%s)
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
|
||||
@ -230,6 +236,12 @@ jobs:
|
||||
DOCKER_SHELL_CMD=
|
||||
fi
|
||||
|
||||
if [[ ${MAX_JOBS_OVERRIDE} == "" ]]; then
|
||||
MAX_JOBS="$(nproc --ignore=2)"
|
||||
else
|
||||
MAX_JOBS="${MAX_JOBS_OVERRIDE}"
|
||||
fi
|
||||
|
||||
# Leaving 1GB for the runner and other things
|
||||
TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
|
||||
# https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
|
||||
@ -241,7 +253,8 @@ jobs:
|
||||
# shellcheck disable=SC2086
|
||||
container_name=$(docker run \
|
||||
-e BUILD_ENVIRONMENT \
|
||||
-e MAX_JOBS="$(nproc --ignore=2)" \
|
||||
-e MAX_JOBS=${MAX_JOBS} \
|
||||
-e MAX_JOBS_OVERRIDE \
|
||||
-e AWS_DEFAULT_REGION \
|
||||
-e PR_NUMBER \
|
||||
-e SHA1 \
|
||||
|
||||
65
.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
generated
vendored
65
.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
generated
vendored
@ -301,6 +301,71 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
libtorch-cuda12_8-shared-with-deps-cxx11-abi-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: 12.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.8-main
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: libtorch-cuda12_8-shared-with-deps-cxx11-abi
|
||||
build_environment: linux-binary-libtorch-cxx11-abi
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
libtorch-cuda12_8-shared-with-deps-cxx11-abi-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- libtorch-cuda12_8-shared-with-deps-cxx11-abi-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: 12.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.8-main
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
build_name: libtorch-cuda12_8-shared-with-deps-cxx11-abi
|
||||
build_environment: linux-binary-libtorch-cxx11-abi
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 build needs sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
libtorch-cuda12_8-shared-with-deps-cxx11-abi-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: libtorch-cuda12_8-shared-with-deps-cxx11-abi-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: 12.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.8-main
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
build_name: libtorch-cuda12_8-shared-with-deps-cxx11-abi
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
libtorch-rocm6_2_4-shared-with-deps-cxx11-abi-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
|
||||
2
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
2
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
@ -223,6 +223,6 @@ jobs:
|
||||
build_name: manywheel-py3_9-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 build needs sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
12
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
12
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
@ -424,7 +424,7 @@ jobs:
|
||||
build_name: manywheel-py3_9-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 build needs sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_8-upload: # Uploading
|
||||
@ -1122,7 +1122,7 @@ jobs:
|
||||
build_name: manywheel-py3_10-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 build needs sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_8-upload: # Uploading
|
||||
@ -1885,7 +1885,7 @@ jobs:
|
||||
build_name: manywheel-py3_11-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 build needs sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_8-upload: # Uploading
|
||||
@ -2583,7 +2583,7 @@ jobs:
|
||||
build_name: manywheel-py3_12-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 build needs sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_8-upload: # Uploading
|
||||
@ -3281,7 +3281,7 @@ jobs:
|
||||
build_name: manywheel-py3_13-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 build needs sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_8-upload: # Uploading
|
||||
@ -3979,7 +3979,7 @@ jobs:
|
||||
build_name: manywheel-py3_13t-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 build needs sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_8-upload: # Uploading
|
||||
|
||||
4
.github/workflows/generated-windows-binary-libtorch-debug-main.yml
generated
vendored
4
.github/workflows/generated-windows-binary-libtorch-debug-main.yml
generated
vendored
@ -37,7 +37,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -151,7 +151,7 @@ jobs:
|
||||
- libtorch-cpu-shared-with-deps-debug-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
|
||||
265
.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
generated
vendored
265
.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
generated
vendored
@ -44,7 +44,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -158,7 +158,7 @@ jobs:
|
||||
- libtorch-cpu-shared-with-deps-debug-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -290,7 +290,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -405,7 +405,7 @@ jobs:
|
||||
- libtorch-cuda11_8-shared-with-deps-debug-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -539,7 +539,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -654,7 +654,7 @@ jobs:
|
||||
- libtorch-cuda12_4-shared-with-deps-debug-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -788,7 +788,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -903,7 +903,7 @@ jobs:
|
||||
- libtorch-cuda12_6-shared-with-deps-debug-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -1033,3 +1033,252 @@ jobs:
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
libtorch-cuda12_8-shared-with-deps-debug-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: 12.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
LIBTORCH_CONFIG: debug
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
function get_ec2_metadata() {
|
||||
# Pulled from instance metadata endpoint for EC2
|
||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
||||
category=$1
|
||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
||||
}
|
||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
continue-on-error: true
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
||||
shell: bash
|
||||
run: |
|
||||
git config --global core.longpaths true
|
||||
git config --global core.symlinks true
|
||||
|
||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
||||
# the directory on Windows and prevent GHA from checking out as reported
|
||||
# in https://github.com/actions/checkout/issues/1018
|
||||
git config --global core.fsmonitor false
|
||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
||||
- name: Enable long paths on Windows
|
||||
shell: powershell
|
||||
run: |
|
||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
||||
# removed once Windows Defender is removed from the AMI
|
||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
||||
continue-on-error: true
|
||||
shell: powershell
|
||||
run: |
|
||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
||||
# that it doesn't interfere
|
||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
# runner.temp variable, which we need.
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
|
||||
- uses: actions/upload-artifact@v4.4.0
|
||||
if: always()
|
||||
with:
|
||||
name: libtorch-cuda12_8-shared-with-deps-debug
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
||||
- name: Wait until all sessions have drained
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
timeout-minutes: 120
|
||||
run: |
|
||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
run: |
|
||||
.github\scripts\kill_active_ssh_sessions.ps1
|
||||
libtorch-cuda12_8-shared-with-deps-debug-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- libtorch-cuda12_8-shared-with-deps-debug-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: 12.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
LIBTORCH_CONFIG: debug
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
function get_ec2_metadata() {
|
||||
# Pulled from instance metadata endpoint for EC2
|
||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
||||
category=$1
|
||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
||||
}
|
||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
continue-on-error: true
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
||||
shell: bash
|
||||
run: |
|
||||
git config --global core.longpaths true
|
||||
git config --global core.symlinks true
|
||||
|
||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
||||
# the directory on Windows and prevent GHA from checking out as reported
|
||||
# in https://github.com/actions/checkout/issues/1018
|
||||
git config --global core.fsmonitor false
|
||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
||||
- name: Enable long paths on Windows
|
||||
shell: powershell
|
||||
run: |
|
||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
||||
# removed once Windows Defender is removed from the AMI
|
||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
||||
continue-on-error: true
|
||||
shell: powershell
|
||||
run: |
|
||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
||||
# that it doesn't interfere
|
||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
# runner.temp variable, which we need.
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: libtorch-cuda12_8-shared-with-deps-debug
|
||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Test PyTorch binary
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
|
||||
- name: Wait until all sessions have drained
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
timeout-minutes: 120
|
||||
run: |
|
||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
run: |
|
||||
.github\scripts\kill_active_ssh_sessions.ps1
|
||||
libtorch-cuda12_8-shared-with-deps-debug-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: libtorch-cuda12_8-shared-with-deps-debug-test
|
||||
with:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: 12.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
LIBTORCH_CONFIG: debug
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cuda12_8-shared-with-deps-debug
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
4
.github/workflows/generated-windows-binary-libtorch-release-main.yml
generated
vendored
4
.github/workflows/generated-windows-binary-libtorch-release-main.yml
generated
vendored
@ -37,7 +37,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -151,7 +151,7 @@ jobs:
|
||||
- libtorch-cpu-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
|
||||
265
.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
generated
vendored
265
.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
generated
vendored
@ -44,7 +44,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -158,7 +158,7 @@ jobs:
|
||||
- libtorch-cpu-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -290,7 +290,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -405,7 +405,7 @@ jobs:
|
||||
- libtorch-cuda11_8-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -539,7 +539,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -654,7 +654,7 @@ jobs:
|
||||
- libtorch-cuda12_4-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -788,7 +788,7 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -903,7 +903,7 @@ jobs:
|
||||
- libtorch-cuda12_6-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
||||
timeout-minutes: 240
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
@ -1033,3 +1033,252 @@ jobs:
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
libtorch-cuda12_8-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: 12.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
function get_ec2_metadata() {
|
||||
# Pulled from instance metadata endpoint for EC2
|
||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
||||
category=$1
|
||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
||||
}
|
||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
continue-on-error: true
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
||||
shell: bash
|
||||
run: |
|
||||
git config --global core.longpaths true
|
||||
git config --global core.symlinks true
|
||||
|
||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
||||
# the directory on Windows and prevent GHA from checking out as reported
|
||||
# in https://github.com/actions/checkout/issues/1018
|
||||
git config --global core.fsmonitor false
|
||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
||||
- name: Enable long paths on Windows
|
||||
shell: powershell
|
||||
run: |
|
||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
||||
# removed once Windows Defender is removed from the AMI
|
||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
||||
continue-on-error: true
|
||||
shell: powershell
|
||||
run: |
|
||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
||||
# that it doesn't interfere
|
||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
# runner.temp variable, which we need.
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
|
||||
- uses: actions/upload-artifact@v4.4.0
|
||||
if: always()
|
||||
with:
|
||||
name: libtorch-cuda12_8-shared-with-deps-release
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
||||
- name: Wait until all sessions have drained
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
timeout-minutes: 120
|
||||
run: |
|
||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
run: |
|
||||
.github\scripts\kill_active_ssh_sessions.ps1
|
||||
libtorch-cuda12_8-shared-with-deps-release-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- libtorch-cuda12_8-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
||||
timeout-minutes: 300
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: 12.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
function get_ec2_metadata() {
|
||||
# Pulled from instance metadata endpoint for EC2
|
||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
||||
category=$1
|
||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
||||
}
|
||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
continue-on-error: true
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
||||
shell: bash
|
||||
run: |
|
||||
git config --global core.longpaths true
|
||||
git config --global core.symlinks true
|
||||
|
||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
||||
# the directory on Windows and prevent GHA from checking out as reported
|
||||
# in https://github.com/actions/checkout/issues/1018
|
||||
git config --global core.fsmonitor false
|
||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
||||
- name: Enable long paths on Windows
|
||||
shell: powershell
|
||||
run: |
|
||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
||||
# removed once Windows Defender is removed from the AMI
|
||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
||||
continue-on-error: true
|
||||
shell: powershell
|
||||
run: |
|
||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
||||
# that it doesn't interfere
|
||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
# runner.temp variable, which we need.
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: libtorch-cuda12_8-shared-with-deps-release
|
||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Test PyTorch binary
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
|
||||
- name: Wait until all sessions have drained
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
timeout-minutes: 120
|
||||
run: |
|
||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
run: |
|
||||
.github\scripts\kill_active_ssh_sessions.ps1
|
||||
libtorch-cuda12_8-shared-with-deps-release-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: libtorch-cuda12_8-shared-with-deps-release-test
|
||||
with:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: 12.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cuda12_8-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
1548
.github/workflows/generated-windows-binary-wheel-nightly.yml
generated
vendored
1548
.github/workflows/generated-windows-binary-wheel-nightly.yml
generated
vendored
File diff suppressed because it is too large
Load Diff
155
.github/workflows/inductor-perf-test-nightly-h100.yml
vendored
Normal file
155
.github/workflows/inductor-perf-test-nightly-h100.yml
vendored
Normal file
@ -0,0 +1,155 @@
|
||||
name: inductor-perf-nightly-h100
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: 0 7 * * 1-6
|
||||
- cron: 0 7 * * 0
|
||||
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
|
||||
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
training:
|
||||
description: Run training (on by default)?
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
inference:
|
||||
description: Run inference (on by default)?
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
default:
|
||||
description: Run inductor_default?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
dynamic:
|
||||
description: Run inductor_dynamic_shapes?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
cppwrapper:
|
||||
description: Run inductor_cpp_wrapper?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
cudagraphs:
|
||||
description: Run inductor_cudagraphs?
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
freezing_cudagraphs:
|
||||
description: Run inductor_cudagraphs with freezing for inference?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
aotinductor:
|
||||
description: Run aot_inductor for inference?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
maxautotune:
|
||||
description: Run inductor_max_autotune?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
benchmark_configs:
|
||||
description: The list of configs used the benchmark
|
||||
required: false
|
||||
type: string
|
||||
default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
# NB: Keep this in sync with trunk.yml
|
||||
build:
|
||||
name: cuda12.4-py3.10-gcc9-sm90
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm90
|
||||
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '9.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor_huggingface_perf_cuda_h100", shard: 1, num_shards: 5, runner: "linux.aws.h100" },
|
||||
{ config: "inductor_huggingface_perf_cuda_h100", shard: 2, num_shards: 5, runner: "linux.aws.h100" },
|
||||
{ config: "inductor_huggingface_perf_cuda_h100", shard: 3, num_shards: 5, runner: "linux.aws.h100" },
|
||||
{ config: "inductor_huggingface_perf_cuda_h100", shard: 4, num_shards: 5, runner: "linux.aws.h100" },
|
||||
{ config: "inductor_huggingface_perf_cuda_h100", shard: 5, num_shards: 5, runner: "linux.aws.h100" },
|
||||
{ config: "inductor_timm_perf_cuda_h100", shard: 1, num_shards: 6, runner: "linux.aws.h100" },
|
||||
{ config: "inductor_timm_perf_cuda_h100", shard: 2, num_shards: 6, runner: "linux.aws.h100" },
|
||||
{ config: "inductor_timm_perf_cuda_h100", shard: 3, num_shards: 6, runner: "linux.aws.h100" },
|
||||
{ config: "inductor_timm_perf_cuda_h100", shard: 4, num_shards: 6, runner: "linux.aws.h100" },
|
||||
{ config: "inductor_timm_perf_cuda_h100", shard: 5, num_shards: 6, runner: "linux.aws.h100" },
|
||||
{ config: "inductor_timm_perf_cuda_h100", shard: 6, num_shards: 6, runner: "linux.aws.h100" },
|
||||
{ config: "inductor_torchbench_perf_cuda_h100", shard: 1, num_shards: 6, runner: "linux.aws.h100" },
|
||||
{ config: "inductor_torchbench_perf_cuda_h100", shard: 2, num_shards: 6, runner: "linux.aws.h100" },
|
||||
{ config: "inductor_torchbench_perf_cuda_h100", shard: 3, num_shards: 6, runner: "linux.aws.h100" },
|
||||
{ config: "inductor_torchbench_perf_cuda_h100", shard: 4, num_shards: 6, runner: "linux.aws.h100" },
|
||||
{ config: "inductor_torchbench_perf_cuda_h100", shard: 5, num_shards: 6, runner: "linux.aws.h100" },
|
||||
{ config: "inductor_torchbench_perf_cuda_h100", shard: 6, num_shards: 6, runner: "linux.aws.h100" },
|
||||
]}
|
||||
selected-test-configs: ${{ inputs.benchmark_configs }}
|
||||
secrets: inherit
|
||||
|
||||
test-nightly:
|
||||
name: cuda12.4-py3.10-gcc9-sm90
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event.schedule == '0 7 * * 1-6'
|
||||
with:
|
||||
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm90
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
# disable monitor in perf tests for more investigation
|
||||
disable-monitor: true
|
||||
secrets: inherit
|
||||
|
||||
test-weekly:
|
||||
name: cuda12.4-py3.10-gcc9-sm90
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event.schedule == '0 7 * * 0'
|
||||
with:
|
||||
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm90
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
timeout-minutes: 1440
|
||||
# disable monitor in perf tests for more investigation
|
||||
disable-monitor: true
|
||||
secrets: inherit
|
||||
|
||||
test:
|
||||
name: cuda12.4-py3.10-gcc9-sm90
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event_name == 'workflow_dispatch'
|
||||
with:
|
||||
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm90
|
||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
# disable monitor in perf tests for more investigation
|
||||
disable-monitor: true
|
||||
secrets: inherit
|
||||
@ -57,7 +57,7 @@ on:
|
||||
description: The list of configs used the benchmark
|
||||
required: false
|
||||
type: string
|
||||
default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf
|
||||
default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,cachebench
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
@ -105,6 +105,7 @@ jobs:
|
||||
{ config: "inductor_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "inductor_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "inductor_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
|
||||
{ config: "cachebench", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
|
||||
]}
|
||||
selected-test-configs: ${{ inputs.benchmark_configs }}
|
||||
secrets: inherit
|
||||
|
||||
30
.github/workflows/inductor-periodic.yml
vendored
30
.github/workflows/inductor-periodic.yml
vendored
@ -77,21 +77,21 @@ jobs:
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c2" },
|
||||
{ config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c1" },
|
||||
{ config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2.c2" },
|
||||
{ config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c1" },
|
||||
{ config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c2" },
|
||||
{ config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c1" },
|
||||
{ config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c2" },
|
||||
{ config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2.c1" },
|
||||
{ config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c2" },
|
||||
{ config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c1" },
|
||||
{ config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c2" },
|
||||
{ config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c1" },
|
||||
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2.c2" },
|
||||
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c1" },
|
||||
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2.c2" },
|
||||
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
20
.github/workflows/pull.yml
vendored
20
.github/workflows/pull.yml
vendored
@ -449,6 +449,26 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
unstable-linux-focal-cuda12_4-py3_10-gcc9-sm89-build-xfail:
|
||||
# A version of the build that sets a larger number of jobs for a build. May
|
||||
# OOM
|
||||
name: unstable-linux-focal-cuda12.4-py3.10-gcc9-sm89-xfail
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm89
|
||||
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
|
||||
cuda-arch-list: 8.9
|
||||
max-jobs: 4
|
||||
# Doesn't actually run tests, but need this in order to prevent the build
|
||||
# from being skipped
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-focal-cuda12_4-py3_10-gcc9-sm89-test:
|
||||
name: linux-focal-cuda12.4-py3.10-gcc9-sm89
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
|
||||
@ -2,7 +2,7 @@ name: Upload torch dynamo performance stats
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: [inductor-A100-perf-nightly, inductor-perf-nightly-A10g, inductor-perf-nightly-aarch64, inductor-perf-nightly-x86, perf-nightly-macos, inductor-perf-nightly-rocm]
|
||||
workflows: [inductor-A100-perf-nightly, inductor-perf-nightly-A10g, inductor-perf-nightly-aarch64, inductor-perf-nightly-x86, perf-nightly-macos, inductor-perf-nightly-rocm, inductor-perf-nightly-h100]
|
||||
types:
|
||||
- completed
|
||||
|
||||
|
||||
@ -330,7 +330,7 @@ at::BlasBackend Context::blasPreferredBackend() {
|
||||
if (blas_preferred_backend == at::BlasBackend::Cublaslt) {
|
||||
static const bool hipblaslt_unsupported = []() {
|
||||
static const std::vector<std::string> archs = {
|
||||
"gfx90a", "gfx940", "gfx941", "gfx942",
|
||||
"gfx90a", "gfx942",
|
||||
#if ROCM_VERSION >= 60300
|
||||
"gfx1100", "gfx1101"
|
||||
#endif
|
||||
|
||||
@ -63,10 +63,12 @@ DLDataType getDLDataType(const Tensor& t) {
|
||||
case ScalarType::BFloat16:
|
||||
dtype.code = DLDataTypeCode::kDLBfloat;
|
||||
break;
|
||||
// TODO(#146647): use macro here instead of spelling out each shell dtype
|
||||
case ScalarType::Float8_e5m2:
|
||||
case ScalarType::Float8_e5m2fnuz:
|
||||
case ScalarType::Float8_e4m3fn:
|
||||
case ScalarType::Float8_e4m3fnuz:
|
||||
case ScalarType::Float8_e8m0fnu:
|
||||
TORCH_CHECK(false, "float8 types are not supported by dlpack");
|
||||
break;
|
||||
case ScalarType::QInt8:
|
||||
|
||||
@ -87,7 +87,7 @@
|
||||
|
||||
#define AT_FLOAT8_TYPES \
|
||||
c10::kFloat8_e5m2, c10::kFloat8_e5m2fnuz, c10::kFloat8_e4m3fn, \
|
||||
c10::kFloat8_e4m3fnuz
|
||||
c10::kFloat8_e4m3fnuz, c10::kFloat8_e8m0fnu
|
||||
|
||||
#define AT_INTEGRAL_TYPES \
|
||||
c10::kByte, c10::kChar, c10::kInt, c10::kLong, c10::kShort
|
||||
|
||||
@ -13,6 +13,9 @@ inline void FunctionSchema::checkArg(
|
||||
// Fast-path for the common case
|
||||
return;
|
||||
}
|
||||
if (value.isGenericDict() && value.toGenericDict().empty()) {
|
||||
return;
|
||||
}
|
||||
if (!value.type<T>()->isSubtypeOf(*argument.type())) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
|
||||
@ -197,7 +197,7 @@ public:
|
||||
return vector;
|
||||
}
|
||||
// Workaround for https: //gcc.gnu.org/bugzilla/show_bug.cgi?id=117001
|
||||
#if __GNUC__ <= 12 && defined(__ARM_FEATURE_SVE)
|
||||
#if __GNUC__ <= 12 && !defined(__clang__) && defined(__ARM_FEATURE_SVE)
|
||||
static Vectorized<T> __attribute__ ((optimize("-fno-tree-loop-vectorize"))) blendv(const Vectorized<T>& a,
|
||||
#else
|
||||
static Vectorized<T> blendv(const Vectorized<T>& a,
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
namespace at::vec {
|
||||
// See Note [CPU_CAPABILITY namespace]
|
||||
@ -46,5 +47,105 @@ static inline float half2float_scalar(uint16_t val) {
|
||||
|
||||
#endif
|
||||
|
||||
// Transpose a [2, 32] matrix to [32, 2]
|
||||
// Note: the output leading dimension should be 2,
|
||||
// that is, the output must be contiguous
|
||||
template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 2>>
|
||||
static inline void transpose_pad_2x32_block(
|
||||
const scalar_t* src,
|
||||
scalar_t* dst,
|
||||
int64_t ld_src,
|
||||
int krem = 2,
|
||||
int nrem = 32) {
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
__m512i r0, r1;
|
||||
__m512i d0, d1;
|
||||
// load
|
||||
if (nrem < 32) {
|
||||
__mmask32 mask_krem_v = (1LL << nrem) - 1;
|
||||
r0 = _mm512_maskz_loadu_epi16(mask_krem_v, src);
|
||||
// if krem is not 2, pad with zeros
|
||||
if (krem == 2) {
|
||||
r1 = _mm512_maskz_loadu_epi16(mask_krem_v, src + ld_src);
|
||||
} else {
|
||||
r1 = _mm512_setzero_si512();
|
||||
}
|
||||
} else {
|
||||
r0 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src));
|
||||
if (krem == 2) {
|
||||
r1 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + ld_src));
|
||||
} else {
|
||||
r1 = _mm512_setzero_si512();
|
||||
}
|
||||
}
|
||||
// transpose
|
||||
d0 = _mm512_unpacklo_epi16(r0, r1);
|
||||
d1 = _mm512_unpackhi_epi16(r0, r1);
|
||||
r0 = _mm512_shuffle_i32x4(d0, d1, 0x88);
|
||||
r1 = _mm512_shuffle_i32x4(d0, d1, 0xdd);
|
||||
d0 = _mm512_shuffle_i32x4(r0, r1, 0x88);
|
||||
d1 = _mm512_shuffle_i32x4(r0, r1, 0xdd);
|
||||
|
||||
// store
|
||||
if (nrem < 16) {
|
||||
__mmask32 mask_rem_v = (1LL << (nrem * 2)) - 1;
|
||||
_mm512_mask_storeu_epi16(dst, mask_rem_v, d0);
|
||||
} else if (nrem == 16) {
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), d0);
|
||||
} else if (nrem < 32) {
|
||||
__mmask32 mask_rem_v = (1LL << (nrem * 2 - 32)) - 1;
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), d0);
|
||||
_mm512_mask_storeu_epi16(
|
||||
reinterpret_cast<__m512i*>(dst + 32), mask_rem_v, d1);
|
||||
} else {
|
||||
// normal store
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), d0);
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 32), d1);
|
||||
}
|
||||
#else
|
||||
TORCH_CHECK(false, "transpose_pad_2x32_block is only supported when avx512 is supported")
|
||||
#endif
|
||||
}
|
||||
|
||||
// To use AMX to accelerate GEMM,
|
||||
// reorder the memory format [K, N] -> [K/2, N, 2]
|
||||
// Note: If K % 2 != 0, pad K implicitly
|
||||
template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 2>>
|
||||
static inline void pack_vnni2(
|
||||
const scalar_t* src,
|
||||
scalar_t* dst,
|
||||
int64_t ld_src,
|
||||
int64_t K,
|
||||
int64_t N) {
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
int64_t bk = 0;
|
||||
int64_t _K = K / 2 * 2;
|
||||
int64_t _N = N / 32 * 32;
|
||||
for (; bk < _K; bk += 2) {
|
||||
int64_t bn = 0;
|
||||
for (; bn < _N; bn += 32) {
|
||||
transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src);
|
||||
}
|
||||
int64_t nrem = N - bn;
|
||||
if (nrem > 0) {
|
||||
transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 2, nrem);
|
||||
}
|
||||
}
|
||||
if (K % 2 == 1) {
|
||||
int64_t bn = 0;
|
||||
for (; bn < _N; bn += 32) {
|
||||
transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 1);
|
||||
}
|
||||
int64_t nrem = N - bn;
|
||||
if (nrem > 0) {
|
||||
transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 1, nrem);
|
||||
}
|
||||
}
|
||||
#else
|
||||
TORCH_CHECK(false, "pack_vnni2 is only supported when avx512 is supported")
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
||||
@ -25,4 +25,10 @@ unpack(at::PhiloxCudaState arg) {
|
||||
}
|
||||
}
|
||||
|
||||
// Adapted from TE
|
||||
// extract seed and offset from PhiloxCudaState
|
||||
__global__ void unpack_cudnn(at::PhiloxCudaState arg, int64_t* seed_ptr, int64_t* offset_ptr);
|
||||
|
||||
void unpack_cudnn_wrapper(at::PhiloxCudaState arg, int64_t* seed_ptr, int64_t* offset_ptr, cudaStream_t stream);
|
||||
|
||||
} // namespace at::cuda::philox
|
||||
|
||||
@ -227,15 +227,10 @@ TuningResultsValidator::TuningResultsValidator() {
|
||||
}
|
||||
// rocblas
|
||||
{
|
||||
#define STRINGIFY(s) #s
|
||||
#define XSTRINGIFY(s) STRINGIFY(s)
|
||||
std::string rocblas_version = c10::str(
|
||||
XSTRINGIFY(ROCBLAS_VERSION_MAJOR), ".",
|
||||
XSTRINGIFY(ROCBLAS_VERSION_MINOR), ".",
|
||||
XSTRINGIFY(ROCBLAS_VERSION_PATCH), "-",
|
||||
XSTRINGIFY(ROCBLAS_VERSION_TWEAK));
|
||||
#undef XSTRINGIFY
|
||||
#undef STRINGIFY
|
||||
size_t rocblas_version_size;
|
||||
rocblas_get_version_string_size(&rocblas_version_size);
|
||||
std::string rocblas_version(rocblas_version_size - 1, '\0');
|
||||
rocblas_get_version_string(rocblas_version.data(), rocblas_version_size);
|
||||
RegisterValidator(
|
||||
"ROCBLAS_VERSION",
|
||||
[rocblas_version]() { return rocblas_version; },
|
||||
|
||||
@ -288,20 +288,23 @@ class TunableOp {
|
||||
}
|
||||
|
||||
// for warmup does user set max duration, max iters, or both?
|
||||
// warmup is allowed to be skipped by setting either iterations or duration to 0
|
||||
// warmup is skipped by default, i.e. warmup_iter = 0
|
||||
// warmup will be set to the non-zero value of max_warmup_duration
|
||||
// or max_warmup_iter
|
||||
// if both are non-zero, we take the smaller of the two.
|
||||
double max_warmup_duration = ctx->GetMaxWarmupDurationMs();
|
||||
int max_warmup_iter = ctx->GetMaxWarmupIterations();
|
||||
int warmup_iter = 1; // default
|
||||
if (max_warmup_duration >= 0) {
|
||||
int warmup_iter = 0; // default
|
||||
if (max_warmup_duration > 0) {
|
||||
int duration_iters = max_warmup_duration / approx_duration;
|
||||
if (max_warmup_iter >= 0) {
|
||||
if (max_warmup_iter > 0) {
|
||||
warmup_iter = std::min(max_warmup_iter, duration_iters);
|
||||
}
|
||||
else {
|
||||
warmup_iter = duration_iters;
|
||||
}
|
||||
}
|
||||
else if (max_warmup_iter >= 0) {
|
||||
else if (max_warmup_iter > 0) {
|
||||
warmup_iter = max_warmup_iter;
|
||||
}
|
||||
|
||||
|
||||
@ -121,6 +121,16 @@ struct ConvolutionDescriptor
|
||||
}
|
||||
};
|
||||
|
||||
struct DropoutDescriptor
|
||||
: public Descriptor<miopenDropoutDescriptor,
|
||||
&miopenCreateDropoutDescriptor,
|
||||
&miopenDestroyDropoutDescriptor>
|
||||
{
|
||||
void set(miopenHandle_t handle, float dropout, void* states, size_t stateSizeInBytes,
|
||||
unsigned long long seed, bool use_mask, bool state_evo, miopenRNGType_t rng_mode) {
|
||||
MIOPEN_CHECK(miopenSetDropoutDescriptor(mut_desc(), handle, dropout, states, stateSizeInBytes, seed, use_mask, state_evo, rng_mode));
|
||||
}
|
||||
};
|
||||
|
||||
struct RNNDescriptor
|
||||
: public Descriptor<miopenRNNDescriptor,
|
||||
@ -128,9 +138,14 @@ struct RNNDescriptor
|
||||
&miopenDestroyRNNDescriptor>
|
||||
{
|
||||
void set(int64_t hidden_size, int64_t num_layers, miopenRNNInputMode_t input_mode, miopenRNNDirectionMode_t direction, miopenRNNMode_t rnn_mode,
|
||||
miopenRNNBiasMode_t bias_mode, miopenRNNAlgo_t algorithm, miopenDataType_t datatype) {
|
||||
miopenRNNBiasMode_t bias_mode, miopenRNNAlgo_t algorithm, miopenDataType_t datatype) {
|
||||
MIOPEN_CHECK(miopenSetRNNDescriptor(mut_desc(), hidden_size, num_layers, input_mode, direction, rnn_mode, bias_mode, algorithm, datatype));
|
||||
}
|
||||
|
||||
void setWithDropout(DropoutDescriptor& dropout_desc, int64_t hidden_size, int64_t num_layers, miopenRNNInputMode_t input_mode, miopenRNNDirectionMode_t direction,
|
||||
miopenRNNMode_t rnn_mode, miopenRNNBiasMode_t bias_mode, miopenRNNAlgo_t algorithm, miopenDataType_t datatype) {
|
||||
MIOPEN_CHECK(miopenSetRNNDescriptor_V2(mut_desc(), hidden_size, num_layers, dropout_desc.mut_desc(), input_mode, direction, rnn_mode, bias_mode, algorithm, datatype));
|
||||
}
|
||||
};
|
||||
|
||||
union Constant
|
||||
|
||||
@ -7,11 +7,6 @@
|
||||
#include <ATen/Config.h>
|
||||
|
||||
#include <ATen/native/mkldnn/Matmul.h>
|
||||
#include <ATen/native/mkldnn/Linear.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
#include <cpuinfo.h>
|
||||
#endif
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/CPUFunctions.h>
|
||||
@ -29,12 +24,6 @@
|
||||
#include <ATen/ops/mv_native.h>
|
||||
#include <ATen/ops/scalar_tensor_native.h>
|
||||
#include <ATen/ops/vdot_native.h>
|
||||
#include <ATen/ops/_scaled_mm_native.h>
|
||||
#include <ATen/ops/mul.h>
|
||||
#include <ATen/ops/matmul.h>
|
||||
#endif
|
||||
#if AT_MKLDNN_ENABLED()
|
||||
#include <ideep.hpp>
|
||||
#endif
|
||||
|
||||
namespace at::meta {
|
||||
@ -233,79 +222,4 @@ Tensor vdot(const Tensor &self, const Tensor &other){
|
||||
|
||||
}
|
||||
|
||||
static Tensor&
|
||||
_scaled_mm_out_cpu_emulated(const Tensor& mat1, const Tensor& mat2,
|
||||
const Tensor& scale_a,
|
||||
const Tensor& scale_b,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
const std::optional<at::Tensor>& scale_result,
|
||||
std::optional<c10::ScalarType> out_dtype,
|
||||
bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
|
||||
TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
|
||||
TORCH_CHECK(
|
||||
mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
|
||||
mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
|
||||
|
||||
TORCH_INTERNAL_ASSERT((scale_a.numel() == 1 && scale_b.numel() == 1), "Now _scaled_mm only supports per-tensor scaling for CPU backend.");
|
||||
TORCH_CHECK(!bias || bias->numel() == mat2.sizes()[1], "Bias must be size ", mat2.sizes()[1],
|
||||
" but got ", bias->numel());
|
||||
|
||||
// Check types
|
||||
TORCH_CHECK(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type");
|
||||
TORCH_CHECK(isFloat8Type(mat1.scalar_type()), "Expected mat1 to be Float8 matrix got ", mat1.scalar_type());
|
||||
TORCH_CHECK(isFloat8Type(mat2.scalar_type()), "Expected mat2 to be Float8 matrix got ", mat2.scalar_type());
|
||||
|
||||
auto mat1_c = mat1.contiguous();
|
||||
auto mat2_c = mat2.contiguous();
|
||||
IntArrayRef mat1_sizes = mat1_c.sizes();
|
||||
IntArrayRef mat2_sizes = mat2_c.sizes();
|
||||
at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
|
||||
|
||||
float input_scale = scale_a.item<float>();
|
||||
float weight_scale = scale_b.item<float>();
|
||||
auto fp32_mat1 = at::mul(mat1.to(kFloat), input_scale);
|
||||
auto fp32_mat2 = at::mul(mat2_c.to(kFloat), weight_scale);
|
||||
auto out_tmp = at::matmul(fp32_mat1, fp32_mat2);
|
||||
if (bias) {
|
||||
out_tmp.add_(bias.value());
|
||||
}
|
||||
out_tmp = out_tmp.to(out.scalar_type());
|
||||
out.copy_(out_tmp);
|
||||
return out;
|
||||
}
|
||||
|
||||
Tensor&
|
||||
_scaled_mm_out_cpu(const Tensor& mat1, const Tensor& mat2,
|
||||
const Tensor& scale_a,
|
||||
const Tensor& scale_b,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
const std::optional<at::Tensor>& scale_result,
|
||||
std::optional<c10::ScalarType> out_dtype,
|
||||
bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
#if AT_MKLDNN_ENABLED() && !(IDEEP_VERSION_MAJOR <= 2 || (IDEEP_VERSION_MAJOR == 3 && IDEEP_VERSION_MINOR < 5))
|
||||
if (at::globalContext().userEnabledMkldnn() && cpuinfo_has_x86_amx_int8()) {
|
||||
return mkldnn_scaled_mm(mat1, mat2, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
return _scaled_mm_out_cpu_emulated(mat1, mat2, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
|
||||
}
|
||||
}
|
||||
|
||||
Tensor
|
||||
_scaled_mm_cpu(const Tensor& mat_a, const Tensor& mat_b,
|
||||
const Tensor& scale_a,
|
||||
const Tensor& scale_b,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
const std::optional<at::Tensor>& scale_result,
|
||||
std::optional<c10::ScalarType> out_dtype,
|
||||
bool use_fast_accum) {
|
||||
const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
|
||||
Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
|
||||
return _scaled_mm_out_cpu(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -59,8 +59,8 @@ bool copy_transpose_valid(const Tensor& self, const Tensor& src) {
|
||||
#if !defined(C10_MOBILE)
|
||||
#define _AT_DISPATCH_CP_TYPES(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_V2( \
|
||||
TYPE, NAME, AT_WRAP(__VA_ARGS__), kComplexHalf, kHalf, kBool, kBFloat16, kFloat8_e5m2, \
|
||||
kFloat8_e4m3fn, kFloat8_e5m2fnuz, kFloat8_e4m3fnuz, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
|
||||
TYPE, NAME, AT_WRAP(__VA_ARGS__), kComplexHalf, kHalf, kBool, kBFloat16, \
|
||||
AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
|
||||
#else
|
||||
#define _AT_DISPATCH_CP_TYPES(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( \
|
||||
|
||||
@ -365,9 +365,13 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
|
||||
for (const auto i : c10::irange(2, ndim)) {
|
||||
reduce_dims[i - 1] = i;
|
||||
}
|
||||
|
||||
auto sum = at::sum(grad_out_, /*dim=*/reduce_dims);
|
||||
auto sum_a = sum.accessor<scalar_t, 1>();
|
||||
// Using float data type for Half sum to avoid overflow
|
||||
// since the representation range of Half is small.
|
||||
auto sum = grad_out_.scalar_type() == kHalf
|
||||
? at::sum(grad_out_.to(ScalarType::Float), /*dim=*/reduce_dims)
|
||||
: at::sum(grad_out_, /*dim=*/reduce_dims);
|
||||
using sum_t = std::conditional_t<std::is_same_v<scalar_t, at::Half>, float, scalar_t>;
|
||||
auto sum_a = sum.accessor<sum_t, 1>();
|
||||
|
||||
auto reduce_iter = TensorIteratorConfig()
|
||||
.add_const_input(input)
|
||||
|
||||
@ -460,7 +460,8 @@ Tensor isinf(const Tensor& self) {
|
||||
|
||||
Tensor isfinite(const Tensor& self) {
|
||||
// Note: Integral tensor values are always finite
|
||||
if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/true)) {
|
||||
if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/true) ||
|
||||
self.scalar_type() == kFloat8_e8m0fnu) {
|
||||
return at::ones_like(self, at::kBool, at::MemoryFormat::Preserve);
|
||||
}
|
||||
|
||||
|
||||
@ -204,12 +204,12 @@ static void reduced_float_copy_kernel(TensorIteratorBase &iter, bool requires_ne
|
||||
#define _AT_DISPATCH_ALL_TYPES(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_V2(TYPE, NAME, AT_WRAP(__VA_ARGS__), \
|
||||
kComplexHalf, kHalf, kBool, \
|
||||
kBFloat16, kFloat8_e5m2, kFloat8_e4m3fn, \
|
||||
kFloat8_e5m2fnuz, kFloat8_e4m3fnuz, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
|
||||
kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), \
|
||||
AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
|
||||
#define _AT_DISPATCH_ALL_TYPES_NO_CF(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_V2(TYPE, NAME, AT_WRAP(__VA_ARGS__), \
|
||||
kBool, kHalf, kBFloat16, kFloat8_e5m2, kFloat8_e4m3fn, \
|
||||
kFloat8_e5m2fnuz, kFloat8_e4m3fnuz, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
|
||||
kBool, kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), \
|
||||
AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
|
||||
#else
|
||||
#define _AT_DISPATCH_ALL_TYPES(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( \
|
||||
|
||||
@ -51,6 +51,9 @@ void fill_kernel(TensorIterator& iter, const Scalar& value_scalar) {
|
||||
fill_non_native_type<at::Float8_e4m3fnuz>(iter, value_scalar);
|
||||
} else if (iter.dtype() == ScalarType::Float8_e5m2fnuz) {
|
||||
fill_non_native_type<at::Float8_e5m2fnuz>(iter, value_scalar);
|
||||
} else if (iter.dtype() == ScalarType::Float8_e8m0fnu) {
|
||||
// TODO(#146647): use macro here instead of spelling out each float8 dtype
|
||||
fill_non_native_type<at::Float8_e8m0fnu>(iter, value_scalar);
|
||||
} else {
|
||||
AT_DISPATCH_V2(
|
||||
iter.dtype(), "fill_cpu", AT_WRAP([&]() {
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/cpu/vec/vec.h>
|
||||
#include <ATen/cpu/vec/vec_half.h>
|
||||
#include <ATen/cpu/vec/functional.h>
|
||||
#include <ATen/native/CPUBlas.h>
|
||||
#include <ATen/native/cpu/utils.h>
|
||||
@ -293,103 +294,6 @@ inline void pad_remain_row_col_zero(
|
||||
}
|
||||
|
||||
|
||||
// Transpose a [2, 32] matrix to [32, 2]
|
||||
// Note: the output leading dimension should be 2,
|
||||
// that is, the output must be contiguous
|
||||
static inline void transpose_pad_2x32_block(
|
||||
const uint16_t* src,
|
||||
uint16_t* dst,
|
||||
int64_t ld_src,
|
||||
int krem = 2,
|
||||
int nrem = 32) {
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
__m512i r0, r1;
|
||||
__m512i d0, d1;
|
||||
// load
|
||||
if (nrem < 32) {
|
||||
__mmask32 mask_krem_v = (1LL << nrem) - 1;
|
||||
r0 = _mm512_maskz_loadu_epi16(mask_krem_v, src);
|
||||
// if krem is not 2, pad with zeros
|
||||
if (krem == 2) {
|
||||
r1 = _mm512_maskz_loadu_epi16(mask_krem_v, src + ld_src);
|
||||
} else {
|
||||
r1 = _mm512_setzero_si512();
|
||||
}
|
||||
} else {
|
||||
r0 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src));
|
||||
if (krem == 2) {
|
||||
r1 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + ld_src));
|
||||
} else {
|
||||
r1 = _mm512_setzero_si512();
|
||||
}
|
||||
}
|
||||
// transpose
|
||||
d0 = _mm512_unpacklo_epi16(r0, r1);
|
||||
d1 = _mm512_unpackhi_epi16(r0, r1);
|
||||
r0 = _mm512_shuffle_i32x4(d0, d1, 0x88);
|
||||
r1 = _mm512_shuffle_i32x4(d0, d1, 0xdd);
|
||||
d0 = _mm512_shuffle_i32x4(r0, r1, 0x88);
|
||||
d1 = _mm512_shuffle_i32x4(r0, r1, 0xdd);
|
||||
|
||||
// store
|
||||
if (nrem < 16) {
|
||||
__mmask32 mask_rem_v = (1LL << (nrem * 2)) - 1;
|
||||
_mm512_mask_storeu_epi16(dst, mask_rem_v, d0);
|
||||
} else if (nrem == 16) {
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), d0);
|
||||
} else if (nrem < 32) {
|
||||
__mmask32 mask_rem_v = (1LL << (nrem * 2 - 32)) - 1;
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), d0);
|
||||
_mm512_mask_storeu_epi16(
|
||||
reinterpret_cast<__m512i*>(dst + 32), mask_rem_v, d1);
|
||||
} else {
|
||||
// normal store
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), d0);
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 32), d1);
|
||||
}
|
||||
#else
|
||||
TORCH_CHECK(false, "transpose_pad_2x32_block is only supported when avx512 is supported")
|
||||
#endif
|
||||
}
|
||||
|
||||
// To use AMX to accelerate GEMM,
|
||||
// reorder the memory format [K, N] -> [K/2, N, 2]
|
||||
// Note: If K % 2 != 0, pad K implicitly
|
||||
static inline void pack_vnni2(
|
||||
const uint16_t* src,
|
||||
uint16_t* dst,
|
||||
int64_t ld_src,
|
||||
int64_t K,
|
||||
int64_t N) {
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
int64_t bk = 0;
|
||||
int64_t _K = K / 2 * 2;
|
||||
int64_t _N = N / 32 * 32;
|
||||
for (; bk < _K; bk += 2) {
|
||||
int64_t bn = 0;
|
||||
for (; bn < _N; bn += 32) {
|
||||
transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src);
|
||||
}
|
||||
int64_t nrem = N - bn;
|
||||
if (nrem > 0) {
|
||||
transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 2, nrem);
|
||||
}
|
||||
}
|
||||
if (K % 2 == 1) {
|
||||
int64_t bn = 0;
|
||||
for (; bn < _N; bn += 32) {
|
||||
transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 1);
|
||||
}
|
||||
int64_t nrem = N - bn;
|
||||
if (nrem > 0) {
|
||||
transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 1, nrem);
|
||||
}
|
||||
}
|
||||
#else
|
||||
TORCH_CHECK(false, "pack_vnni2 is only supported when avx512 is supported")
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename mask_t, int64_t q_split_size, int64_t kv_split_size, bool with_pack=false>
|
||||
void cpu_flash_attention(
|
||||
const Tensor& output,
|
||||
@ -576,7 +480,7 @@ void cpu_flash_attention(
|
||||
/* ld_dst */ kvBlockSize);
|
||||
|
||||
// Pack [headSize, kvBlockSize]
|
||||
pack_vnni2(
|
||||
at::vec::pack_vnni2(
|
||||
/* src */ reinterpret_cast<const uint16_t*>(transpose_ptr),
|
||||
/* dst */ reinterpret_cast<uint16_t*>(key_reorder_ptr + i * num_head * eheadSize * kvSize +
|
||||
j * eheadSize * kvSize + n * eheadSize),
|
||||
@ -585,7 +489,7 @@ void cpu_flash_attention(
|
||||
/* N */ kvBlockSize);
|
||||
|
||||
// Pack [kvBlockSize, headSize]
|
||||
pack_vnni2(
|
||||
at::vec::pack_vnni2(
|
||||
/* src */ reinterpret_cast<const uint16_t*>(v_data + i * vStrideB + j * vStrideH + n * vStrideN),
|
||||
/* dst */ reinterpret_cast<uint16_t*>(value_reorder_ptr +
|
||||
i * num_head * kv_padding_size * headSize +
|
||||
|
||||
@ -184,7 +184,13 @@ void index_put_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef
|
||||
}
|
||||
}),
|
||||
AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
|
||||
AT_EXPAND(AT_FLOAT8_TYPES),
|
||||
// AT_EXPAND(AT_FLOAT8_TYPES),
|
||||
// TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
|
||||
// should not be supported here, then reenable AT_FLOAT8_DTYPES
|
||||
kFloat8_e4m3fn,
|
||||
kFloat8_e5m2,
|
||||
kFloat8_e4m3fnuz,
|
||||
kFloat8_e5m2fnuz,
|
||||
kComplexHalf,
|
||||
kHalf,
|
||||
kBool,
|
||||
|
||||
@ -191,7 +191,7 @@ static bool isSupportedHipLtROCmArch(int index) {
|
||||
hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index);
|
||||
std::string device_arch = prop->gcnArchName;
|
||||
static const std::vector<std::string> archs = {
|
||||
"gfx90a", "gfx940", "gfx941", "gfx942",
|
||||
"gfx90a", "gfx942",
|
||||
#if ROCM_VERSION >= 60300
|
||||
"gfx1100", "gfx1101"
|
||||
#endif
|
||||
@ -862,7 +862,7 @@ static bool _scaled_mm_allowed_device() {
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
#ifdef USE_ROCM
|
||||
std::string device_arch = dprops->gcnArchName;
|
||||
static const std::vector<std::string> archs = {"gfx940", "gfx941", "gfx942"};
|
||||
static const std::vector<std::string> archs = {"gfx942"};
|
||||
for (std::string arch : archs) {
|
||||
size_t substring = device_arch.find(arch);
|
||||
if (substring != std::string::npos) {
|
||||
@ -879,7 +879,7 @@ static bool _scaled_mm_allowed_device() {
|
||||
static bool _scaled_mm_is_fnuz() {
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
std::string device_arch = dprops->gcnArchName;
|
||||
static const std::vector<std::string> archs = {"gfx940", "gfx941", "gfx942"};
|
||||
static const std::vector<std::string> archs = {"gfx942"};
|
||||
for (std::string arch : archs) {
|
||||
size_t substring = device_arch.find(arch);
|
||||
if (substring != std::string::npos) {
|
||||
|
||||
@ -144,6 +144,28 @@ void float8_copy_kernel_cuda(TensorIteratorBase &iter) {
|
||||
gpu_kernel(iter, [] GPU_LAMBDA(Float8_e5m2fnuz x) { return x; });
|
||||
break;
|
||||
}
|
||||
} else if (dtype == kFloat8_e8m0fnu) {
|
||||
// TODO(#146647): clean this up, too much copy-pasta
|
||||
switch (other_dtype) {
|
||||
case kFloat:
|
||||
gpu_kernel_nocast(iter, [] GPU_LAMBDA(float value) {
|
||||
return Float8_e8m0fnu(value);
|
||||
});
|
||||
break;
|
||||
case kHalf:
|
||||
gpu_kernel_nocast(iter, [] GPU_LAMBDA(Half value) {
|
||||
return Float8_e8m0fnu(value);
|
||||
});
|
||||
break;
|
||||
case kBFloat16:
|
||||
gpu_kernel_nocast(iter, [] GPU_LAMBDA(BFloat16 value) {
|
||||
return Float8_e8m0fnu(value);
|
||||
});
|
||||
break;
|
||||
default:
|
||||
gpu_kernel(iter, [] GPU_LAMBDA(Float8_e8m0fnu x) { return x; });
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
TORCH_CHECK(false, "This supposed ot be called only for Float8 types");
|
||||
}
|
||||
@ -157,7 +179,7 @@ void direct_copy_kernel_cuda(TensorIteratorBase &iter) {
|
||||
AT_DISPATCH_QINT_TYPES(dtype, "copy_", [&] {
|
||||
gpu_kernel(iter, [] GPU_LAMBDA(scalar_t x) { return x; });
|
||||
});
|
||||
} else if (dtype == kFloat8_e5m2 || dtype == kFloat8_e4m3fn || dtype == kFloat8_e5m2fnuz || dtype == kFloat8_e4m3fnuz) {
|
||||
} else if (isFloat8Type(dtype)) {
|
||||
float8_copy_kernel_cuda(iter);
|
||||
} else if (iter.dtype(1) == kFloat && (dtype == kBFloat16 || dtype == kHalf)) {
|
||||
if (dtype == kBFloat16) {
|
||||
|
||||
@ -582,7 +582,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}),
|
||||
AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
|
||||
AT_EXPAND(AT_FLOAT8_TYPES),
|
||||
// AT_EXPAND(AT_FLOAT8_TYPES),
|
||||
// TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
|
||||
// should not be supported here, then reenable AT_FLOAT8_DTYPES
|
||||
kFloat8_e4m3fn,
|
||||
kFloat8_e5m2,
|
||||
kFloat8_e4m3fnuz,
|
||||
kFloat8_e5m2fnuz,
|
||||
kComplexHalf,
|
||||
kHalf,
|
||||
kBool,
|
||||
@ -606,7 +612,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}),
|
||||
AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
|
||||
AT_EXPAND(AT_FLOAT8_TYPES),
|
||||
// AT_EXPAND(AT_FLOAT8_TYPES),
|
||||
// TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
|
||||
// should not be supported here, then reenable AT_FLOAT8_DTYPES
|
||||
kFloat8_e4m3fn,
|
||||
kFloat8_e5m2,
|
||||
kFloat8_e4m3fnuz,
|
||||
kFloat8_e5m2fnuz,
|
||||
kComplexHalf,
|
||||
kHalf,
|
||||
kBool,
|
||||
@ -630,7 +642,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}),
|
||||
AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
|
||||
AT_EXPAND(AT_FLOAT8_TYPES),
|
||||
// AT_EXPAND(AT_FLOAT8_TYPES),
|
||||
// TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
|
||||
// should not be supported here, then reenable AT_FLOAT8_DTYPES
|
||||
kFloat8_e4m3fn,
|
||||
kFloat8_e5m2,
|
||||
kFloat8_e4m3fnuz,
|
||||
kFloat8_e5m2fnuz,
|
||||
kComplexHalf,
|
||||
kHalf,
|
||||
kBool,
|
||||
@ -652,7 +670,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}),
|
||||
AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
|
||||
AT_EXPAND(AT_FLOAT8_TYPES),
|
||||
// AT_EXPAND(AT_FLOAT8_TYPES),
|
||||
// TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
|
||||
// should not be supported here, then reenable AT_FLOAT8_DTYPES
|
||||
kFloat8_e4m3fn,
|
||||
kFloat8_e5m2,
|
||||
kFloat8_e4m3fnuz,
|
||||
kFloat8_e5m2fnuz,
|
||||
kComplexHalf,
|
||||
kHalf,
|
||||
kBool,
|
||||
@ -677,7 +701,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}),
|
||||
AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
|
||||
AT_EXPAND(AT_FLOAT8_TYPES),
|
||||
// AT_EXPAND(AT_FLOAT8_TYPES),
|
||||
// TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
|
||||
// should not be supported here, then reenable AT_FLOAT8_DTYPES
|
||||
kFloat8_e4m3fn,
|
||||
kFloat8_e5m2,
|
||||
kFloat8_e4m3fnuz,
|
||||
kFloat8_e5m2fnuz,
|
||||
kComplexHalf,
|
||||
kHalf,
|
||||
kBool,
|
||||
|
||||
@ -11,7 +11,7 @@
|
||||
#include <hip/hip_bf16.h>
|
||||
|
||||
__device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) {
|
||||
#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
|
||||
#if (defined(__gfx942__)) && \
|
||||
__has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16)
|
||||
typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
|
||||
static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw));
|
||||
@ -39,7 +39,7 @@ __device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* addre
|
||||
}
|
||||
|
||||
__device__ inline __half2 preview_unsafeAtomicAdd(__half2* address, __half2 value) {
|
||||
#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
|
||||
#if (defined(__gfx942__)) && \
|
||||
__has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2f16)
|
||||
// The api expects an ext_vector_type of half
|
||||
typedef _Float16 __attribute__((ext_vector_type(2))) vec_fp162;
|
||||
|
||||
@ -982,7 +982,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
|
||||
auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
|
||||
smem_reduction_sz) / sizeof(scalar_t);
|
||||
|
||||
bool can_use_smem = (size_t) dim_size < max_elements_per_smem;
|
||||
bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
|
||||
can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
|
||||
can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
|
||||
can_use_smem &= !(dim_size % ILP);
|
||||
@ -1061,7 +1061,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
|
||||
auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
|
||||
smem_reduction_sz) / sizeof(scalar_t);
|
||||
|
||||
bool can_use_smem = (size_t) dim_size < max_elements_per_smem;
|
||||
bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
|
||||
can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
|
||||
can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
|
||||
can_use_smem &= !(dim_size % ILP);
|
||||
@ -1125,10 +1125,10 @@ void dispatch_host_softmax_backward(int64_t dim_size, dim3 grid, Tensor &grad, T
|
||||
size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
|
||||
auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
|
||||
smem_reduction_sz) / sizeof(output_t);
|
||||
bool can_use_smem = dim_size < max_elements_per_smem;
|
||||
can_use_smem &= (!(reinterpret_cast<const uintptr_t>(gI.const_data_ptr<input_t>()) % ALIGN_BYTES));
|
||||
can_use_smem &= (!(reinterpret_cast<const uintptr_t>(output.const_data_ptr<output_t>()) % ALIGN_BYTES));
|
||||
can_use_smem &= !(reinterpret_cast<const uintptr_t>(grad.const_data_ptr<output_t>()) % ALIGN_BYTES);
|
||||
bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
|
||||
can_use_smem &= (!(reinterpret_cast<uintptr_t>(gI.const_data_ptr<input_t>()) % ALIGN_BYTES));
|
||||
can_use_smem &= (!(reinterpret_cast<uintptr_t>(output.const_data_ptr<output_t>()) % ALIGN_BYTES));
|
||||
can_use_smem &= !(reinterpret_cast<uintptr_t>(grad.const_data_ptr<output_t>()) % ALIGN_BYTES);
|
||||
can_use_smem &= !(dim_size % ILP);
|
||||
// This should not be needed on current generation GPUs because the size of shared memory is so low.
|
||||
// But we add this check to be defensive and future-proof just in case shared memory size goes up
|
||||
|
||||
@ -34,6 +34,10 @@ void topk_out_with_sort(
|
||||
// TODO: remove this when CUDA <11.6 is no longer supported
|
||||
bool disable_sort_for_topk();
|
||||
bool should_use_sort(const Tensor& self, int64_t dim) {
|
||||
#if defined(USE_ROCM)
|
||||
if (self.dtype() == kBool) return false; // Bool sort not supported in ROCm: https://github.com/pytorch/pytorch/issues/139972
|
||||
return (self.numel() >= 10000 && self.numel() == self.size(dim)); // based on the experiments in https://github.com/pytorch/pytorch/pull/146387
|
||||
#else
|
||||
if (disable_sort_for_topk()) return false;
|
||||
// This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/68632
|
||||
if (self.dim() == 0) return false;
|
||||
@ -42,6 +46,7 @@ bool should_use_sort(const Tensor& self, int64_t dim) {
|
||||
if (slice_size == 0) return false;
|
||||
int64_t num_slices = self.numel() / slice_size;
|
||||
return num_slices <= 10 && slice_size >= 100000;
|
||||
#endif
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(topk_out_cuda)
|
||||
|
||||
@ -596,17 +596,8 @@ int get_items_per_thread(uint64_t num_slices, uint64_t slice_size) {
|
||||
constexpr int REGS_PER_BLOCK = REGS_PER_THREAD * BLOCK_THREADS;
|
||||
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
|
||||
int mpc = prop->multiProcessorCount;
|
||||
#if defined(USE_ROCM)
|
||||
int regs_per_mp = prop->regsPerBlock;
|
||||
int max_blocks_per_mp = 32;
|
||||
#else
|
||||
int regs_per_mp = prop->regsPerMultiprocessor;
|
||||
#if !defined(USE_ROCM)
|
||||
int max_blocks_per_mp = prop->maxBlocksPerMultiProcessor;
|
||||
#else
|
||||
int max_blocks_per_mp = 32;
|
||||
#endif
|
||||
#endif
|
||||
int blocks_per_mp = std::min(regs_per_mp / REGS_PER_BLOCK, max_blocks_per_mp);
|
||||
int64_t items_per_thread = at::ceil_div((int64_t)(slice_size * num_slices), (int64_t)(mpc * blocks_per_mp * BLOCK_THREADS));
|
||||
items_per_thread = std::max(MIN_ITEMS_PER_THREAD, std::min((int)items_per_thread, MAX_ITEMS_PER_THREAD)); // clamp to (4, 64)
|
||||
|
||||
@ -137,7 +137,7 @@ using VecT = T __attribute__((ext_vector_type(Rank)));
|
||||
static bool isCDNA2orLater(int index) {
|
||||
hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index);
|
||||
std::string device_arch = prop->gcnArchName;
|
||||
static const std::vector<std::string> archs = {"gfx90a", "gfx940", "gfx941", "gfx942"};
|
||||
static const std::vector<std::string> archs = {"gfx90a", "gfx942"};
|
||||
for (std::string arch : archs) {
|
||||
size_t substring = device_arch.find(arch);
|
||||
if (substring != std::string::npos) {
|
||||
@ -151,7 +151,7 @@ static bool isCDNA2orLater(int index) {
|
||||
constexpr int32_t kWarpSize = 32;
|
||||
#endif
|
||||
|
||||
#if defined (__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||
#if defined (__gfx90a__) || defined(__gfx942__)
|
||||
#define CDNA2_OR_LATER 1
|
||||
#else
|
||||
#define CDNA2_OR_LATER 0
|
||||
|
||||
@ -228,6 +228,10 @@ template <> inline std::string typeName<at::Float8_e5m2fnuz>() {
|
||||
template <> inline std::string typeName<at::Float8_e4m3fnuz>() {
|
||||
return "at::Float8_e4m3fnuz";
|
||||
}
|
||||
template <> inline std::string typeName<at::Float8_e8m0fnu>() {
|
||||
// TODO(#146647): Can the code here be made generic for any scalartype?
|
||||
return "at::Float8_e8m0fnu";
|
||||
}
|
||||
|
||||
#define TYPE_NAME_CASE(ctype, scalartype) \
|
||||
case ScalarType::scalartype: return typeName<ctype>();
|
||||
|
||||
@ -31,6 +31,33 @@ void run_cudnn_SDP_fprop(
|
||||
false, "PyTorch was not compiled with cuDNN Flash Attention enabled!");
|
||||
}
|
||||
|
||||
void run_cudnn_SDP_fprop_nestedtensor(
|
||||
int64_t b,
|
||||
int64_t h_q,
|
||||
int64_t h_k,
|
||||
int64_t h_v,
|
||||
int64_t s_q,
|
||||
int64_t s_kv,
|
||||
int64_t d_qk,
|
||||
int64_t d_v,
|
||||
float scaling_factor,
|
||||
bool return_softmaxstats,
|
||||
bool is_causal,
|
||||
double dropout_probability,
|
||||
const Tensor& cum_seqlen_q,
|
||||
const Tensor& cum_seqlen_kv,
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
Tensor& softmaxstats,
|
||||
Tensor& o,
|
||||
Tensor& dropoutseed,
|
||||
Tensor& dropoutoffset) {
|
||||
TORCH_CHECK(
|
||||
false, "PyTorch was not compiled with cuDNN Flash Attention enabled!");
|
||||
}
|
||||
|
||||
void run_cudnn_SDP_bprop(
|
||||
int64_t b,
|
||||
int64_t h,
|
||||
@ -461,16 +488,6 @@ auto build_graph_and_tensors(
|
||||
.set_stride(attn_bias.value().strides().vec()));
|
||||
scaled_dot_product_flash_attention_options.set_bias(bias.value());
|
||||
}
|
||||
auto seq_q = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Seq_q")
|
||||
.set_dim({b, 1, 1, 1})
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_data_type(fe::DataType_t::INT32));
|
||||
auto seq_kv = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Seq_kv")
|
||||
.set_dim({b, 1, 1, 1})
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_data_type(fe::DataType_t::INT32));
|
||||
|
||||
auto [O, Stats] =
|
||||
mha_graph->sdpa(Q, K, V, scaled_dot_product_flash_attention_options);
|
||||
@ -500,6 +517,201 @@ auto build_graph_and_tensors(
|
||||
std::move(Stats));
|
||||
}
|
||||
|
||||
auto build_graph_and_tensors_nestedtensor(
|
||||
int64_t b,
|
||||
int64_t h_q,
|
||||
int64_t h_k,
|
||||
int64_t h_v,
|
||||
int64_t s_q,
|
||||
int64_t s_kv,
|
||||
int64_t d_qk,
|
||||
int64_t d_v,
|
||||
float scaling_factor,
|
||||
bool return_softmaxstats,
|
||||
bool is_causal,
|
||||
double dropout_probability,
|
||||
const Tensor& cum_seqlen_q,
|
||||
const Tensor& cum_seqlen_kv,
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
Tensor& softmaxstats,
|
||||
Tensor& o,
|
||||
Tensor& dropoutseed,
|
||||
Tensor& dropoutoffset,
|
||||
cudnnHandle_t& handle) {
|
||||
auto dtype = fe::DataType_t::HALF;
|
||||
if (q.scalar_type() == kBFloat16) {
|
||||
dtype = fe::DataType_t::BFLOAT16;
|
||||
}
|
||||
auto mha_graph = std::make_shared<fe::graph::Graph>();
|
||||
// We're baking in float accumulation and scale types
|
||||
// in theory the graph may support other types, but they
|
||||
// have not been tested
|
||||
mha_graph->set_io_data_type(dtype)
|
||||
.set_intermediate_data_type(fe::DataType_t::FLOAT)
|
||||
.set_compute_data_type(fe::DataType_t::FLOAT);
|
||||
auto attn_scale =
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Attn_scale")
|
||||
.set_dim({1, 1, 1, 1})
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_is_pass_by_value(true)
|
||||
.set_data_type(fe::DataType_t::FLOAT));
|
||||
auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Seed")
|
||||
.set_dim({1, 1, 1, 1})
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_data_type(fe::DataType_t::INT32));
|
||||
auto offset = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Offset")
|
||||
.set_dim({1, 1, 1, 1})
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_data_type(fe::DataType_t::INT32));
|
||||
auto SEQ_LEN_Q = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Seq_q")
|
||||
.set_dim({b, 1, 1, 1})
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_data_type(fe::DataType_t::INT32));
|
||||
auto SEQ_LEN_KV =
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Seq_kv")
|
||||
.set_dim({b, 1, 1, 1})
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_data_type(fe::DataType_t::INT32));
|
||||
|
||||
auto scaled_dot_product_flash_attention_options =
|
||||
fe::graph::SDPA_attributes()
|
||||
.set_name("CUDNN_SDPA_NESTEDTENSOR")
|
||||
.set_is_inference(return_softmaxstats == false)
|
||||
.set_causal_mask(is_causal)
|
||||
.set_attn_scale(attn_scale)
|
||||
.set_dropout(dropout_probability, seed, offset)
|
||||
.set_seq_len_q(SEQ_LEN_Q)
|
||||
.set_seq_len_kv(SEQ_LEN_KV)
|
||||
.set_padding_mask(true);
|
||||
// We hardcode BSHD to cuDNN even though the underlying layout is THD
|
||||
auto q_strides = q.strides();
|
||||
auto k_strides = k.strides();
|
||||
auto v_strides = v.strides();
|
||||
constexpr int strideidx0 = 1;
|
||||
constexpr int strideidx1 = 0;
|
||||
constexpr int strideidx2 = 2;
|
||||
auto Q = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Q")
|
||||
.set_dim({b, h_q, s_q, d_qk})
|
||||
.set_stride(
|
||||
{INT_MAX,
|
||||
q_strides[strideidx0],
|
||||
q_strides[strideidx1],
|
||||
q_strides[strideidx2]}));
|
||||
auto K = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("K")
|
||||
.set_dim({b, h_k, s_kv, d_qk})
|
||||
.set_stride(
|
||||
{INT_MAX,
|
||||
k_strides[strideidx0],
|
||||
k_strides[strideidx1],
|
||||
k_strides[strideidx2]}));
|
||||
auto V = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("V")
|
||||
.set_dim({b, h_v, s_kv, d_v})
|
||||
.set_stride(
|
||||
{INT_MAX,
|
||||
v_strides[strideidx0],
|
||||
v_strides[strideidx1],
|
||||
v_strides[strideidx2]}));
|
||||
std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
|
||||
if (attn_bias.has_value()) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"attn_bias not yet supportd with cuDNN Attention and NestedTensor");
|
||||
bias =
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("bias")
|
||||
.set_dim(attn_bias.value().sizes().vec())
|
||||
.set_stride(attn_bias.value().strides().vec()));
|
||||
scaled_dot_product_flash_attention_options.set_bias(bias.value());
|
||||
}
|
||||
auto RAG_Q_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("cum_seq_q")
|
||||
.set_dim({b + 1, 1, 1, 1})
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_data_type(fe::DataType_t::INT32));
|
||||
auto RAG_K_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("cum_seq_k")
|
||||
.set_dim({b + 1, 1, 1, 1})
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_data_type(fe::DataType_t::INT32));
|
||||
auto RAG_V_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("cum_seq_v")
|
||||
.set_dim({b + 1, 1, 1, 1})
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_data_type(fe::DataType_t::INT32));
|
||||
auto RAG_O_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("cum_seq_o")
|
||||
.set_dim({b + 1, 1, 1, 1})
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_data_type(fe::DataType_t::INT32));
|
||||
// auto RAG_STATS_OFF = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
// .set_name("cum_seq_stats")
|
||||
// .set_dim({b + 1, 1, 1, 1})
|
||||
// .set_stride({1, 1, 1, 1})
|
||||
// .set_data_type(fe::DataType_t::INT32));
|
||||
auto RAG_STATS_OFF = nullptr;
|
||||
Q->set_ragged_offset(RAG_Q_OFF);
|
||||
K->set_ragged_offset(RAG_K_OFF);
|
||||
V->set_ragged_offset(RAG_V_OFF);
|
||||
auto [O, Stats] =
|
||||
mha_graph->sdpa(Q, K, V, scaled_dot_product_flash_attention_options);
|
||||
auto o_strides = o.strides();
|
||||
O->set_output(true)
|
||||
.set_dim({b, h_q, s_q, d_v})
|
||||
.set_stride(
|
||||
{INT_MAX,
|
||||
o_strides[strideidx0],
|
||||
o_strides[strideidx1],
|
||||
o_strides[strideidx2]});
|
||||
|
||||
O->set_ragged_offset(RAG_O_OFF);
|
||||
if (Stats) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"cuDNN SDPA Nested Tensor does not yet handle backwards/logsumexp computation");
|
||||
// TODO(eqy): fix when stats (backward) support is added
|
||||
Stats->set_output(true)
|
||||
.set_data_type(fe::DataType_t::FLOAT)
|
||||
.set_dim({b, h_q, s_q, 1})
|
||||
.set_stride({h_q * s_q * d_v, d_v, s_q * d_v, 1});
|
||||
Stats->set_ragged_offset(RAG_STATS_OFF);
|
||||
}
|
||||
AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
|
||||
AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle));
|
||||
AT_CUDNN_FRONTEND_CHECK(
|
||||
mha_graph->create_execution_plans({fe::HeurMode_t::A}));
|
||||
AT_CUDNN_FRONTEND_CHECK(mha_graph->check_support(handle));
|
||||
AT_CUDNN_FRONTEND_CHECK(mha_graph->build_plans(handle));
|
||||
return std::make_tuple(
|
||||
std::move(mha_graph),
|
||||
std::move(Q),
|
||||
std::move(K),
|
||||
std::move(V),
|
||||
std::move(bias),
|
||||
std::move(attn_scale),
|
||||
std::move(seed),
|
||||
std::move(offset),
|
||||
std::move(O),
|
||||
std::move(Stats),
|
||||
std::move(RAG_Q_OFF),
|
||||
std::move(RAG_K_OFF),
|
||||
std::move(RAG_V_OFF),
|
||||
std::move(RAG_O_OFF),
|
||||
std::move(RAG_STATS_OFF),
|
||||
std::move(SEQ_LEN_Q),
|
||||
std::move(SEQ_LEN_KV));
|
||||
}
|
||||
|
||||
auto build_graph_and_tensors_backward(
|
||||
int64_t b,
|
||||
int64_t h,
|
||||
@ -737,6 +949,119 @@ void run_cudnn_SDP_fprop(
|
||||
mhagraphcache.update(key, graph_and_tensors_values);
|
||||
}
|
||||
|
||||
void run_cudnn_SDP_fprop_nestedtensor(
|
||||
int64_t b,
|
||||
int64_t h_q,
|
||||
int64_t h_k,
|
||||
int64_t h_v,
|
||||
int64_t s_q,
|
||||
int64_t s_kv,
|
||||
int64_t d_qk,
|
||||
int64_t d_v,
|
||||
float scaling_factor,
|
||||
bool return_softmaxstats,
|
||||
bool is_causal,
|
||||
double dropout_probability,
|
||||
const Tensor& cum_seqlen_q,
|
||||
const Tensor& cum_seqlen_kv,
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
Tensor& softmaxstats,
|
||||
Tensor& o,
|
||||
Tensor& dropoutseed,
|
||||
Tensor& dropoutoffset) {
|
||||
cudnnHandle_t handle = getCudnnHandle();
|
||||
// do nothing if we got 0-element tensors
|
||||
if (!q.numel() || !k.numel() || !v.numel()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!o.defined()) {
|
||||
o = at::empty({q.size(0), h_q, d_v}, q.options());
|
||||
}
|
||||
|
||||
if (return_softmaxstats && !softmaxstats.defined()) {
|
||||
softmaxstats = at::empty({q.size(0), h_q, 1}, q.options().dtype(kFloat));
|
||||
}
|
||||
auto
|
||||
[mha_graph,
|
||||
Q,
|
||||
K,
|
||||
V,
|
||||
bias,
|
||||
attn_scale,
|
||||
seed,
|
||||
offset,
|
||||
O,
|
||||
Stats,
|
||||
RAG_Q_OFF,
|
||||
RAG_K_OFF,
|
||||
RAG_V_OFF,
|
||||
RAG_O_OFF,
|
||||
RAG_STATS_OFF,
|
||||
SEQ_LEN_Q,
|
||||
SEQ_LEN_KV] =
|
||||
build_graph_and_tensors_nestedtensor(
|
||||
b,
|
||||
h_q,
|
||||
h_k,
|
||||
h_v,
|
||||
s_q,
|
||||
s_kv,
|
||||
d_qk,
|
||||
d_v,
|
||||
scaling_factor,
|
||||
return_softmaxstats,
|
||||
is_causal,
|
||||
dropout_probability,
|
||||
cum_seqlen_q,
|
||||
cum_seqlen_kv,
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
attn_bias,
|
||||
softmaxstats,
|
||||
o,
|
||||
dropoutseed,
|
||||
dropoutoffset,
|
||||
handle);
|
||||
auto seqlen_q = at::diff(cum_seqlen_q, 1, 0);
|
||||
auto seqlen_kv = at::diff(cum_seqlen_kv, 1, 0);
|
||||
auto rag_q_off = cum_seqlen_q.mul(h_q * d_qk);
|
||||
auto rag_k_off = cum_seqlen_kv.mul(h_k * d_qk);
|
||||
auto rag_v_off = cum_seqlen_kv.mul(h_v * d_v);
|
||||
auto rag_stats_off = cum_seqlen_q.mul(h_q);
|
||||
std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
|
||||
variant_pack = {
|
||||
{Q, q.data_ptr()},
|
||||
{K, k.data_ptr()},
|
||||
{V, v.data_ptr()},
|
||||
{attn_scale, &scaling_factor},
|
||||
{seed, dropoutseed.data_ptr()},
|
||||
{offset, dropoutoffset.data_ptr()},
|
||||
{O, o.data_ptr()},
|
||||
{RAG_Q_OFF, rag_q_off.data_ptr()},
|
||||
{RAG_O_OFF, rag_q_off.data_ptr()},
|
||||
{RAG_K_OFF, rag_k_off.data_ptr()},
|
||||
{RAG_V_OFF, rag_v_off.data_ptr()},
|
||||
{SEQ_LEN_Q, seqlen_q.data_ptr()},
|
||||
{SEQ_LEN_KV, seqlen_kv.data_ptr()}};
|
||||
if (return_softmaxstats) {
|
||||
variant_pack[Stats] = softmaxstats.data_ptr();
|
||||
variant_pack[RAG_STATS_OFF] = cum_seqlen_q.data_ptr();
|
||||
}
|
||||
if (attn_bias.has_value()) {
|
||||
TORCH_CHECK("bias not supported with nestedtensor");
|
||||
}
|
||||
auto workspace_size = mha_graph->get_workspace_size();
|
||||
auto workspace_ptr =
|
||||
c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
|
||||
TORCH_CHECK(
|
||||
mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
|
||||
}
|
||||
|
||||
void run_cudnn_SDP_bprop(
|
||||
int64_t b,
|
||||
int64_t h,
|
||||
|
||||
@ -23,6 +23,30 @@ void run_cudnn_SDP_fprop(
|
||||
Tensor& dropoutseed,
|
||||
Tensor& dropoutoffset);
|
||||
|
||||
void run_cudnn_SDP_fprop_nestedtensor(
|
||||
int64_t b,
|
||||
int64_t h_q,
|
||||
int64_t h_k,
|
||||
int64_t h_v,
|
||||
int64_t max_s_q,
|
||||
int64_t max_s_kv,
|
||||
int64_t d_k,
|
||||
int64_t d_v,
|
||||
float scaling_factor,
|
||||
bool isTraining,
|
||||
bool is_causal,
|
||||
double dropout_probability,
|
||||
const Tensor& cum_seqlen_q,
|
||||
const Tensor& cum_seqlen_kv,
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
Tensor& softmaxstats,
|
||||
Tensor& o,
|
||||
Tensor& dropoutseed,
|
||||
Tensor& dropoutoffset);
|
||||
|
||||
void run_cudnn_SDP_bprop(
|
||||
int64_t b,
|
||||
int64_t h,
|
||||
|
||||
@ -619,7 +619,7 @@ Workspace chooseAlgorithm(
|
||||
try {
|
||||
return Workspace(workspace_size);
|
||||
} catch (const std::exception& e) {
|
||||
hipGetLastError(); // clear OOM error
|
||||
std::ignore = hipGetLastError(); // clear OOM error
|
||||
|
||||
// switch to default algorithm and record it in the cache to prevent
|
||||
// further OOM errors
|
||||
@ -640,7 +640,7 @@ Workspace chooseSolution(const ConvolutionArgs& args, uint64_t* solution_id)
|
||||
*solution_id = solution.solution_id;
|
||||
return Workspace(solution.workspace_size);
|
||||
} catch (const std::exception& e) {
|
||||
hipGetLastError(); // clear OOM error
|
||||
std::ignore = hipGetLastError(); // clear OOM error
|
||||
|
||||
// switch to default algorithm
|
||||
solution = search::getSolution(args, true);
|
||||
|
||||
@ -57,6 +57,10 @@ namespace at::native {
|
||||
|
||||
#include <ATen/TensorUtils.h>
|
||||
|
||||
#include <c10/hip/HIPCachingAllocator.h>
|
||||
|
||||
#include <rocrand/rocrand_xorwow.h>
|
||||
|
||||
#include <functional>
|
||||
#include <iterator>
|
||||
#include <sstream>
|
||||
@ -68,10 +72,36 @@ namespace at::native {
|
||||
|
||||
namespace at { namespace native {
|
||||
|
||||
// Workspace copied from Conv_miopen.cpp but is put here inside anonymous namespace
|
||||
// to avoid duplicate symbols and to avoid the need to expose as a public struct.
|
||||
|
||||
namespace {
|
||||
|
||||
struct Workspace {
|
||||
Workspace(size_t size) : size(size), data(NULL) {
|
||||
data = c10::hip::HIPCachingAllocator::raw_alloc(size);
|
||||
}
|
||||
Workspace(const Workspace&) = delete;
|
||||
Workspace(Workspace&&) = default;
|
||||
Workspace& operator=(Workspace&&) = default;
|
||||
~Workspace() {
|
||||
if (data) {
|
||||
c10::hip::HIPCachingAllocator::raw_delete(data);
|
||||
}
|
||||
}
|
||||
|
||||
size_t size;
|
||||
void* data;
|
||||
};
|
||||
|
||||
} // anonymous
|
||||
|
||||
//RNNDescriptor.
|
||||
struct RNNDescriptorParams {
|
||||
int64_t hidden_size;
|
||||
int64_t num_layers;
|
||||
double dropout_rate;
|
||||
uint64_t dropout_seed;
|
||||
miopenRNNDirectionMode_t direction;
|
||||
miopenRNNMode_t rnn_mode;
|
||||
miopenDataType_t datatype;
|
||||
@ -114,6 +144,16 @@ struct RNNDescriptorParams {
|
||||
}
|
||||
}
|
||||
|
||||
void set_dropout(double dropout_rate, uint64_t dropout_seed = 0) {
|
||||
this->dropout_rate = dropout_rate;
|
||||
if (dropout_seed == 0) {
|
||||
// rand() returns 32 bit values so we combine two of them
|
||||
this->dropout_seed = rand() << 32 | rand();
|
||||
} else {
|
||||
this->dropout_seed = dropout_seed;
|
||||
}
|
||||
}
|
||||
|
||||
void set(int64_t mode, int64_t hidden_size, int64_t num_layers, bool bidirectional, miopenDataType_t datatype, miopenRNNBiasMode_t bias_mode) {
|
||||
this->set_mode(mode);
|
||||
this->hidden_size = hidden_size;
|
||||
@ -128,6 +168,12 @@ struct RNNDescriptorParams {
|
||||
rnn_desc.set(hidden_size, num_layers, input_mode, direction, rnn_mode, bias_mode, algo, datatype);
|
||||
return rnn_desc;
|
||||
}
|
||||
|
||||
RNNDescriptor descriptorWithDropout(DropoutDescriptor& dropout_desc) const {
|
||||
RNNDescriptor rnn_desc;
|
||||
rnn_desc.setWithDropout(dropout_desc, hidden_size, num_layers, input_mode, direction, rnn_mode, bias_mode, algo, datatype);
|
||||
return rnn_desc;
|
||||
}
|
||||
};
|
||||
|
||||
//TensorDescriptor list.
|
||||
@ -204,6 +250,8 @@ struct RNNParams {
|
||||
|
||||
struct RNNDescriptors {
|
||||
RNNDescriptor rnn_desc;
|
||||
DropoutDescriptor dropout_desc;
|
||||
std::unique_ptr<Workspace> dropout_states;
|
||||
std::vector<TensorDescriptor> x_descs;
|
||||
std::vector<TensorDescriptor> y_descs;
|
||||
TensorDescriptor hx_desc;
|
||||
@ -212,7 +260,25 @@ struct RNNDescriptors {
|
||||
TensorDescriptor cy_desc;
|
||||
|
||||
RNNDescriptors(const RNNParams& fn, miopenHandle_t handle, Tensor x, Tensor y, Tensor hx, Tensor cx) {
|
||||
rnn_desc = fn.rnn.descriptor();
|
||||
if (fn.rnn.dropout_rate == 0.0) {
|
||||
rnn_desc = fn.rnn.descriptor();
|
||||
} else {
|
||||
size_t statesSizeInBytes = 0;
|
||||
miopenDropoutGetStatesSize(handle, &statesSizeInBytes);
|
||||
size_t states_size = statesSizeInBytes / sizeof(rocrand_state_xorwow);
|
||||
|
||||
dropout_states = std::unique_ptr<Workspace>(new Workspace(states_size * sizeof(rocrand_state_xorwow)));
|
||||
dropout_desc.set(handle,
|
||||
fn.rnn.dropout_rate,
|
||||
dropout_states->data,
|
||||
dropout_states->size,
|
||||
fn.rnn.dropout_seed,
|
||||
false,
|
||||
false,
|
||||
miopenRNGType_t::MIOPEN_RNG_PSEUDO_XORWOW);
|
||||
rnn_desc = fn.rnn.descriptorWithDropout(dropout_desc);
|
||||
}
|
||||
|
||||
x_descs = fn.tensors.descriptors(x);
|
||||
y_descs = fn.tensors.descriptors(y);
|
||||
hx_desc.set(hx, 5);
|
||||
@ -492,7 +558,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> miopen_rnn(
|
||||
auto handle = getMiopenHandle();
|
||||
miopenRNNAlgo_t algo = miopenRNNdefault;
|
||||
fn.rnn.set_algo(algo);
|
||||
|
||||
fn.rnn.set_dropout(fn_dropout);
|
||||
RNNDescriptors descs(fn, handle, x, y, hx, cx);
|
||||
|
||||
FilterDescriptor w_desc;
|
||||
@ -551,7 +617,6 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> miopen_rnn(
|
||||
}
|
||||
|
||||
return std::make_tuple(output, hy, cy, reserve, weight_buf);
|
||||
|
||||
}
|
||||
|
||||
std::tuple<Tensor, Tensor, Tensor, Tensor> miopen_rnn_backward_input(
|
||||
@ -626,6 +691,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> miopen_rnn_backward_input(
|
||||
|
||||
miopenRNNAlgo_t algo = miopenRNNdefault;
|
||||
fn.rnn.set_algo(algo);
|
||||
fn.rnn.set_dropout(fn_dropout);
|
||||
RNNDescriptors descs(fn, handle, x, y, hx, cx);
|
||||
|
||||
FilterDescriptor w_desc;
|
||||
@ -720,6 +786,7 @@ std::vector<Tensor> miopen_rnn_backward_weight(
|
||||
|
||||
miopenRNNAlgo_t algo = miopenRNNdefault;
|
||||
fn.rnn.set_algo(algo);
|
||||
fn.rnn.set_dropout(fn_dropout);
|
||||
RNNDescriptors descs(fn, handle, x, y, hx, cx);
|
||||
|
||||
FilterDescriptor w_desc;
|
||||
@ -909,6 +976,6 @@ REGISTER_CUDA_DISPATCH(lstm_miopen_stub, &lstm_miopen)
|
||||
REGISTER_CUDA_DISPATCH(lstm_packed_miopen_stub, &lstm_packed_miopen)
|
||||
|
||||
} // anonymous namespace
|
||||
}} //namespace native.
|
||||
}} // namespace native
|
||||
|
||||
#endif
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <torch/library.h>
|
||||
#include <ATen/native/mkldnn/Linear.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
@ -47,20 +46,9 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_linear_backward(
|
||||
TORCH_CHECK(false, "mkldnn_linear_backward: ATen not compiled with MKLDNN support");
|
||||
}
|
||||
|
||||
Tensor&
|
||||
mkldnn_scaled_mm(const Tensor& mat1, const Tensor& mat2,
|
||||
const Tensor& scale_a,
|
||||
const Tensor& scale_b,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
const std::optional<at::Tensor>& scale_result,
|
||||
std::optional<c10::ScalarType> out_dtype,
|
||||
bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
TORCH_INTERNAL_ASSERT(false, "mkldnn_scaled_mm: ATen not compiled with MKLDNN support");
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
|
||||
#else // AT_MKLDNN_ENABLED
|
||||
|
||||
#include <ATen/native/mkldnn/MKLDNNCommon.h>
|
||||
@ -459,119 +447,6 @@ TORCH_LIBRARY_IMPL(mkldnn, MkldnnCPU, m) {
|
||||
TORCH_FN(mkldnn_linear_pointwise_binary));
|
||||
}
|
||||
|
||||
Tensor&
|
||||
mkldnn_scaled_mm(const Tensor& mat1, const Tensor& mat2,
|
||||
const Tensor& scale_a,
|
||||
const Tensor& scale_b,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
const std::optional<at::Tensor>& scale_result,
|
||||
std::optional<c10::ScalarType> out_dtype,
|
||||
bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
|
||||
TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
|
||||
TORCH_CHECK(
|
||||
mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
|
||||
mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
|
||||
|
||||
TORCH_INTERNAL_ASSERT((scale_a.numel() == 1 && scale_b.numel() == 1), "Now _scaled_mm only supports per-tensor scaling for CPU backend.");
|
||||
TORCH_CHECK(!bias || bias->numel() == mat2.sizes()[1], "Bias must be size ", mat2.sizes()[1],
|
||||
" but got ", bias->numel());
|
||||
|
||||
// Check types
|
||||
TORCH_CHECK(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type");
|
||||
TORCH_CHECK(isFloat8Type(mat1.scalar_type()), "Expected mat1 to be Float8 matrix got ", mat1.scalar_type());
|
||||
TORCH_CHECK(isFloat8Type(mat2.scalar_type()), "Expected mat2 to be Float8 matrix got ", mat2.scalar_type());
|
||||
// TODO: This check of mat1 and mat2 must have the same data type will be removed after oneDNN v3.6.
|
||||
TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "Expected mat1 and mat2 must have the same data type");
|
||||
|
||||
// Validation checks have passed lets resize the output to actual size
|
||||
auto mat1_c = mat1.contiguous();
|
||||
auto mat2_c = mat2.contiguous();
|
||||
IntArrayRef mat1_sizes = mat1_c.sizes();
|
||||
IntArrayRef mat2_sizes = mat2_c.sizes();
|
||||
at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
|
||||
|
||||
float input_scale = scale_a.item<float>();
|
||||
float weight_scale = scale_b.item<float>();
|
||||
auto src = at::native::itensor_view_from_dense(mat1_c);
|
||||
auto weight_t = at::native::itensor_view_from_dense(mat2_c);
|
||||
bool with_bias = bias.has_value();
|
||||
int64_t K = mat1_sizes[1], M = mat1_sizes[0],
|
||||
N = mat2_sizes[1];
|
||||
|
||||
std::vector<int64_t> src_dims = {M, K};
|
||||
std::vector<int64_t> weight_dims = {K, N};
|
||||
std::vector<int64_t> dst_dims = {M, N};
|
||||
|
||||
ideep::tensor dst = at::native::itensor_view_from_dense(out);
|
||||
auto src_desc = ideep::tensor::desc(
|
||||
src_dims,
|
||||
get_mkldnn_dtype(mat1.scalar_type()),
|
||||
ideep::format_tag::any);
|
||||
auto weights_desc = ideep::tensor::desc(
|
||||
weight_dims,
|
||||
get_mkldnn_dtype(mat2.scalar_type()),
|
||||
ideep::format_tag::any);
|
||||
auto dst_desc = ideep::tensor::desc(
|
||||
dst_dims,
|
||||
get_mkldnn_dtype(out.scalar_type()),
|
||||
ideep::format_tag::any);
|
||||
ideep::tensor onednn_bias;
|
||||
if (with_bias) {
|
||||
auto bias_value = bias.value();
|
||||
if (bias_value.dim() == 1) {
|
||||
auto b_reshape = bias_value.reshape({1, bias_value.size(0)});
|
||||
onednn_bias = at::native::itensor_view_from_dense(b_reshape);
|
||||
} else {
|
||||
onednn_bias = at::native::itensor_view_from_dense(bias_value);
|
||||
}
|
||||
}
|
||||
auto bias_desc = ideep::tensor::desc();
|
||||
if (with_bias) {
|
||||
bias_desc = ideep::tensor::desc(onednn_bias.get_dims(),
|
||||
get_mkldnn_dtype(bias.value().scalar_type()),
|
||||
ideep::format_tag::any);
|
||||
}
|
||||
auto op_attr = ideep::attr_t();
|
||||
if (input_scale != 1.0f) {
|
||||
op_attr.set_scales_mask(DNNL_ARG_SRC, 0);
|
||||
}
|
||||
if (weight_scale != 1.0f) {
|
||||
op_attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
|
||||
}
|
||||
|
||||
op_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
|
||||
auto engine = ideep::engine::cpu_engine();
|
||||
dnnl::matmul::primitive_desc primitive_desc = with_bias
|
||||
? dnnl::matmul::primitive_desc(
|
||||
engine, src_desc, weights_desc, bias_desc, dst_desc, op_attr)
|
||||
: dnnl::matmul::primitive_desc(
|
||||
engine, src_desc, weights_desc, dst_desc, op_attr);
|
||||
auto primitive = dnnl::matmul(primitive_desc);
|
||||
|
||||
// Prepare args and execute primitive
|
||||
ideep::tensor scratchpad(primitive_desc.scratchpad_desc());
|
||||
ideep::exec_args args;
|
||||
args.insert({DNNL_ARG_SRC, src});
|
||||
args.insert({DNNL_ARG_WEIGHTS, weight_t});
|
||||
args.insert({DNNL_ARG_DST, dst});
|
||||
args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
|
||||
if (with_bias) {
|
||||
args.insert({DNNL_ARG_BIAS, onednn_bias});
|
||||
}
|
||||
ideep::tensor src_scales_t = ideep::tensor(ideep::scale_t(1, input_scale));
|
||||
ideep::tensor wei_scales_t = ideep::tensor(ideep::scale_t(1, weight_scale));
|
||||
|
||||
if (input_scale != 1.0f) {
|
||||
args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_scales_t});
|
||||
}
|
||||
args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, wei_scales_t});
|
||||
|
||||
primitive.execute(ideep::stream::default_stream(), args);
|
||||
return out;
|
||||
}
|
||||
|
||||
} // namespace at
|
||||
|
||||
#endif // AT_MKLDNN_ENABLED
|
||||
|
||||
@ -35,15 +35,3 @@ C10_API Tensor mkl_linear(
|
||||
} // namespace at
|
||||
|
||||
#endif // AT_MKLDNN_ENABLED()
|
||||
|
||||
namespace at::native {
|
||||
Tensor&
|
||||
mkldnn_scaled_mm(const Tensor& mat1, const Tensor& mat2,
|
||||
const Tensor& scale_a,
|
||||
const Tensor& scale_b,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
const std::optional<at::Tensor>& scale_result,
|
||||
std::optional<c10::ScalarType> out_dtype,
|
||||
bool use_fast_accum,
|
||||
Tensor& out);
|
||||
} // namespace at::native
|
||||
|
||||
@ -54,15 +54,9 @@ ideep::tensor::data_type get_mkldnn_dtype(ScalarType type) {
|
||||
case ScalarType::Byte:
|
||||
return ideep::tensor::data_type::u8;
|
||||
case ScalarType::BFloat16:
|
||||
#if !(IDEEP_VERSION_MAJOR <= 2 || (IDEEP_VERSION_MAJOR == 3 && IDEEP_VERSION_MINOR < 5))
|
||||
return ideep::tensor::data_type::bf16;
|
||||
case ScalarType::Half:
|
||||
return ideep::tensor::data_type::f16;
|
||||
case ScalarType::Float8_e4m3fn:
|
||||
return ideep::tensor::data_type::f8_e4m3;
|
||||
case ScalarType::Float8_e5m2:
|
||||
return ideep::tensor::data_type::f8_e5m2;
|
||||
#endif
|
||||
default:
|
||||
TORCH_CHECK(false, "get_mkldnn_dtype: unsupported data type");
|
||||
}
|
||||
@ -167,26 +161,8 @@ ideep::tensor itensor_view_from_dense(const Tensor& tensor, bool from_const_data
|
||||
const_cast<void*>(tensor.const_data_ptr()) :
|
||||
tensor.data_ptr()};
|
||||
}
|
||||
#if !(IDEEP_VERSION_MAJOR <= 2 || (IDEEP_VERSION_MAJOR == 3 && IDEEP_VERSION_MINOR < 5))
|
||||
else if (tensor.scalar_type() == ScalarType::Float8_e4m3fn) {
|
||||
return {{tensor.sizes().vec(),
|
||||
ideep::tensor::data_type::f8_e4m3,
|
||||
tensor.strides().vec()},
|
||||
from_const_data_ptr ?
|
||||
const_cast<void*>(tensor.const_data_ptr()) :
|
||||
tensor.data_ptr()};
|
||||
}
|
||||
else if (tensor.scalar_type() == ScalarType::Float8_e5m2) {
|
||||
return {{tensor.sizes().vec(),
|
||||
ideep::tensor::data_type::f8_e5m2,
|
||||
tensor.strides().vec()},
|
||||
from_const_data_ptr ?
|
||||
const_cast<void*>(tensor.const_data_ptr()) :
|
||||
tensor.data_ptr()};
|
||||
}
|
||||
#endif
|
||||
else {
|
||||
TORCH_CHECK(false, "itensor_view_from_dense expects float/bfloat16/half/int8/fp8 tensor input");
|
||||
TORCH_CHECK(false, "itensor_view_from_dense expects float/bfloat16/half/int8 tensor input");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -177,7 +177,7 @@ class Attr {
|
||||
float sum_q_scale = 1.f,
|
||||
int64_t zp = 0) {
|
||||
ops_params_.push_back(
|
||||
PostOpParam(/*scale_sum*/ sum_scale * sum_q_scale, kind_t::sum));
|
||||
PostOpParam(/*scale_sum*/ sum_scale * sum_q_scale, zp, kind_t::sum));
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -261,10 +261,7 @@ class Attr {
|
||||
return *this;
|
||||
}
|
||||
|
||||
dnnl::post_ops extract_post_ops(
|
||||
const at::Tensor& dst,
|
||||
bool is_quantized = false,
|
||||
bool int8_output = false) {
|
||||
dnnl::post_ops extract_post_ops(const at::Tensor& dst) {
|
||||
// this function is used to extract post ops params from the ops_params_
|
||||
// and put them into onednn post ops
|
||||
for (size_t i = 0; i < ops_params_.size(); ++i) {
|
||||
@ -303,11 +300,6 @@ class Attr {
|
||||
}
|
||||
}
|
||||
|
||||
// if output is quantized, then append the eltwise linear to adjust the
|
||||
// output scale/zero_point
|
||||
if (is_quantized && int8_output) {
|
||||
dnnl_post_ops_.append_eltwise(kind_with_linear, q_scale_, q_zero_point_);
|
||||
}
|
||||
return dnnl_post_ops_;
|
||||
}
|
||||
|
||||
@ -410,6 +402,7 @@ static inline void construct_attr_by_post_op(
|
||||
double binary_alpha,
|
||||
double input1_scale,
|
||||
int64_t input1_zero_point,
|
||||
std::optional<at::Tensor> accum,
|
||||
const std::string_view& unary_post_op,
|
||||
const torch::List<std::optional<at::Scalar>>& unary_post_op_args,
|
||||
const std::string_view& unary_post_op_algorithm,
|
||||
@ -418,11 +411,46 @@ static inline void construct_attr_by_post_op(
|
||||
(binary_post_op == "none" && unary_post_op == "none"); // not post-ops
|
||||
bool is_unary_post_op_only =
|
||||
(binary_post_op == "none" && unary_post_op != "none"); // ex., conv + relu
|
||||
bool is_valid_binary_combination =
|
||||
(binary_post_op == "add" || binary_post_op == "sum") &&
|
||||
(unary_post_op == "none" || unary_post_op == "relu");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
is_unary_post_op_only || is_none_post_op,
|
||||
"Currently, quantization backend for Intel GPU only supports convolution or convolution with unary post operation like ReLU");
|
||||
construct_attr_for_unary(
|
||||
unary_post_op, unary_post_op_args, unary_post_op_algorithm, attr);
|
||||
is_unary_post_op_only || is_none_post_op || is_valid_binary_combination,
|
||||
"Please provide valid combination of unary post operators and binary post operators");
|
||||
|
||||
if (binary_post_op == "none") {
|
||||
construct_attr_for_unary(
|
||||
unary_post_op, unary_post_op_args, unary_post_op_algorithm, attr);
|
||||
} else if (binary_post_op == "sum") {
|
||||
if (unary_post_op == "none") {
|
||||
if (input1_zero_point != 0)
|
||||
attr = attr.append_post_eltwise(
|
||||
/*scale*/ 1.f,
|
||||
/*alpha*/ 1.f,
|
||||
-input1_zero_point * input1_scale,
|
||||
attr.kind_with_linear);
|
||||
attr = attr.append_post_sum(1, input1_scale, /*input1_zero_point*/ 0);
|
||||
} else if (unary_post_op == "relu") {
|
||||
if (input1_zero_point != 0)
|
||||
attr = attr.append_post_eltwise(
|
||||
/*scale*/ 1.f,
|
||||
/*alpha*/ 1.f,
|
||||
-input1_zero_point * input1_scale,
|
||||
attr.kind_with_linear);
|
||||
attr = attr.append_post_sum(1, input1_scale, /*input1_zero_point*/ 0);
|
||||
attr = attr.append_post_eltwise(
|
||||
/* scale */ 1.f,
|
||||
/* alpha */ 0.f,
|
||||
/* beta */ 0.f,
|
||||
attr.kind_with_relu);
|
||||
}
|
||||
} else if (binary_post_op == "add") {
|
||||
TORCH_CHECK(accum.has_value());
|
||||
attr = attr.append_post_binary(attr.kind_with_binary_add, accum.value());
|
||||
if (unary_post_op == "relu") {
|
||||
attr = attr.append_post_eltwise(1.f, 0.f, 0.f, attr.kind_with_relu);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace at::native::onednn
|
||||
|
||||
@ -11,14 +11,19 @@
|
||||
|
||||
namespace at::native::onednn {
|
||||
|
||||
static std::tuple<dnnl::memory::desc, dnnl::memory::desc, dnnl::memory::desc>
|
||||
static std::tuple<
|
||||
dnnl::memory::desc,
|
||||
dnnl::memory::desc,
|
||||
dnnl::memory::desc,
|
||||
dnnl::memory::desc>
|
||||
qconv_get_md(
|
||||
const at::Tensor& src,
|
||||
const at::Tensor& wgh,
|
||||
std::optional<at::Tensor> bias,
|
||||
const at::Tensor& dst,
|
||||
int64_t groups) {
|
||||
// create dnnl::memory desc from the src/wgh/dst tensors
|
||||
dnnl::memory::desc src_usr_md, wgh_usr_md, dst_usr_md;
|
||||
dnnl::memory::desc src_usr_md, wgh_usr_md, dst_usr_md, bias_usr_md;
|
||||
auto ndim = src.ndimension();
|
||||
bool src_is_cl =
|
||||
(src.suggest_memory_format() == at::MemoryFormat::ChannelsLast) ||
|
||||
@ -44,7 +49,14 @@ qconv_get_md(
|
||||
auto fmt_wgh = conv_weight_fmt(ndim, groups != 1, wgh_is_cl);
|
||||
wgh_usr_md = dnnl::memory::desc(wgh_tz, wei_data_t, fmt_wgh);
|
||||
|
||||
return {src_usr_md, wgh_usr_md, dst_usr_md};
|
||||
if (bias.has_value()) {
|
||||
bias_usr_md = dnnl::memory::desc(
|
||||
bias.value().sizes().vec(),
|
||||
dnnl::memory::data_type::f32,
|
||||
dnnl::memory::format_tag::x);
|
||||
}
|
||||
|
||||
return {src_usr_md, wgh_usr_md, bias_usr_md, dst_usr_md};
|
||||
}
|
||||
|
||||
at::Tensor quantized_convolution(
|
||||
@ -76,14 +88,12 @@ at::Tensor quantized_convolution(
|
||||
Attr(/*q_scale=*/1.0 / inv_output_scale, /*zp=*/output_zero_point);
|
||||
|
||||
auto ndim = act.ndimension();
|
||||
if (bias.has_value()) {
|
||||
attr = attr.append_bias(bias.value(), ndim - 2);
|
||||
}
|
||||
construct_attr_by_post_op(
|
||||
binary_attr.has_value() ? binary_attr.value() : "none",
|
||||
binary_alpha.has_value() ? binary_alpha.value().to<double>() : 1.0,
|
||||
accum_scale,
|
||||
accum_zero_point,
|
||||
accum,
|
||||
unary_attr.has_value() ? unary_attr.value() : "none",
|
||||
unary_scalars,
|
||||
unary_algorithm.has_value() ? unary_algorithm.value() : "",
|
||||
@ -110,10 +120,7 @@ at::Tensor quantized_convolution(
|
||||
dnnl::memory::dims _dilation = compatible_dilation(dilation);
|
||||
dnnl::post_ops po;
|
||||
// extract post ops
|
||||
po = attr.extract_post_ops(
|
||||
output,
|
||||
/*is_quantized*/ true,
|
||||
output.scalar_type() == at::kByte || output.scalar_type() == at::kChar);
|
||||
po = attr.extract_post_ops(output);
|
||||
int mask_ac = 0, mask_weight;
|
||||
// [Note: Per-channel quantization mask setting]
|
||||
// Per-channel quantization is on weight output channel mostly, mask_weight=
|
||||
@ -127,10 +134,11 @@ at::Tensor quantized_convolution(
|
||||
dnnl::primitive_attr pattr;
|
||||
|
||||
bool src_need_zp = (act_scale != 0);
|
||||
bool dst_need_zp = (output_zero_point != 0);
|
||||
|
||||
// create usr_md for tensors, and md for conv primitive
|
||||
auto [src_md, weight_md, output_md] =
|
||||
qconv_get_md(act, weight, output, groups);
|
||||
auto [src_md, weight_md, bias_md, output_md] =
|
||||
qconv_get_md(act, weight, bias, output, groups);
|
||||
|
||||
// get tensor md
|
||||
auto ic = act.size(1);
|
||||
@ -139,11 +147,14 @@ at::Tensor quantized_convolution(
|
||||
compatible_weight_dims(ndim, groups, oc, ic, weight.sizes());
|
||||
|
||||
pattr.set_scales_mask(DNNL_ARG_SRC, mask_ac);
|
||||
pattr.set_scales_mask(DNNL_ARG_DST, mask_ac);
|
||||
pattr.set_scales_mask(DNNL_ARG_WEIGHTS, mask_weight);
|
||||
pattr.set_post_ops(po);
|
||||
|
||||
if (src_need_zp)
|
||||
pattr.set_zero_points_mask(DNNL_ARG_SRC, mask_ac);
|
||||
if (dst_need_zp)
|
||||
pattr.set_zero_points_mask(DNNL_ARG_DST, mask_ac);
|
||||
pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
|
||||
|
||||
// create primitive
|
||||
@ -153,7 +164,7 @@ at::Tensor quantized_convolution(
|
||||
dnnl::algorithm::convolution_direct,
|
||||
src_md,
|
||||
weight_md,
|
||||
dnnl::memory::desc(),
|
||||
bias.has_value() ? bias_md : dnnl::memory::desc(),
|
||||
output_md,
|
||||
_stride,
|
||||
_dilation,
|
||||
@ -164,11 +175,14 @@ at::Tensor quantized_convolution(
|
||||
dnnl::convolution_forward conv_forward =
|
||||
dnnl::convolution_forward(conv_fwd_pd);
|
||||
|
||||
dnnl::memory src_m, weight_m, output_m;
|
||||
dnnl::memory src_m, weight_m, output_m, bias_m;
|
||||
|
||||
src_m = make_onednn_memory(src_md, engine, act.data_ptr());
|
||||
output_m = make_onednn_memory(output_md, engine, output.data_ptr());
|
||||
weight_m = make_onednn_memory(weight_md, engine, weight.data_ptr());
|
||||
if (bias.has_value()) {
|
||||
bias_m = make_onednn_memory(bias_md, engine, bias.value().data_ptr());
|
||||
}
|
||||
|
||||
std::unordered_map<int, dnnl::memory> args;
|
||||
if (attr.with_binary())
|
||||
@ -176,6 +190,9 @@ at::Tensor quantized_convolution(
|
||||
args.insert({DNNL_ARG_SRC, src_m});
|
||||
args.insert({DNNL_ARG_WEIGHTS, weight_m});
|
||||
args.insert({DNNL_ARG_DST, output_m});
|
||||
if (bias.has_value()) {
|
||||
args.insert({DNNL_ARG_BIAS, bias_m});
|
||||
}
|
||||
|
||||
dnnl::memory src_sc_m, src_zp_m;
|
||||
Tensor src_sc_tensor, src_zp_tensor;
|
||||
@ -188,7 +205,17 @@ at::Tensor quantized_convolution(
|
||||
args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC, src_zp_m});
|
||||
}
|
||||
|
||||
// dst scale is no need for setting, since it is fused in postop via linear
|
||||
dnnl::memory dst_sc_m, dst_zp_m;
|
||||
Tensor dst_sc_tensor, dst_zp_tensor;
|
||||
dst_sc_m = dnnl_memory_from_host_scalar(
|
||||
static_cast<float>(inv_output_scale), dst_sc_tensor, engine);
|
||||
dst_zp_m = dnnl_memory_from_host_scalar(
|
||||
static_cast<int32_t>(output_zero_point), dst_zp_tensor, engine);
|
||||
args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_sc_m});
|
||||
if (dst_need_zp) {
|
||||
args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST, dst_zp_m});
|
||||
}
|
||||
|
||||
size_t scratchpad_size = conv_fwd_pd.scratchpad_desc().get_size();
|
||||
Tensor scratchpad_tensor = at::empty(
|
||||
{static_cast<int64_t>(scratchpad_size)},
|
||||
|
||||
@ -118,6 +118,7 @@ void quantized_matmul(
|
||||
binary_alpha,
|
||||
input_scale,
|
||||
input_zero_point,
|
||||
other,
|
||||
unary_post_op,
|
||||
unary_post_op_args,
|
||||
unary_post_op_algorithm,
|
||||
@ -210,11 +211,9 @@ void quantized_matmul(
|
||||
std::unordered_map<int, dnnl::memory> args;
|
||||
|
||||
dnnl::post_ops po;
|
||||
po = attr.extract_post_ops(
|
||||
dst,
|
||||
true,
|
||||
dst.scalar_type() == at::kByte || dst.scalar_type() == at::kChar);
|
||||
po = attr.extract_post_ops(dst);
|
||||
bool m1_need_zp = (input_zero_point != 0);
|
||||
bool dst_need_zp = (output_zero_point != 0);
|
||||
bool wgh_is_per_channel = weight_scales.numel() > 1;
|
||||
|
||||
dnnl::matmul matmul_p;
|
||||
@ -242,6 +241,10 @@ void quantized_matmul(
|
||||
if (m1_need_zp) {
|
||||
pattr.set_zero_points_mask(DNNL_ARG_SRC, mask_ac);
|
||||
}
|
||||
pattr.set_scales_mask(DNNL_ARG_DST, mask_ac);
|
||||
if (dst_need_zp) {
|
||||
pattr.set_zero_points_mask(DNNL_ARG_DST, mask_ac);
|
||||
}
|
||||
|
||||
if (with_bias) {
|
||||
b_md = dnnl::memory::desc(bias_dims, bias_dt, bias_strides);
|
||||
@ -309,6 +312,17 @@ void quantized_matmul(
|
||||
args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC, m1_zp_m});
|
||||
}
|
||||
|
||||
dnnl::memory dst_sc_m, dst_zp_m;
|
||||
Tensor dst_sc_tensor, dst_zp_tensor;
|
||||
dst_sc_m = dnnl_memory_from_host_scalar(
|
||||
static_cast<float>(output_scale), dst_sc_tensor, engine);
|
||||
dst_zp_m = dnnl_memory_from_host_scalar(
|
||||
static_cast<int32_t>(output_zero_point), dst_zp_tensor, engine);
|
||||
args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_sc_m});
|
||||
if (dst_need_zp) {
|
||||
args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST, dst_zp_m});
|
||||
}
|
||||
|
||||
auto qmatmul_event = dnnl::sycl_interop::execute(matmul_p, stream, args);
|
||||
|
||||
if (!dst.is_same(result))
|
||||
|
||||
@ -54,7 +54,7 @@ class QConvoneDNNXPU final {
|
||||
TORCH_CHECK(
|
||||
attr == "none" || attr == "relu" || attr == "hardtanh" ||
|
||||
attr == "hardswish" || attr == "swish",
|
||||
"We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, and Hardswish are supported. However, encountered unsupported post operation:",
|
||||
"We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:",
|
||||
attr,
|
||||
".");
|
||||
}
|
||||
@ -76,7 +76,7 @@ class QConvoneDNNXPU final {
|
||||
dilation.vec());
|
||||
|
||||
Tensor output = at::empty(
|
||||
dst_tz, device(c10::kXPU).dtype(output_dtype).memory_format(mfmt));
|
||||
dst_tz, act.options().dtype(output_dtype).memory_format(mfmt));
|
||||
|
||||
return quantized_convolution(
|
||||
act,
|
||||
@ -104,6 +104,95 @@ class QConvoneDNNXPU final {
|
||||
/*unary_scalars*/ scalars,
|
||||
/*unary_algorithm*/ algorithm);
|
||||
}
|
||||
|
||||
static at::Tensor run_pointwise_binary(
|
||||
at::Tensor act,
|
||||
double act_scale,
|
||||
int64_t act_zero_point,
|
||||
at::Tensor weight,
|
||||
at::Tensor weight_scales,
|
||||
at::Tensor weight_zero_points,
|
||||
at::Tensor accum,
|
||||
std::optional<at::Tensor> bias,
|
||||
torch::List<int64_t> stride,
|
||||
torch::List<int64_t> padding,
|
||||
torch::List<int64_t> dilation,
|
||||
int64_t groups,
|
||||
double output_scale,
|
||||
int64_t output_zero_point,
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
double accum_scale,
|
||||
int64_t accum_zero_point,
|
||||
std::string_view binary_attr,
|
||||
std::optional<at::Scalar> alpha,
|
||||
std::optional<std::string_view> unary_attr,
|
||||
torch::List<std::optional<at::Scalar>> unary_scalars,
|
||||
std::optional<std::string_view> unary_algorithm) {
|
||||
TORCH_CHECK(
|
||||
act.dim() == 4 && binary_attr == "sum" &&
|
||||
(!unary_attr.has_value() ||
|
||||
(unary_attr.has_value() &&
|
||||
(unary_attr.value() == "none" || unary_attr.value() == "relu"))),
|
||||
"post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ",
|
||||
binary_attr,
|
||||
" unary_post_op: ",
|
||||
unary_attr.has_value() ? unary_attr.value() : "none",
|
||||
".")
|
||||
|
||||
bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
|
||||
auto mfmt = is_channels_last_suggested
|
||||
? get_cl_tag_by_ndim(act.ndimension())
|
||||
: at::MemoryFormat::Contiguous;
|
||||
Tensor input_ = act.contiguous(mfmt);
|
||||
Tensor weight_ = weight.contiguous(mfmt);
|
||||
|
||||
auto dst_tz = conv_dst_size(
|
||||
input_.ndimension(),
|
||||
input_.sizes(),
|
||||
weight_.sizes(),
|
||||
padding.vec(),
|
||||
padding.vec(),
|
||||
stride.vec(),
|
||||
dilation.vec());
|
||||
|
||||
bool has_accum_postop_sum = binary_attr == "sum";
|
||||
Tensor output = has_accum_postop_sum
|
||||
? accum
|
||||
: at::empty(
|
||||
dst_tz, act.options().dtype(output_dtype).memory_format(mfmt));
|
||||
|
||||
output = quantized_convolution(
|
||||
act,
|
||||
act_scale,
|
||||
act_zero_point,
|
||||
weight,
|
||||
weight_scales,
|
||||
weight_zero_points,
|
||||
bias,
|
||||
stride,
|
||||
padding,
|
||||
dilation,
|
||||
/*transposed*/ false,
|
||||
groups,
|
||||
output,
|
||||
output_scale,
|
||||
output_zero_point,
|
||||
/*accum*/ accum,
|
||||
/*accum_scale*/ accum_scale,
|
||||
/*accum_zero_point*/ accum_zero_point,
|
||||
/*output_dtype*/ output_dtype,
|
||||
/*binary_attr*/ binary_attr,
|
||||
/*binary_alpha*/ alpha,
|
||||
/*unary_attr*/ unary_attr,
|
||||
/*unary_scalars*/ unary_scalars,
|
||||
/*unary_algorithm*/ unary_algorithm);
|
||||
|
||||
if (!has_accum_postop_sum) {
|
||||
return output;
|
||||
} else {
|
||||
return accum;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
TORCH_LIBRARY_IMPL(onednn, XPU, m) {
|
||||
@ -119,6 +208,9 @@ TORCH_LIBRARY_IMPL(onednn, XPU, m) {
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("onednn::qconv3d_pointwise"),
|
||||
QConvoneDNNXPU::run_pointwise);
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("onednn::qconv2d_pointwise.binary"),
|
||||
QConvoneDNNXPU::run_pointwise_binary);
|
||||
}
|
||||
|
||||
} // namespace at::native::xpu
|
||||
|
||||
@ -125,6 +125,132 @@ Tensor q_linear_pointwise_tensor(
|
||||
return qout;
|
||||
}
|
||||
|
||||
Tensor q_linear_pointwise_binary(
|
||||
Tensor act,
|
||||
double act_scale,
|
||||
int64_t act_zero_point,
|
||||
Tensor weight,
|
||||
Tensor weight_scales,
|
||||
Tensor weight_zero_points,
|
||||
std::optional<at::Tensor> other,
|
||||
std::optional<Tensor> bias,
|
||||
double output_scale,
|
||||
int64_t output_zero_point,
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
double other_scale,
|
||||
int64_t other_zero_point,
|
||||
c10::string_view binary_post_op,
|
||||
double binary_alpha,
|
||||
c10::string_view unary_post_op,
|
||||
torch::List<std::optional<at::Scalar>> unary_post_op_args,
|
||||
c10::string_view unary_post_op_algorithm) {
|
||||
TORCH_CHECK(
|
||||
act.device() == weight.device() &&
|
||||
act.device() == weight_scales.device() &&
|
||||
act.device() == weight_zero_points.device(),
|
||||
"qlinear xpu: input tensors(act, weight, weight scale, weight zero-points) should be on the same device");
|
||||
Tensor b_raw = bias.has_value() ? bias.value() : at::Tensor();
|
||||
|
||||
const int64_t dim = act.dim();
|
||||
int64_t K = act.size(dim - 1);
|
||||
int64_t M = act.numel() / K;
|
||||
// [M, K] x [K, N]
|
||||
int64_t N = weight.size(1);
|
||||
|
||||
std::vector<int64_t> src_dims = {M, K};
|
||||
std::vector<int64_t> dst_dims = {M, N};
|
||||
auto out_dtype =
|
||||
output_dtype.has_value() ? output_dtype.value() : act.scalar_type();
|
||||
Tensor qout = at::empty(dst_dims, act.options().dtype(out_dtype));
|
||||
|
||||
quantized_matmul(
|
||||
act.contiguous(),
|
||||
act_scale,
|
||||
act_zero_point,
|
||||
weight.contiguous(),
|
||||
weight_scales,
|
||||
weight_zero_points,
|
||||
b_raw,
|
||||
qout,
|
||||
output_scale,
|
||||
output_zero_point,
|
||||
output_dtype,
|
||||
/*other*/ other,
|
||||
/*other scale*/ other_scale,
|
||||
/*other zp*/ other_zero_point,
|
||||
/*binary post op*/ binary_post_op,
|
||||
/*binary alpha*/ binary_alpha,
|
||||
unary_post_op,
|
||||
unary_post_op_args,
|
||||
unary_post_op_algorithm,
|
||||
/*m2_trans*/ true);
|
||||
|
||||
return qout;
|
||||
}
|
||||
|
||||
Tensor q_linear_pointwise_binary_tensor(
|
||||
Tensor act,
|
||||
Tensor act_scale,
|
||||
Tensor act_zero_point,
|
||||
Tensor weight,
|
||||
Tensor weight_scales,
|
||||
Tensor weight_zero_points,
|
||||
std::optional<at::Tensor> other,
|
||||
std::optional<Tensor> bias,
|
||||
double output_scale,
|
||||
int64_t output_zero_point,
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
double other_scale,
|
||||
int64_t other_zero_point,
|
||||
c10::string_view binary_post_op,
|
||||
double binary_alpha,
|
||||
c10::string_view unary_post_op,
|
||||
torch::List<std::optional<at::Scalar>> unary_post_op_args,
|
||||
c10::string_view unary_post_op_algorithm) {
|
||||
TORCH_CHECK(
|
||||
act.device() == weight.device() &&
|
||||
act.device() == weight_scales.device() &&
|
||||
act.device() == weight_zero_points.device(),
|
||||
"qlinear xpu: input tensors(act, weight, weight scale, weight zero-points) should be on the same device");
|
||||
Tensor b_raw = bias.has_value() ? bias.value() : at::Tensor();
|
||||
|
||||
const int64_t dim = act.dim();
|
||||
int64_t K = act.size(dim - 1);
|
||||
int64_t M = act.numel() / K;
|
||||
// [M, K] x [K, N]
|
||||
int64_t N = weight.size(1);
|
||||
|
||||
std::vector<int64_t> src_dims = {M, K};
|
||||
std::vector<int64_t> dst_dims = {M, N};
|
||||
auto out_dtype =
|
||||
output_dtype.has_value() ? output_dtype.value() : act.scalar_type();
|
||||
Tensor qout = at::empty(dst_dims, act.options().dtype(out_dtype));
|
||||
|
||||
quantized_matmul(
|
||||
act.contiguous(),
|
||||
act_scale.item().toDouble(),
|
||||
act_zero_point.item().toLong(),
|
||||
weight.contiguous(),
|
||||
weight_scales,
|
||||
weight_zero_points,
|
||||
b_raw,
|
||||
qout,
|
||||
output_scale,
|
||||
output_zero_point,
|
||||
output_dtype,
|
||||
/*other*/ other,
|
||||
/*other scale*/ other_scale,
|
||||
/*other zp*/ other_zero_point,
|
||||
/*binary post op*/ binary_post_op,
|
||||
/*binary alpha*/ binary_alpha,
|
||||
unary_post_op,
|
||||
unary_post_op_args,
|
||||
unary_post_op_algorithm,
|
||||
/*m2_trans*/ true);
|
||||
|
||||
return qout;
|
||||
}
|
||||
|
||||
at::Tensor q_linear_prepack_onednn(
|
||||
at::Tensor weight,
|
||||
std::optional<torch::List<int64_t>> input_shape) {
|
||||
@ -142,6 +268,12 @@ TORCH_LIBRARY_IMPL(onednn, XPU, m) {
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("onednn::qlinear_prepack"),
|
||||
TORCH_FN(q_linear_prepack_onednn));
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary"),
|
||||
TORCH_FN(q_linear_pointwise_binary));
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary_tensor"),
|
||||
TORCH_FN(q_linear_pointwise_binary_tensor));
|
||||
}
|
||||
|
||||
} // namespace at::native::xpu
|
||||
|
||||
@ -7066,13 +7066,11 @@
|
||||
- func: _scaled_mm(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False) -> Tensor
|
||||
variants: function
|
||||
dispatch:
|
||||
CPU: _scaled_mm_cpu
|
||||
CUDA: _scaled_mm_cuda
|
||||
|
||||
- func: _scaled_mm.out(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
variants: function
|
||||
dispatch:
|
||||
CPU: _scaled_mm_out_cpu
|
||||
CUDA: _scaled_mm_out_cuda
|
||||
|
||||
# NOTE [ Sparse: autograd and API ]
|
||||
@ -14904,6 +14902,7 @@
|
||||
- func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
|
||||
dispatch:
|
||||
CUDA: _scaled_dot_product_cudnn_attention_cuda
|
||||
NestedTensorCUDA: _scaled_dot_product_cudnn_attention_nestedtensor_cuda
|
||||
tags: nondeterministic_seeded
|
||||
|
||||
- func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor attn_bias, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, *, float? scale=None) -> (Tensor, Tensor, Tensor)
|
||||
@ -14936,6 +14935,11 @@
|
||||
dispatch:
|
||||
CUDA: _efficient_attention_backward
|
||||
|
||||
- func: _cudnn_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
|
||||
dispatch:
|
||||
CUDA: _cudnn_attention_forward
|
||||
tags: nondeterministic_seeded
|
||||
|
||||
- func: _triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor
|
||||
variants: function
|
||||
dispatch:
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
#include <ATen/ATen.h>
|
||||
#pragma once
|
||||
#include <ATen/core/Tensor.h>
|
||||
|
||||
namespace at::native::preprocessing {
|
||||
|
||||
|
||||
@ -18,6 +18,8 @@
|
||||
#include <ATen/native/transformers/cuda/sdp_utils.h>
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <ATen/cuda/CUDAGeneratorImpl.h>
|
||||
#include <ATen/cuda/CUDAGraphsUtils.cuh>
|
||||
|
||||
namespace at::native {
|
||||
namespace {
|
||||
@ -320,6 +322,33 @@ _scaled_dot_product_efficient_attention_nestedtensor_cuda(
|
||||
return std::make_tuple(std::move(attention), std::move(log_sumexp), std::move(seed), std::move(offset));
|
||||
}
|
||||
|
||||
std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Tensor, Tensor>
|
||||
_scaled_dot_product_cudnn_attention_nestedtensor_cuda(
|
||||
const Tensor& query,
|
||||
const Tensor& key,
|
||||
const Tensor& value,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
bool compute_logsumexp,
|
||||
double dropout_p,
|
||||
bool is_causal,
|
||||
bool return_debug_mask,
|
||||
std::optional<double> scale) {
|
||||
|
||||
auto [
|
||||
query_buffer_reshaped,
|
||||
key_buffer_reshaped,
|
||||
value_buffer_reshaped,
|
||||
cumulative_sequence_length_q,
|
||||
cumulative_sequence_length_kv,
|
||||
max_seqlen_batch_q,
|
||||
max_seqlen_batch_kv,
|
||||
output_shape] = preprocessing::sdpa_nested_preprocessing(query, key, value);
|
||||
auto [attention, log_sumexp, ignore1, ignore2, ignore3, ignore4, cudnn_seed, cudnn_offset, ignore5] = at::_cudnn_attention_forward(query_buffer_reshaped, key_buffer_reshaped, value_buffer_reshaped, attn_bias, cumulative_sequence_length_q, cumulative_sequence_length_kv, max_seqlen_batch_q, max_seqlen_batch_kv, compute_logsumexp, dropout_p, is_causal, return_debug_mask, scale);
|
||||
|
||||
attention = wrap_buffer(attention.view(-1), output_shape).transpose(1, 2);
|
||||
return std::make_tuple(std::move(attention), std::move(log_sumexp), cumulative_sequence_length_q, cumulative_sequence_length_kv, max_seqlen_batch_q, max_seqlen_batch_kv, std::move(cudnn_seed), std::move(cudnn_offset), Tensor());
|
||||
}
|
||||
|
||||
std::tuple<at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_flash_attention_backward_nested(
|
||||
const at::Tensor& grad_out_,
|
||||
const at::Tensor& query,
|
||||
|
||||
@ -464,6 +464,7 @@ def define_qnnpack(third_party, labels = []):
|
||||
apple_sdks = (IOS, MACOSX, APPLETVOS),
|
||||
compiler_flags = [
|
||||
"-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION",
|
||||
"-Wno-unused-command-line-argument",
|
||||
],
|
||||
fbobjc_preprocessor_flags = [
|
||||
"-DQNNP_PRIVATE=",
|
||||
|
||||
@ -26,6 +26,8 @@
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/_cudnn_attention_forward.h>
|
||||
#include <ATen/ops/_cudnn_attention_forward_native.h>
|
||||
#include <ATen/ops/_efficient_attention_forward.h>
|
||||
#include <ATen/ops/_efficient_attention_forward_native.h>
|
||||
#include <ATen/ops/_fill_mem_eff_dropout_mask_native.h>
|
||||
@ -63,6 +65,7 @@
|
||||
|
||||
#include <ATen/native/transformers/attention.h>
|
||||
#include <ATen/native/nested/NestedTensorUtils.h>
|
||||
#include <ATen/native/nested/NestedTensorTransformerUtils.h>
|
||||
#include <ATen/native/nested/NestedTensorTransformerFunctions.h>
|
||||
#include <ATen/native/transformers/cuda/sdp_utils.h>
|
||||
#include <ATen/native/transformers/sdp_utils_cpp.h>
|
||||
@ -87,6 +90,25 @@
|
||||
|
||||
namespace at {
|
||||
|
||||
namespace cuda::philox {
|
||||
|
||||
__global__ void unpack_cudnn(at::PhiloxCudaState arg, int64_t* seed_ptr, int64_t* offset_ptr) {
|
||||
if (arg.captured_) {
|
||||
*seed_ptr = static_cast<int64_t>(*arg.seed_.ptr);
|
||||
*offset_ptr = static_cast<int64_t>(
|
||||
*(arg.offset_.ptr) + static_cast<int64_t>(arg.offset_intragraph_));
|
||||
} else {
|
||||
*seed_ptr = static_cast<int64_t>(arg.seed_.val);
|
||||
*offset_ptr = static_cast<int64_t>(arg.offset_.val);
|
||||
}
|
||||
}
|
||||
|
||||
void unpack_cudnn_wrapper(at::PhiloxCudaState arg, int64_t* seed_ptr, int64_t* offset_ptr, cudaStream_t stream) {
|
||||
at::cuda::philox::unpack_cudnn<<<1, 1, 0, stream>>>(arg, seed_ptr, offset_ptr);
|
||||
}
|
||||
|
||||
} // namespace cuda::philox
|
||||
|
||||
namespace native {
|
||||
|
||||
namespace {
|
||||
@ -732,16 +754,177 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Ten
|
||||
return std::make_tuple(attention, logsumexp, Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_k, philox_seed, philox_offset, debug_attn_mask);
|
||||
}
|
||||
|
||||
// Adapted from TE
|
||||
// extract seed and offset from PhiloxCudaState
|
||||
__global__ void unpack_cudnn(at::PhiloxCudaState arg, int64_t* seed_ptr, int64_t* offset_ptr) {
|
||||
if (arg.captured_) {
|
||||
*seed_ptr = static_cast<int64_t>(*arg.seed_.ptr);
|
||||
*offset_ptr = static_cast<int64_t>(
|
||||
*(arg.offset_.ptr) + static_cast<int64_t>(arg.offset_intragraph_));
|
||||
std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Tensor, Tensor> _cudnn_attention_forward(
|
||||
const Tensor& query,
|
||||
const Tensor& key,
|
||||
const Tensor& value,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
const std::optional<Tensor>& cumulative_sequence_length_q,
|
||||
const std::optional<Tensor>& cumulative_sequence_length_kv,
|
||||
long max_seqlen_batch_q,
|
||||
long max_seqlen_batch_kv,
|
||||
bool compute_logsumexp,
|
||||
double dropout_p,
|
||||
bool is_causal,
|
||||
bool return_debug_mask,
|
||||
std::optional<double> scale) {
|
||||
// TODO(eqy): debug mask support
|
||||
// Query (Batch x Num_heads x Q_seq_len x Dim_per_head)
|
||||
// Key (Batch x Num_heads x KV_seq_len x Dim_per_head)
|
||||
// Value (Batch x Num_heads x KV_seq_len x Dim_per_head)
|
||||
const bool is_nested = cumulative_sequence_length_q.has_value();
|
||||
if (!is_nested) {
|
||||
const int64_t batch_size = query.size(0);
|
||||
const int64_t num_heads = query.size(1);
|
||||
const int64_t head_dim_qk = query.size(3);
|
||||
const int64_t head_dim_v = value.size(3);
|
||||
auto attn_bias_ = attn_bias;
|
||||
if (attn_bias_.has_value()) {
|
||||
const auto bias_dim = attn_bias_.value().dim();
|
||||
if (bias_dim == 2) {
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_kv});
|
||||
} else if (bias_dim == 3) {
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_kv});
|
||||
} else {
|
||||
TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_kv});
|
||||
}
|
||||
}
|
||||
|
||||
Tensor attention, log_sumexp;
|
||||
at::Tensor cudnn_seed, cudnn_offset;
|
||||
cudnn_seed = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
cudnn_offset = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
|
||||
const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
|
||||
|
||||
// See Note [Seed and Offset Device] in _efficient_attention_forward
|
||||
at::PhiloxCudaState philox_state;
|
||||
const bool in_capture_stream =
|
||||
at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None;
|
||||
if (use_dropout) {
|
||||
// Device
|
||||
auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
|
||||
std::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
|
||||
|
||||
// See Note [Acquire lock when using random generators]
|
||||
std::lock_guard<std::mutex> lock(gen->mutex_);
|
||||
// if using dropout, we produce 1 random number for each element of the
|
||||
// attention tensor
|
||||
// TODO(eqy): should state be advanced per thread (local) amount or per call/launch (global) amount
|
||||
philox_state = gen->philox_cuda_state(batch_size * num_heads * max_seqlen_batch_q * max_seqlen_batch_kv);
|
||||
at::cuda::philox::unpack_cudnn_wrapper(
|
||||
philox_state, static_cast<int64_t*>(cudnn_seed.data_ptr()), static_cast<int64_t*>(cudnn_offset.data_ptr()), at::cuda::getCurrentCUDAStream());
|
||||
}
|
||||
|
||||
const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float();
|
||||
Tensor debugmask;
|
||||
|
||||
run_cudnn_SDP_fprop(batch_size/*int64_t b*/,
|
||||
num_heads/*int64_t h*/,
|
||||
max_seqlen_batch_q/*int64_t s_q*/,
|
||||
max_seqlen_batch_kv/*int64_t s_kv*/,
|
||||
head_dim_qk/*int64_t d_qk*/,
|
||||
head_dim_v/*int64_t d_v*/,
|
||||
softmax_scale/*float scaling_factor*/,
|
||||
compute_logsumexp/* bool */,
|
||||
is_causal/* bool */,
|
||||
dropout_p/*double dropout_probability*/,
|
||||
query/* Tensor q*/,
|
||||
key/* Tensor k*/,
|
||||
value/* Tensor v*/,
|
||||
attn_bias_ /* std::optional<Tensor> */,
|
||||
log_sumexp/*Tensor softmaxstats*/,
|
||||
attention/*Tensor o*/,
|
||||
cudnn_seed/*Tensor dropoutseed*/,
|
||||
cudnn_offset/*Tensor dropoutoffset*/);
|
||||
|
||||
// TODO(eqy): support debug_attn_mask
|
||||
return std::make_tuple(std::move(attention), std::move(log_sumexp), Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_kv, std::move(cudnn_seed), std::move(cudnn_offset), Tensor());
|
||||
} else {
|
||||
*seed_ptr = static_cast<int64_t>(arg.seed_.val);
|
||||
*offset_ptr = static_cast<int64_t>(arg.offset_.val);
|
||||
//auto [
|
||||
// query_buffer_reshaped,
|
||||
// key_buffer_reshaped,
|
||||
// value_buffer_reshaped,
|
||||
// cumulative_sequence_length_q,
|
||||
// cumulative_sequence_length_kv,
|
||||
// max_seqlen_batch_q,
|
||||
// max_seqlen_batch_kv,
|
||||
// output_shape] = preprocessing::sdpa_nested_preprocessing(query, key, value);
|
||||
// C10_LOG_API_USAGE_ONCE("torch.sdpa.flash_attention_cudnn");
|
||||
// TODO(eqy): debug mask support
|
||||
// BHSD ...
|
||||
const int64_t batch_size = cumulative_sequence_length_q.value().size(0) - 1;
|
||||
const int64_t num_heads_q = query.size(-2);
|
||||
const int64_t num_heads_k = key.size(-2);
|
||||
const int64_t num_heads_v = value.size(-2);
|
||||
const int64_t head_dim_qk = query.size(-1);
|
||||
const int64_t head_dim_v = value.size(-1);
|
||||
auto attn_bias_ = attn_bias;
|
||||
if (attn_bias_.has_value()) {
|
||||
const auto bias_dim = attn_bias_.value().dim();
|
||||
if (bias_dim == 2) {
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_kv});
|
||||
} else if (bias_dim == 3) {
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_kv});
|
||||
} else {
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_kv});
|
||||
TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
|
||||
}
|
||||
}
|
||||
|
||||
Tensor attention, log_sumexp;
|
||||
|
||||
at::Tensor cudnn_seed, cudnn_offset;
|
||||
cudnn_seed = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
cudnn_offset = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
|
||||
const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
|
||||
|
||||
// See Note [Seed and Offset Device] in _efficient_attention_forward
|
||||
at::PhiloxCudaState philox_state;
|
||||
const bool in_capture_stream =
|
||||
at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None;
|
||||
if (use_dropout) {
|
||||
// Device
|
||||
auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
|
||||
std::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
|
||||
|
||||
// See Note [Acquire lock when using random generators]
|
||||
std::lock_guard<std::mutex> lock(gen->mutex_);
|
||||
// if using dropout, we produce 1 random number for each element of the
|
||||
// attention tensor
|
||||
// TODO(eqy): should state be advanced per thread (local) amount or per call/launch (global) amount
|
||||
philox_state = gen->philox_cuda_state(batch_size * num_heads_q * max_seqlen_batch_q * max_seqlen_batch_kv);
|
||||
at::cuda::philox::unpack_cudnn_wrapper(philox_state, static_cast<int64_t*>(cudnn_seed.data_ptr()), static_cast<int64_t*>(cudnn_offset.data_ptr()), at::cuda::getCurrentCUDAStream());
|
||||
}
|
||||
|
||||
const auto softmax_scale = sdp::calculate_scale(query, scale).as_float_unchecked();
|
||||
|
||||
run_cudnn_SDP_fprop_nestedtensor(batch_size/*int64_t b*/,
|
||||
num_heads_q/*int64_t h*/,
|
||||
num_heads_k,
|
||||
num_heads_v,
|
||||
max_seqlen_batch_q/*int64_t s_q*/,
|
||||
max_seqlen_batch_kv/*int64_t s_kv*/,
|
||||
head_dim_qk/*int64_t d_qk*/,
|
||||
head_dim_v/*int64_t d_v*/,
|
||||
softmax_scale/*float scaling_factor*/,
|
||||
compute_logsumexp/* bool */,
|
||||
is_causal/* bool */,
|
||||
dropout_p/*double dropout_probability*/,
|
||||
cumulative_sequence_length_q.value(),
|
||||
cumulative_sequence_length_kv.value(),
|
||||
query/* Tensor q*/,
|
||||
key/* Tensor k*/,
|
||||
value/* Tensor v*/,
|
||||
attn_bias_ /* std::optional<Tensor> */,
|
||||
log_sumexp/*Tensor softmaxstats*/,
|
||||
attention/*Tensor o*/,
|
||||
cudnn_seed/*Tensor dropoutseed*/,
|
||||
cudnn_offset/*Tensor dropoutoffset*/);
|
||||
//attention = wrap_buffer(attention.view(-1), output_shape).transpose(1, 2);
|
||||
return std::make_tuple(std::move(attention), std::move(log_sumexp), cumulative_sequence_length_q.value(), cumulative_sequence_length_kv.value(), max_seqlen_batch_q, max_seqlen_batch_kv, std::move(cudnn_seed), std::move(cudnn_offset), Tensor());
|
||||
}
|
||||
}
|
||||
|
||||
@ -757,84 +940,88 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Ten
|
||||
std::optional<double> scale) {
|
||||
// Used for tracking usage statistics
|
||||
C10_LOG_API_USAGE_ONCE("torch.sdpa.flash_attention_cudnn");
|
||||
// TODO(eqy): debug mask support
|
||||
// Query (Batch x Num_heads x Q_seq_len x Dim_per_head)
|
||||
// Key (Batch x Num_heads x KV_seq_len x Dim_per_head)
|
||||
// Value (Batch x Num_heads x KV_seq_len x Dim_per_head)
|
||||
const int64_t batch_size = query.size(0);
|
||||
const int64_t num_heads = query.size(1);
|
||||
const int64_t max_seqlen_batch_q = query.size(2);
|
||||
const int64_t head_dim_qk = query.size(3);
|
||||
const int64_t head_dim_v = value.size(3);
|
||||
const int64_t max_seqlen_batch_k = key.size(2);
|
||||
const int64_t max_seqlen_batch_v = value.size(2);
|
||||
TORCH_CHECK(
|
||||
max_seqlen_batch_k == max_seqlen_batch_v,
|
||||
"Key and Value must have the same sequence length");
|
||||
auto attn_bias_ = attn_bias;
|
||||
if (attn_bias_.has_value()) {
|
||||
const auto bias_dim = attn_bias_.value().dim();
|
||||
if (bias_dim == 2) {
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
|
||||
} else if (bias_dim == 3) {
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
|
||||
} else {
|
||||
TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k});
|
||||
}
|
||||
}
|
||||
|
||||
Tensor attention, log_sumexp;
|
||||
return at::_cudnn_attention_forward(query, key, value, attn_bias, std::nullopt, std::nullopt, max_seqlen_batch_q, max_seqlen_batch_k, compute_logsumexp, dropout_p, is_causal, return_debug_mask, scale);
|
||||
//// TODO(eqy): debug mask support
|
||||
//// Query (Batch x Num_heads x Q_seq_len x Dim_per_head)
|
||||
//// Key (Batch x Num_heads x KV_seq_len x Dim_per_head)
|
||||
//// Value (Batch x Num_heads x KV_seq_len x Dim_per_head)
|
||||
//const int64_t batch_size = query.size(0);
|
||||
//const int64_t num_heads = query.size(1);
|
||||
//const int64_t max_seqlen_batch_q = query.size(2);
|
||||
//const int64_t head_dim_qk = query.size(3);
|
||||
//const int64_t head_dim_v = value.size(3);
|
||||
//const int64_t max_seqlen_batch_k = key.size(2);
|
||||
//const int64_t max_seqlen_batch_v = value.size(2);
|
||||
//TORCH_CHECK(
|
||||
// max_seqlen_batch_k == max_seqlen_batch_v,
|
||||
// "Key and Value must have the same sequence length");
|
||||
//auto attn_bias_ = attn_bias;
|
||||
//if (attn_bias_.has_value()) {
|
||||
// const auto bias_dim = attn_bias_.value().dim();
|
||||
// if (bias_dim == 2) {
|
||||
// attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
|
||||
// } else if (bias_dim == 3) {
|
||||
// attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
|
||||
// } else {
|
||||
// TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
|
||||
// attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k});
|
||||
// }
|
||||
//}
|
||||
|
||||
at::Tensor cudnn_seed, cudnn_offset;
|
||||
cudnn_seed = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
cudnn_offset = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
//Tensor attention, log_sumexp;
|
||||
|
||||
const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
|
||||
//at::Tensor cudnn_seed, cudnn_offset;
|
||||
//cudnn_seed = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
//cudnn_offset = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
|
||||
// See Note [Seed and Offset Device] in _efficient_attention_forward
|
||||
at::PhiloxCudaState philox_state;
|
||||
const bool in_capture_stream =
|
||||
at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None;
|
||||
if (use_dropout) {
|
||||
// Device
|
||||
auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
|
||||
std::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
|
||||
//const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
|
||||
|
||||
// See Note [Acquire lock when using random generators]
|
||||
std::lock_guard<std::mutex> lock(gen->mutex_);
|
||||
// if using dropout, we produce 1 random number for each element of the
|
||||
// attention tensor
|
||||
// TODO(eqy): should state be advanced per thread (local) amount or per call/launch (global) amount
|
||||
philox_state = gen->philox_cuda_state(batch_size * num_heads * max_seqlen_batch_q * max_seqlen_batch_k);
|
||||
unpack_cudnn<<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>(
|
||||
philox_state, static_cast<int64_t*>(cudnn_seed.data_ptr()), static_cast<int64_t*>(cudnn_offset.data_ptr()));
|
||||
}
|
||||
//// See Note [Seed and Offset Device] in _efficient_attention_forward
|
||||
//at::PhiloxCudaState philox_state;
|
||||
//const bool in_capture_stream =
|
||||
// at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None;
|
||||
//if (use_dropout) {
|
||||
// // Device
|
||||
// auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
|
||||
// std::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
|
||||
|
||||
const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float();
|
||||
Tensor debugmask;
|
||||
// // See Note [Acquire lock when using random generators]
|
||||
// std::lock_guard<std::mutex> lock(gen->mutex_);
|
||||
// // if using dropout, we produce 1 random number for each element of the
|
||||
// // attention tensor
|
||||
// // TODO(eqy): should state be advanced per thread (local) amount or per call/launch (global) amount
|
||||
// philox_state = gen->philox_cuda_state(batch_size * num_heads * max_seqlen_batch_q * max_seqlen_batch_k);
|
||||
// at::cuda::philox::unpack_cudnn_wrapper(
|
||||
// philox_state, static_cast<int64_t*>(cudnn_seed.data_ptr()), static_cast<int64_t*>(cudnn_offset.data_ptr()), at::cuda::getCurrentCUDAStream());
|
||||
//}
|
||||
|
||||
run_cudnn_SDP_fprop(batch_size/*int64_t b*/,
|
||||
num_heads/*int64_t h*/,
|
||||
max_seqlen_batch_q/*int64_t s_q*/,
|
||||
max_seqlen_batch_k/*int64_t s_kv*/,
|
||||
head_dim_qk/*int64_t d_qk*/,
|
||||
head_dim_v/*int64_t d_v*/,
|
||||
softmax_scale/*float scaling_factor*/,
|
||||
compute_logsumexp/* bool */,
|
||||
is_causal/* bool */,
|
||||
dropout_p/*double dropout_probability*/,
|
||||
query/* Tensor q*/,
|
||||
key/* Tensor k*/,
|
||||
value/* Tensor v*/,
|
||||
attn_bias_ /* std::optional<Tensor> */,
|
||||
log_sumexp/*Tensor softmaxstats*/,
|
||||
attention/*Tensor o*/,
|
||||
cudnn_seed/*Tensor dropoutseed*/,
|
||||
cudnn_offset/*Tensor dropoutoffset*/);
|
||||
//const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float();
|
||||
//Tensor debugmask;
|
||||
|
||||
// TODO(eqy): support debug_attn_mask
|
||||
return std::make_tuple(std::move(attention), std::move(log_sumexp), Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_k, std::move(cudnn_seed), std::move(cudnn_offset), Tensor());
|
||||
//run_cudnn_SDP_fprop(batch_size/*int64_t b*/,
|
||||
// num_heads/*int64_t h*/,
|
||||
// max_seqlen_batch_q/*int64_t s_q*/,
|
||||
// max_seqlen_batch_k/*int64_t s_kv*/,
|
||||
// head_dim_qk/*int64_t d_qk*/,
|
||||
// head_dim_v/*int64_t d_v*/,
|
||||
// softmax_scale/*float scaling_factor*/,
|
||||
// compute_logsumexp/* bool */,
|
||||
// is_causal/* bool */,
|
||||
// dropout_p/*double dropout_probability*/,
|
||||
// query/* Tensor q*/,
|
||||
// key/* Tensor k*/,
|
||||
// value/* Tensor v*/,
|
||||
// attn_bias_ /* std::optional<Tensor> */,
|
||||
// log_sumexp/*Tensor softmaxstats*/,
|
||||
// attention/*Tensor o*/,
|
||||
// cudnn_seed/*Tensor dropoutseed*/,
|
||||
// cudnn_offset/*Tensor dropoutoffset*/);
|
||||
|
||||
//// TODO(eqy): support debug_attn_mask
|
||||
//return std::make_tuple(std::move(attention), std::move(log_sumexp), Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_k, std::move(cudnn_seed), std::move(cudnn_offset), Tensor());
|
||||
}
|
||||
|
||||
std::tuple<Tensor, Tensor, Tensor, Tensor> _scaled_dot_product_efficient_attention_cuda(
|
||||
|
||||
@ -505,10 +505,23 @@ bool check_cudnn_hardware_support(sdp_params const& params, bool debug) {
|
||||
}
|
||||
|
||||
bool check_for_nested_inputs(sdp_params const& params, bool debug) {
|
||||
// Check that the input is nested
|
||||
if (has_for_nested_inputs(params)) {
|
||||
static const bool enable_cudnn_nested = c10::utils::check_env("TORCH_CUDNN_SDPA_NESTED_TENSOR_ENABLED") == true;
|
||||
if (has_for_nested_inputs(params) && !enable_cudnn_nested) {
|
||||
if (debug) {
|
||||
TORCH_WARN("CuDNN currently does not support nested inputs.");
|
||||
TORCH_WARN("Experimental cuDNN SDPA nested tensor support is not enabled.");
|
||||
}
|
||||
return false;
|
||||
} else if (params.query.requires_grad() || params.key.requires_grad() || params.value.requires_grad()) {
|
||||
if (debug) {
|
||||
TORCH_WARN("Experimental cuDNN SDPA nested tensor support does not support backward.");
|
||||
}
|
||||
}
|
||||
|
||||
const auto dprop = at::cuda::getCurrentDeviceProperties();
|
||||
// Check that the input is nested
|
||||
if (dprop->major != 9 && has_for_nested_inputs(params)) {
|
||||
if (debug) {
|
||||
TORCH_WARN("CuDNN SDPA supports nested tensors on SM 9.0.");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -574,7 +587,6 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
|
||||
check_runtime_disabled_cudnn,
|
||||
check_for_nested_inputs,
|
||||
check_nonzero_sequence_lengths_dense,
|
||||
check_last_dim_stride_equals_1_dense<true /*ignore_singleton_dim>*/>,
|
||||
check_all_tensors_on_device,
|
||||
check_tensor_shapes,
|
||||
check_cudnn_tensor_shapes,
|
||||
@ -588,6 +600,18 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
constexpr auto dense_constraints =
|
||||
c10::array_of<bool (*)(sdp_params const&, bool)>(
|
||||
check_last_dim_stride_equals_1_dense<true /*ignore_singleton_dim=*/>
|
||||
);
|
||||
|
||||
if (has_only_dense_inputs(params)) {
|
||||
for (auto& constraint : dense_constraints) {
|
||||
if (!constraint(params, debug)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
172
benchmarks/dynamo/cachebench.py
Normal file
172
benchmarks/dynamo/cachebench.py
Normal file
@ -0,0 +1,172 @@
|
||||
import argparse
|
||||
import dataclasses
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
from torch._inductor.utils import fresh_inductor_cache
|
||||
|
||||
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
|
||||
TIMEOUT: int = 2000
|
||||
|
||||
MODELS: list[str] = ["nanogpt", "BERT_pytorch", "resnet50"]
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class RunResult:
|
||||
model: str
|
||||
mode: str # inference or training
|
||||
dynamic: bool
|
||||
device: str # cuda or cpu
|
||||
cold_compile_s: float
|
||||
warm_compile_s: float
|
||||
speedup: float
|
||||
|
||||
|
||||
def get_compile_time(file: tempfile._TemporaryFileWrapper) -> float:
|
||||
lines = file.readlines()
|
||||
# Decode from byte string, remove new lines, parse csv
|
||||
lines = [line.decode("utf-8").strip().split(",") for line in lines]
|
||||
compilation_time_idx = lines[0].index("compilation_latency")
|
||||
compilation_time = lines[1][compilation_time_idx]
|
||||
return float(compilation_time)
|
||||
|
||||
|
||||
def _run_torchbench_from_args(model: str, args: list[str]) -> tuple[float, float]:
|
||||
with fresh_inductor_cache():
|
||||
env = os.environ.copy()
|
||||
with tempfile.NamedTemporaryFile(suffix=".csv") as file:
|
||||
args.append("--output=" + file.name)
|
||||
logger.info(f"Performing cold-start run for {model}") # noqa: G004
|
||||
subprocess.check_call(args, timeout=TIMEOUT, env=env)
|
||||
cold_compile_time = get_compile_time(file)
|
||||
|
||||
args.pop()
|
||||
with tempfile.NamedTemporaryFile(suffix=".csv") as file:
|
||||
args.append("--output=" + file.name)
|
||||
logger.info(f"Performing warm-start run for {model}") # noqa: G004
|
||||
subprocess.check_call(args, timeout=TIMEOUT, env=env)
|
||||
warm_compile_time = get_compile_time(file)
|
||||
|
||||
return cold_compile_time, warm_compile_time
|
||||
|
||||
|
||||
def _run_torchbench_model(results: list[RunResult], model: str, device: str) -> None:
|
||||
cur_file = os.path.abspath(__file__)
|
||||
torchbench_file = os.path.join(os.path.dirname(cur_file), "torchbench.py")
|
||||
assert os.path.exists(
|
||||
torchbench_file
|
||||
), f"Torchbench does not exist at {torchbench_file}"
|
||||
|
||||
base_args = [
|
||||
sys.executable,
|
||||
torchbench_file,
|
||||
f"--only={model}",
|
||||
"--repeat=1",
|
||||
"--performance",
|
||||
"--backend=inductor",
|
||||
f"--device={device}",
|
||||
]
|
||||
for mode, mode_args in [
|
||||
("inference", ["--inference", "--bfloat16"]),
|
||||
("training", ["--training", "--amp"]),
|
||||
]:
|
||||
for dynamic, dynamic_args in [
|
||||
(False, []),
|
||||
(True, ["--dynamic-shapes", "--dynamic-batch-only"]),
|
||||
]:
|
||||
args = list(base_args)
|
||||
args.extend(mode_args)
|
||||
args.extend(dynamic_args)
|
||||
|
||||
logger.info(f"Command: {args}") # noqa: G004
|
||||
try:
|
||||
cold_compile_t, warm_compile_t = _run_torchbench_from_args(model, args)
|
||||
results.append(
|
||||
RunResult(
|
||||
"model",
|
||||
mode,
|
||||
dynamic,
|
||||
device,
|
||||
cold_compile_t,
|
||||
warm_compile_t,
|
||||
cold_compile_t / warm_compile_t,
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return None
|
||||
|
||||
|
||||
def _write_results_to_json(results: list[RunResult], output_filename: str) -> None:
|
||||
records = []
|
||||
for result in results:
|
||||
for metric_name, value in [
|
||||
("cold_compile_time(s)", result.cold_compile_s),
|
||||
("warm_compile_time(s)", result.warm_compile_s),
|
||||
("speedup", result.speedup),
|
||||
]:
|
||||
records.append(
|
||||
{
|
||||
"benchmark": {
|
||||
"name": "cache_benchmarks",
|
||||
"mode": result.mode,
|
||||
"extra_info": {
|
||||
"is_dynamic": result.dynamic,
|
||||
"device": result.device,
|
||||
},
|
||||
},
|
||||
"model": {
|
||||
"name": result.model,
|
||||
"backend": "inductor",
|
||||
},
|
||||
"metric": {
|
||||
"name": metric_name,
|
||||
"type": "OSS model",
|
||||
"benchmark_values": [value],
|
||||
},
|
||||
}
|
||||
)
|
||||
with open(output_filename, "w") as f:
|
||||
json.dump(records, f)
|
||||
|
||||
|
||||
def parse_cmd_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run a TorchBench ServiceLab benchmark."
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--model",
|
||||
help="Name of the model to run",
|
||||
)
|
||||
parser.add_argument("-d", "--device", default="cuda", help="cpu or cuda")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
required=True,
|
||||
help="The output filename (json)",
|
||||
)
|
||||
args, _ = parser.parse_known_args()
|
||||
return args
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_cmd_args()
|
||||
|
||||
results: list[RunResult] = []
|
||||
|
||||
if args.model is not None:
|
||||
_run_torchbench_model(results, args.model, args.device)
|
||||
else:
|
||||
for model in MODELS:
|
||||
_run_torchbench_model(results, model, args.device)
|
||||
_write_results_to_json(results, args.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -38,7 +38,7 @@ update_hint_regression,compile_time_instruction_count,1787000000,0.02
|
||||
|
||||
|
||||
|
||||
sum_floordiv_regression,compile_time_instruction_count,1055000000,0.015
|
||||
sum_floordiv_regression,compile_time_instruction_count,1076000000,0.015
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -841,6 +841,7 @@ libtorch_python_core_sources = [
|
||||
"torch/csrc/dynamo/cpp_shim.cpp",
|
||||
"torch/csrc/dynamo/cpython_defs.c",
|
||||
"torch/csrc/dynamo/eval_frame.c",
|
||||
"torch/csrc/dynamo/eval_frame_cpp.cpp",
|
||||
"torch/csrc/dynamo/extra_state.cpp",
|
||||
"torch/csrc/dynamo/framelocals_mapping.cpp",
|
||||
"torch/csrc/dynamo/guards.cpp",
|
||||
|
||||
@ -49,16 +49,9 @@ class C10_API Scalar {
|
||||
#define DEFINE_IMPLICIT_CTOR(type, name) \
|
||||
Scalar(type vv) : Scalar(vv, true) {}
|
||||
|
||||
AT_FORALL_SCALAR_TYPES_AND7(
|
||||
Half,
|
||||
BFloat16,
|
||||
Float8_e5m2,
|
||||
Float8_e4m3fn,
|
||||
Float8_e5m2fnuz,
|
||||
Float8_e4m3fnuz,
|
||||
ComplexHalf,
|
||||
DEFINE_IMPLICIT_CTOR)
|
||||
AT_FORALL_SCALAR_TYPES_AND3(Half, BFloat16, ComplexHalf, DEFINE_IMPLICIT_CTOR)
|
||||
AT_FORALL_COMPLEX_TYPES(DEFINE_IMPLICIT_CTOR)
|
||||
AT_FORALL_FLOAT8_TYPES(DEFINE_IMPLICIT_CTOR)
|
||||
|
||||
// Helper constructors to allow Scalar creation from long and long long types
|
||||
// As std::is_same_v<long, long long> is false(except Android), one needs to
|
||||
|
||||
@ -222,6 +222,9 @@ std::pair<std::string, std::string> getDtypeNames(c10::ScalarType scalarType) {
|
||||
return std::make_pair("float8_e5m2fnuz", "");
|
||||
case c10::ScalarType::Float8_e4m3fnuz:
|
||||
return std::make_pair("float8_e4m3fnuz", "");
|
||||
case c10::ScalarType::Float8_e8m0fnu:
|
||||
// TODO(#146647): macroify all of this
|
||||
return std::make_pair("float8_e8m0fnu", "");
|
||||
default:
|
||||
throw std::runtime_error("Unimplemented scalar type");
|
||||
}
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
#include <c10/util/Float8_e4m3fnuz.h>
|
||||
#include <c10/util/Float8_e5m2.h>
|
||||
#include <c10/util/Float8_e5m2fnuz.h>
|
||||
#include <c10/util/Float8_e8m0fnu.h>
|
||||
#include <c10/util/Half.h>
|
||||
#include <c10/util/bits.h>
|
||||
#include <c10/util/complex.h>
|
||||
@ -102,7 +103,8 @@ struct dummy_int1_7_t {};
|
||||
_(c10::dummy_int1_7_t<4>, Int4) /* 40 */ \
|
||||
_(c10::dummy_int1_7_t<5>, Int5) /* 41 */ \
|
||||
_(c10::dummy_int1_7_t<6>, Int6) /* 42 */ \
|
||||
_(c10::dummy_int1_7_t<7>, Int7) /* 43 */
|
||||
_(c10::dummy_int1_7_t<7>, Int7) /* 43 */ \
|
||||
_(c10::Float8_e8m0fnu, Float8_e8m0fnu) /* 44 */
|
||||
|
||||
// If you want to support ComplexHalf for real, add ComplexHalf
|
||||
// into this macro (and change the name). But beware: convert()
|
||||
@ -146,7 +148,8 @@ struct dummy_int1_7_t {};
|
||||
_(at::Float8_e5m2, Float8_e5m2) \
|
||||
_(at::Float8_e4m3fn, Float8_e4m3fn) \
|
||||
_(at::Float8_e5m2fnuz, Float8_e5m2fnuz) \
|
||||
_(at::Float8_e4m3fnuz, Float8_e4m3fnuz)
|
||||
_(at::Float8_e4m3fnuz, Float8_e4m3fnuz) \
|
||||
_(at::Float8_e8m0fnu, Float8_e8m0fnu)
|
||||
|
||||
enum class ScalarType : int8_t {
|
||||
#define DEFINE_ST_ENUM_VAL_(_1, n) n,
|
||||
@ -317,6 +320,13 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
|
||||
_(c10::quint4x2, QUInt4x2) \
|
||||
_(c10::quint2x4, QUInt2x4)
|
||||
|
||||
#define AT_FORALL_FLOAT8_TYPES(_) \
|
||||
_(at::Float8_e5m2, Float8_e5m2) \
|
||||
_(at::Float8_e4m3fn, Float8_e4m3fn) \
|
||||
_(at::Float8_e5m2fnuz, Float8_e5m2fnuz) \
|
||||
_(at::Float8_e4m3fnuz, Float8_e4m3fnuz) \
|
||||
_(at::Float8_e8m0fnu, Float8_e8m0fnu)
|
||||
|
||||
#define AT_FORALL_COMPLEX_TYPES(_) \
|
||||
_(c10::complex<float>, ComplexFloat) \
|
||||
_(c10::complex<double>, ComplexDouble)
|
||||
@ -372,7 +382,8 @@ inline bool isIntegralType(ScalarType t) {
|
||||
|
||||
inline bool isFloat8Type(ScalarType t) {
|
||||
return t == ScalarType::Float8_e5m2 || t == ScalarType::Float8_e5m2fnuz ||
|
||||
t == ScalarType::Float8_e4m3fn || t == ScalarType::Float8_e4m3fnuz;
|
||||
t == ScalarType::Float8_e4m3fn || t == ScalarType::Float8_e4m3fnuz ||
|
||||
t == ScalarType::Float8_e8m0fnu;
|
||||
}
|
||||
|
||||
inline bool isReducedFloatingType(ScalarType t) {
|
||||
@ -446,6 +457,10 @@ inline bool isSignedType(ScalarType t) {
|
||||
return std::numeric_limits< \
|
||||
::c10::impl::ScalarTypeToCPPTypeT<ScalarType::name>>::is_signed;
|
||||
|
||||
// TODO(#146647): If we expect to have numeric_limits for everything,
|
||||
// let's just have a big macro for the whole thing.
|
||||
// If we're hardcoding it, let's just use the macro and a "true"/"false"
|
||||
// below?
|
||||
switch (t) {
|
||||
case ScalarType::QInt8:
|
||||
case ScalarType::QUInt8:
|
||||
@ -467,6 +482,7 @@ inline bool isSignedType(ScalarType t) {
|
||||
CASE_ISSIGNED(Float8_e5m2fnuz);
|
||||
CASE_ISSIGNED(Float8_e4m3fn);
|
||||
CASE_ISSIGNED(Float8_e4m3fnuz);
|
||||
CASE_ISSIGNED(Float8_e8m0fnu);
|
||||
CASE_ISSIGNED(Byte);
|
||||
CASE_ISSIGNED(Char);
|
||||
CASE_ISSIGNED(Short);
|
||||
|
||||
112
c10/util/Float8_e8m0fnu-inl.h
Normal file
112
c10/util/Float8_e8m0fnu-inl.h
Normal file
@ -0,0 +1,112 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/floating_point_utils.h>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
|
||||
// TODO(#146647): Can we remove the below warning?
|
||||
C10_CLANG_DIAGNOSTIC_PUSH()
|
||||
#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
|
||||
C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
|
||||
#endif
|
||||
|
||||
namespace c10 {
|
||||
|
||||
/// Constructors
|
||||
|
||||
inline C10_HOST_DEVICE Float8_e8m0fnu::Float8_e8m0fnu(float value)
|
||||
: x(detail::fp8e8m0fnu_from_fp32_value(value)) {}
|
||||
|
||||
/// Implicit conversions
|
||||
|
||||
inline C10_HOST_DEVICE Float8_e8m0fnu::operator float() const {
|
||||
// TODO(#146647): maybe rewrite without control flow
|
||||
|
||||
// if exponent is zero, need to special case to return 2^-127 instead of zero
|
||||
if (x == 0) {
|
||||
return c10::detail::fp32_from_bits(0x00400000);
|
||||
}
|
||||
|
||||
// if exponent is NaN, need to special case to return properly encoded NaN
|
||||
if (isnan()) {
|
||||
return c10::detail::fp32_from_bits(0x7f800001);
|
||||
}
|
||||
|
||||
// leave sign at 0, set the exponent bits, leave stored mantissa at 0
|
||||
uint32_t res = x << 23;
|
||||
|
||||
return c10::detail::fp32_from_bits(res);
|
||||
}
|
||||
|
||||
/// Special values helper
|
||||
|
||||
inline C10_HOST_DEVICE bool Float8_e8m0fnu::isnan() const {
|
||||
return x == 0b11111111;
|
||||
}
|
||||
|
||||
/// NOTE: we do not define comparisons directly and instead rely on the implicit
|
||||
/// conversion from c10::Float8_e8m0fnu to float.
|
||||
|
||||
} // namespace c10
|
||||
|
||||
namespace std {
|
||||
|
||||
template <>
|
||||
class numeric_limits<c10::Float8_e8m0fnu> {
|
||||
public:
|
||||
static constexpr bool is_specialized = true;
|
||||
static constexpr bool is_signed = false;
|
||||
static constexpr bool is_integer = false;
|
||||
static constexpr bool is_exact = false;
|
||||
static constexpr bool has_infinity = false;
|
||||
static constexpr bool has_quiet_NaN = true;
|
||||
static constexpr bool has_signaling_NaN = false;
|
||||
static constexpr auto has_denorm = false;
|
||||
static constexpr auto has_denorm_loss = false;
|
||||
static constexpr auto round_style = numeric_limits<float>::round_style;
|
||||
static constexpr bool is_iec559 = false;
|
||||
static constexpr bool is_bounded = true;
|
||||
static constexpr bool is_modulo = false;
|
||||
static constexpr int digits = 1;
|
||||
static constexpr int digits10 = 0;
|
||||
static constexpr int max_digits10 = 1; // just a 2!
|
||||
static constexpr int radix = 2;
|
||||
static constexpr int min_exponent = -126;
|
||||
static constexpr int min_exponent10 = -38;
|
||||
static constexpr int max_exponent = 128;
|
||||
static constexpr int max_exponent10 = 38;
|
||||
static constexpr auto traps = numeric_limits<float>::traps;
|
||||
static constexpr auto tinyness_before = false;
|
||||
|
||||
static constexpr c10::Float8_e8m0fnu min() {
|
||||
// 2^-127
|
||||
return c10::Float8_e8m0fnu(0b00000000, c10::Float8_e8m0fnu::from_bits());
|
||||
}
|
||||
static constexpr c10::Float8_e8m0fnu lowest() {
|
||||
// 2^-127
|
||||
return c10::Float8_e8m0fnu(0b00000000, c10::Float8_e8m0fnu::from_bits());
|
||||
}
|
||||
static constexpr c10::Float8_e8m0fnu max() {
|
||||
// 254 biased, which is 127 unbiased, so 2^127
|
||||
return c10::Float8_e8m0fnu(0b11111110, c10::Float8_e8m0fnu::from_bits());
|
||||
}
|
||||
static constexpr c10::Float8_e8m0fnu epsilon() {
|
||||
// according to https://en.cppreference.com/w/cpp/types/numeric_limits, this
|
||||
// is "the difference between 1.0 and the next representable value of the
|
||||
// given floating-point type". The next representable value is 2.0, so the
|
||||
// difference is 1.0 which is 2^0. 0 unbiased is 127 biased.
|
||||
return c10::Float8_e8m0fnu(0b01111111, c10::Float8_e8m0fnu::from_bits());
|
||||
}
|
||||
static constexpr c10::Float8_e8m0fnu round_error() {
|
||||
// 0.5 in float, which is 2^-1, and -1 + 127 = 126
|
||||
return c10::Float8_e8m0fnu(0b01111110, c10::Float8_e8m0fnu::from_bits());
|
||||
}
|
||||
static constexpr c10::Float8_e8m0fnu quiet_NaN() {
|
||||
return c10::Float8_e8m0fnu(0b11111111, c10::Float8_e8m0fnu::from_bits());
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace std
|
||||
|
||||
C10_CLANG_DIAGNOSTIC_POP()
|
||||
12
c10/util/Float8_e8m0fnu.cpp
Normal file
12
c10/util/Float8_e8m0fnu.cpp
Normal file
@ -0,0 +1,12 @@
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/Float8_e8m0fnu.h>
|
||||
|
||||
namespace c10 {
|
||||
|
||||
// TODO(#146647): Can we have these in a single shared cpp file
|
||||
// built with macro to remove the need for a new cpp file?
|
||||
static_assert(
|
||||
std::is_standard_layout_v<Float8_e8m0fnu>,
|
||||
"c10::Float8_e8m0fnu must be standard layout.");
|
||||
|
||||
} // namespace c10
|
||||
120
c10/util/Float8_e8m0fnu.h
Normal file
120
c10/util/Float8_e8m0fnu.h
Normal file
@ -0,0 +1,120 @@
|
||||
#pragma once
|
||||
|
||||
/// Defines the Float8_e8m0fnu type (8-bit floating-point) including
|
||||
/// conversions to standard C types
|
||||
/// Binary configuration :
|
||||
/// eeeeeeee
|
||||
/// no sign bits
|
||||
/// 8 exponent bits
|
||||
/// no mantissa bits
|
||||
///
|
||||
/// This is the E8M0 dtype from the OCP MX format spec
|
||||
/// (https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf,
|
||||
/// Section 5.4.1)
|
||||
|
||||
#include <c10/macros/Export.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/floating_point_utils.h>
|
||||
#include <type_traits>
|
||||
|
||||
// TODO(#146647): do we need to special case OPENCL?
|
||||
#if defined(__cplusplus)
|
||||
#include <cstdint>
|
||||
#elif !defined(__OPENCL_VERSION__)
|
||||
#include <math.h>
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
#include <iosfwd>
|
||||
#include <ostream>
|
||||
|
||||
namespace c10 {
|
||||
|
||||
namespace detail {
|
||||
|
||||
/*
|
||||
* Convert a 32-bit floating-point number in IEEE single-precision format to a
|
||||
* 8-bit floating-point number in fp8 e8m0fnu format, in bit representation.
|
||||
*/
|
||||
inline C10_HOST_DEVICE uint8_t fp8e8m0fnu_from_fp32_value(float f) {
|
||||
// TODO(#146647): maybe rewrite without control flow
|
||||
|
||||
uint32_t f_bits = c10::detail::fp32_to_bits(f);
|
||||
|
||||
// extract the exponent
|
||||
uint32_t exponent = (f_bits >> 23) & 0b11111111;
|
||||
|
||||
// special case float32 NaN and +-inf to map to e8m0 nan
|
||||
if (exponent == 0b11111111) {
|
||||
return exponent;
|
||||
}
|
||||
|
||||
// next, we use guard, round, sticky bits and the LSB to implement round to
|
||||
// nearest, with ties to even
|
||||
|
||||
// guard bit - bit 23, or 22 zero-indexed
|
||||
uint8_t g = (f_bits & 0x400000) > 0;
|
||||
// round bit - bit 22, or 21 zero-indexed
|
||||
uint8_t r = (f_bits & 0x200000) > 0;
|
||||
// sticky bit - bits 21 to 1, or 20 to 0 zero-indexed
|
||||
uint8_t s = (f_bits & 0x1FFFFF) > 0;
|
||||
// in casting to e8m0, LSB is the implied mantissa bit. It equals to 0 if the
|
||||
// original float32 is denormal, and to 1 if the original float32 is normal.
|
||||
uint8_t lsb = exponent > 0;
|
||||
|
||||
// implement the RNE logic
|
||||
bool round_up = false;
|
||||
|
||||
// if g == 0, round down (no-op)
|
||||
if (g == 1) {
|
||||
if ((r == 1) || (s == 1)) {
|
||||
// round up
|
||||
round_up = true;
|
||||
} else {
|
||||
if (lsb == 1) {
|
||||
// round up
|
||||
round_up = true;
|
||||
}
|
||||
// if lsb == 0, round down (no-op)
|
||||
}
|
||||
}
|
||||
|
||||
if (round_up) {
|
||||
// adjust exponent
|
||||
// note that if exponent was 255 we would have already returned earlier, so
|
||||
// we know we can add one safely without running out of bounds
|
||||
exponent++;
|
||||
}
|
||||
|
||||
return exponent;
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
struct alignas(1) Float8_e8m0fnu {
|
||||
uint8_t x;
|
||||
|
||||
struct from_bits_t {};
|
||||
C10_HOST_DEVICE static constexpr from_bits_t from_bits() {
|
||||
return from_bits_t();
|
||||
}
|
||||
|
||||
Float8_e8m0fnu() = default;
|
||||
|
||||
constexpr C10_HOST_DEVICE Float8_e8m0fnu(uint8_t bits, from_bits_t)
|
||||
: x(bits) {}
|
||||
inline C10_HOST_DEVICE Float8_e8m0fnu(float value);
|
||||
inline C10_HOST_DEVICE operator float() const;
|
||||
inline C10_HOST_DEVICE bool isnan() const;
|
||||
};
|
||||
|
||||
C10_API inline std::ostream& operator<<(
|
||||
std::ostream& out,
|
||||
const Float8_e8m0fnu& value) {
|
||||
out << (float)value;
|
||||
return out;
|
||||
}
|
||||
|
||||
} // namespace c10
|
||||
|
||||
#include <c10/util/Float8_e8m0fnu-inl.h> // IWYU pragma: keep
|
||||
@ -5,6 +5,7 @@
|
||||
#include <c10/util/Float8_e4m3fnuz.h>
|
||||
#include <c10/util/Float8_e5m2.h>
|
||||
#include <c10/util/Float8_e5m2fnuz.h>
|
||||
#include <c10/util/Float8_e8m0fnu.h>
|
||||
#include <c10/util/Half.h>
|
||||
#include <c10/util/complex.h>
|
||||
#include <c10/util/overflows.h>
|
||||
@ -151,6 +152,19 @@ struct static_cast_with_inter_type<
|
||||
}
|
||||
};
|
||||
|
||||
// TODO(#146647): Can we make all these template specialization happen
|
||||
// based off our apply macros?
|
||||
template <>
|
||||
struct static_cast_with_inter_type<
|
||||
c10::complex<c10::Half>,
|
||||
c10::Float8_e8m0fnu> {
|
||||
C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline c10::complex<
|
||||
c10::Half>
|
||||
apply(c10::Float8_e8m0fnu src) {
|
||||
return static_cast<c10::complex<c10::Half>>(c10::complex<float>{src});
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct static_cast_with_inter_type<c10::complex<c10::Half>, c10::Half> {
|
||||
C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline c10::complex<
|
||||
|
||||
@ -58,7 +58,7 @@ IF(NOT MKLDNN_FOUND)
|
||||
-DDNNL_CPU_RUNTIME=THREADPOOL
|
||||
-DDNNL_BUILD_TESTS=OFF
|
||||
-DDNNL_BUILD_EXAMPLES=OFF
|
||||
-DONEDNN_BUILD_GRAPH=OFF
|
||||
-DONEDNN_BUILD_GRAPH=ON
|
||||
-DDNNL_LIBRARY_TYPE=STATIC
|
||||
-DDNNL_DPCPP_HOST_COMPILER=${DNNL_HOST_COMPILER} # Use global cxx compiler as host compiler
|
||||
-G ${CMAKE_GENERATOR} # Align Generator to Torch
|
||||
|
||||
118
hack.py
Normal file
118
hack.py
Normal file
@ -0,0 +1,118 @@
|
||||
|
||||
import torch
|
||||
from torch._inductor.ir import NoneAsConstantBuffer
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import depyf
|
||||
depyf.install()
|
||||
|
||||
|
||||
def fn(loss):
|
||||
gm = None
|
||||
args = None
|
||||
|
||||
def noop(_gm):
|
||||
nonlocal gm
|
||||
gm = _gm
|
||||
def _noop(*_args, **_kwargs):
|
||||
assert not _kwargs
|
||||
nonlocal args
|
||||
args = _args
|
||||
return []
|
||||
return _noop
|
||||
|
||||
with torch._dynamo.compiled_autograd._enable(noop):
|
||||
loss.backward()
|
||||
|
||||
return gm, args
|
||||
|
||||
|
||||
result = torch._dynamo.compiled_autograd.Op("FunctionalCompiledAutograd", fn, is_custom_function=False)
|
||||
setattr(torch._dynamo.compiled_autograd.ops, "FunctionalCompiledAutograd", torch._dynamo.allow_in_graph(result))
|
||||
|
||||
|
||||
x = torch.randn(64, 3)
|
||||
t = torch.randn(64, 1)
|
||||
|
||||
model = nn.Linear(3, 1)
|
||||
|
||||
torch._dynamo.config.compiled_autograd = True
|
||||
torch._dynamo.config.do_not_emit_runtime_asserts = True
|
||||
|
||||
@torch.compile(backend="eager")
|
||||
def train(model, x, t):
|
||||
y = model(x)
|
||||
loss = F.mse_loss(y, t)
|
||||
gm, args = torch._dynamo.compiled_autograd.ops.FunctionalCompiledAutograd(loss)
|
||||
gm(*args)
|
||||
return ()
|
||||
|
||||
# with torch._dynamo.compiled_autograd._enable(noop):
|
||||
train(model, x, t)
|
||||
|
||||
for p in model.parameters():
|
||||
assert p.grad is not None
|
||||
|
||||
|
||||
"""
|
||||
# this kinda works, but not ideal
|
||||
===== __compiled_fn_1 =====
|
||||
/home/xmfan/core/a/pytorch/torch/fx/_lazy_graph_module.py class GraphModule(torch.nn.Module):
|
||||
def forward(self, L_model_parameters_weight_: "f32[1, 3][3, 1]cpu", L_model_parameters_bias_: "f32[1][1]cpu", L_x_: "f32[64, 3][3, 1]cpu", L_t_: "f32[64, 1][1, 1]cpu"):
|
||||
l_model_parameters_weight_ = L_model_parameters_weight_
|
||||
l_model_parameters_bias_ = L_model_parameters_bias_
|
||||
l_x_ = L_x_
|
||||
l_t_ = L_t_
|
||||
|
||||
# File: /home/xmfan/core/a/pytorch/hack.py:44 in train, code: y = model(x)
|
||||
y: "f32[64, 1][1, 1]cpu" = torch._C._nn.linear(l_x_, l_model_parameters_weight_, l_model_parameters_bias_); l_x_ = l_model_parameters_weight_ = l_model_parameters_bias_ = None
|
||||
|
||||
# File: /home/xmfan/core/a/pytorch/hack.py:45 in train, code: loss = F.mse_loss(y, t)
|
||||
loss: "f32[][]cpu" = torch.nn.functional.mse_loss(y, l_t_); y = l_t_ = None
|
||||
|
||||
# File: /home/xmfan/core/a/pytorch/hack.py:46 in train, code: gm, args = torch._dynamo.compiled_autograd.ops.FunctionalCompiledAutograd(loss)
|
||||
functional_compiled_autograd = torch__dynamo_compiled_autograd_ops_FunctionalCompiledAutograd(loss); loss = None
|
||||
getitem = functional_compiled_autograd[1]; functional_compiled_autograd = None
|
||||
getitem_1 = getitem[0]; getitem = None
|
||||
getitem_8: "f32[][]cpu" = getitem_1[0]
|
||||
getitem_9: "f32[64, 1][1, 1]cpu" = getitem_1[1]
|
||||
getitem_10: "f32[64, 1][1, 1]cpu" = getitem_1[2]
|
||||
getitem_11: "f32[64, 3][3, 1]cpu" = getitem_1[3]; getitem_1 = None
|
||||
|
||||
# File: <eval_with_key>.0:11 in forward, code: validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [], True)]); getitem = None
|
||||
validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_8], [((None, None, device(type='cpu'), 6, 0, None), [], True)]); getitem_8 = None
|
||||
getitem_15: "f32[][]cpu" = validate_outputs[0]; validate_outputs = None
|
||||
|
||||
# File: <eval_with_key>.0:13 in forward, code: mse_loss_backward0 = torch__dynamo_compiled_autograd_ops_MseLossBackward0([getitem_6], [True, False], 1, getitem_1, getitem_2); getitem_6 = getitem_1 = getitem_2 = None
|
||||
mse_loss_backward0 = torch__dynamo_compiled_autograd_ops_MseLossBackward0([getitem_15], [True, False], 1, getitem_9, getitem_10); getitem_15 = getitem_9 = getitem_10 = None
|
||||
getitem_17: "f32[64, 1][1, 1]cpu" = mse_loss_backward0[0]; mse_loss_backward0 = None
|
||||
|
||||
# File: <eval_with_key>.0:16 in forward, code: validate_outputs_1 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_7, getitem_8], [((None, None, device(type='cpu'), 6, 0, None), [64, 1], True), None]); getitem_7 = getitem_8 = None
|
||||
validate_outputs_1 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_17, None], [((None, None, device(type='cpu'), 6, 0, None), [64, 1], True), None]); getitem_17 = None
|
||||
getitem_19: "f32[64, 1][1, 1]cpu" = validate_outputs_1[0]; validate_outputs_1 = None
|
||||
|
||||
# File: <eval_with_key>.0:18 in forward, code: addmm_backward0 = torch__dynamo_compiled_autograd_ops_AddmmBackward0([getitem_9], [True, False, True], 1, 1, getitem_3, 0, [64, 3], [], None, 0, [3, 1], [1, 3]); getitem_9 = getitem_3 = None
|
||||
addmm_backward0 = torch__dynamo_compiled_autograd_ops_AddmmBackward0([getitem_19], [True, False, True], 1, 1, getitem_11, 0, [64, 3], [], None, 0, [3, 1], [1, 3]); getitem_19 = getitem_11 = None
|
||||
getitem_22: "f32[64, 1][1, 1]cpu" = addmm_backward0[0]
|
||||
getitem_23: "f32[3, 1][1, 3]cpu" = addmm_backward0[2]; addmm_backward0 = None
|
||||
|
||||
# File: <eval_with_key>.0:22 in forward, code: validate_outputs_2 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_11, getitem_12, getitem_13], [((None, None, device(type='cpu'), 6, 0, None), [1], True), None, ((None, None, device(type='cpu'), 6, 0, None), [3, 1], True)]); getitem_11 = getitem_12 = getitem_13 = None
|
||||
validate_outputs_2 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_22, None, getitem_23], [((None, None, device(type='cpu'), 6, 0, None), [1], True), None, ((None, None, device(type='cpu'), 6, 0, None), [3, 1], True)]); getitem_22 = getitem_23 = None
|
||||
getitem_26: "f32[1][1]cpu" = validate_outputs_2[0]
|
||||
getitem_27: "f32[3, 1][1, 3]cpu" = validate_outputs_2[2]; validate_outputs_2 = None
|
||||
|
||||
# File: /home/xmfan/core/a/pytorch/torch/_dynamo/polyfills/__init__.py:80 in accumulate_grad, code: new_grad = torch.clone(new_grad)
|
||||
new_grad: "f32[1][1]cpu" = torch.clone(getitem_26); getitem_26 = new_grad = None
|
||||
|
||||
# File: <eval_with_key>.0:26 in forward, code: tbackward0 = torch__dynamo_compiled_autograd_ops_TBackward0([getitem_16], [True]); getitem_16 = None
|
||||
tbackward0 = torch__dynamo_compiled_autograd_ops_TBackward0([getitem_27], [True]); getitem_27 = None
|
||||
getitem_29: "f32[1, 3][3, 1]cpu" = tbackward0[0]; tbackward0 = None
|
||||
|
||||
# File: <eval_with_key>.0:28 in forward, code: validate_outputs_3 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_17], [((None, None, device(type='cpu'), 6, 0, None), [1, 3], True)]); getitem_17 = None
|
||||
validate_outputs_3 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_29], [((None, None, device(type='cpu'), 6, 0, None), [1, 3], True)]); getitem_29 = None
|
||||
getitem_31: "f32[1, 3][3, 1]cpu" = validate_outputs_3[0]; validate_outputs_3 = None
|
||||
|
||||
# File: /home/xmfan/core/a/pytorch/torch/_dynamo/polyfills/__init__.py:80 in accumulate_grad, code: new_grad = torch.clone(new_grad)
|
||||
new_grad_1: "f32[1, 3][3, 1]cpu" = torch.clone(getitem_31); getitem_31 = new_grad_1 = None
|
||||
return ()
|
||||
"""
|
||||
@ -1,7 +1,6 @@
|
||||
# Owner(s): ["oncall: distributed"]
|
||||
|
||||
import sys
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
@ -119,7 +118,7 @@ def _find_name_param_mappings(module: torch.nn.Module, prefix: str):
|
||||
|
||||
|
||||
def _discover_ddp_ignored_params(module: torch.nn.Module, prefix: str):
|
||||
ddp_ignore_parameters: List[str] = []
|
||||
ddp_ignore_parameters: list[str] = []
|
||||
if isinstance(module, FSDP2):
|
||||
ddp_ignore_parameters = [name for name, _ in module.named_parameters(prefix)]
|
||||
else:
|
||||
|
||||
@ -1089,8 +1089,7 @@ class TestHSDPWithCustomHook(FSDPTestMultiThread):
|
||||
torch.nn.init.constant_(model.in_proj.weight, 1.0 * rank_group)
|
||||
torch.nn.init.constant_(model.out_proj.weight, 2.0 * rank_group)
|
||||
|
||||
fully_shard(model, mesh=mesh)
|
||||
model = cast(FSDPModule, model)
|
||||
model = fully_shard(model, mesh=mesh)
|
||||
|
||||
hook_called: bool = False
|
||||
|
||||
|
||||
@ -292,15 +292,7 @@ class TestFullyShard1DTrainingCore(FSDPTest):
|
||||
ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
|
||||
|
||||
def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
|
||||
largest_dim = -1
|
||||
largest_dim_size = -1
|
||||
for dim, dim_size in enumerate(param.shape):
|
||||
if dim_size > largest_dim_size:
|
||||
largest_dim = dim
|
||||
largest_dim_size = dim_size
|
||||
assert largest_dim >= 0, f"{param.shape}"
|
||||
assert largest_dim < param.ndim, f"{largest_dim=} {param.shape}"
|
||||
return Shard(largest_dim)
|
||||
return Shard(param.shape.index(max(param.shape)))
|
||||
|
||||
shard_placement_fn = _shard_placement_fn if use_shard_placement_fn else None
|
||||
fully_shard(model, shard_placement_fn=shard_placement_fn)
|
||||
@ -769,13 +761,7 @@ class TestFullyShardShardPlacementFnMultiProcess(FSDPTest):
|
||||
ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
|
||||
|
||||
def shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
|
||||
largest_dim = -1
|
||||
largest_dim_size = -1
|
||||
for dim, dim_size in enumerate(param.shape):
|
||||
if dim_size > largest_dim_size:
|
||||
largest_dim = dim
|
||||
largest_dim_size = dim_size
|
||||
return Shard(largest_dim)
|
||||
return Shard(param.shape.index(max(param.shape)))
|
||||
|
||||
for layer in model.layers:
|
||||
fully_shard(layer, shard_placement_fn=shard_placement_fn)
|
||||
|
||||
@ -97,11 +97,7 @@ class ReplicateTest(MultiProcessInductorTestCase):
|
||||
device: Union[str, torch.device],
|
||||
):
|
||||
self.create_pg(device)
|
||||
torch._dynamo.config.optimize_ddp = (
|
||||
"python_reducer_without_compiled_forward"
|
||||
if no_compile_forward
|
||||
else "python_reducer"
|
||||
)
|
||||
torch._dynamo.config.optimize_ddp = "python_reducer"
|
||||
torch.manual_seed(123)
|
||||
model = Net(checkpoint=checkpoint).to(device)
|
||||
input = torch.randn([1, DIM], device=device)
|
||||
|
||||
@ -47,7 +47,12 @@ from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
MultiProcessTestCase,
|
||||
with_comms,
|
||||
)
|
||||
from torch.testing._internal.distributed.common_state_dict import VerifyStateDictMixin
|
||||
from torch.testing._internal.distributed.common_state_dict import (
|
||||
FusionEmbedding,
|
||||
FusionEmbeddingWithHook,
|
||||
FusionEmbeddingWithModifier,
|
||||
VerifyStateDictMixin,
|
||||
)
|
||||
from torch.utils._pytree import tree_all, tree_all_only
|
||||
|
||||
|
||||
@ -919,6 +924,20 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
|
||||
),
|
||||
)
|
||||
|
||||
@with_comms
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_state_dict_with_hook_on_keys(self) -> None:
|
||||
with torch.device("meta"):
|
||||
metamodel = FusionEmbedding(4, 4, 4)
|
||||
with torch.device("cuda"):
|
||||
gpumodel = FusionEmbeddingWithHook(4, 4, 4)
|
||||
gpumodel_state_dict = get_model_state_dict(gpumodel)
|
||||
with self.assertRaisesRegex(RuntimeError, "Missing key"):
|
||||
set_model_state_dict(metamodel, gpumodel_state_dict)
|
||||
with torch.device("meta"):
|
||||
metamodel_modified = FusionEmbeddingWithModifier(4, 4, 4)
|
||||
set_model_state_dict(metamodel_modified, gpumodel_state_dict)
|
||||
|
||||
|
||||
class TestNoComm(MultiProcessTestCase):
|
||||
def setUp(self) -> None:
|
||||
|
||||
@ -170,7 +170,7 @@ class DistributedUtilTest(TestCase):
|
||||
server_port=pick_free_port,
|
||||
timeout=1,
|
||||
)
|
||||
with self.assertRaises(RuntimeError):
|
||||
with self.assertRaises(DistNetworkError):
|
||||
create_c10d_store(
|
||||
is_server=True, server_addr=server_addr, server_port=store1.port
|
||||
)
|
||||
|
||||
@ -8,7 +8,7 @@ import sys
|
||||
REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent
|
||||
|
||||
sys.path.insert(0, str(REPO_ROOT))
|
||||
from tools.flight_recorder.components.types import COLLECTIVES, MatchState
|
||||
from tools.flight_recorder.components.types import COLLECTIVES, MatchInfo, MatchState
|
||||
from tools.flight_recorder.components.utils import match_one_event
|
||||
|
||||
|
||||
@ -50,14 +50,14 @@ class FlightRecorderEventTest(TestCase):
|
||||
)
|
||||
membership = {"0": {0, 1}}
|
||||
self.assertEqual(
|
||||
match_one_event(e1, e1, membership, "0"), MatchState.FULLY_MATCHED
|
||||
match_one_event(e1, e1, membership, "0").state, MatchState.FULLY_MATCHED
|
||||
)
|
||||
|
||||
e2 = create_one_event(
|
||||
"all_gather", ("0", "default"), [[4, 4]], [[4, 4]], "scheduled", 1
|
||||
)
|
||||
self.assertEqual(
|
||||
match_one_event(e1, e2, membership, "0"),
|
||||
match_one_event(e1, e2, membership, "0").state,
|
||||
MatchState.COLLECTIVE_TYPE_MISMATCH,
|
||||
)
|
||||
|
||||
@ -67,34 +67,39 @@ class FlightRecorderEventTest(TestCase):
|
||||
e4 = create_one_event(
|
||||
"all_to_all", ("0", "default"), [[4, 4]], [[4, 4]], "scheduled", 1
|
||||
)
|
||||
self.assertEqual(match_one_event(e3, e4, membership, "0"), MatchState.UNDECIDED)
|
||||
self.assertEqual(
|
||||
match_one_event(e3, e4, membership, "0").state, MatchState.UNDECIDED
|
||||
)
|
||||
|
||||
e5 = create_one_event(
|
||||
"all_reduce", ("0", "default"), [[5, 4]], [[4, 4]], "scheduled", 1, 1
|
||||
)
|
||||
self.assertEqual(
|
||||
match_one_event(e1, e5, membership, "0"), MatchState.SIZE_OR_SYNTAX_MISMATCH
|
||||
match_one_event(e1, e5, membership, "0").state,
|
||||
MatchState.SIZE_OR_SYNTAX_MISMATCH,
|
||||
)
|
||||
|
||||
e6 = create_one_event(
|
||||
"all_reduce", ("0", "default"), [[4, 4]], [[5, 4]], "scheduled", 1, 2
|
||||
)
|
||||
self.assertEqual(
|
||||
match_one_event(e1, e6, membership, "0"), MatchState.SIZE_OR_SYNTAX_MISMATCH
|
||||
match_one_event(e1, e6, membership, "0").state,
|
||||
MatchState.SIZE_OR_SYNTAX_MISMATCH,
|
||||
)
|
||||
|
||||
e7 = create_one_event(
|
||||
"all_reduce", ("0", "default"), [[4, 4]], [[5, 4]], "scheduled", 2
|
||||
)
|
||||
self.assertEqual(
|
||||
match_one_event(e7, e7, membership, "0"), MatchState.SIZE_OR_SYNTAX_MISMATCH
|
||||
match_one_event(e7, e7, membership, "0").state,
|
||||
MatchState.SIZE_OR_SYNTAX_MISMATCH,
|
||||
)
|
||||
|
||||
e9 = create_one_event(
|
||||
"all_reduce", ("0", "default"), [[4, 4]], [[4, 4]], "completed", 1
|
||||
)
|
||||
self.assertEqual(
|
||||
match_one_event(e1, e9, membership, "0"),
|
||||
match_one_event(e1, e9, membership, "0").state,
|
||||
MatchState.COLLECTIVE_STATE_MISMATCH,
|
||||
)
|
||||
|
||||
@ -108,7 +113,7 @@ class FlightRecorderEventTest(TestCase):
|
||||
output_dtypes="float16",
|
||||
)
|
||||
self.assertEqual(
|
||||
match_one_event(e10, e9, membership, "0"),
|
||||
match_one_event(e10, e9, membership, "0").state,
|
||||
MatchState.COLLECTIVE_DTYPE_MISMATCH,
|
||||
)
|
||||
|
||||
@ -128,9 +133,19 @@ class FlightRecorderEventTest(TestCase):
|
||||
collective, ("0", "default"), input_sizes, output_sizes, "scheduled", 1
|
||||
)
|
||||
membership = {"0": {0, 1}}
|
||||
result = match_one_event(event, event, membership, "0")
|
||||
result = match_one_event(event, event, membership, "0").state
|
||||
self.assertEqual(result, expectedState)
|
||||
|
||||
|
||||
class FlightMatchInfoTest(TestCase):
|
||||
def test_match_info(self):
|
||||
m1 = MatchInfo(MatchState.FULLY_MATCHED, "rank 0")
|
||||
m2 = MatchInfo(MatchState.FULLY_MATCHED, "rank 1")
|
||||
self.assertEqual(m1.state, MatchState.FULLY_MATCHED)
|
||||
self.assertEqual(m1.state, m2.state)
|
||||
self.assertEqual(str(m1), "Error type: FULLY_MATCHED, rank 0")
|
||||
self.assertEqual(str(m2), "Error type: FULLY_MATCHED, rank 1")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -3,7 +3,7 @@ import gc
|
||||
import threading
|
||||
import unittest
|
||||
from datetime import timedelta
|
||||
from typing import List, Optional
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
@ -576,24 +576,24 @@ class ProcessGroupDummy(dist.ProcessGroup):
|
||||
self.waits = 0
|
||||
self.dels = 0
|
||||
|
||||
def broadcast(self, tensor_list: List[torch.Tensor], opts: object) -> dist.Work:
|
||||
def broadcast(self, tensor_list: list[torch.Tensor], opts: object) -> dist.Work:
|
||||
return _DummyWork(self)
|
||||
|
||||
def allgather_into_tensor_coalesced(
|
||||
self,
|
||||
output_lists: List[torch.Tensor],
|
||||
input_list: List[torch.Tensor],
|
||||
output_lists: list[torch.Tensor],
|
||||
input_list: list[torch.Tensor],
|
||||
opts: object,
|
||||
) -> dist.Work:
|
||||
return _DummyWork(self)
|
||||
|
||||
def allreduce(self, tensors: List[torch.Tensor], opts: object) -> dist.Work:
|
||||
def allreduce(self, tensors: list[torch.Tensor], opts: object) -> dist.Work:
|
||||
return _DummyWork(self)
|
||||
|
||||
def reduce_scatter_tensor_coalesced(
|
||||
self,
|
||||
outputTensors: List[torch.Tensor],
|
||||
inputTensors: List[torch.Tensor],
|
||||
outputTensors: list[torch.Tensor],
|
||||
inputTensors: list[torch.Tensor],
|
||||
opts: object,
|
||||
) -> dist.Work:
|
||||
return _DummyWork(self)
|
||||
|
||||
@ -289,7 +289,7 @@ class TCPStoreTest(TestCase, StoreTestBase):
|
||||
port = common.find_free_port()
|
||||
|
||||
err_msg_reg = f"^The server socket has failed to listen on any local .*{port}"
|
||||
with self.assertRaisesRegex(RuntimeError, err_msg_reg):
|
||||
with self.assertRaisesRegex(dist.DistNetworkError, err_msg_reg):
|
||||
# Use noqa to silence flake8.
|
||||
# Need to store in an unused variable here to ensure the first
|
||||
# object is not destroyed before the second object is created.
|
||||
@ -521,6 +521,38 @@ class TCPStoreTest(TestCase, StoreTestBase):
|
||||
with self.assertRaisesRegex(ValueError, "TCPStore world size cannot be 0"):
|
||||
dist.TCPStore("localhost", 0, world_size=0, is_master=False)
|
||||
|
||||
def test_agent_store(self) -> None:
|
||||
store = self._create_store()
|
||||
|
||||
with self.assertRaisesRegex(
|
||||
dist.DistNetworkError,
|
||||
"The server socket has failed to listen on any local network address",
|
||||
):
|
||||
dist.TCPStore(
|
||||
host_name="localhost",
|
||||
port=store.port,
|
||||
world_size=1,
|
||||
is_master=True,
|
||||
use_libuv=self._use_libuv,
|
||||
)
|
||||
|
||||
USE_AGENT_STORE = "TORCHELASTIC_USE_AGENT_STORE"
|
||||
MASTER_PORT = "MASTER_PORT"
|
||||
|
||||
os.environ[USE_AGENT_STORE] = "1"
|
||||
os.environ[MASTER_PORT] = str(store.port)
|
||||
second_server = dist.TCPStore(
|
||||
host_name="localhost",
|
||||
port=store.port,
|
||||
world_size=1,
|
||||
is_master=True,
|
||||
use_libuv=self._use_libuv,
|
||||
)
|
||||
del os.environ[USE_AGENT_STORE]
|
||||
del os.environ[MASTER_PORT]
|
||||
|
||||
self.assertEqual(second_server.port, store.port)
|
||||
|
||||
|
||||
class LibUvTCPStoreTest(TCPStoreTest):
|
||||
_use_libuv = True
|
||||
|
||||
@ -1251,7 +1251,7 @@ Non-primal fwd outputs from model w/o backward hook: {mod_no_hook_fwd_outputs_no
|
||||
x = torch.randn(4, 4).to(device)
|
||||
opt_fn = torch.compile(fn, fullgraph=True)
|
||||
with self.assertRaisesRegex(
|
||||
torch._dynamo.exc.Unsupported, "skip function graph_break in file"
|
||||
torch._dynamo.exc.Unsupported, "User-inserted graph break"
|
||||
):
|
||||
opt_fn(x)
|
||||
|
||||
|
||||
@ -27,7 +27,7 @@ from torch._inductor.utils import fresh_inductor_cache
|
||||
from torch._subclasses import FakeTensorMode
|
||||
from torch.compiler._cache import CacheArtifactManager
|
||||
from torch.fx.experimental.symbolic_shapes import ShapeEnv
|
||||
from torch.testing._internal.common_cuda import SM80OrLater
|
||||
from torch.testing._internal.common_cuda import SM80OrLater, TEST_MULTIGPU
|
||||
from torch.testing._internal.common_device_type import largeTensorTest
|
||||
from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
@ -693,6 +693,36 @@ class AOTAutogradCacheTests(InductorTestCase):
|
||||
self.assertNotEqual(res1, res3)
|
||||
self.assertEqual(res1, res3.sub(torch.ones(2, 2)))
|
||||
|
||||
@inductor_config.patch("fx_graph_cache", True)
|
||||
@inductor_config.patch("fx_graph_remote_cache", False)
|
||||
@functorch_config.patch({"enable_autograd_cache": True})
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
|
||||
def test_constant_tensor_device_guards(self):
|
||||
"""
|
||||
Usually, when there are example inputs, the device index of the inputs
|
||||
is sufficient to make sure we don't cache hit with the results from different
|
||||
cuda devices.
|
||||
When the input has no arguments, we still need to have the cuda
|
||||
device index in the cache key.
|
||||
"""
|
||||
|
||||
@torch.compile
|
||||
def f():
|
||||
y = torch.tensor([5], device="cuda")
|
||||
return (y,)
|
||||
|
||||
with torch.cuda._DeviceGuard(0):
|
||||
torch.cuda.set_device(0)
|
||||
result = f()
|
||||
self.assertEqual(result[0].device, torch.device("cuda:0"))
|
||||
|
||||
self._clear_dynamo_and_codecache()
|
||||
|
||||
with torch.cuda._DeviceGuard(1):
|
||||
torch.cuda.set_device(1)
|
||||
result = f()
|
||||
self.assertEqual(result[0].device, torch.device("cuda:1"))
|
||||
|
||||
|
||||
@inductor_config.patch("fx_graph_cache", True)
|
||||
class AOTAutogradCachePicklerTests(torch._dynamo.test_case.TestCase):
|
||||
|
||||
@ -271,7 +271,10 @@ class AutogradFunctionTests(torch._dynamo.test_case.TestCase):
|
||||
model = CustomFuncBwdPrintModule()
|
||||
opt_model = torch.compile(model, backend="eager", fullgraph=True)
|
||||
x = torch.randn(2, 2, dtype=torch.double, requires_grad=True)
|
||||
with self.assertRaisesRegex(torch._dynamo.exc.Unsupported, "builtin: print"):
|
||||
with self.assertRaisesRegex(
|
||||
torch._dynamo.exc.Unsupported,
|
||||
"Dynamo does not know how to trace builtin operator `print`",
|
||||
):
|
||||
opt_model(x)
|
||||
|
||||
def test_stride_in_bwd(self):
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user