mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-28 02:04:53 +08:00
Compare commits
158 Commits
cslpull88
...
mlazos/tf-
| Author | SHA1 | Date | |
|---|---|---|---|
| ac3dabf652 | |||
| 54ab06fc07 | |||
| 32542724be | |||
| dfbb990dc4 | |||
| 194d46e91c | |||
| 9094fb5c7c | |||
| ec6b49eed9 | |||
| 042f2f7746 | |||
| fd494dd426 | |||
| 8334cb2fb9 | |||
| e72ed4717e | |||
| 3bebc09be9 | |||
| a2db22e6bb | |||
| eac5e12548 | |||
| 18479c5f70 | |||
| f7c0c06692 | |||
| b53d97c7be | |||
| 6c1da66407 | |||
| d7c97e7245 | |||
| be9f4ffe88 | |||
| 692faa9bc6 | |||
| 32f3af72b7 | |||
| ebab5c85c4 | |||
| 3d734d837b | |||
| c92227c41a | |||
| e6a0221fc6 | |||
| a6b9d444fb | |||
| d42b0c8f22 | |||
| 941d094dd1 | |||
| b1a934741e | |||
| 0c661f3e1a | |||
| 2c7e314803 | |||
| ead4407f57 | |||
| 2f5b40c099 | |||
| 993b5647ab | |||
| 2ab26806f1 | |||
| b1612569f6 | |||
| dc0e818738 | |||
| 06e414d7fe | |||
| a681260caf | |||
| 95e976a63f | |||
| 306ac44eaa | |||
| a7643baceb | |||
| a4030e37be | |||
| 22e1fb6faa | |||
| 2a4890e315 | |||
| 3ce433aef2 | |||
| 7f2d20e687 | |||
| 32fd29c1ea | |||
| 5eebd9315a | |||
| a15aabc975 | |||
| b143426db3 | |||
| 13ba0a2e5c | |||
| 8520ce5f78 | |||
| 196748d491 | |||
| 177e4f4218 | |||
| 3988b3468b | |||
| 04118d8617 | |||
| 24482e5c68 | |||
| c0ec599f27 | |||
| 7074de43c0 | |||
| 771dcce11d | |||
| de74aafff4 | |||
| ad29a2c0dc | |||
| 3a9e33dca8 | |||
| a086882d72 | |||
| 84ae6b7d6b | |||
| 60a097a071 | |||
| 13bae39e22 | |||
| 4ef6c05f65 | |||
| d6b9bd3e60 | |||
| d0591f4658 | |||
| b5dea061c8 | |||
| 041960a1ce | |||
| 67c7924ea1 | |||
| 217ba7b2ab | |||
| 758d515d98 | |||
| 60d98b4cfb | |||
| 590a3e9f8a | |||
| 764ee6e3f9 | |||
| 67f98a99a4 | |||
| e020a8755a | |||
| 7ffb3b201c | |||
| f946bf88c4 | |||
| 66da3b3b2a | |||
| 41e653456e | |||
| e40a0a9359 | |||
| c05a7adb36 | |||
| 5f57be7571 | |||
| 29d72c1100 | |||
| 3b1a334c0f | |||
| 07689a38bf | |||
| 06a7dc21c1 | |||
| d9a18173fa | |||
| d8543e3162 | |||
| ad01fc194d | |||
| e162414963 | |||
| 9e5a797771 | |||
| b46a1b9e2d | |||
| 9688014820 | |||
| 8f6e73f068 | |||
| 1e57ef08fa | |||
| 614b86d602 | |||
| 0b96dfb736 | |||
| 62b221d5cc | |||
| 66dd4577b1 | |||
| cc28634172 | |||
| c83cdf068b | |||
| 28ccfba248 | |||
| b2386bdca1 | |||
| bdfc8d9f96 | |||
| 70779dded8 | |||
| ea231300d1 | |||
| 8f66995459 | |||
| 144fde4fd2 | |||
| 43f4947d44 | |||
| 65e1c34061 | |||
| 830247c355 | |||
| 4262755b5a | |||
| 3825607144 | |||
| 3c8f71ff93 | |||
| fc890b55b5 | |||
| 058a69d91a | |||
| 6c5920d515 | |||
| 116fd474da | |||
| a5d70cf545 | |||
| 7fe819d917 | |||
| f63571060c | |||
| 38fead8f7c | |||
| 24a223c49d | |||
| e4920a1364 | |||
| bc5ecf83d7 | |||
| e55c0f59e5 | |||
| a4cf9653ee | |||
| 9c0b03020b | |||
| 034717a029 | |||
| 9c38b00999 | |||
| 8efe547046 | |||
| 82d00acfee | |||
| 098431a29d | |||
| be660ea2d3 | |||
| 52c7c89ea4 | |||
| 1efd341d15 | |||
| a096f2899d | |||
| dbeb8a1691 | |||
| b1f72e2984 | |||
| bb3c2408f4 | |||
| 2c99f17a32 | |||
| 0043dcd79e | |||
| 2e2fb668fa | |||
| 9d24f945ba | |||
| ecbd715363 | |||
| 58f2477a26 | |||
| 43dcb4bb61 | |||
| 50d1e37079 | |||
| b99ef1a02e | |||
| 8a5c8e5db9 | |||
| c7328dff7f |
@ -108,10 +108,10 @@ ENV CMAKE_C_COMPILER cc
|
||||
ENV CMAKE_CXX_COMPILER c++
|
||||
COPY ./common/install_triton.sh install_triton.sh
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ci_commit_pins/triton-rocm.txt triton-rocm.txt
|
||||
COPY ci_commit_pins/triton.txt triton.txt
|
||||
COPY triton_version.txt triton_version.txt
|
||||
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
|
||||
RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt
|
||||
RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
|
||||
|
||||
# Install AOTriton (Early fail)
|
||||
COPY ./aotriton_version.txt aotriton_version.txt
|
||||
|
||||
@ -1 +0,0 @@
|
||||
21eae954efa5bf584da70324b640288c3ee7aede
|
||||
@ -1 +1 @@
|
||||
1b2f15840e0d70eec50d84c7a0575cb835524def
|
||||
cc981feba10a3f4c2e46f3fe368e8fcf5f5643df
|
||||
|
||||
@ -1 +1 @@
|
||||
dedb7bdf339a3546896d4820366ca562c586bfa0
|
||||
757b6a61e7df814ba806f498f8bb3160f84b120c
|
||||
|
||||
@ -12,10 +12,7 @@ conda_reinstall() {
|
||||
as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y --force-reinstall $*
|
||||
}
|
||||
|
||||
if [ -n "${ROCM_VERSION}" ]; then
|
||||
TRITON_REPO="https://github.com/openai/triton"
|
||||
TRITON_TEXT_FILE="triton-rocm"
|
||||
elif [ -n "${XPU_VERSION}" ]; then
|
||||
if [ -n "${XPU_VERSION}" ]; then
|
||||
TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
|
||||
TRITON_TEXT_FILE="triton-xpu"
|
||||
else
|
||||
|
||||
@ -30,9 +30,14 @@ dill==0.3.7
|
||||
#Pinned versions: 0.3.7
|
||||
#test that import: dynamo/test_replay_record.py test_dataloader.py test_datapipe.py test_serialization.py
|
||||
|
||||
expecttest==0.1.6
|
||||
expecttest==0.2.1
|
||||
#Description: method for writing tests where test framework auto populates
|
||||
# the expected output based on previous runs
|
||||
#Pinned versions: 0.2.1
|
||||
#test that import:
|
||||
|
||||
fbscribelogger==0.1.6
|
||||
#Description: write to scribe from authenticated jobs on CI
|
||||
#Pinned versions: 0.1.6
|
||||
#test that import:
|
||||
|
||||
|
||||
@ -100,10 +100,10 @@ ARG TRITON
|
||||
# try to reach out to S3, which docker build runners don't have access
|
||||
COPY ./common/install_triton.sh install_triton.sh
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ci_commit_pins/triton-rocm.txt triton-rocm.txt
|
||||
COPY ci_commit_pins/triton.txt triton.txt
|
||||
COPY triton_version.txt triton_version.txt
|
||||
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
|
||||
RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt
|
||||
RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
|
||||
|
||||
# Install AOTriton
|
||||
COPY ./aotriton_version.txt aotriton_version.txt
|
||||
|
||||
@ -596,6 +596,9 @@ test_single_dynamo_benchmark() {
|
||||
|
||||
test_inductor_micro_benchmark() {
|
||||
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
||||
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
|
||||
test_inductor_set_cpu_affinity
|
||||
fi
|
||||
python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
|
||||
}
|
||||
|
||||
|
||||
@ -119,6 +119,11 @@ fi
|
||||
# Test the package
|
||||
/builder/check_binary.sh
|
||||
|
||||
if [[ "\$GPU_ARCH_TYPE" != *s390x* && "\$GPU_ARCH_TYPE" != *xpu* && "\$GPU_ARCH_TYPE" != *rocm* && "$PACKAGE_TYPE" != libtorch ]]; then
|
||||
# Exclude s390, xpu, rocm and libtorch builds from smoke testing
|
||||
python /builder/test/smoke_test/smoke_test.py --package=torchonly --torch-compile-check disabled
|
||||
fi
|
||||
|
||||
# Clean temp files
|
||||
cd /builder && git clean -ffdx
|
||||
|
||||
|
||||
@ -90,7 +90,7 @@ fi
|
||||
if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" ]]; then
|
||||
TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
|
||||
if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
|
||||
TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-rocm.txt)
|
||||
TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
|
||||
TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
|
||||
fi
|
||||
if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
|
||||
|
||||
1
.github/pytorch-probot.yml
vendored
1
.github/pytorch-probot.yml
vendored
@ -9,6 +9,7 @@ ciflow_push_tags:
|
||||
- ciflow/inductor-rocm
|
||||
- ciflow/inductor-perf-compare
|
||||
- ciflow/inductor-micro-benchmark
|
||||
- ciflow/inductor-micro-benchmark-cpu-x86
|
||||
- ciflow/inductor-cu124
|
||||
- ciflow/linux-aarch64
|
||||
- ciflow/mps
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
boto3==1.19.12
|
||||
hypothesis==6.56.4
|
||||
expecttest==0.1.6
|
||||
expecttest==0.2.1
|
||||
fbscribelogger==0.1.6
|
||||
librosa>=0.6.2
|
||||
mpmath==1.3.0
|
||||
networkx==2.8.7
|
||||
|
||||
4
.github/scripts/build_triton_wheel.py
vendored
4
.github/scripts/build_triton_wheel.py
vendored
@ -15,9 +15,7 @@ REPO_DIR = SCRIPT_DIR.parent.parent
|
||||
|
||||
def read_triton_pin(device: str = "cuda") -> str:
|
||||
triton_file = "triton.txt"
|
||||
if device == "rocm":
|
||||
triton_file = "triton-rocm.txt"
|
||||
elif device == "xpu":
|
||||
if device == "xpu":
|
||||
triton_file = "triton-xpu.txt"
|
||||
with open(REPO_DIR / ".ci" / "docker" / "ci_commit_pins" / triton_file) as f:
|
||||
return f.read().strip()
|
||||
|
||||
47
.github/scripts/generate_binary_build_matrix.py
vendored
47
.github/scripts/generate_binary_build_matrix.py
vendored
@ -325,6 +325,7 @@ def generate_wheels_matrix(
|
||||
os: str,
|
||||
arches: Optional[List[str]] = None,
|
||||
python_versions: Optional[List[str]] = None,
|
||||
use_split_build: bool = False,
|
||||
) -> List[Dict[str, str]]:
|
||||
package_type = "wheel"
|
||||
if os == "linux" or os == "linux-aarch64" or os == "linux-s390x":
|
||||
@ -371,7 +372,17 @@ def generate_wheels_matrix(
|
||||
) and python_version == "3.13":
|
||||
continue
|
||||
|
||||
if use_split_build and (
|
||||
arch_version not in ["12.4", "12.1", "11.8", "cpu"] or os != "linux"
|
||||
):
|
||||
raise RuntimeError(
|
||||
"Split build is only supported on linux with cuda 12.4, 12.1, 11.8, and cpu.\n"
|
||||
f"Currently attempting to build on arch version {arch_version} and os {os}.\n"
|
||||
"Please modify the matrix generation to exclude this combination."
|
||||
)
|
||||
|
||||
# 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
|
||||
|
||||
if (
|
||||
arch_version in ["12.4", "12.1", "11.8"]
|
||||
and os == "linux"
|
||||
@ -385,6 +396,7 @@ def generate_wheels_matrix(
|
||||
"desired_cuda": translate_desired_cuda(
|
||||
gpu_arch_type, gpu_arch_version
|
||||
),
|
||||
"use_split_build": "True" if use_split_build else "False",
|
||||
"devtoolset": (
|
||||
"cxx11-abi" if arch_version == "cuda-aarch64" else ""
|
||||
),
|
||||
@ -400,7 +412,8 @@ def generate_wheels_matrix(
|
||||
),
|
||||
}
|
||||
)
|
||||
if arch_version != "cuda-aarch64":
|
||||
# Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
|
||||
if python_version == "3.10" and arch_version == "12.1":
|
||||
ret.append(
|
||||
{
|
||||
"python_version": python_version,
|
||||
@ -409,40 +422,16 @@ def generate_wheels_matrix(
|
||||
"desired_cuda": translate_desired_cuda(
|
||||
gpu_arch_type, gpu_arch_version
|
||||
),
|
||||
"use_split_build": "True",
|
||||
"use_split_build": "True" if use_split_build else "False",
|
||||
"devtoolset": "",
|
||||
"container_image": WHEEL_CONTAINER_IMAGES[arch_version],
|
||||
"package_type": package_type,
|
||||
"pytorch_extra_install_requirements": (
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version] # fmt: skip
|
||||
if os != "linux-aarch64"
|
||||
else ""
|
||||
),
|
||||
"build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-split".replace( # noqa: B950
|
||||
"pytorch_extra_install_requirements": "",
|
||||
"build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace( # noqa: B950
|
||||
".", "_"
|
||||
),
|
||||
}
|
||||
)
|
||||
# Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
|
||||
if python_version == "3.10" and arch_version == "12.1":
|
||||
ret.append(
|
||||
{
|
||||
"python_version": python_version,
|
||||
"gpu_arch_type": gpu_arch_type,
|
||||
"gpu_arch_version": gpu_arch_version,
|
||||
"desired_cuda": translate_desired_cuda(
|
||||
gpu_arch_type, gpu_arch_version
|
||||
),
|
||||
"use_split_build": "False",
|
||||
"devtoolset": "",
|
||||
"container_image": WHEEL_CONTAINER_IMAGES[arch_version],
|
||||
"package_type": package_type,
|
||||
"pytorch_extra_install_requirements": "",
|
||||
"build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace( # noqa: B950
|
||||
".", "_"
|
||||
),
|
||||
}
|
||||
)
|
||||
else:
|
||||
ret.append(
|
||||
{
|
||||
@ -452,6 +441,7 @@ def generate_wheels_matrix(
|
||||
"desired_cuda": translate_desired_cuda(
|
||||
gpu_arch_type, gpu_arch_version
|
||||
),
|
||||
"use_split_build": "True" if use_split_build else "False",
|
||||
"devtoolset": (
|
||||
"cxx11-abi" if arch_version == "cpu-cxx11-abi" else ""
|
||||
),
|
||||
@ -467,6 +457,7 @@ def generate_wheels_matrix(
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
|
||||
35
.github/scripts/generate_ci_workflows.py
vendored
35
.github/scripts/generate_ci_workflows.py
vendored
@ -61,6 +61,7 @@ class BinaryBuildWorkflow:
|
||||
# Mainly for macos
|
||||
cross_compile_arm64: bool = False
|
||||
macos_runner: str = "macos-14-xlarge"
|
||||
use_split_build: bool = False
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.abi_version:
|
||||
@ -75,6 +76,11 @@ class BinaryBuildWorkflow:
|
||||
GITHUB_DIR
|
||||
/ f"workflows/generated-{self.build_environment}-{self.branches}.yml"
|
||||
)
|
||||
if self.use_split_build:
|
||||
output_file_path = (
|
||||
GITHUB_DIR
|
||||
/ f"workflows/generated-{self.build_environment}-{self.branches}-split.yml"
|
||||
)
|
||||
with open(output_file_path, "w") as output_file:
|
||||
GENERATED = "generated" # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file
|
||||
output_file.writelines([f"# @{GENERATED} DO NOT EDIT MANUALLY\n"])
|
||||
@ -110,6 +116,20 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
|
||||
isolated_workflow=True,
|
||||
),
|
||||
),
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.LINUX,
|
||||
package_type="manywheel",
|
||||
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
|
||||
OperatingSystem.LINUX,
|
||||
use_split_build=True,
|
||||
arches=["11.8", "12.1", "12.4", "cpu"],
|
||||
),
|
||||
ciflow_config=CIFlowConfig(
|
||||
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
|
||||
isolated_workflow=True,
|
||||
),
|
||||
use_split_build=True,
|
||||
),
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.LINUX,
|
||||
package_type="conda",
|
||||
@ -162,6 +182,21 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
|
||||
),
|
||||
branches="main",
|
||||
),
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.LINUX,
|
||||
package_type="manywheel",
|
||||
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
|
||||
OperatingSystem.LINUX,
|
||||
arches=["11.8", "12.1", "12.4"],
|
||||
python_versions=["3.9"],
|
||||
use_split_build=True,
|
||||
),
|
||||
ciflow_config=CIFlowConfig(
|
||||
labels={LABEL_CIFLOW_PERIODIC},
|
||||
),
|
||||
branches="main",
|
||||
use_split_build=True,
|
||||
),
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.LINUX,
|
||||
package_type="libtorch",
|
||||
|
||||
2
.github/templates/upload.yml.j2
vendored
2
.github/templates/upload.yml.j2
vendored
@ -45,7 +45,7 @@
|
||||
{%- if is_windows %}
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
{%- endif %}
|
||||
|
||||
{%- else %}
|
||||
|
||||
8
.github/workflows/build-triton-wheel.yml
vendored
8
.github/workflows/build-triton-wheel.yml
vendored
@ -13,7 +13,6 @@ on:
|
||||
- .github/scripts/build_triton_wheel.py
|
||||
- .github/ci_commit_pins/triton.txt
|
||||
- .ci/docker/ci_commit_pins/triton.txt
|
||||
- .ci/docker/ci_commit_pins/triton-rocm.txt
|
||||
- .ci/docker/ci_commit_pins/triton-xpu.txt
|
||||
pull_request:
|
||||
paths:
|
||||
@ -21,7 +20,6 @@ on:
|
||||
- .github/scripts/build_triton_wheel.py
|
||||
- .github/ci_commit_pins/triton.txt
|
||||
- .ci/docker/ci_commit_pins/triton.txt
|
||||
- .ci/docker/ci_commit_pins/triton-rocm.txt
|
||||
- .ci/docker/ci_commit_pins/triton-xpu.txt
|
||||
|
||||
concurrency:
|
||||
@ -31,7 +29,7 @@ concurrency:
|
||||
jobs:
|
||||
build-wheel:
|
||||
name: "Build Triton Wheel"
|
||||
runs-on: [self-hosted, linux.2xlarge]
|
||||
runs-on: [self-hosted, linux.4xlarge]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@ -120,7 +118,7 @@ jobs:
|
||||
fi
|
||||
docker exec -t "${container_name}" chown -R 1000.1000 /artifacts
|
||||
|
||||
- uses: actions/upload-artifact@v3
|
||||
- uses: actions/upload-artifact@v4.4.0
|
||||
with:
|
||||
name: pytorch-triton-wheel-${{ matrix.py_vers }}-${{ matrix.device }}
|
||||
if-no-files-found: error
|
||||
@ -253,7 +251,7 @@ jobs:
|
||||
docker exec -t "${container_name}" python /pytorch/.github/scripts/build_triton_wheel.py --build-conda --py-version="${PY_VERS}" $RELEASE
|
||||
docker exec -t "${container_name}" chown -R 1000.1000 /artifacts
|
||||
|
||||
- uses: actions/upload-artifact@v3
|
||||
- uses: actions/upload-artifact@v4.4.0
|
||||
with:
|
||||
name: pytorch-triton-conda-${{ matrix.py_vers }}
|
||||
if-no-files-found: error
|
||||
|
||||
20
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
20
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -58,6 +58,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
@ -81,6 +82,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -103,6 +105,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu-aarch64
|
||||
secrets:
|
||||
@ -125,6 +128,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
@ -149,6 +153,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda-aarch64
|
||||
secrets:
|
||||
@ -170,6 +175,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
@ -193,6 +199,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -215,6 +222,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cpu-aarch64
|
||||
secrets:
|
||||
@ -237,6 +245,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
@ -261,6 +270,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda-aarch64
|
||||
secrets:
|
||||
@ -282,6 +292,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
@ -305,6 +316,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -327,6 +339,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cpu-aarch64
|
||||
secrets:
|
||||
@ -349,6 +362,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
@ -373,6 +387,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda-aarch64
|
||||
secrets:
|
||||
@ -394,6 +409,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
@ -417,6 +433,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -439,6 +456,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cpu-aarch64
|
||||
secrets:
|
||||
@ -461,6 +479,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
@ -485,6 +504,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cuda-aarch64
|
||||
secrets:
|
||||
|
||||
182
.github/workflows/generated-linux-binary-manywheel-main-split.yml
generated
vendored
Normal file
182
.github/workflows/generated-linux-binary-manywheel-main-split.yml
generated
vendored
Normal file
@ -0,0 +1,182 @@
|
||||
# @generated DO NOT EDIT MANUALLY
|
||||
|
||||
# Template is at: .github/templates/linux_binary_build_workflow.yml.j2
|
||||
# Generation script: .github/scripts/generate_ci_workflows.py
|
||||
name: linux-binary-manywheel
|
||||
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
tags:
|
||||
- 'ciflow/periodic/*'
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
# Needed for conda builds
|
||||
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
|
||||
ANACONDA_USER: pytorch
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
BINARY_ENV_FILE: /tmp/env
|
||||
BUILD_ENVIRONMENT: linux-binary-manywheel
|
||||
BUILDER_ROOT: /builder
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
PYTORCH_FINAL_PACKAGE_DIR: /artifacts
|
||||
PYTORCH_ROOT: /pytorch
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
SKIP_ALL_TESTS: 0
|
||||
concurrency:
|
||||
group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
manywheel-py3_9-cuda11_8-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu118
|
||||
GPU_ARCH_VERSION: 11.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda11_8-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda11_8-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu118
|
||||
GPU_ARCH_VERSION: 11.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_9-cuda12_1-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_1-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda12_1-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_9-cuda12_4-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu124
|
||||
GPU_ARCH_VERSION: 12.4
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_4-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda12_4-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu124
|
||||
GPU_ARCH_VERSION: 12.4
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
147
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
147
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
@ -54,6 +54,7 @@ jobs:
|
||||
GPU_ARCH_VERSION: 11.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda11_8
|
||||
@ -77,6 +78,7 @@ jobs:
|
||||
GPU_ARCH_VERSION: 11.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
@ -85,53 +87,6 @@ jobs:
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_9-cuda11_8-split-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu118
|
||||
GPU_ARCH_VERSION: 11.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda11_8-split
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda11_8-split-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda11_8-split-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu118
|
||||
GPU_ARCH_VERSION: 11.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda11_8-split
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_9-cuda12_1-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -146,6 +101,7 @@ jobs:
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_1
|
||||
@ -169,6 +125,7 @@ jobs:
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
@ -177,53 +134,6 @@ jobs:
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_9-cuda12_1-split-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_1-split
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_1-split-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda12_1-split-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_1-split
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_9-cuda12_4-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -238,6 +148,7 @@ jobs:
|
||||
GPU_ARCH_VERSION: 12.4
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_4
|
||||
@ -261,6 +172,7 @@ jobs:
|
||||
GPU_ARCH_VERSION: 12.4
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
@ -268,50 +180,3 @@ jobs:
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_9-cuda12_4-split-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu124
|
||||
GPU_ARCH_VERSION: 12.4
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_4-split
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_4-split-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda12_4-split-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu124
|
||||
GPU_ARCH_VERSION: 12.4
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_4-split
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
1516
.github/workflows/generated-linux-binary-manywheel-nightly-split.yml
generated
vendored
Normal file
1516
.github/workflows/generated-linux-binary-manywheel-nightly-split.yml
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1179
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
1179
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
File diff suppressed because it is too large
Load Diff
15
.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
generated
vendored
15
.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
generated
vendored
@ -58,6 +58,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runs_on: linux.s390x
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
@ -81,6 +82,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
@ -103,6 +105,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu-s390x
|
||||
secrets:
|
||||
@ -124,6 +127,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runs_on: linux.s390x
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
@ -147,6 +151,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
@ -169,6 +174,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cpu-s390x
|
||||
secrets:
|
||||
@ -190,6 +196,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runs_on: linux.s390x
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
@ -213,6 +220,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
@ -235,6 +243,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cpu-s390x
|
||||
secrets:
|
||||
@ -256,6 +265,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runs_on: linux.s390x
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
@ -279,6 +289,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
@ -301,6 +312,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cpu-s390x
|
||||
secrets:
|
||||
@ -322,6 +334,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runs_on: linux.s390x
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
@ -345,6 +358,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
@ -367,6 +381,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cpu-s390x
|
||||
secrets:
|
||||
|
||||
2
.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml
generated
vendored
2
.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml
generated
vendored
@ -49,7 +49,7 @@ jobs:
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
|
||||
4
.github/workflows/generated-windows-binary-libtorch-debug-main.yml
generated
vendored
4
.github/workflows/generated-windows-binary-libtorch-debug-main.yml
generated
vendored
@ -51,7 +51,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -169,7 +169,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
|
||||
24
.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
generated
vendored
24
.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
generated
vendored
@ -58,7 +58,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -176,7 +176,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -290,7 +290,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cpu-shared-with-deps-debug
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -316,7 +316,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -435,7 +435,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -550,7 +550,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cuda11_8-shared-with-deps-debug
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -576,7 +576,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -695,7 +695,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -810,7 +810,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cuda12_1-shared-with-deps-debug
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -836,7 +836,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -955,7 +955,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -1070,7 +1070,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cuda12_4-shared-with-deps-debug
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
4
.github/workflows/generated-windows-binary-libtorch-release-main.yml
generated
vendored
4
.github/workflows/generated-windows-binary-libtorch-release-main.yml
generated
vendored
@ -51,7 +51,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -169,7 +169,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
|
||||
24
.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
generated
vendored
24
.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
generated
vendored
@ -58,7 +58,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -176,7 +176,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -290,7 +290,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cpu-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -316,7 +316,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -435,7 +435,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -550,7 +550,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cuda11_8-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -576,7 +576,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -695,7 +695,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -810,7 +810,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cuda12_1-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -836,7 +836,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -955,7 +955,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -1070,7 +1070,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cuda12_4-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
40
.github/workflows/inductor-micro-benchmark-x86.yml
vendored
Normal file
40
.github/workflows/inductor-micro-benchmark-x86.yml
vendored
Normal file
@ -0,0 +1,40 @@
|
||||
name: inductor-micro-benchmark-x86
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: 0 7 * * *
|
||||
push:
|
||||
tags:
|
||||
- ciflow/inductor-micro-benchmark-cpu-x86/*
|
||||
workflow_dispatch:
|
||||
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-build:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
|
||||
# Use metal host for benchmark jobs
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor-micro-benchmark-cpu-x86", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" },
|
||||
]}
|
||||
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-micro-benchmark-test:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
|
||||
use-gha: anything-non-empty-to-use-gha
|
||||
timeout-minutes: 720
|
||||
2
.github/workflows/lint.yml
vendored
2
.github/workflows/lint.yml
vendored
@ -223,7 +223,7 @@ jobs:
|
||||
cache: pip
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.1.* numpy==1.24.*
|
||||
pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.2.* fbscribelogger==0.1.* numpy==1.24.*
|
||||
pip install torch --pre --index-url https://download.pytorch.org/whl/nightly/cpu/
|
||||
- name: Run run_test.py (nonretryable)
|
||||
run: |
|
||||
|
||||
2
.github/workflows/upload-test-stats.yml
vendored
2
.github/workflows/upload-test-stats.yml
vendored
@ -2,7 +2,7 @@ name: Upload test stats
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: [pull, trunk, periodic, inductor, unstable, slow, unstable-periodic, inductor-periodic, rocm, inductor-micro-benchmark, inductor-cu124, inductor-rocm]
|
||||
workflows: [pull, trunk, periodic, inductor, unstable, slow, unstable-periodic, inductor-periodic, rocm, inductor-micro-benchmark, inductor-micro-benchmark-x86, inductor-cu124, inductor-rocm]
|
||||
types:
|
||||
- completed
|
||||
|
||||
|
||||
@ -138,7 +138,7 @@ init_command = [
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'numpy==1.24.3 ; python_version == "3.8"',
|
||||
'numpy==1.26.0 ; python_version >= "3.9"',
|
||||
'expecttest==0.1.6',
|
||||
'expecttest==0.2.1',
|
||||
'mypy==1.10.0',
|
||||
'sympy==1.12.1 ; python_version == "3.8"',
|
||||
'sympy==1.13.0 ; python_version >= "3.9"',
|
||||
|
||||
@ -332,6 +332,7 @@ intern_build_aten_ops(
|
||||
"@fbgemm",
|
||||
"@mkl",
|
||||
"@sleef",
|
||||
"@mkl_dnn//:mkl-dnn",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@ -57,7 +57,6 @@ nn/qat/ @jerryzh168
|
||||
# Docker
|
||||
/.ci/docker/ @jeffdaily
|
||||
/.ci/docker/ci_commit_pins/triton.txt @desertfire @Chillee @eellison @shunting314 @bertmaher @jeffdaily @jataylo @jithunnair-amd @pruthvistony
|
||||
/.ci/docker/ci_commit_pins/triton-rocm.txt @jeffdaily @jataylo @jithunnair-amd @pruthvistony
|
||||
/.ci/docker/ci_commit_pins/triton-xpu.txt @EikanWang @gujinghui
|
||||
|
||||
# Github Actions
|
||||
|
||||
@ -50,6 +50,7 @@ Following is the Release Compatibility Matrix for PyTorch releases:
|
||||
|
||||
| PyTorch version | Python | Stable CUDA | Experimental CUDA | Stable ROCm |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| 2.5 | >=3.9, <=3.12, (3.13 experimental) | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70 | None | ROCm 6.2 |
|
||||
| 2.4 | >=3.8, <=3.12 | CUDA 11.8, CUDA 12.1, CUDNN 9.1.0.70 | CUDA 12.4, CUDNN 9.1.0.70 | ROCm 6.1 |
|
||||
| 2.3 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 6.0 |
|
||||
| 2.2 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.7 |
|
||||
|
||||
@ -707,7 +707,12 @@ bool are_all_mutations_under_no_grad_or_inference_mode(const Tensor& functional_
|
||||
}
|
||||
|
||||
bool isFunctionalTensor(const at::Tensor& tensor) {
|
||||
return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Functionalize);
|
||||
return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Functionalize);
|
||||
}
|
||||
|
||||
bool isBaseTensor(const at::Tensor& tensor) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isFunctionalTensor(tensor));
|
||||
return unsafeGetFunctionalWrapper(tensor)->isBaseTensor();
|
||||
}
|
||||
|
||||
bool isFunctionalTensor(const std::optional<Tensor>& t) {
|
||||
|
||||
@ -165,6 +165,12 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
|
||||
was_storage_changed_ = true;
|
||||
}
|
||||
|
||||
// A FunctionalTensor is considered a base if its not a view of another
|
||||
// tensor.
|
||||
bool isBaseTensor() const {
|
||||
return view_metas_.empty();
|
||||
}
|
||||
|
||||
c10::SymInt get_storage_size(bool before) {
|
||||
return functional_storage_impl()->get_storage_size(before);
|
||||
}
|
||||
@ -290,6 +296,8 @@ TORCH_API inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(
|
||||
return functional_impl;
|
||||
}
|
||||
|
||||
TORCH_API bool isBaseTensor(const at::Tensor& tensor);
|
||||
|
||||
TORCH_API bool isFunctionalTensor(const at::Tensor& tensor);
|
||||
TORCH_API bool isFunctionalTensor(const std::optional<Tensor>& t);
|
||||
TORCH_API bool isFunctionalTensor(
|
||||
|
||||
@ -69,7 +69,7 @@ thread_local std::array<at::ScalarType, at::COMPILE_TIME_MAX_DEVICE_TYPES>
|
||||
at::ScalarType::Undefined, // Vulkan
|
||||
at::ScalarType::Undefined, // Metal
|
||||
at::kHalf, // XPU
|
||||
at::ScalarType::Undefined, // MPS
|
||||
at::kHalf, // MPS
|
||||
at::ScalarType::Undefined, // Meta (tensors with no data)
|
||||
at::kBFloat16, // HPU / HABANA
|
||||
at::ScalarType::Undefined, // SX-Aurora / NEC
|
||||
@ -206,6 +206,118 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
|
||||
TORCH_FN((&at::autocast::binary_cross_entropy_banned)));
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(_, AutocastMPS, m) {
|
||||
m.fallback(torch::CppFunction::makeFallthrough());
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(aten, AutocastMPS, m) {
|
||||
// lower_precision_fp
|
||||
KERNEL_MPS2(_convolution, deprecated, lower_precision_fp)
|
||||
KERNEL_MPS(_convolution, lower_precision_fp)
|
||||
KERNEL_MPS(conv1d, lower_precision_fp)
|
||||
KERNEL_MPS(conv2d, lower_precision_fp)
|
||||
KERNEL_MPS(conv_tbc, lower_precision_fp)
|
||||
KERNEL_MPS(conv_transpose1d, lower_precision_fp)
|
||||
KERNEL_MPS2(conv_transpose2d, input, lower_precision_fp)
|
||||
KERNEL_MPS(convolution, lower_precision_fp)
|
||||
KERNEL_MPS(_mps_convolution, lower_precision_fp)
|
||||
KERNEL_MPS(prelu, lower_precision_fp)
|
||||
KERNEL_MPS(addmm, lower_precision_fp)
|
||||
KERNEL_MPS(addmv, lower_precision_fp)
|
||||
KERNEL_MPS(addr, lower_precision_fp)
|
||||
KERNEL_MPS(matmul, lower_precision_fp)
|
||||
KERNEL_MPS(einsum, lower_precision_fp)
|
||||
KERNEL_MPS(mm, lower_precision_fp)
|
||||
KERNEL_MPS(mv, lower_precision_fp)
|
||||
KERNEL_MPS(linear, lower_precision_fp)
|
||||
KERNEL_MPS(addbmm, lower_precision_fp)
|
||||
KERNEL_MPS(baddbmm, lower_precision_fp)
|
||||
KERNEL_MPS(bmm, lower_precision_fp)
|
||||
KERNEL_MPS(chain_matmul, lower_precision_fp)
|
||||
KERNEL_MPS(linalg_multi_dot, lower_precision_fp)
|
||||
KERNEL_MPS(lstm_cell, lower_precision_fp)
|
||||
|
||||
// fp32
|
||||
KERNEL_MPS(acos, fp32)
|
||||
KERNEL_MPS(asin, fp32)
|
||||
KERNEL_MPS(cosh, fp32)
|
||||
KERNEL_MPS(erfinv, fp32)
|
||||
KERNEL_MPS(exp, fp32)
|
||||
KERNEL_MPS(expm1, fp32)
|
||||
KERNEL_MPS(log, fp32)
|
||||
KERNEL_MPS(log10, fp32)
|
||||
KERNEL_MPS(log2, fp32)
|
||||
KERNEL_MPS(log1p, fp32)
|
||||
KERNEL_MPS(reciprocal, fp32)
|
||||
KERNEL_MPS(rsqrt, fp32)
|
||||
KERNEL_MPS(sinh, fp32)
|
||||
KERNEL_MPS(tan, fp32)
|
||||
KERNEL_MPS2(pow, Tensor_Scalar, fp32)
|
||||
KERNEL_MPS2(pow, Tensor_Tensor, fp32)
|
||||
KERNEL_MPS2(pow, Scalar, fp32)
|
||||
KERNEL_MPS(softplus, fp32)
|
||||
KERNEL_MPS(layer_norm, fp32)
|
||||
KERNEL_MPS(native_layer_norm, fp32)
|
||||
KERNEL_MPS(group_norm, fp32)
|
||||
KERNEL_MPS2(frobenius_norm, dim, fp32)
|
||||
KERNEL_MPS(nuclear_norm, fp32)
|
||||
KERNEL_MPS2(nuclear_norm, dim, fp32)
|
||||
KERNEL_MPS(batch_norm, fp32)
|
||||
KERNEL_MPS(cosine_similarity, fp32)
|
||||
KERNEL_MPS(poisson_nll_loss, fp32)
|
||||
KERNEL_MPS(cosine_embedding_loss, fp32)
|
||||
KERNEL_MPS(nll_loss, fp32)
|
||||
KERNEL_MPS(nll_loss2d, fp32)
|
||||
KERNEL_MPS(hinge_embedding_loss, fp32)
|
||||
KERNEL_MPS(kl_div, fp32)
|
||||
KERNEL_MPS(l1_loss, fp32)
|
||||
KERNEL_MPS(smooth_l1_loss, fp32)
|
||||
KERNEL_MPS(huber_loss, fp32)
|
||||
KERNEL_MPS(mse_loss, fp32)
|
||||
KERNEL_MPS(margin_ranking_loss, fp32)
|
||||
KERNEL_MPS(multilabel_margin_loss, fp32)
|
||||
KERNEL_MPS(soft_margin_loss, fp32)
|
||||
KERNEL_MPS(triplet_margin_loss, fp32)
|
||||
KERNEL_MPS(multi_margin_loss, fp32)
|
||||
KERNEL_MPS(binary_cross_entropy_with_logits, fp32)
|
||||
KERNEL_MPS(dist, fp32)
|
||||
KERNEL_MPS(pdist, fp32)
|
||||
KERNEL_MPS(cdist, fp32)
|
||||
KERNEL_MPS(renorm, fp32)
|
||||
KERNEL_MPS(logsumexp, fp32)
|
||||
|
||||
// fp32_set_opt_dtype
|
||||
KERNEL_MPS(prod, fp32)
|
||||
KERNEL_MPS2(prod, dim_int, fp32)
|
||||
KERNEL_MPS2(prod, dim_Dimname, fp32)
|
||||
KERNEL_MPS2(softmax, int, fp32)
|
||||
KERNEL_MPS2(softmax, Dimname, fp32)
|
||||
KERNEL_MPS2(log_softmax, int, fp32)
|
||||
KERNEL_MPS2(log_softmax, Dimname, fp32)
|
||||
KERNEL_MPS(cumprod, fp32)
|
||||
KERNEL_MPS2(cumprod, dimname, fp32)
|
||||
KERNEL_MPS(cumsum, fp32)
|
||||
KERNEL_MPS2(cumsum, dimname, fp32)
|
||||
KERNEL_MPS(linalg_vector_norm, fp32)
|
||||
KERNEL_MPS(linalg_matrix_norm, fp32)
|
||||
KERNEL_MPS2(linalg_matrix_norm, str_ord, fp32)
|
||||
KERNEL_MPS(sum, fp32)
|
||||
KERNEL_MPS2(sum, dim_IntList, fp32)
|
||||
KERNEL_MPS2(sum, dim_DimnameList, fp32)
|
||||
//
|
||||
// promote
|
||||
KERNEL_MPS(addcdiv, promote)
|
||||
KERNEL_MPS(addcmul, promote)
|
||||
KERNEL_MPS(atan2, promote)
|
||||
KERNEL_MPS(bilinear, promote)
|
||||
KERNEL_MPS(cross, promote)
|
||||
KERNEL_MPS(dot, promote)
|
||||
KERNEL_MPS(grid_sampler, promote)
|
||||
KERNEL_MPS(index_put, promote)
|
||||
KERNEL_MPS(tensordot, promote)
|
||||
KERNEL_MPS(scatter_add, promote)
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(_, AutocastCPU, m) {
|
||||
m.fallback(torch::CppFunction::makeFallthrough());
|
||||
}
|
||||
|
||||
@ -145,6 +145,8 @@ inline bool is_autocast_eligible(
|
||||
return tensor.is_xla() && tensor.is_floating_point();
|
||||
case c10::DeviceType::PrivateUse1:
|
||||
return tensor.is_privateuseone() && tensor.is_floating_point();
|
||||
case c10::DeviceType::MPS:
|
||||
return tensor.is_mps() && tensor.is_floating_point();
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@ -168,6 +170,8 @@ inline DispatchKey get_autocast_dispatch_key_from_device_type(
|
||||
return DispatchKey::AutocastXLA;
|
||||
case c10::DeviceType::PrivateUse1:
|
||||
return DispatchKey::AutocastPrivateUse1;
|
||||
case c10::DeviceType::MPS:
|
||||
return DispatchKey::AutocastMPS;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"unknown device type for autocast in get_autocast_dispatch_key_from_device_type");
|
||||
@ -178,7 +182,7 @@ inline bool is_autocast_available(c10::DeviceType device_type) {
|
||||
if (device_type == at::kCPU || device_type == at::kCUDA ||
|
||||
device_type == at::kXPU || device_type == at::kIPU ||
|
||||
device_type == at::kHPU || device_type == at::kXLA ||
|
||||
device_type == at::kPrivateUse1) {
|
||||
device_type == at::kPrivateUse1 || device_type == at::kMPS) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
@ -745,6 +749,27 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
|
||||
REDISPATCH_SIGNATURE, \
|
||||
POLICY)
|
||||
|
||||
// KERNEL_MPS registration for AutocastMPS
|
||||
#define KERNEL_MPS(OP, POLICY) \
|
||||
m.impl( \
|
||||
TORCH_SELECTIVE_NAME("aten::" #OP), \
|
||||
&WrapFunction< \
|
||||
CastPolicy::POLICY, \
|
||||
DeviceType::MPS, \
|
||||
decltype(ATEN_FN(OP)), \
|
||||
decltype(ATEN_FN(OP)), \
|
||||
&ATEN_FN(OP)>::type::call);
|
||||
|
||||
#define KERNEL_MPS2(OP, OVERLOAD, POLICY) \
|
||||
m.impl( \
|
||||
TORCH_SELECTIVE_NAME("aten::" #OP "." #OVERLOAD), \
|
||||
&WrapFunction< \
|
||||
CastPolicy::POLICY, \
|
||||
DeviceType::MPS, \
|
||||
decltype(ATEN_FN2(OP, OVERLOAD)), \
|
||||
decltype(ATEN_FN2(OP, OVERLOAD)), \
|
||||
&ATEN_FN2(OP, OVERLOAD)>::type::call);
|
||||
|
||||
// Op lists for different policies.
|
||||
// To make sure other backends can reuse the policy op list.
|
||||
#define AT_FORALL_LOWER_PRECISION_FP(_) \
|
||||
|
||||
@ -228,6 +228,7 @@ namespace c10 {
|
||||
_(aten, is_autocast_cpu_enabled) \
|
||||
_(aten, is_autocast_xla_enabled) \
|
||||
_(aten, get_autocast_dtype) \
|
||||
_(aten, is_autocast_mps_enabled) \
|
||||
FORALL_ATEN_BASE_SYMBOLS(_) \
|
||||
_(onnx, Add) \
|
||||
_(onnx, Concat) \
|
||||
|
||||
@ -9,7 +9,7 @@
|
||||
#endif
|
||||
|
||||
namespace at::cpu {
|
||||
bool is_cpu_support_avx2() {
|
||||
bool is_avx2_supported() {
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
return cpuinfo_initialize() && cpuinfo_has_x86_avx2();
|
||||
#else
|
||||
@ -17,7 +17,7 @@ bool is_cpu_support_avx2() {
|
||||
#endif
|
||||
}
|
||||
|
||||
bool is_cpu_support_avx512() {
|
||||
bool is_avx512_supported() {
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
return cpuinfo_initialize() && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512vl() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq();
|
||||
#else
|
||||
@ -25,7 +25,7 @@ bool is_cpu_support_avx512() {
|
||||
#endif
|
||||
}
|
||||
|
||||
bool is_cpu_support_avx512_vnni() {
|
||||
bool is_avx512_vnni_supported() {
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
return cpuinfo_initialize() && cpuinfo_has_x86_avx512vnni();
|
||||
#else
|
||||
@ -33,7 +33,15 @@ bool is_cpu_support_avx512_vnni() {
|
||||
#endif
|
||||
}
|
||||
|
||||
bool is_cpu_support_amx_tile() {
|
||||
bool is_avx512_bf16_supported() {
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
return cpuinfo_initialize() && cpuinfo_has_x86_avx512bf16();
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool is_amx_tile_supported() {
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
return cpuinfo_initialize() && cpuinfo_has_x86_amx_tile();
|
||||
#else
|
||||
@ -42,7 +50,7 @@ bool is_cpu_support_amx_tile() {
|
||||
}
|
||||
|
||||
bool init_amx() {
|
||||
if (!is_cpu_support_amx_tile()) {
|
||||
if (!is_amx_tile_supported()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@ -6,14 +6,17 @@
|
||||
|
||||
namespace at::cpu {
|
||||
|
||||
TORCH_API bool is_cpu_support_avx2();
|
||||
TORCH_API bool is_cpu_support_avx512();
|
||||
TORCH_API bool is_avx2_supported();
|
||||
TORCH_API bool is_avx512_supported();
|
||||
|
||||
// Detect if CPU support Vector Neural Network Instruction.
|
||||
TORCH_API bool is_cpu_support_avx512_vnni();
|
||||
TORCH_API bool is_avx512_vnni_supported();
|
||||
|
||||
// Detect if CPU supports AVX512_BF16 ISA
|
||||
TORCH_API bool is_avx512_bf16_supported();
|
||||
|
||||
// Detect if CPU support Advanced Matrix Extension.
|
||||
TORCH_API bool is_cpu_support_amx_tile();
|
||||
TORCH_API bool is_amx_tile_supported();
|
||||
|
||||
// Enable the system to use AMX instructions.
|
||||
TORCH_API bool init_amx();
|
||||
|
||||
@ -636,6 +636,21 @@ inline void transpose_mxn<float, 8, 8>(
|
||||
_mm256_storeu_ps(&dst[7 * ld_dst], th);
|
||||
}
|
||||
|
||||
template<>
|
||||
inline void transpose_mxn<float, 16, 16>(
|
||||
const float* src,
|
||||
int64_t ld_src,
|
||||
float* dst,
|
||||
int64_t ld_dst) {
|
||||
transpose_mxn<float, 8, 8>(
|
||||
src , ld_src, dst, ld_dst);
|
||||
transpose_mxn<float, 8, 8>(
|
||||
src + 8, ld_src, dst + 8 * ld_dst, ld_dst);
|
||||
transpose_mxn<float, 8, 8>(
|
||||
src + 8 * ld_src, ld_src, dst + 8, ld_dst);
|
||||
transpose_mxn<float, 8, 8>(
|
||||
src + 8 * ld_src + 8, ld_src, dst + 8 * ld_dst + 8, ld_dst);
|
||||
}
|
||||
#endif
|
||||
|
||||
}} // namespace at::vec::CPU_CAPABILITY
|
||||
|
||||
@ -582,8 +582,7 @@ Vectorized<float> inline fmsub(const Vectorized<float>& a, const Vectorized<floa
|
||||
// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L230-L304
|
||||
// kernel for transposing mxn where m, n <= 16
|
||||
// M + (M + 1) / 2 * 2 + (M + 3) / 4 * 4 + (M + 7) / 8 * 8 + 2 * N instructions
|
||||
template <>
|
||||
inline void transpose_mxn<float>(const float* src, int64_t ld_src, float* dst, int64_t ld_dst, int M, int N) {
|
||||
inline void transpose_mxn_16x16(const float* src, int64_t ld_src, float* dst, int64_t ld_dst, int M, int N) {
|
||||
TORCH_CHECK(M <= 16 && N <= 16, "transpose_mxn<float> expects M, N <= 16.");
|
||||
// load from src to registers
|
||||
__m512 input[16];
|
||||
@ -667,8 +666,39 @@ inline void transpose_mxn<float>(const float* src, int64_t ld_src, float* dst, i
|
||||
}
|
||||
}
|
||||
|
||||
template<>
|
||||
inline void transpose_mxn<float>(const float* src, int64_t ld_src, float* dst, int64_t ld_dst, int M, int N) {
|
||||
int64_t i = 0;
|
||||
for (; i < M / 16 * 16; i += 16) {
|
||||
int64_t j = 0;
|
||||
for (; j < N / 16 * 16; j += 16) {
|
||||
transpose_mxn_16x16(
|
||||
src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, 16, 16);
|
||||
}
|
||||
// handle remainder j
|
||||
int nrem = N - j;
|
||||
if (nrem > 0) {
|
||||
transpose_mxn_16x16(
|
||||
src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, 16, nrem);
|
||||
}
|
||||
}
|
||||
// handle remainder i
|
||||
int mrem = M - i;
|
||||
if (mrem > 0) {
|
||||
int j = 0;
|
||||
for (; j < N / 16 * 16; j += 16) {
|
||||
transpose_mxn_16x16(
|
||||
src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, mrem, 16);
|
||||
}
|
||||
// handle remainder j
|
||||
int nrem = N - j;
|
||||
transpose_mxn_16x16(
|
||||
src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, mrem, nrem);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int M, int N,
|
||||
typename std::enable_if_t<std::is_same<T, float>::value && M <= 16 && N <= 16, int> = 0>
|
||||
typename std::enable_if_t<std::is_same<T, float>::value, int> = 0>
|
||||
inline void transpose_mxn(const float* src, int64_t ld_src, float* dst, int64_t ld_dst) {
|
||||
transpose_mxn<float>(src, ld_src, dst, ld_dst, M, N);
|
||||
}
|
||||
|
||||
@ -23,6 +23,9 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchVmapMode, m) {
|
||||
OP_DECOMPOSE(dropout_);
|
||||
OP_DECOMPOSE(feature_alpha_dropout_);
|
||||
OP_DECOMPOSE(feature_dropout_);
|
||||
OP_DECOMPOSE(dropout);
|
||||
OP_DECOMPOSE(_scaled_dot_product_attention_math);
|
||||
OP_DECOMPOSE(scaled_dot_product_attention);
|
||||
}
|
||||
|
||||
static void unsupportedData(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
|
||||
@ -235,7 +238,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
|
||||
OP_DECOMPOSE(relu6_);
|
||||
OP_DECOMPOSE(prelu);
|
||||
OP_DECOMPOSE2(softmax, int);
|
||||
OP_DECOMPOSE(scaled_dot_product_attention);
|
||||
OP_DECOMPOSE(special_gammainc);
|
||||
OP_DECOMPOSE(special_gammaincc);
|
||||
OP_DECOMPOSE(special_logit);
|
||||
@ -261,7 +263,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
|
||||
OP_DECOMPOSE(special_xlogy);
|
||||
OP_DECOMPOSE2(special_xlogy, other_scalar);
|
||||
OP_DECOMPOSE2(special_xlogy, self_scalar);
|
||||
OP_DECOMPOSE(_scaled_dot_product_attention_math);
|
||||
|
||||
|
||||
m.impl("split.sizes", native::split_symint);
|
||||
@ -386,6 +387,11 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
|
||||
OP_DECOMPOSE2(to, dtype);
|
||||
OP_DECOMPOSE2(to, dtype_layout);
|
||||
OP_DECOMPOSE2(to, other);
|
||||
|
||||
// Random ops that are also registered here
|
||||
OP_DECOMPOSE(dropout);
|
||||
OP_DECOMPOSE(_scaled_dot_product_attention_math);
|
||||
OP_DECOMPOSE(scaled_dot_product_attention);
|
||||
}
|
||||
|
||||
} // namespace at::functorch
|
||||
|
||||
@ -496,6 +496,11 @@ _scaled_dot_product_flash_attention_batch_rule(
|
||||
bool return_debug_mask,
|
||||
c10::optional<double> scale
|
||||
) {
|
||||
if (dropout_p > 0) {
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
RandomnessType randomness = maybe_layer->randomness();
|
||||
check_randomness(randomness, query_bdim.has_value() || key_bdim.has_value() || value_bdim.has_value());
|
||||
}
|
||||
auto batch_size = get_bdim_size3(query, query_bdim, key, key_bdim, value, value_bdim);
|
||||
auto query_ = moveBatchDimToFront(query, query_bdim);
|
||||
auto key_ = moveBatchDimToFront(key, key_bdim);
|
||||
@ -540,6 +545,11 @@ fourOutputs _scaled_dot_product_efficient_attention_batch_rule(
|
||||
bool is_causal,
|
||||
c10::optional<double> scale
|
||||
) {
|
||||
if (dropout_p > 0) {
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
RandomnessType randomness = maybe_layer->randomness();
|
||||
check_randomness(randomness, query_bdim.has_value() || key_bdim.has_value() || value_bdim.has_value());
|
||||
}
|
||||
auto batch_size = get_bdim_size3(query, query_bdim, key, key_bdim, value, value_bdim);
|
||||
auto query_ = moveBatchDimToFront(query, query_bdim);
|
||||
auto key_ = moveBatchDimToFront(key, key_bdim);
|
||||
@ -577,6 +587,11 @@ _scaled_dot_product_cudnn_attention_batch_rule(
|
||||
bool return_debug_mask,
|
||||
c10::optional<double> scale
|
||||
) {
|
||||
if (dropout_p > 0) {
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
RandomnessType randomness = maybe_layer->randomness();
|
||||
check_randomness(randomness, query_bdim.has_value() || key_bdim.has_value() || value_bdim.has_value());
|
||||
}
|
||||
auto batch_size = get_bdim_size3(query, query_bdim, key, key_bdim, value, value_bdim);
|
||||
auto query_ = moveBatchDimToFront(query, query_bdim);
|
||||
auto key_ = moveBatchDimToFront(key, key_bdim);
|
||||
|
||||
@ -41,6 +41,17 @@ extern "C" void zaxpy_(int *n, void *a, const void *x, int *incx, void *y, int *
|
||||
#include <fbgemm/FbgemmI64.h>
|
||||
#endif // USE_FBGEMM
|
||||
|
||||
#if AT_MKLDNN_ENABLED()
|
||||
#include <oneapi/dnnl/dnnl_version.h>
|
||||
#endif // oneDNN
|
||||
|
||||
#define ONEDNN_UKERNEL_ENABLED (DNNL_VERSION_MAJOR >=3 && DNNL_VERSION_MINOR >=5)
|
||||
|
||||
#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
|
||||
#include <oneapi/dnnl/dnnl_ukernel.hpp>
|
||||
#include <oneapi/dnnl/dnnl.hpp>
|
||||
#endif // oneDNN BRGEMM
|
||||
|
||||
namespace at::native::cpublas {
|
||||
namespace internal {
|
||||
|
||||
@ -822,4 +833,366 @@ void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<fl
|
||||
n, x, incx, y, incy);
|
||||
}
|
||||
|
||||
} // namespace at::native::cpublas
|
||||
// oneDNN BRGEMM
|
||||
#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
|
||||
struct BrgemmKey {
|
||||
int64_t M;
|
||||
int64_t N;
|
||||
int64_t K;
|
||||
int64_t batch_size;
|
||||
int64_t lda;
|
||||
int64_t ldb;
|
||||
int64_t ldc;
|
||||
ScalarType dt_a;
|
||||
ScalarType dt_b;
|
||||
ScalarType dt_c;
|
||||
float alpha;
|
||||
float beta;
|
||||
BrgemmKey(
|
||||
int64_t M,
|
||||
int64_t N,
|
||||
int64_t K,
|
||||
int64_t batch_size,
|
||||
int64_t lda,
|
||||
int64_t ldb,
|
||||
int64_t ldc,
|
||||
ScalarType dt_a,
|
||||
ScalarType dt_b,
|
||||
ScalarType dt_c,
|
||||
float alpha,
|
||||
float beta)
|
||||
: M(M),
|
||||
N(N),
|
||||
K(K),
|
||||
batch_size(batch_size),
|
||||
lda(lda),
|
||||
ldb(ldb),
|
||||
ldc(ldc),
|
||||
dt_a(dt_a),
|
||||
dt_b(dt_b),
|
||||
dt_c(dt_c),
|
||||
alpha(alpha),
|
||||
beta(beta) {}
|
||||
bool operator==(const BrgemmKey& other) const {
|
||||
return M == other.M && N == other.N && K == other.K &&
|
||||
batch_size == other.batch_size && lda == other.lda &&
|
||||
ldb == other.ldb && ldc == other.ldc && dt_a == other.dt_a &&
|
||||
dt_b == other.dt_b && dt_c == other.dt_c && alpha == other.alpha &&
|
||||
beta == other.beta;
|
||||
}
|
||||
};
|
||||
|
||||
struct PackKey {
|
||||
int64_t K;
|
||||
int64_t N;
|
||||
int64_t ld_in;
|
||||
int64_t ld_out;
|
||||
ScalarType dt_in;
|
||||
ScalarType dt_out;
|
||||
PackKey(
|
||||
int64_t K,
|
||||
int64_t N,
|
||||
int64_t ld_in,
|
||||
int64_t ld_out,
|
||||
ScalarType dt_in,
|
||||
ScalarType dt_out)
|
||||
: K(K),
|
||||
N(N),
|
||||
ld_in(ld_in),
|
||||
ld_out(ld_out),
|
||||
dt_in(dt_in),
|
||||
dt_out(dt_out) {}
|
||||
bool operator==(const PackKey& other) const {
|
||||
return N == other.N && K == other.K && ld_in == other.ld_in &&
|
||||
ld_out == other.ld_out && dt_in == other.dt_in &&
|
||||
dt_out == other.dt_out;
|
||||
}
|
||||
};
|
||||
|
||||
inline dnnl::memory::data_type get_dnnl_dtype(ScalarType dtype) {
|
||||
if (dtype == ScalarType::Float) {
|
||||
return dnnl::memory::data_type::f32;
|
||||
} else if (dtype == ScalarType::BFloat16) {
|
||||
return dnnl::memory::data_type::bf16;
|
||||
} else if (dtype == ScalarType::Half) {
|
||||
return dnnl::memory::data_type::f16;
|
||||
} else if (dtype == ScalarType::Byte) {
|
||||
return dnnl::memory::data_type::u8;
|
||||
} else if (dtype == ScalarType::Char) {
|
||||
return dnnl::memory::data_type::s8;
|
||||
} else {
|
||||
TORCH_CHECK(false, "get_dnnl_dtype expects float/bfloat16/half/int8 tensor input");
|
||||
}
|
||||
}
|
||||
|
||||
template<typename key_t>
|
||||
struct UnsafeUkernelKeyHasher {
|
||||
std::size_t operator()(const key_t& key) const;
|
||||
};
|
||||
|
||||
template<>
|
||||
std::size_t UnsafeUkernelKeyHasher<BrgemmKey>::operator()(const BrgemmKey& key) const {
|
||||
// Use beta, M, N, and K to compute hash to reduce the overhead as
|
||||
// batch size, alpha, and data types are unlikely to change within the same kernel and
|
||||
// leading dimensions are likely to be related to M, K, N or use fixed values.
|
||||
std::size_t h = std::hash<float>()(key.beta + 1);
|
||||
h = std::hash<int64_t>()(key.M) ^ (h << 1);
|
||||
h = std::hash<int64_t>()(key.N) ^ (h << 1);
|
||||
h = std::hash<int64_t>()(key.K) ^ (h << 1);
|
||||
h = std::hash<int64_t>()(key.ldc) ^ (h << 1);
|
||||
return h;
|
||||
}
|
||||
|
||||
template<>
|
||||
std::size_t UnsafeUkernelKeyHasher<PackKey>::operator()(const PackKey& key) const {
|
||||
// Use K and N to compute hash to reduce the overhead as
|
||||
// data types are unlikely to change and
|
||||
// ld_in/ld_out is likely to be related to K, N or use fixed values
|
||||
std::size_t h = std::hash<int64_t>()(key.K);
|
||||
h = std::hash<int64_t>()(key.N) ^ (h << 1);
|
||||
return h;
|
||||
}
|
||||
|
||||
template <typename key_t, typename value_t>
|
||||
struct KernelCache {
|
||||
using kstore_t = std::unordered_map<key_t, std::shared_ptr<value_t>, UnsafeUkernelKeyHasher<key_t>>;
|
||||
static inline std::shared_ptr<value_t>&& fetch_or_create(
|
||||
const key_t& key,
|
||||
const std::function<std::shared_ptr<value_t>()>& callback) {
|
||||
auto&& search = get_store().find(key);
|
||||
if (search != get_store().end()) {
|
||||
return std::move(search->second);
|
||||
} else {
|
||||
get_store().insert({key, callback()});
|
||||
return std::move(get_store()[key]);
|
||||
}
|
||||
}
|
||||
|
||||
static inline kstore_t& get_store() {
|
||||
static thread_local kstore_t cache_kernels;
|
||||
return cache_kernels;
|
||||
}
|
||||
};
|
||||
|
||||
// Helper struct for convenient brgemm configuration
|
||||
struct GemmHelper {
|
||||
GemmHelper(
|
||||
int64_t M,
|
||||
int64_t N,
|
||||
int64_t K,
|
||||
int64_t bs,
|
||||
int64_t ld_a,
|
||||
int64_t ld_b,
|
||||
int64_t ld_c,
|
||||
ScalarType dt_a,
|
||||
ScalarType dt_b,
|
||||
ScalarType dt_c,
|
||||
const float alpha,
|
||||
const float beta) {
|
||||
// Create brgemm
|
||||
brg = dnnl::ukernel::brgemm(
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
bs,
|
||||
ld_a,
|
||||
ld_b,
|
||||
ld_c,
|
||||
get_dnnl_dtype(dt_a),
|
||||
get_dnnl_dtype(dt_b),
|
||||
get_dnnl_dtype(dt_c),
|
||||
alpha,
|
||||
beta);
|
||||
// Create a scratchpad buffer for the brgemm execution
|
||||
scratchpad = std::vector<uint8_t>(brg.get_scratchpad_size());
|
||||
// Prepare default vector of pairs of tensors A and B offsets for each batch.
|
||||
A_B_offsets.reserve(1);
|
||||
A_B_offsets[0] = std::make_pair(0, 0);
|
||||
}
|
||||
dnnl::ukernel::brgemm brg;
|
||||
std::vector<uint8_t> scratchpad;
|
||||
std::vector<std::pair<int64_t, int64_t>> A_B_offsets;
|
||||
};
|
||||
|
||||
struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
|
||||
// Fetch/create GemmHelper object and execute brgemm with batch size = 1
|
||||
template <typename scalar_t_a, typename scalar_t_b, typename scalar_t_c>
|
||||
static inline void call(
|
||||
int64_t M,
|
||||
int64_t N,
|
||||
int64_t K,
|
||||
int64_t ld_a,
|
||||
int64_t ld_b,
|
||||
int64_t ld_c,
|
||||
const float alpha,
|
||||
const float beta,
|
||||
const scalar_t_a* A,
|
||||
const scalar_t_b* B,
|
||||
scalar_t_c* C) {
|
||||
auto&& key = BrgemmKey(
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
int64_t(1),
|
||||
ld_a,
|
||||
ld_b,
|
||||
ld_c,
|
||||
c10::CppTypeToScalarType<scalar_t_a>::value,
|
||||
c10::CppTypeToScalarType<scalar_t_b>::value,
|
||||
c10::CppTypeToScalarType<scalar_t_c>::value,
|
||||
alpha,
|
||||
beta);
|
||||
// Fetch/create GemmHelper object
|
||||
auto&& value = fetch_or_create(key, [&]() {
|
||||
auto&& v = std::make_shared<GemmHelper>(
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
1,
|
||||
ld_a,
|
||||
ld_b,
|
||||
ld_c,
|
||||
c10::CppTypeToScalarType<scalar_t_a>::value,
|
||||
c10::CppTypeToScalarType<scalar_t_b>::value,
|
||||
c10::CppTypeToScalarType<scalar_t_c>::value,
|
||||
alpha,
|
||||
beta);
|
||||
(*v).brg.generate();
|
||||
return std::move(v);
|
||||
});
|
||||
if (get_current() != value) {
|
||||
dnnl::ukernel::brgemm::release_hw_context();
|
||||
((*value).brg).set_hw_context();
|
||||
get_current() = value;
|
||||
}
|
||||
((*value).brg)
|
||||
.execute(A, B, (*value).A_B_offsets, C, (*value).scratchpad.data());
|
||||
}
|
||||
|
||||
static inline std::shared_ptr<GemmHelper>& get_current() {
|
||||
static thread_local std::shared_ptr<GemmHelper> current;
|
||||
return current;
|
||||
}
|
||||
|
||||
static inline bool device_check(ScalarType dtype) {
|
||||
if (!at::globalContext().userEnabledMkldnn()) {
|
||||
return false;
|
||||
}
|
||||
if (dtype == ScalarType::Half) {
|
||||
static bool fp16_support = dnnl::get_effective_cpu_isa() >= dnnl::cpu_isa::avx512_core_fp16;
|
||||
return fp16_support;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
using pack_t = dnnl::ukernel::brgemm_pack_B;
|
||||
struct Pack : public KernelCache <PackKey, pack_t> {
|
||||
static inline void call(
|
||||
int64_t K,
|
||||
int64_t N,
|
||||
int64_t ld_in,
|
||||
int64_t ld_out,
|
||||
ScalarType dt_in,
|
||||
ScalarType dt_out,
|
||||
const void* in,
|
||||
void* out) {
|
||||
auto&& key = PackKey(K, N, ld_in, ld_out, dt_in, dt_out);
|
||||
auto&& pack = fetch_or_create(key, [&]() {
|
||||
auto&& p = std::make_shared<pack_t>(
|
||||
K, N, ld_in, ld_out, get_dnnl_dtype(dt_in), get_dnnl_dtype(dt_out));
|
||||
if (need_pack(dt_in)) {
|
||||
(*p).generate();
|
||||
}
|
||||
return std::move(p);
|
||||
});
|
||||
if (need_pack(dt_in)) {
|
||||
(*pack).execute(in, out);
|
||||
} else {
|
||||
TORCH_CHECK(false, "No need to pack");
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool need_pack(ScalarType dtype) {
|
||||
if (!at::globalContext().userEnabledMkldnn()) {
|
||||
return false;
|
||||
}
|
||||
if (dtype == ScalarType::Half) {
|
||||
static bool fp16_pack = dnnl::get_effective_cpu_isa() >= dnnl::cpu_isa::avx512_core_amx_fp16;
|
||||
return fp16_pack;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
void brgemm(
|
||||
int64_t M,
|
||||
int64_t N,
|
||||
int64_t K,
|
||||
int64_t ld_a,
|
||||
int64_t ld_b,
|
||||
int64_t ld_c,
|
||||
const float alpha,
|
||||
const float beta,
|
||||
const at::Half* A,
|
||||
const at::Half* B,
|
||||
float* C) {
|
||||
#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
|
||||
if (Brgemm::device_check(ScalarType::Half)) {
|
||||
Brgemm::call<at::Half, at::Half, float>(
|
||||
M, N, K, ld_a, ld_b, ld_c, alpha, beta, A, B, C);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
TORCH_CHECK(false,
|
||||
"Half Brgemm is only supported on X64 when oneDNN ukernel is enabled and avx512_fp16 is supported");
|
||||
}
|
||||
|
||||
void brgemm(
|
||||
int64_t M,
|
||||
int64_t N,
|
||||
int64_t K,
|
||||
int64_t ld_a,
|
||||
int64_t ld_b,
|
||||
int64_t ld_c,
|
||||
const float alpha,
|
||||
const float beta,
|
||||
const at::BFloat16* A,
|
||||
const at::BFloat16* B,
|
||||
float* C) {
|
||||
TORCH_CHECK(false,
|
||||
"BFloat16 Brgemm is currently not supported");
|
||||
}
|
||||
|
||||
void brgemm_release() {
|
||||
#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
|
||||
dnnl::ukernel::brgemm::release_hw_context();
|
||||
#endif
|
||||
}
|
||||
|
||||
void pack(
|
||||
int64_t K,
|
||||
int64_t N,
|
||||
int64_t ld_in,
|
||||
int64_t ld_out,
|
||||
ScalarType dt_in,
|
||||
ScalarType dt_out,
|
||||
const void* in,
|
||||
void* out) {
|
||||
#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
|
||||
Pack::call(K, N, ld_in, ld_out, dt_in, dt_out, in, out);
|
||||
#else
|
||||
TORCH_CHECK(false, "pack is only supported on X64 with oneDNN ukernel enabled");
|
||||
#endif
|
||||
}
|
||||
|
||||
bool need_pack(ScalarType dt_in) {
|
||||
#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
|
||||
return Pack::need_pack(dt_in);
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace at::native::cpublas
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <c10/core/Scalar.h>
|
||||
|
||||
|
||||
namespace at::native::cpublas {
|
||||
|
||||
namespace internal {
|
||||
@ -186,4 +187,58 @@ void copy(int64_t n, const float *x, int64_t incx, float *y, int64_t incy);
|
||||
void copy(int64_t n, const c10::complex<double> *x, int64_t incx, c10::complex<double> *y, int64_t incy);
|
||||
void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<float> *y, int64_t incy);
|
||||
|
||||
} // namespace at::native::cpublas
|
||||
// Batch-reduce GEMM
|
||||
// Operates by the following formula:
|
||||
// C = alpha * SUM(A[i] x B[i]) + beta * C, i = 0 to batch size
|
||||
// A Base pointer to a tensor A.
|
||||
// B Base pointer to a tensor B.
|
||||
// Byte offsets vector of pairs of tensors A and B offsets for
|
||||
// each batch. The number of batches must coincide with the
|
||||
// `batch_size` value passed at object construction stage.
|
||||
// C Pointer to a tensor C (accumulation buffer).
|
||||
// scratchpad Pointer to a scratchpad buffer.
|
||||
// Currently, only brgemm with batch size = 1 will be used
|
||||
TORCH_API void brgemm(
|
||||
int64_t M,
|
||||
int64_t N,
|
||||
int64_t K,
|
||||
int64_t ld_a,
|
||||
int64_t ld_b,
|
||||
int64_t ld_c,
|
||||
const float alpha,
|
||||
const float beta,
|
||||
const at::Half* A,
|
||||
const at::Half* B,
|
||||
float* C);
|
||||
|
||||
TORCH_API void brgemm(
|
||||
int64_t M,
|
||||
int64_t N,
|
||||
int64_t K,
|
||||
int64_t ld_a,
|
||||
int64_t ld_b,
|
||||
int64_t ld_c,
|
||||
const float alpha,
|
||||
const float beta,
|
||||
const at::BFloat16* A,
|
||||
const at::BFloat16* B,
|
||||
float* C);
|
||||
|
||||
// Release brgemm hardware context
|
||||
void brgemm_release();
|
||||
|
||||
// Pack B matrix to get better performance if needed
|
||||
void pack(
|
||||
int64_t K,
|
||||
int64_t N,
|
||||
int64_t ld_in,
|
||||
int64_t ld_out,
|
||||
ScalarType dt_in,
|
||||
ScalarType dt_out,
|
||||
const void* in,
|
||||
void* out);
|
||||
|
||||
// Whether pack is needed in the platform.
|
||||
bool need_pack(ScalarType dt_in);
|
||||
|
||||
} // namespace at::native::cpublas
|
||||
|
||||
@ -144,7 +144,7 @@ static void col2im_out_cpu_template(
|
||||
|
||||
output.resize_({batch_size, n_output_plane, output_height, output_width});
|
||||
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf,
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kBFloat16, kHalf, kBool,
|
||||
input.scalar_type(), "col2im_out_cpu", [&] {
|
||||
Tensor input_n = Tensor();
|
||||
Tensor output_n = Tensor();
|
||||
|
||||
@ -421,12 +421,18 @@ struct ConvParams {
|
||||
// cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest
|
||||
// that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how)
|
||||
#if !defined(C10_MOBILE)
|
||||
if (needs_64bit_indexing_no_split(input, weight)) {
|
||||
return false;
|
||||
}
|
||||
if (!detail::getCUDAHooks().compiledWithCuDNN()) {
|
||||
return false;
|
||||
}
|
||||
if (needs_64bit_indexing_no_split(input, weight)) {
|
||||
static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
|
||||
if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
|
||||
TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
|
||||
" if the V8 API is not enabled or before cuDNN version 9.3+."
|
||||
" Consider upgrading cuDNN and/or enabling the V8 API for better efficiency.");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (!input.is_cuda() || !cudnn_enabled) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -94,7 +94,7 @@ static void im2col_out_cpu_template(
|
||||
|
||||
output.resize_({batch_size, n_output_plane, output_length});
|
||||
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf,
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kBFloat16, kHalf, kBool,
|
||||
input.scalar_type(), "im2col_out_cpu", [&] {
|
||||
Tensor input_n;
|
||||
Tensor output_n;
|
||||
|
||||
@ -19,6 +19,7 @@
|
||||
#include <ATen/native/ReduceOpsUtils.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/mkldnn/Matmul.h>
|
||||
#include <ATen/native/mkldnn/Utils.h>
|
||||
#include <c10/core/GradMode.h>
|
||||
#include <c10/util/accumulate.h>
|
||||
#include <c10/util/irange.h>
|
||||
@ -1358,13 +1359,8 @@ static inline int64_t get_mkldnn_matmul_min_dim() {
|
||||
static auto value = [&] {
|
||||
const int64_t default_min_dim = [&] {
|
||||
// Minimum dimension requirement for MKLDNN; derived based on experiments.
|
||||
// By default, it's only enabled on Neoverse V1.
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
if (cpuinfo_initialize() && cpuinfo_get_uarchs_count() == 1 && cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_v1) {
|
||||
return 8;
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
//it's enabled on all Neoverse cpus.
|
||||
return is_arm_neoverse() ? 8 : 0;
|
||||
}();
|
||||
const char* ptr = std::getenv("TORCH_MKLDNN_MATMUL_MIN_DIM");
|
||||
return ptr != nullptr ? std::atoi(ptr) : default_min_dim;
|
||||
@ -1377,13 +1373,8 @@ static inline int64_t get_mkldnn_matmul_min_size() {
|
||||
static auto value = [&] {
|
||||
const int64_t default_min_size = [&] {
|
||||
// Minimum size requirement for MKLDNN; derived based on experiments.
|
||||
// By default, it's only enabled on Neoverse V1.
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
if (cpuinfo_initialize() && cpuinfo_get_uarchs_count() == 1 && cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_v1) {
|
||||
return 8 * 1024;
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
// it's enabled on all Neoverse cpus.
|
||||
return is_arm_neoverse() ? 8 * 1024 : 0;
|
||||
}();
|
||||
const char* ptr = std::getenv("TORCH_MKLDNN_MATMUL_MIN_SIZE");
|
||||
return ptr != nullptr ? std::atoi(ptr) : default_min_size;
|
||||
|
||||
@ -284,7 +284,7 @@ void resize_bytes_nocuda(const Storage& storage, const c10::SymInt& newsize) {
|
||||
} else if (device_type == at::kPrivateUse1) {
|
||||
at::detail::getPrivateUse1Hooks().resizePrivateUse1Bytes(
|
||||
storage, newsize.expect_int());
|
||||
} else if (device_type == at::kXPU || device_type == at::kHPU) {
|
||||
} else if (device_type == at::kXPU || device_type == at::kHPU || device_type == at::kMTIA) {
|
||||
ptrdiff_t size_bytes_i = newsize.expect_int();
|
||||
TORCH_CHECK(
|
||||
!c10::overflows<int64_t>(size_bytes_i),
|
||||
|
||||
@ -102,7 +102,7 @@ void col2im_out_cuda_template(
|
||||
output.resize_({batch_size, n_output_plane, output_height, output_width});
|
||||
int64_t output_batch_stride = output.stride(0);
|
||||
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kBool,
|
||||
input.scalar_type(), "col2im_out_cuda", [&] {
|
||||
int64_t height_col = (output_height + 2 * pad_height -
|
||||
(dilation_height * (kernel_height - 1) + 1)) /
|
||||
|
||||
@ -103,7 +103,7 @@ static void im2col_out_cuda_template(
|
||||
output.resize_({batch_size, n_output_plane, output_length});
|
||||
|
||||
// Launch kernel
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kBool,
|
||||
input.scalar_type(), "im2col_out_cuda", [&] {
|
||||
Tensor input_n;
|
||||
Tensor output_n;
|
||||
|
||||
@ -89,10 +89,22 @@ const std::map<c10::string_view, ideep::algorithm>& fusion_binary_alg_map();
|
||||
inline bool mkldnn_bf16_device_check_arm() {
|
||||
return cpuinfo_initialize() && cpuinfo_has_arm_bf16();
|
||||
}
|
||||
|
||||
inline bool is_arm_neoverse() {
|
||||
return (cpuinfo_initialize() && cpuinfo_get_uarchs_count() == 1 &&
|
||||
(cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_v1 ||
|
||||
cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_v2 ||
|
||||
cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_n1 ||
|
||||
cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_n2));
|
||||
}
|
||||
#else
|
||||
constexpr bool mkldnn_bf16_device_check_arm() {
|
||||
return false;
|
||||
}
|
||||
|
||||
constexpr bool is_arm_neoverse() {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if AT_MKLDNN_ENABLED()
|
||||
|
||||
@ -3400,9 +3400,9 @@
|
||||
|
||||
- func: fbgemm_pack_gemm_matrix_fp16(Tensor input) -> Tensor
|
||||
|
||||
- func: wrapped_linear_prepack(Tensor weight, Tensor weight_scale, Tensor weight_zero_point, Tensor bias) -> Tensor
|
||||
- func: _wrapped_linear_prepack(Tensor weight, Tensor weight_scale, Tensor weight_zero_point, Tensor bias) -> Tensor
|
||||
|
||||
- func: wrapped_quantized_linear_prepacked(Tensor input, Tensor input_scale, Tensor input_zero_point, Tensor packed_weight, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor
|
||||
- func: _wrapped_quantized_linear_prepacked(Tensor input, Tensor input_scale, Tensor input_zero_point, Tensor packed_weight, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor
|
||||
|
||||
- func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor
|
||||
|
||||
|
||||
@ -436,12 +436,12 @@ at::Tensor wrapped_quantized_linear_meta(
|
||||
#endif // USE_FBGEMM
|
||||
}
|
||||
|
||||
at::Tensor wrapped_linear_prepack(const at::Tensor& weight,
|
||||
at::Tensor _wrapped_linear_prepack(const at::Tensor& weight,
|
||||
const at::Tensor& weight_scale,
|
||||
const at::Tensor& weight_zero_point,
|
||||
const at::Tensor& bias);
|
||||
|
||||
at::Tensor wrapped_linear_prepack(const at::Tensor& weight,
|
||||
at::Tensor _wrapped_linear_prepack(const at::Tensor& weight,
|
||||
const at::Tensor& weight_scale,
|
||||
const at::Tensor& weight_zero_point,
|
||||
const at::Tensor& bias) {
|
||||
@ -474,14 +474,14 @@ at::Tensor wrapped_linear_prepack(const at::Tensor& weight,
|
||||
#endif // USE_FBGEMM
|
||||
}
|
||||
|
||||
at::Tensor wrapped_quantized_linear_prepacked(const at::Tensor& input, const at::Tensor& input_scale,
|
||||
at::Tensor _wrapped_quantized_linear_prepacked(const at::Tensor& input, const at::Tensor& input_scale,
|
||||
const at::Tensor& input_zero_point,
|
||||
const at::Tensor& packed_weight,
|
||||
const at::Tensor& output_scale,
|
||||
const at::Tensor& output_zero_point,
|
||||
[[maybe_unused]] const int64_t out_channel);
|
||||
|
||||
at::Tensor wrapped_quantized_linear_prepacked(const at::Tensor& input, const at::Tensor& input_scale,
|
||||
at::Tensor _wrapped_quantized_linear_prepacked(const at::Tensor& input, const at::Tensor& input_scale,
|
||||
const at::Tensor& input_zero_point,
|
||||
const at::Tensor& packed_weight,
|
||||
const at::Tensor& output_scale,
|
||||
@ -507,12 +507,12 @@ at::Tensor wrapped_quantized_linear_prepacked(const at::Tensor& input, const at:
|
||||
#endif // USE_FBGEMM
|
||||
}
|
||||
|
||||
at::Tensor wrapped_linear_prepack_meta(const at::Tensor& weight,
|
||||
at::Tensor _wrapped_linear_prepack_meta(const at::Tensor& weight,
|
||||
[[maybe_unused]] const at::Tensor& weight_scale,
|
||||
[[maybe_unused]] const at::Tensor& weight_zero_point,
|
||||
[[maybe_unused]] const at::Tensor& bias);
|
||||
|
||||
at::Tensor wrapped_linear_prepack_meta(const at::Tensor& weight,
|
||||
at::Tensor _wrapped_linear_prepack_meta(const at::Tensor& weight,
|
||||
[[maybe_unused]] const at::Tensor& weight_scale,
|
||||
[[maybe_unused]] const at::Tensor& weight_zero_point,
|
||||
[[maybe_unused]] const at::Tensor& bias) {
|
||||
@ -530,7 +530,7 @@ at::Tensor wrapped_linear_prepack_meta(const at::Tensor& weight,
|
||||
#endif // USE_FBGEMM
|
||||
}
|
||||
|
||||
at::Tensor wrapped_quantized_linear_prepacked_meta(const at::Tensor& input,
|
||||
at::Tensor _wrapped_quantized_linear_prepacked_meta(const at::Tensor& input,
|
||||
[[maybe_unused]] const at::Tensor& input_scale,
|
||||
[[maybe_unused]] const at::Tensor& input_zero_point,
|
||||
[[maybe_unused]] const at::Tensor& packed_weight,
|
||||
@ -538,7 +538,7 @@ at::Tensor wrapped_quantized_linear_prepacked_meta(const at::Tensor& input,
|
||||
[[maybe_unused]] const at::Tensor& output_zero_point,
|
||||
const int64_t out_channel);
|
||||
|
||||
at::Tensor wrapped_quantized_linear_prepacked_meta(const at::Tensor& input,
|
||||
at::Tensor _wrapped_quantized_linear_prepacked_meta(const at::Tensor& input,
|
||||
[[maybe_unused]] const at::Tensor& input_scale,
|
||||
[[maybe_unused]] const at::Tensor& input_zero_point,
|
||||
[[maybe_unused]] const at::Tensor& packed_weight,
|
||||
@ -695,21 +695,21 @@ TORCH_LIBRARY_IMPL(_quantized, CPU, m) {
|
||||
m.impl(TORCH_SELECTIVE_NAME("_quantized::linear_prepack_fp16_legacy"), TORCH_FN(QLinearPackWeightFp16Legacy::run));
|
||||
m.impl(TORCH_SELECTIVE_NAME("_quantized::wrapped_quantized_linear"), TORCH_FN(wrapped_quantized_linear));
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("_quantized::wrapped_linear_prepack"),
|
||||
wrapped_linear_prepack);
|
||||
TORCH_SELECTIVE_NAME("_quantized::_wrapped_linear_prepack"),
|
||||
_wrapped_linear_prepack);
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("_quantized::wrapped_quantized_linear_prepacked"),
|
||||
wrapped_quantized_linear_prepacked);
|
||||
TORCH_SELECTIVE_NAME("_quantized::_wrapped_quantized_linear_prepacked"),
|
||||
_wrapped_quantized_linear_prepacked);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(_quantized, Meta, m) {
|
||||
m.impl(TORCH_SELECTIVE_NAME("_quantized::wrapped_quantized_linear"), TORCH_FN(wrapped_quantized_linear_meta));
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("_quantized::wrapped_linear_prepack"),
|
||||
wrapped_linear_prepack_meta);
|
||||
TORCH_SELECTIVE_NAME("_quantized::_wrapped_linear_prepack"),
|
||||
_wrapped_linear_prepack_meta);
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("_quantized::wrapped_quantized_linear_prepacked"),
|
||||
wrapped_quantized_linear_prepacked_meta);
|
||||
TORCH_SELECTIVE_NAME("_quantized::_wrapped_quantized_linear_prepacked"),
|
||||
_wrapped_quantized_linear_prepacked_meta);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(onednn, CPU, m) {
|
||||
|
||||
@ -251,8 +251,8 @@ TORCH_LIBRARY(_quantized, m) {
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_fbgemm_pack_gemm_matrix_fp16(Tensor W) -> Tensor"));
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_fbgemm_linear_fp16_weight(Tensor X, Tensor W, Tensor B, int out_channel) -> Tensor"));
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_quantized_linear(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor W, Tensor W_scale, Tensor W_zero_point, Tensor B, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor Y"));
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_linear_prepack(Tensor W, Tensor W_scale, Tensor W_zero_point, Tensor B) -> Tensor"));
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_quantized_linear_prepacked(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor W_prepack, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor Y"));
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("_quantized::_wrapped_linear_prepack(Tensor W, Tensor W_scale, Tensor W_zero_point, Tensor B) -> Tensor"));
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("_quantized::_wrapped_quantized_linear_prepacked(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor W_prepack, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor Y"));
|
||||
}
|
||||
|
||||
TORCH_LIBRARY(onednn, m) {
|
||||
|
||||
@ -46,6 +46,15 @@
|
||||
desc: |
|
||||
This tag indicates that the operator should be passed Tensors following
|
||||
the same stride permutation as observed in eager when compiled in inductor.
|
||||
Only one of {needs_fixed_stride_order, flexible_layout} can apply; if
|
||||
multiple are assigned then we assume the most restrictive one.
|
||||
- tag: flexible_layout
|
||||
desc: |
|
||||
This tag indicates that the custom operator can accept inputs with varying
|
||||
strides/storage_offset and that when compiled, Inductor is allowed to change
|
||||
the strides/storage_offset of inputs to the custom operator.
|
||||
Only one of {needs_fixed_stride_order, flexible_layout} can apply; if
|
||||
multiple are assigned then we assume the most restrictive one.
|
||||
|
||||
# NOTE [Core ATen Ops]
|
||||
- tag: core
|
||||
|
||||
@ -90,7 +90,7 @@ detectron2_maskrcnn_r_50_fpn,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
dlrm,fail_to_run,0
|
||||
dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -90,7 +90,7 @@ detectron2_maskrcnn_r_50_fpn,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
dlrm,fail_to_run,0
|
||||
dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -74,7 +74,7 @@ detectron2_fasterrcnn_r_50_fpn,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
dlrm,fail_to_run,0
|
||||
dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -74,7 +74,7 @@ detectron2_fasterrcnn_r_50_fpn,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
dlrm,fail_to_run,0
|
||||
dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -3,7 +3,12 @@ import csv
|
||||
import dataclasses
|
||||
import os
|
||||
|
||||
from generate import run_llama2_7b_bf16, run_llama2_7b_int8, run_mixtral_8x7b_int8
|
||||
from generate import (
|
||||
get_arch_name,
|
||||
run_llama2_7b_bf16,
|
||||
run_llama2_7b_int8,
|
||||
run_mixtral_8x7b_int8,
|
||||
)
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -24,6 +29,7 @@ class Experiment:
|
||||
actual: float
|
||||
dtype: str
|
||||
device: str
|
||||
arch: str # GPU name for CUDA or CPU arch for CPU
|
||||
is_model: bool = False
|
||||
|
||||
|
||||
@ -71,7 +77,12 @@ def run_mlp_layer_norm_gelu(device: str = "cuda"):
|
||||
for _ in range(WARMUP_ITER):
|
||||
compiled_mod(x)
|
||||
|
||||
us_per_iter = benchmarker.benchmark_gpu(lambda: compiled_mod(x)) * 1000
|
||||
benchmark_fn = (
|
||||
benchmarker.benchmark_gpu
|
||||
if device == "cuda"
|
||||
else benchmarker.benchmark_cpu
|
||||
)
|
||||
us_per_iter = benchmark_fn(lambda: compiled_mod(x)) * 1000
|
||||
flops_utilization += us_per_iter * flops / 1e9 / A100_40G_BF16_TFLOPS
|
||||
|
||||
flops_utilization = flops_utilization / len(input_shapes)
|
||||
@ -84,6 +95,7 @@ def run_mlp_layer_norm_gelu(device: str = "cuda"):
|
||||
f"{flops_utilization:.02f}",
|
||||
dtype_str,
|
||||
device,
|
||||
get_arch_name(),
|
||||
)
|
||||
)
|
||||
return results
|
||||
@ -108,7 +120,12 @@ def run_layer_norm(device: str = "cuda"):
|
||||
for _ in range(WARMUP_ITER):
|
||||
compiled_mod(x)
|
||||
|
||||
us_per_iter = benchmarker.benchmark_gpu(lambda: compiled_mod(x)) * 1000
|
||||
benchmark_fn = (
|
||||
benchmarker.benchmark_gpu
|
||||
if device == "cuda"
|
||||
else benchmarker.benchmark_cpu
|
||||
)
|
||||
us_per_iter = benchmark_fn(lambda: compiled_mod(x)) * 1000
|
||||
memory_bandwidth += (1e6 / us_per_iter) * 2 * BS * D * dtype.itemsize / 1e9
|
||||
|
||||
memory_bandwidth = memory_bandwidth / len(input_shapes)
|
||||
@ -121,6 +138,7 @@ def run_layer_norm(device: str = "cuda"):
|
||||
f"{memory_bandwidth:.02f}",
|
||||
dtype_str,
|
||||
device,
|
||||
get_arch_name(),
|
||||
)
|
||||
)
|
||||
return results
|
||||
@ -151,9 +169,12 @@ def run_gather_gemv(device: str = "cuda"):
|
||||
for _ in range(WARMUP_ITER):
|
||||
compiled_fn(W, score_idxs, x)
|
||||
|
||||
us_per_iter = (
|
||||
benchmarker.benchmark_gpu(lambda: compiled_fn(W, score_idxs, x)) * 1000
|
||||
benchmark_fn = (
|
||||
benchmarker.benchmark_gpu
|
||||
if device == "cuda"
|
||||
else benchmarker.benchmark_cpu
|
||||
)
|
||||
us_per_iter = benchmark_fn(lambda: compiled_fn(W, score_idxs, x)) * 1000
|
||||
memory_bandwidth += (1e6 / us_per_iter) * 2 * D * D * dtype.itemsize / 1e9
|
||||
|
||||
memory_bandwidth = memory_bandwidth / len(input_shapes)
|
||||
@ -166,6 +187,7 @@ def run_gather_gemv(device: str = "cuda"):
|
||||
f"{memory_bandwidth:.02f}",
|
||||
dtype_str,
|
||||
device,
|
||||
get_arch_name(),
|
||||
)
|
||||
)
|
||||
return results
|
||||
@ -186,15 +208,20 @@ def run_gemv(device: str = "cuda"):
|
||||
def gemv(W, x):
|
||||
return W.to(x.dtype) @ x
|
||||
|
||||
W = torch.randn(D, D, device="cuda").to(dtype=dtype)
|
||||
x = torch.randn(D, device="cuda", dtype=torch.bfloat16)
|
||||
W = torch.randn(D, D, device=device).to(dtype=dtype)
|
||||
x = torch.randn(D, device=device, dtype=torch.bfloat16)
|
||||
|
||||
compiled_fn = torch.compile(gemv, dynamic=False)
|
||||
|
||||
for _ in range(WARMUP_ITER):
|
||||
compiled_fn(W, x)
|
||||
|
||||
us_per_iter = benchmarker.benchmark_gpu(lambda: compiled_fn(W, x)) * 1000
|
||||
benchmark_fn = (
|
||||
benchmarker.benchmark_gpu
|
||||
if device == "cuda"
|
||||
else benchmarker.benchmark_cpu
|
||||
)
|
||||
us_per_iter = benchmark_fn(lambda: compiled_fn(W, x)) * 1000
|
||||
memory_bandwidth += (1e6 / us_per_iter) * D * D * dtype.itemsize / 1e9
|
||||
|
||||
memory_bandwidth = memory_bandwidth / len(input_shapes)
|
||||
@ -207,6 +234,7 @@ def run_gemv(device: str = "cuda"):
|
||||
f"{memory_bandwidth:.02f}",
|
||||
dtype_str,
|
||||
device,
|
||||
get_arch_name(),
|
||||
)
|
||||
)
|
||||
return results
|
||||
@ -252,7 +280,13 @@ def main(output_file=DEFAULT_OUTPUT_FILE):
|
||||
results = []
|
||||
|
||||
for func in all_experiments:
|
||||
lst = func()
|
||||
try:
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
except AssertionError:
|
||||
# This happens when torch is compiled with CUDA turning off completely
|
||||
device = "cpu"
|
||||
|
||||
lst = func(device)
|
||||
for x in lst:
|
||||
results.append(dataclasses.astuple(x))
|
||||
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import dataclasses
|
||||
import itertools
|
||||
import platform
|
||||
import time
|
||||
from typing import Optional, Tuple
|
||||
|
||||
@ -41,6 +42,14 @@ def device_sync(device):
|
||||
print(f"device={device} is not yet suppported")
|
||||
|
||||
|
||||
def get_arch_name() -> str:
|
||||
if torch.cuda.is_available():
|
||||
return torch.cuda.get_device_name()
|
||||
else:
|
||||
# This returns x86_64 or arm64 (for aarch64)
|
||||
return platform.machine()
|
||||
|
||||
|
||||
def multinomial_sample_one_no_sync(
|
||||
probs_sort,
|
||||
): # Does multinomial sampling without a cuda synchronization
|
||||
@ -198,7 +207,7 @@ def run_experiment(
|
||||
) -> None:
|
||||
print(f"Loading model {x.name}")
|
||||
t0 = time.time()
|
||||
model = _load_model(x)
|
||||
model = _load_model(x, device=device)
|
||||
device_sync(device=device) # MKG
|
||||
print(f"Time to load model: {time.time() - t0:.02f} seconds")
|
||||
|
||||
@ -257,7 +266,9 @@ def run_llama2_7b_bf16(device: str = "cuda"):
|
||||
1253,
|
||||
162,
|
||||
)
|
||||
token_per_sec, memory_bandwidth, compilation_time = run_experiment(model)
|
||||
token_per_sec, memory_bandwidth, compilation_time = run_experiment(
|
||||
model, device=device
|
||||
)
|
||||
return [
|
||||
Experiment(
|
||||
model.name,
|
||||
@ -266,6 +277,7 @@ def run_llama2_7b_bf16(device: str = "cuda"):
|
||||
f"{token_per_sec:.02f}",
|
||||
model.mode,
|
||||
device,
|
||||
get_arch_name(),
|
||||
True,
|
||||
),
|
||||
Experiment(
|
||||
@ -275,6 +287,7 @@ def run_llama2_7b_bf16(device: str = "cuda"):
|
||||
f"{memory_bandwidth:.02f}",
|
||||
model.mode,
|
||||
device,
|
||||
get_arch_name(),
|
||||
True,
|
||||
),
|
||||
Experiment(
|
||||
@ -284,6 +297,7 @@ def run_llama2_7b_bf16(device: str = "cuda"):
|
||||
f"{compilation_time:.02f}",
|
||||
model.mode,
|
||||
device,
|
||||
get_arch_name(),
|
||||
True,
|
||||
),
|
||||
]
|
||||
@ -302,7 +316,9 @@ def run_llama2_7b_int8(device: str = "cuda"):
|
||||
957,
|
||||
172,
|
||||
)
|
||||
token_per_sec, memory_bandwidth, compilation_time = run_experiment(model)
|
||||
token_per_sec, memory_bandwidth, compilation_time = run_experiment(
|
||||
model, device=device
|
||||
)
|
||||
return [
|
||||
Experiment(
|
||||
model.name,
|
||||
@ -311,6 +327,7 @@ def run_llama2_7b_int8(device: str = "cuda"):
|
||||
f"{token_per_sec:.02f}",
|
||||
model.mode,
|
||||
device,
|
||||
get_arch_name(),
|
||||
True,
|
||||
),
|
||||
Experiment(
|
||||
@ -320,6 +337,7 @@ def run_llama2_7b_int8(device: str = "cuda"):
|
||||
f"{memory_bandwidth:.02f}",
|
||||
model.mode,
|
||||
device,
|
||||
get_arch_name(),
|
||||
True,
|
||||
),
|
||||
Experiment(
|
||||
@ -329,6 +347,7 @@ def run_llama2_7b_int8(device: str = "cuda"):
|
||||
f"{compilation_time:.02f}",
|
||||
model.mode,
|
||||
device,
|
||||
get_arch_name(),
|
||||
True,
|
||||
),
|
||||
]
|
||||
@ -348,7 +367,9 @@ def run_mixtral_8x7b_int8(device: str = "cuda"):
|
||||
1130,
|
||||
162,
|
||||
)
|
||||
token_per_sec, memory_bandwidth, compilation_time = run_experiment(model)
|
||||
token_per_sec, memory_bandwidth, compilation_time = run_experiment(
|
||||
model, device=device
|
||||
)
|
||||
return [
|
||||
Experiment(
|
||||
model.name,
|
||||
@ -357,6 +378,7 @@ def run_mixtral_8x7b_int8(device: str = "cuda"):
|
||||
f"{token_per_sec:.02f}",
|
||||
model.mode,
|
||||
device,
|
||||
get_arch_name(),
|
||||
True,
|
||||
),
|
||||
Experiment(
|
||||
@ -366,6 +388,7 @@ def run_mixtral_8x7b_int8(device: str = "cuda"):
|
||||
f"{memory_bandwidth:.02f}",
|
||||
model.mode,
|
||||
device,
|
||||
get_arch_name(),
|
||||
True,
|
||||
),
|
||||
Experiment(
|
||||
@ -375,6 +398,7 @@ def run_mixtral_8x7b_int8(device: str = "cuda"):
|
||||
f"{compilation_time:.02f}",
|
||||
model.mode,
|
||||
device,
|
||||
get_arch_name(),
|
||||
True,
|
||||
),
|
||||
]
|
||||
|
||||
@ -149,6 +149,8 @@ const char* toString(DispatchKey t) {
|
||||
return "AutocastXLA";
|
||||
case DispatchKey::AutocastPrivateUse1:
|
||||
return "AutocastPrivateUse1";
|
||||
case DispatchKey::AutocastMPS:
|
||||
return "AutocastMPS";
|
||||
|
||||
case DispatchKey::FuncTorchBatched:
|
||||
return "FuncTorchBatched";
|
||||
@ -297,6 +299,7 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
|
||||
{"AutocastCUDA", c10::DispatchKey::AutocastCUDA},
|
||||
{"AutocastXLA", c10::DispatchKey::AutocastXLA},
|
||||
{"AutocastPrivateUse1", c10::DispatchKey::AutocastPrivateUse1},
|
||||
{"AutocastMPS", c10::DispatchKey::AutocastMPS},
|
||||
{"FuncTorchBatched", c10::DispatchKey::FuncTorchBatched},
|
||||
{"BatchedNestedTensor", c10::DispatchKey::BatchedNestedTensor},
|
||||
{"FuncTorchVmapMode", c10::DispatchKey::FuncTorchVmapMode},
|
||||
|
||||
@ -359,6 +359,7 @@ enum class DispatchKey : uint16_t {
|
||||
AutocastXLA,
|
||||
// AutocastXLA is only being used for TPUs. XLA GPUs continue to use
|
||||
// AutocastCUDA.
|
||||
AutocastMPS,
|
||||
AutocastCUDA,
|
||||
AutocastPrivateUse1,
|
||||
|
||||
|
||||
@ -655,6 +655,7 @@ constexpr DispatchKeySet autograd_dispatch_keyset = DispatchKeySet({
|
||||
|
||||
constexpr DispatchKeySet autocast_dispatch_keyset = DispatchKeySet({
|
||||
DispatchKey::AutocastCPU,
|
||||
DispatchKey::AutocastMPS,
|
||||
DispatchKey::AutocastCUDA,
|
||||
DispatchKey::AutocastXPU,
|
||||
DispatchKey::AutocastIPU,
|
||||
@ -671,6 +672,7 @@ constexpr DispatchKeySet default_included_set = DispatchKeySet({
|
||||
|
||||
constexpr DispatchKeySet default_excluded_set = DispatchKeySet({
|
||||
DispatchKey::AutocastCPU,
|
||||
DispatchKey::AutocastMPS,
|
||||
DispatchKey::AutocastCUDA,
|
||||
DispatchKey::AutocastXPU,
|
||||
DispatchKey::AutocastIPU,
|
||||
@ -863,6 +865,7 @@ inline DispatchKeySet getAutocastRelatedKeySetFromBackend(BackendComponent t) {
|
||||
constexpr auto autocast_xla_ks = DispatchKeySet(DispatchKey::AutocastXLA);
|
||||
constexpr auto autocast_privateuse1_ks =
|
||||
DispatchKeySet(DispatchKey::AutocastPrivateUse1);
|
||||
constexpr auto autocast_mps_ks = DispatchKeySet(DispatchKey::AutocastMPS);
|
||||
switch (t) {
|
||||
case BackendComponent::CPUBit:
|
||||
return autocast_cpu_ks;
|
||||
@ -878,6 +881,8 @@ inline DispatchKeySet getAutocastRelatedKeySetFromBackend(BackendComponent t) {
|
||||
return autocast_xla_ks;
|
||||
case BackendComponent::PrivateUse1Bit:
|
||||
return autocast_privateuse1_ks;
|
||||
case BackendComponent::MPSBit:
|
||||
return autocast_mps_ks;
|
||||
default:
|
||||
return DispatchKeySet();
|
||||
}
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
#include <c10/cuda/CUDAFunctions.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <c10/util/CallOnce.h>
|
||||
#include <c10/util/Gauge.h>
|
||||
#include <c10/util/ScopeExit.h>
|
||||
#include <c10/util/UniqueVoidPtr.h>
|
||||
#include <c10/util/flat_hash_map.h>
|
||||
@ -1429,6 +1430,12 @@ class DeviceCachingAllocator {
|
||||
if (block->size >= CUDAAllocatorConfig::max_split_size())
|
||||
stats.oversize_allocations.increase(1);
|
||||
|
||||
auto allocated_bytes_gauge =
|
||||
STATIC_GAUGE(pytorch.CUDACachingAllocator.allocated_bytes);
|
||||
allocated_bytes_gauge.record(
|
||||
stats.allocated_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
|
||||
.current);
|
||||
|
||||
c10::reportMemoryUsageToProfiler(
|
||||
block->ptr,
|
||||
static_cast<int64_t>(block->size),
|
||||
@ -1456,6 +1463,11 @@ class DeviceCachingAllocator {
|
||||
stats.allocation[stat_type].decrease(1);
|
||||
stats.allocated_bytes[stat_type].decrease(block->size);
|
||||
});
|
||||
auto allocated_bytes_gauge =
|
||||
STATIC_GAUGE(pytorch.CUDACachingAllocator.allocated_bytes);
|
||||
allocated_bytes_gauge.record(
|
||||
stats.allocated_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
|
||||
.current);
|
||||
|
||||
record_trace(
|
||||
TraceEntry::FREE_REQUESTED,
|
||||
@ -2245,6 +2257,11 @@ class DeviceCachingAllocator {
|
||||
for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
|
||||
stats.reserved_bytes[stat_type].increase(mapped_range.size);
|
||||
});
|
||||
auto reserved_bytes_gauge =
|
||||
STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
|
||||
reserved_bytes_gauge.record(
|
||||
stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
|
||||
.current);
|
||||
|
||||
stats.num_device_alloc++;
|
||||
record_trace(
|
||||
@ -2683,6 +2700,11 @@ class DeviceCachingAllocator {
|
||||
});
|
||||
if (size >= CUDAAllocatorConfig::max_split_size())
|
||||
stats.oversize_segments.increase(1);
|
||||
auto reserved_bytes_gauge =
|
||||
STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
|
||||
reserved_bytes_gauge.record(
|
||||
stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
|
||||
.current);
|
||||
|
||||
// p.block came from new, not cudaMalloc. It should not be nullptr here.
|
||||
TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
|
||||
@ -2820,6 +2842,11 @@ class DeviceCachingAllocator {
|
||||
stats.segment[stat_type].decrease(1);
|
||||
stats.reserved_bytes[stat_type].decrease(block->size);
|
||||
});
|
||||
auto reserved_bytes_gauge =
|
||||
STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
|
||||
reserved_bytes_gauge.record(
|
||||
stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
|
||||
.current);
|
||||
|
||||
if (block->size >= CUDAAllocatorConfig::max_split_size())
|
||||
stats.oversize_segments.decrease(1);
|
||||
@ -2876,6 +2903,11 @@ class DeviceCachingAllocator {
|
||||
for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
|
||||
stats.reserved_bytes[stat_type].decrease(unmapped.size);
|
||||
});
|
||||
auto reserved_bytes_gauge =
|
||||
STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
|
||||
reserved_bytes_gauge.record(
|
||||
stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
|
||||
.current);
|
||||
|
||||
if (block->pool->owner_PrivatePool) {
|
||||
// The cudaFreed block belonged to a CUDA graph's PrivatePool.
|
||||
|
||||
@ -68,6 +68,12 @@ template <
|
||||
inline T expm1(T a) {
|
||||
return std::expm1(float(a));
|
||||
}
|
||||
template <
|
||||
typename T,
|
||||
typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
|
||||
inline bool isfinite(T a) {
|
||||
return std::isfinite(float(a));
|
||||
}
|
||||
template <
|
||||
typename T,
|
||||
typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
|
||||
@ -237,10 +243,9 @@ C10_HOST_DEVICE inline T nextafter(T from, T to) {
|
||||
// Reference:
|
||||
// https://git.musl-libc.org/cgit/musl/tree/src/math/nextafter.c
|
||||
using int_repr_t = uint16_t;
|
||||
using float_t = T;
|
||||
constexpr uint8_t bits = 16;
|
||||
union {
|
||||
float_t f;
|
||||
T f;
|
||||
int_repr_t i;
|
||||
} ufrom = {from}, uto = {to};
|
||||
|
||||
|
||||
@ -261,19 +261,19 @@ struct alignas(sizeof(T) * 2) complex {
|
||||
#endif
|
||||
|
||||
if (abs_c >= abs_d) {
|
||||
if (abs_c == 0 && abs_d == 0) {
|
||||
if (abs_c == U(0) && abs_d == U(0)) {
|
||||
/* divide by zeros should yield a complex inf or nan */
|
||||
real_ = a / abs_c;
|
||||
imag_ = b / abs_d;
|
||||
} else {
|
||||
auto rat = d / c;
|
||||
auto scl = 1.0 / (c + d * rat);
|
||||
auto scl = U(1.0) / (c + d * rat);
|
||||
real_ = (a + b * rat) * scl;
|
||||
imag_ = (b - a * rat) * scl;
|
||||
}
|
||||
} else {
|
||||
auto rat = c / d;
|
||||
auto scl = 1.0 / (d + c * rat);
|
||||
auto scl = U(1.0) / (d + c * rat);
|
||||
real_ = (a * rat + b) * scl;
|
||||
imag_ = (b * rat - a) * scl;
|
||||
}
|
||||
|
||||
@ -9,6 +9,8 @@
|
||||
|
||||
namespace c10::xpu::XPUCachingAllocator {
|
||||
|
||||
using namespace c10::CachingDeviceAllocator;
|
||||
|
||||
// newly allocated memory with 512-byte alignment.
|
||||
constexpr size_t kDeviceAlignment = 512;
|
||||
// all sizes are rounded to at least 512 bytes
|
||||
@ -117,6 +119,7 @@ struct AllocParams {
|
||||
BlockPool* pool;
|
||||
size_t alloc_size;
|
||||
Block* block;
|
||||
StatTypes stat_types = {};
|
||||
};
|
||||
|
||||
} // anonymous namespace
|
||||
@ -124,6 +127,7 @@ struct AllocParams {
|
||||
class DeviceCachingAllocator {
|
||||
private:
|
||||
mutable std::recursive_mutex mutex;
|
||||
DeviceStats stats;
|
||||
BlockPool large_blocks; // unallocated cached blocks larger than 1 MB
|
||||
BlockPool small_blocks; // unallocated cached blocks 1 MB or smaller
|
||||
ska::flat_hash_set<Block*> active_blocks; // allocated or in use by a stream
|
||||
@ -173,6 +177,12 @@ class DeviceCachingAllocator {
|
||||
active_blocks.erase(block);
|
||||
bool inserted = pool.blocks.insert(block).second;
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted);
|
||||
|
||||
StatTypes stat_types = get_stat_types_for_pool(pool);
|
||||
for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
|
||||
stats.active_bytes[stat_type].decrease(block->size);
|
||||
stats.requested_bytes[stat_type].decrease(block->requested_size);
|
||||
});
|
||||
}
|
||||
|
||||
void process_events() {
|
||||
@ -250,6 +260,9 @@ class DeviceCachingAllocator {
|
||||
return false;
|
||||
}
|
||||
p.block = new Block(device, p.queue(), size, p.pool, ptr);
|
||||
for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
|
||||
stats.reserved_bytes[stat_type].increase(size);
|
||||
});
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -281,6 +294,12 @@ class DeviceCachingAllocator {
|
||||
sycl::free(block->ptr, xpu::get_device_context());
|
||||
auto* pool = block->pool;
|
||||
pool->blocks.erase(block);
|
||||
|
||||
StatTypes stat_types = get_stat_types_for_pool(*pool);
|
||||
for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
|
||||
stats.reserved_bytes[stat_type].decrease(block->size);
|
||||
});
|
||||
|
||||
delete block;
|
||||
}
|
||||
|
||||
@ -314,6 +333,14 @@ class DeviceCachingAllocator {
|
||||
}
|
||||
}
|
||||
|
||||
StatTypes get_stat_types_for_pool(const BlockPool& pool) {
|
||||
StatTypes stat_types = {};
|
||||
stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
|
||||
stat_types[static_cast<size_t>(
|
||||
pool.is_small ? StatType::SMALL_POOL : StatType::LARGE_POOL)] = true;
|
||||
return stat_types;
|
||||
}
|
||||
|
||||
Block* alloc_found_block(
|
||||
AllocParams params,
|
||||
size_t orig_size,
|
||||
@ -350,6 +377,12 @@ class DeviceCachingAllocator {
|
||||
bool inserted = active_blocks.insert(block).second;
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted)
|
||||
|
||||
for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
|
||||
stats.allocated_bytes[stat_type].increase(block->size);
|
||||
stats.active_bytes[stat_type].increase(block->size);
|
||||
stats.requested_bytes[stat_type].increase(block->requested_size);
|
||||
});
|
||||
|
||||
return block;
|
||||
}
|
||||
|
||||
@ -376,6 +409,7 @@ class DeviceCachingAllocator {
|
||||
auto& pool = get_pool(size);
|
||||
const size_t alloc_size = get_allocation_size(size);
|
||||
AllocParams params(device, size, &queue, &pool, alloc_size);
|
||||
params.stat_types = get_stat_types_for_pool(pool);
|
||||
|
||||
// First, try to get a block from the existing pool.
|
||||
bool block_found = get_free_block(params);
|
||||
@ -384,9 +418,32 @@ class DeviceCachingAllocator {
|
||||
block_found = alloc_block(params) ||
|
||||
(release_cached_blocks() && alloc_block(params));
|
||||
}
|
||||
TORCH_CHECK(
|
||||
block_found,
|
||||
"XPU out of memory, please use `empty_cache` to release all unoccupied cached memory.");
|
||||
if (!block_found) {
|
||||
c10::xpu::DeviceProp device_prop;
|
||||
c10::xpu::get_device_properties(&device_prop, device);
|
||||
auto device_total = device_prop.global_mem_size;
|
||||
auto allocated_bytes =
|
||||
stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
|
||||
.current;
|
||||
auto reserved_bytes =
|
||||
stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)]
|
||||
.current;
|
||||
TORCH_CHECK_WITH(
|
||||
OutOfMemoryError,
|
||||
false,
|
||||
"XPU out of memory. Tried to allocate ",
|
||||
format_size(alloc_size),
|
||||
". GPU ",
|
||||
static_cast<int>(device),
|
||||
" has a total capacity of ",
|
||||
format_size(device_total),
|
||||
". Of the allocated memory ",
|
||||
format_size(allocated_bytes),
|
||||
" is allocated by PyTorch, and ",
|
||||
format_size(reserved_bytes - allocated_bytes),
|
||||
" is reserved by PyTorch but unallocated.",
|
||||
" Please use `empty_cache` to release all unoccupied cached memory.");
|
||||
}
|
||||
bool split_remainder = should_split(params.block, params.size());
|
||||
return alloc_found_block(std::move(params), orig_size, split_remainder);
|
||||
}
|
||||
@ -395,6 +452,11 @@ class DeviceCachingAllocator {
|
||||
std::scoped_lock<std::recursive_mutex> lock(mutex);
|
||||
block->allocated = false;
|
||||
|
||||
StatTypes stat_types = get_stat_types_for_pool(*block->pool);
|
||||
for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
|
||||
stats.allocated_bytes[stat_type].decrease(block->size);
|
||||
});
|
||||
|
||||
if (!block->stream_uses.empty()) {
|
||||
insert_events(block);
|
||||
} else {
|
||||
@ -414,6 +476,35 @@ class DeviceCachingAllocator {
|
||||
std::scoped_lock<std::recursive_mutex> lock(mutex);
|
||||
release_cached_blocks();
|
||||
}
|
||||
|
||||
DeviceStats getStats() {
|
||||
std::scoped_lock<std::recursive_mutex> lock(mutex);
|
||||
return stats;
|
||||
}
|
||||
|
||||
void resetAccumulatedStats() {
|
||||
std::scoped_lock<std::recursive_mutex> lock(mutex);
|
||||
|
||||
for (const auto statType :
|
||||
c10::irange(static_cast<size_t>(StatType::NUM_TYPES))) {
|
||||
stats.allocated_bytes[statType].reset_accumulated();
|
||||
stats.reserved_bytes[statType].reset_accumulated();
|
||||
stats.active_bytes[statType].reset_accumulated();
|
||||
stats.requested_bytes[statType].reset_accumulated();
|
||||
}
|
||||
}
|
||||
|
||||
void resetPeakStats() {
|
||||
std::scoped_lock<std::recursive_mutex> lock(mutex);
|
||||
|
||||
for (const auto statType :
|
||||
c10::irange(static_cast<size_t>(StatType::NUM_TYPES))) {
|
||||
stats.allocated_bytes[statType].reset_peak();
|
||||
stats.reserved_bytes[statType].reset_peak();
|
||||
stats.active_bytes[statType].reset_peak();
|
||||
stats.requested_bytes[statType].reset_peak();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void local_raw_delete(void* ptr);
|
||||
@ -547,6 +638,30 @@ class XPUAllocator : public Allocator {
|
||||
void copy_data(void* dest, const void* src, std::size_t count) const final {
|
||||
xpu::getCurrentXPUStream().queue().memcpy(dest, src, count);
|
||||
}
|
||||
|
||||
void assertValidDevice(DeviceIndex device) {
|
||||
const auto device_num = device_allocators.size();
|
||||
TORCH_CHECK(
|
||||
0 <= device && device < static_cast<int64_t>(device_num),
|
||||
"Invalid device argument ",
|
||||
device,
|
||||
": did you call init?");
|
||||
}
|
||||
|
||||
DeviceStats getDeviceStats(DeviceIndex device) {
|
||||
assertValidDevice(device);
|
||||
return device_allocators[device]->getStats();
|
||||
}
|
||||
|
||||
void resetPeakStats(DeviceIndex device) {
|
||||
assertValidDevice(device);
|
||||
device_allocators[device]->resetPeakStats();
|
||||
}
|
||||
|
||||
void resetAccumulatedStats(DeviceIndex device) {
|
||||
assertValidDevice(device);
|
||||
device_allocators[device]->resetAccumulatedStats();
|
||||
}
|
||||
};
|
||||
|
||||
static XPUAllocator allocator;
|
||||
@ -567,6 +682,18 @@ void emptyCache() {
|
||||
return allocator.emptyCache();
|
||||
}
|
||||
|
||||
void resetPeakStats(DeviceIndex device) {
|
||||
return allocator.resetPeakStats(device);
|
||||
}
|
||||
|
||||
void resetAccumulatedStats(DeviceIndex device) {
|
||||
return allocator.resetAccumulatedStats(device);
|
||||
}
|
||||
|
||||
DeviceStats getDeviceStats(DeviceIndex device) {
|
||||
return allocator.getDeviceStats(device);
|
||||
}
|
||||
|
||||
void* raw_alloc(size_t size) {
|
||||
return allocator.raw_alloc(size);
|
||||
}
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/Allocator.h>
|
||||
#include <c10/core/CachingDeviceAllocator.h>
|
||||
#include <c10/xpu/XPUStream.h>
|
||||
|
||||
namespace c10::xpu::XPUCachingAllocator {
|
||||
@ -11,6 +11,13 @@ C10_XPU_API void init(DeviceIndex device_count);
|
||||
|
||||
C10_XPU_API void emptyCache();
|
||||
|
||||
C10_XPU_API void resetPeakStats(DeviceIndex device);
|
||||
|
||||
C10_XPU_API void resetAccumulatedStats(DeviceIndex device);
|
||||
|
||||
C10_XPU_API c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
|
||||
DeviceIndex device);
|
||||
|
||||
C10_XPU_API void* raw_alloc(size_t size);
|
||||
|
||||
C10_XPU_API void raw_delete(void* ptr);
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
#include <cstdint>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
|
||||
@ -43,7 +43,9 @@ IF(NOT MKLDNN_FOUND)
|
||||
endif()
|
||||
endif()
|
||||
if(LINUX)
|
||||
set(ABI_NEUTRAL_FLAGS -fpreview-breaking-changes)
|
||||
set(DNNL_CXX_FLAGS "-DCMAKE_CXX_FLAGS=-fpreview-breaking-changes")
|
||||
else()
|
||||
set(DNNL_CXX_FLAGS "")
|
||||
endif()
|
||||
ExternalProject_Add(xpu_mkldnn_proj
|
||||
SOURCE_DIR ${MKLDNN_ROOT}
|
||||
@ -51,7 +53,7 @@ IF(NOT MKLDNN_FOUND)
|
||||
BUILD_IN_SOURCE 0
|
||||
CMAKE_ARGS -DCMAKE_C_COMPILER=icx
|
||||
-DCMAKE_CXX_COMPILER=${SYCL_CXX_DRIVER}
|
||||
-DCMAKE_CXX_FLAGS=${ABI_NEUTRAL_FLAGS}
|
||||
${DNNL_CXX_FLAGS}
|
||||
-DDNNL_GPU_RUNTIME=SYCL
|
||||
-DDNNL_CPU_RUNTIME=THREADPOOL
|
||||
-DDNNL_BUILD_TESTS=OFF
|
||||
@ -85,13 +87,18 @@ IF(NOT MKLDNN_FOUND)
|
||||
SET(ONEDNN_BUILD_GRAPH ON CACHE BOOL "" FORCE)
|
||||
ENDIF(NOT APPLE AND NOT WIN32 AND NOT BUILD_LITE_INTERPRETER)
|
||||
|
||||
IF(EXISTS "${MKLDNN_ROOT}/include/oneapi/dnnl/dnnl_ukernel.hpp")
|
||||
MESSAGE("-- Will build oneDNN UKERNEL")
|
||||
SET(DNNL_EXPERIMENTAL_UKERNEL ON CACHE BOOL "" FORCE)
|
||||
ENDIF(EXISTS "${MKLDNN_ROOT}/include/oneapi/dnnl/dnnl_ukernel.hpp")
|
||||
|
||||
FIND_PACKAGE(BLAS)
|
||||
FIND_PATH(IDEEP_INCLUDE_DIR ideep.hpp PATHS ${IDEEP_ROOT} PATH_SUFFIXES include)
|
||||
FIND_PATH(MKLDNN_INCLUDE_DIR dnnl.hpp dnnl.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include/oneapi/dnnl)
|
||||
FIND_PATH(MKLDNN_INCLUDE_DIR dnnl.hpp dnnl.h dnnl_ukernel.hpp dnnl_ukernel.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include/oneapi/dnnl)
|
||||
IF(NOT MKLDNN_INCLUDE_DIR)
|
||||
MESSAGE("MKLDNN_INCLUDE_DIR not found")
|
||||
EXECUTE_PROCESS(COMMAND git${CMAKE_EXECUTABLE_SUFFIX} submodule update --init mkl-dnn WORKING_DIRECTORY ${IDEEP_ROOT})
|
||||
FIND_PATH(MKLDNN_INCLUDE_DIR dnnl.hpp dnnl.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include)
|
||||
FIND_PATH(MKLDNN_INCLUDE_DIR dnnl.hpp dnnl.h dnnl_ukernel.hpp dnnl_ukernel.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include)
|
||||
ENDIF(NOT MKLDNN_INCLUDE_DIR)
|
||||
IF(BUILD_ONEDNN_GRAPH)
|
||||
FIND_PATH(LLGA_INCLUDE_DIR dnnl_graph.hpp PATHS ${LLGA_ROOT} PATH_SUFFIXES include/oneapi/dnnl)
|
||||
|
||||
@ -283,9 +283,11 @@ The following ops are currently supported:
|
||||
kron
|
||||
meshgrid
|
||||
narrow
|
||||
nn.functional.unfold
|
||||
ravel
|
||||
select
|
||||
split
|
||||
stack
|
||||
t
|
||||
transpose
|
||||
vsplit
|
||||
@ -294,6 +296,7 @@ The following ops are currently supported:
|
||||
Tensor.expand_as
|
||||
Tensor.reshape
|
||||
Tensor.reshape_as
|
||||
Tensor.unfold
|
||||
Tensor.view
|
||||
|
||||
.. This module needs to be documented. Adding here in the meantime
|
||||
|
||||
@ -398,3 +398,4 @@ The following utility functions are related to serialization:
|
||||
.. autofunction:: clear_safe_globals
|
||||
.. autofunction:: get_safe_globals
|
||||
.. autoclass:: safe_globals
|
||||
.. autoclass:: skip_data
|
||||
|
||||
@ -28,7 +28,6 @@ The exporter is designed to be modular and extensible. It is composed of the fol
|
||||
- **FX Graph Extractor**: :class:`FXGraphExtractor` extracts the FX graph from the PyTorch model.
|
||||
- **Fake Mode**: :class:`ONNXFakeContext` is a context manager that enables fake mode for large scale models.
|
||||
- **ONNX Program**: :class:`ONNXProgram` is the output of the exporter that contains the exported ONNX graph and diagnostics.
|
||||
- **ONNX Program Serializer**: :class:`ONNXProgramSerializer` serializes the exported model to a file.
|
||||
- **ONNX Diagnostic Options**: :class:`DiagnosticOptions` has a set of options that control the diagnostics emitted by the exporter.
|
||||
|
||||
Dependencies
|
||||
@ -144,15 +143,9 @@ API Reference
|
||||
.. autoclass:: torch.onnx.ONNXProgram
|
||||
:members:
|
||||
|
||||
.. autoclass:: torch.onnx.ONNXProgramSerializer
|
||||
:members:
|
||||
|
||||
.. autoclass:: torch.onnx.ONNXRuntimeOptions
|
||||
:members:
|
||||
|
||||
.. autoclass:: torch.onnx.InvalidExportOptionsError
|
||||
:members:
|
||||
|
||||
.. autoclass:: torch.onnx.OnnxExporterError
|
||||
:members:
|
||||
|
||||
|
||||
@ -13,7 +13,6 @@ torch.xpu
|
||||
device
|
||||
device_count
|
||||
device_of
|
||||
empty_cache
|
||||
get_device_capability
|
||||
get_device_name
|
||||
get_device_properties
|
||||
@ -51,7 +50,25 @@ Streams and events
|
||||
Stream
|
||||
|
||||
|
||||
Memory management
|
||||
-----------------
|
||||
.. autosummary::
|
||||
:toctree: generated
|
||||
:nosignatures:
|
||||
|
||||
empty_cache
|
||||
max_memory_allocated
|
||||
max_memory_reserved
|
||||
memory_allocated
|
||||
memory_reserved
|
||||
memory_stats
|
||||
memory_stats_as_nested_dict
|
||||
reset_accumulated_memory_stats
|
||||
reset_peak_memory_stats
|
||||
|
||||
|
||||
.. This module needs to be documented. Adding here in the meantime
|
||||
.. for tracking purposes
|
||||
.. py:module:: torch.xpu.memory
|
||||
.. py:module:: torch.xpu.random
|
||||
.. py:module:: torch.xpu.streams
|
||||
.. py:module:: torch.xpu.streams
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# Python dependencies required for development
|
||||
astunparse
|
||||
expecttest!=0.2.0
|
||||
expecttest>=0.2.1
|
||||
hypothesis
|
||||
numpy
|
||||
psutil
|
||||
|
||||
@ -93,7 +93,7 @@ annotations from the example above one would write:
|
||||
* `CHECK-COUNT-EXACTLY-<num>: <pattern>`
|
||||
Scans the input and succeeds when a line containing exactly `NUM` entries of
|
||||
`PATTERN` is found.
|
||||
* `CHECK-DAG: pattern`
|
||||
* `CHECK-DAG: <pattern>`
|
||||
Works similar to the usual `CHECK` pragma, but also matches if there exists a
|
||||
way to reorder the CHECK-DAG pragmas to satisfy all patterns.
|
||||
For example the following pattern:
|
||||
@ -110,3 +110,18 @@ annotations from the example above one would write:
|
||||
bar
|
||||
end
|
||||
```
|
||||
* `CHECK-SOURCE-HIGHLIGHTED: <pattern>`
|
||||
Check for highlighted source ranges. This is useful when writing tests regarding generated error messages that require source code highlighting.
|
||||
For example the following pattern:
|
||||
```
|
||||
# CHECK-SOURCE-HIGHLIGHTED: raise Exception("raised exception
|
||||
```
|
||||
would match the following input:
|
||||
```
|
||||
def method_that_raises() -> torch.Tensor:
|
||||
raise Exception("raised exception") # noqa: TRY002
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
|
||||
builtins.Exception: raised exception
|
||||
```
|
||||
* `CHECK-REGEX: <pattern>`
|
||||
Scans the input until `PATTERN` is matched, accepts RE syntax for std::regex.
|
||||
|
||||
@ -0,0 +1,111 @@
|
||||
# Owner(s): ["oncall: distributed"]
|
||||
import copy
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.amp.grad_scaler import GradScaler, OptState
|
||||
from torch.distributed._composable.fsdp import fully_shard
|
||||
from torch.distributed._tensor import init_device_mesh
|
||||
from torch.distributed.tensor.parallel import (
|
||||
ColwiseParallel,
|
||||
parallelize_module,
|
||||
RowwiseParallel,
|
||||
)
|
||||
from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
|
||||
from torch.testing._internal.common_fsdp import FSDPTest, MLP
|
||||
from torch.testing._internal.common_utils import run_tests, skipIfRocm
|
||||
|
||||
|
||||
class TestFullyShardGradientScaler(FSDPTest):
|
||||
@skip_if_lt_x_gpu(4)
|
||||
@skipIfRocm
|
||||
def test_gradient_scaler(self):
|
||||
self.run_subtests(
|
||||
{"has_inf": [True, False], "test_2d": [True, False]},
|
||||
self._test_gradient_scaler,
|
||||
)
|
||||
|
||||
def _test_gradient_scaler(self, has_inf: bool, test_2d: bool):
|
||||
torch.manual_seed(0)
|
||||
model = nn.Sequential(
|
||||
*[nn.Linear(4, 4, device="cuda", bias=False) for _ in range(2)]
|
||||
)
|
||||
for layer in model:
|
||||
fully_shard(layer)
|
||||
fully_shard(model)
|
||||
input = torch.randn([4, 4], device="cuda")
|
||||
|
||||
if test_2d:
|
||||
mesh_2d = init_device_mesh(
|
||||
"cuda", (2, self.world_size // 2), mesh_dim_names=("dp", "tp")
|
||||
)
|
||||
dp_mesh, tp_mesh = mesh_2d["dp"], mesh_2d["tp"]
|
||||
model = nn.Sequential(MLP(2), MLP(2), MLP(2))
|
||||
tp_parallelize_plan = {
|
||||
"0.in_proj": ColwiseParallel(),
|
||||
"0.out_proj": RowwiseParallel(),
|
||||
"1.in_proj": ColwiseParallel(),
|
||||
"1.out_proj": RowwiseParallel(),
|
||||
"2.in_proj": ColwiseParallel(),
|
||||
"2.out_proj": RowwiseParallel(),
|
||||
}
|
||||
model = parallelize_module(
|
||||
model,
|
||||
device_mesh=tp_mesh,
|
||||
parallelize_plan=tp_parallelize_plan,
|
||||
)
|
||||
for module in model:
|
||||
fully_shard(module, mesh=dp_mesh)
|
||||
fully_shard(model, mesh=dp_mesh)
|
||||
input = torch.randn((2,), device="cuda")
|
||||
|
||||
loss = model(input).sum()
|
||||
scaler = GradScaler(init_scale=2.0, enabled=True)
|
||||
opt = torch.optim.Adam(model.parameters(), lr=1e-2)
|
||||
scaler.scale(loss).backward()
|
||||
inv_scale = scaler._scale.double().reciprocal().float()
|
||||
if (
|
||||
has_inf is True
|
||||
and opt.param_groups[0]["params"][0].grad._local_tensor.device.index == 1
|
||||
):
|
||||
opt.param_groups[0]["params"][0].grad._local_tensor[0, 0].fill_(
|
||||
float("inf")
|
||||
)
|
||||
inital_grad = opt.param_groups[0]["params"][0].grad.to_local().clone()
|
||||
|
||||
scaler.unscale_(opt)
|
||||
for found_inf in scaler._per_optimizer_states[id(opt)][
|
||||
"found_inf_per_device"
|
||||
].values():
|
||||
self.assertEqual(found_inf, has_inf)
|
||||
self.assertEqual(
|
||||
scaler._per_optimizer_states[id(opt)]["stage"].value,
|
||||
OptState.UNSCALED.value,
|
||||
)
|
||||
unscaled_grad = opt.param_groups[0]["params"][0].grad.to_local().clone()
|
||||
self.assertEqual(unscaled_grad, inital_grad * inv_scale)
|
||||
initial_scale = scaler.get_scale()
|
||||
initial_state = copy.copy(opt.state)
|
||||
|
||||
scaler.step(opt)
|
||||
steped_state = copy.copy(opt.state)
|
||||
if has_inf:
|
||||
# assert parameters are the same before/after
|
||||
self.assertEqual(steped_state, initial_state)
|
||||
else:
|
||||
# new parameters here if no inf found during .unscale_()
|
||||
self.assertNotEqual(steped_state.items(), initial_state.items())
|
||||
|
||||
scaler.update()
|
||||
updated_scale = scaler.get_scale()
|
||||
if has_inf:
|
||||
# assert scale is updated
|
||||
backoff_factor = scaler.get_backoff_factor()
|
||||
self.assertEqual(updated_scale, initial_scale * backoff_factor)
|
||||
else:
|
||||
# scale is not updated
|
||||
self.assertEqual(updated_scale, initial_scale)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
@ -32,6 +32,7 @@ from torch.testing._internal.common_distributed import (
|
||||
skip_if_rocm,
|
||||
)
|
||||
from torch.testing._internal.common_utils import run_tests
|
||||
from torch.testing._internal.distributed.fake_pg import FakeStore
|
||||
from torch.utils._triton import has_triton
|
||||
from torch.utils.checkpoint import checkpoint
|
||||
|
||||
@ -367,35 +368,28 @@ class ReplicateTest(MultiProcessInductorTestCase):
|
||||
fc.run(code)
|
||||
|
||||
|
||||
class DDP_TP_Test(MultiProcessInductorTestCase):
|
||||
@property
|
||||
def world_size(self) -> int:
|
||||
return min(4, torch.cuda.device_count())
|
||||
class DDP_TP_Test(InductorTestCase):
|
||||
def setUp(self):
|
||||
self.rank = 0
|
||||
self.world_size = 4
|
||||
torch.cuda.set_device("cuda:0")
|
||||
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
self._spawn_processes()
|
||||
store = FakeStore()
|
||||
dist.init_process_group(
|
||||
backend="fake",
|
||||
world_size=self.world_size,
|
||||
rank=self.rank,
|
||||
store=store,
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
try:
|
||||
os.remove(self.file_name)
|
||||
except OSError:
|
||||
pass
|
||||
dist.destroy_process_group()
|
||||
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@skip_if_rocm
|
||||
@skip_if_lt_x_gpu(4)
|
||||
def test_ddp_tp(self):
|
||||
torch.cuda.set_device(f"cuda:{self.rank}")
|
||||
dist.init_process_group(
|
||||
backend="nccl",
|
||||
rank=self.rank,
|
||||
world_size=self.world_size,
|
||||
store=dist.FileStore(self.file_name, self.world_size),
|
||||
)
|
||||
model = Net().cuda()
|
||||
compiled_replicate_model = deepcopy(model)
|
||||
ref_model = Net()
|
||||
compiled_replicate_model = deepcopy(ref_model)
|
||||
mesh_2d = init_device_mesh(
|
||||
"cuda", (2, self.world_size // 2), mesh_dim_names=("dp", "tp")
|
||||
)
|
||||
@ -407,8 +401,8 @@ class DDP_TP_Test(MultiProcessInductorTestCase):
|
||||
"fc3": ColwiseParallel(),
|
||||
"fc4": RowwiseParallel(),
|
||||
}
|
||||
model = parallelize_module(model, tp_mesh, parallelize_plan)
|
||||
model = replicate(model, device_mesh=dp_mesh)
|
||||
ref_model = parallelize_module(ref_model, tp_mesh, parallelize_plan)
|
||||
ref_model = replicate(ref_model, device_mesh=dp_mesh)
|
||||
compiled_replicate_model = parallelize_module(
|
||||
compiled_replicate_model, tp_mesh, parallelize_plan
|
||||
)
|
||||
@ -416,15 +410,23 @@ class DDP_TP_Test(MultiProcessInductorTestCase):
|
||||
compiled_replicate_model, device_mesh=dp_mesh
|
||||
)
|
||||
compiled_replicate_model = torch.compile(compiled_replicate_model)
|
||||
data = torch.randn([1, DIM]).cuda()
|
||||
data = torch.randn([1, DIM])
|
||||
with compiled_autograd.enable(compiler_fn()):
|
||||
loss = compiled_replicate_model(data).sum()
|
||||
loss.backward()
|
||||
# TODO: We need "pre-dispatch tracing of backward graph" to make this work:
|
||||
# https://github.com/pytorch/pytorch/issues/127797#issuecomment-2291695474
|
||||
with self.assertRaisesRegex(
|
||||
AssertionError,
|
||||
"Expected ProxyTensor, got <class 'torch.distributed._tensor.api.DTensor'>",
|
||||
):
|
||||
loss.backward()
|
||||
|
||||
loss = model(data).sum()
|
||||
loss.backward()
|
||||
for p1, p2 in zip(model.parameters(), compiled_replicate_model.parameters()):
|
||||
self.assertEqual(p1.grad, p2.grad)
|
||||
# ref_loss = ref_model(data).sum()
|
||||
# ref_loss.backward()
|
||||
# for p1, p2 in zip(
|
||||
# ref_model.parameters(), compiled_replicate_model.parameters()
|
||||
# ):
|
||||
# self.assertEqual(p1.grad, p2.grad)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -299,6 +299,70 @@ class TestDTensorCompile(torch._dynamo.test_case.TestCase):
|
||||
self.assertEqual(res, ref)
|
||||
self.assertEqual(cnt.frame_count, 2)
|
||||
|
||||
def test_dynamo_dtensor_from_local_dynamic_shapes(self):
|
||||
mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
|
||||
|
||||
# Case 1: all dims dynamic
|
||||
def fn(x):
|
||||
dt = DTensor.from_local(
|
||||
x,
|
||||
mesh,
|
||||
[Replicate()],
|
||||
run_check=False,
|
||||
shape=x.shape,
|
||||
stride=x.stride(),
|
||||
)
|
||||
return dt.to_local() + 2
|
||||
|
||||
inp = torch.randn(4, 6, requires_grad=True)
|
||||
ref = fn(inp)
|
||||
cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
|
||||
res = torch.compile(fn, backend=cnt, fullgraph=True, dynamic=True)(inp)
|
||||
res.sum().backward()
|
||||
|
||||
self.assertEqual(res, ref)
|
||||
self.assertEqual(cnt.frame_count, 1)
|
||||
|
||||
# Case 2: only sizes are dynamic, strides are static
|
||||
def fn(x):
|
||||
dt = DTensor.from_local(
|
||||
x, mesh, [Replicate()], run_check=False, shape=x.shape, stride=(1,)
|
||||
)
|
||||
return dt.to_local() + 2
|
||||
|
||||
inp = torch.randn(4, requires_grad=True)
|
||||
torch._dynamo.mark_dynamic(inp, 0)
|
||||
ref = fn(inp)
|
||||
cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
|
||||
res = torch.compile(fn, backend=cnt, fullgraph=True)(inp)
|
||||
res.sum().backward()
|
||||
|
||||
self.assertEqual(res, ref)
|
||||
self.assertEqual(cnt.frame_count, 1)
|
||||
|
||||
# Case 3: both sizes and strides have a mix of dynamic and static dims
|
||||
def fn(x):
|
||||
dt = DTensor.from_local(
|
||||
x,
|
||||
mesh,
|
||||
[Replicate()],
|
||||
run_check=False,
|
||||
shape=(x.shape[0], x.shape[1], 2),
|
||||
stride=(x.stride()[0], 2, 1),
|
||||
)
|
||||
return dt.to_local() + 2
|
||||
|
||||
inp = torch.randn(4, 6, 2, requires_grad=True)
|
||||
torch._dynamo.mark_dynamic(inp, 0)
|
||||
torch._dynamo.mark_dynamic(inp, 1)
|
||||
ref = fn(inp)
|
||||
cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
|
||||
res = torch.compile(fn, backend=cnt, fullgraph=True)(inp)
|
||||
res.sum().backward()
|
||||
|
||||
self.assertEqual(res, ref)
|
||||
self.assertEqual(cnt.frame_count, 1)
|
||||
|
||||
def test_dynamo_dtensor_recompile(self):
|
||||
mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
|
||||
|
||||
|
||||
@ -7,7 +7,13 @@ from typing import cast, List
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from torch import rand, randn, Tensor
|
||||
from torch.distributed._tensor import DeviceMesh, distribute_tensor, Replicate, Shard
|
||||
from torch.distributed._tensor import (
|
||||
DeviceMesh,
|
||||
distribute_tensor,
|
||||
init_device_mesh,
|
||||
Replicate,
|
||||
Shard,
|
||||
)
|
||||
from torch.distributed._tensor.debug import CommDebugMode
|
||||
from torch.distributed._tensor.ops._view_ops import (
|
||||
Broadcast,
|
||||
@ -29,6 +35,10 @@ from torch.utils import _pytree as pytree
|
||||
|
||||
|
||||
class TestViewOps(DTensorTestBase):
|
||||
@property
|
||||
def world_size(self) -> int:
|
||||
return 6
|
||||
|
||||
def test_view_groups(self):
|
||||
self.assertEqual(
|
||||
view_groups([2, 3], [3, 2]),
|
||||
@ -106,8 +116,8 @@ class TestViewOps(DTensorTestBase):
|
||||
view_groups([1, 1, 3, 2, 1, 1], [6, 1, 1, 1]),
|
||||
(
|
||||
Flatten((InputDim(2), InputDim(3))),
|
||||
Singleton(),
|
||||
Singleton(),
|
||||
InputDim(4),
|
||||
InputDim(5),
|
||||
Singleton(),
|
||||
),
|
||||
)
|
||||
@ -116,7 +126,7 @@ class TestViewOps(DTensorTestBase):
|
||||
(
|
||||
Split(InputDim(2), (3, 4), 0),
|
||||
Split(InputDim(2), (3, 4), 1),
|
||||
Singleton(),
|
||||
InputDim(3),
|
||||
Flatten((InputDim(6), InputDim(7))),
|
||||
),
|
||||
)
|
||||
@ -125,10 +135,6 @@ class TestViewOps(DTensorTestBase):
|
||||
(InputDim(0), InputDim(1), InputDim(2)),
|
||||
)
|
||||
|
||||
@property
|
||||
def world_size(self) -> int:
|
||||
return 6
|
||||
|
||||
def call_dt_test(self, op, args, kwargs, device_mesh: DeviceMesh):
|
||||
dim_map = dim_maps[op]
|
||||
rules = dim_map(*args, **kwargs)
|
||||
@ -429,7 +435,7 @@ class TestViewOps(DTensorTestBase):
|
||||
self.dimmap_test(
|
||||
Tensor.view,
|
||||
(randn(1, 1, 42, 1, 24, 1), -1),
|
||||
(Flatten((InputDim(2), InputDim(4))),),
|
||||
(Flatten((InputDim(2), InputDim(input_dim=3), InputDim(4))),),
|
||||
)
|
||||
|
||||
self.dimmap_test(
|
||||
@ -525,6 +531,46 @@ class TestViewOps(DTensorTestBase):
|
||||
)
|
||||
self.assertEqual(out, out_dt.full_tensor())
|
||||
|
||||
@with_comms
|
||||
def test_dtensor_view_op_uneven(self):
|
||||
"""
|
||||
Test two uneven cases for view op:
|
||||
1) the sharded tensor dim is 1 so that only the first rank has an non-empty shard.
|
||||
2) the sharded tensor dim is uneven such that some ranks have full shards,
|
||||
smaller non-empty shards, and empty shards.
|
||||
"""
|
||||
dim0_sizes = [1, self.world_size + 1]
|
||||
for dim0_size in dim0_sizes:
|
||||
p = torch.randn(dim0_size, 2, 2, 2)
|
||||
mesh = init_device_mesh(self.device_type, (self.world_size,))
|
||||
dtensor = distribute_tensor(p, mesh, [Shard(0)])
|
||||
|
||||
with CommDebugMode() as comm_mode:
|
||||
view = dtensor.view(dim0_size, 2, 4)
|
||||
self.assertEqual(len(comm_mode.get_comm_counts()), 0)
|
||||
# when no communication happens, the data pointer should be the same.
|
||||
self.assertEqual(
|
||||
view.to_local().data_ptr(), dtensor.to_local().data_ptr()
|
||||
)
|
||||
|
||||
view = dtensor.view(dim0_size, 4, 2)
|
||||
self.assertEqual(
|
||||
view.to_local().data_ptr(), dtensor.to_local().data_ptr()
|
||||
)
|
||||
self.assertEqual(len(comm_mode.get_comm_counts()), 0)
|
||||
|
||||
view = dtensor.view(dim0_size, 8)
|
||||
self.assertEqual(
|
||||
view.to_local().data_ptr(), dtensor.to_local().data_ptr()
|
||||
)
|
||||
self.assertEqual(len(comm_mode.get_comm_counts()), 0)
|
||||
|
||||
view = dtensor.view(dtensor.shape)
|
||||
self.assertEqual(
|
||||
view.to_local().data_ptr(), dtensor.to_local().data_ptr()
|
||||
)
|
||||
self.assertEqual(len(comm_mode.get_comm_counts()), 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -34,7 +34,7 @@ class FsdpModelStateCheckpoint(DTensorTestBase):
|
||||
"model": model.state_dict(),
|
||||
}
|
||||
|
||||
dist_cp.save_state_dict(
|
||||
dist_cp.save(
|
||||
state_dict=state_dict,
|
||||
storage_writer=dist_cp.FileSystemWriter(CHECKPOINT_DIR),
|
||||
planner=DefaultSavePlanner(),
|
||||
@ -55,7 +55,7 @@ class FsdpModelStateCheckpoint(DTensorTestBase):
|
||||
"model": model_2.state_dict(),
|
||||
}
|
||||
|
||||
dist_cp.load_state_dict(
|
||||
dist_cp.load(
|
||||
state_dict=state_dict,
|
||||
storage_reader=dist_cp.FileSystemReader(CHECKPOINT_DIR),
|
||||
planner=DefaultLoadPlanner(),
|
||||
|
||||
@ -40,7 +40,7 @@ class TestFsdpTpCheckpointConversion(DTensorTestBase):
|
||||
fsdp_state_dict = fsdp_model.state_dict()
|
||||
|
||||
# save fsdp_state_dict to storage
|
||||
dist_cp.save_state_dict(
|
||||
dist_cp.save(
|
||||
state_dict=fsdp_state_dict,
|
||||
storage_writer=dist_cp.FileSystemWriter(CHECKPOINT_DIR),
|
||||
)
|
||||
|
||||
@ -94,7 +94,7 @@ class TestHSDPCheckpoint(DTensorTestBase):
|
||||
state_dict = {"model": model.state_dict()}
|
||||
state_dict_to_save = deepcopy(state_dict)
|
||||
|
||||
dist_cp.save_state_dict(
|
||||
dist_cp.save(
|
||||
state_dict=state_dict_to_save,
|
||||
storage_writer=dist_cp.FileSystemWriter(CHECKPOINT_DIR),
|
||||
planner=DefaultSavePlanner(),
|
||||
@ -113,7 +113,7 @@ class TestHSDPCheckpoint(DTensorTestBase):
|
||||
self.assertEqual(v1.placements, v2.placements)
|
||||
self.assertNotEqual(v1.to_local(), v2.to_local())
|
||||
|
||||
dist_cp.load_state_dict(
|
||||
dist_cp.load(
|
||||
state_dict=state_dict_to_save,
|
||||
storage_reader=dist_cp.FileSystemReader(CHECKPOINT_DIR),
|
||||
planner=DefaultLoadPlanner(),
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
#
|
||||
# This source code is licensed under the BSD-style license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
import asyncio
|
||||
import ctypes
|
||||
import multiprocessing
|
||||
import os
|
||||
@ -362,6 +363,9 @@ if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS):
|
||||
self.assertTrue(pc._stderr_tail.stopped())
|
||||
self.assertTrue(pc._stdout_tail.stopped())
|
||||
|
||||
def test_pcontext_wait_on_a_child_thread(self):
|
||||
asyncio.run(asyncio.to_thread(self.test_pcontext_wait))
|
||||
|
||||
def test_multiprocess_context_close(self):
|
||||
pc = start_processes(
|
||||
name="sleep",
|
||||
|
||||
@ -25,6 +25,7 @@ from torch.distributed.elastic.rendezvous.c10d_rendezvous_backend import (
|
||||
C10dRendezvousBackend,
|
||||
create_backend,
|
||||
)
|
||||
from torch.distributed.elastic.utils.distributed import get_free_port
|
||||
|
||||
|
||||
class TCPStoreBackendTest(TestCase, RendezvousBackendTestMixin):
|
||||
@ -69,9 +70,11 @@ class CreateBackendTest(TestCase):
|
||||
# For testing, the default parameters used are for tcp. If a test
|
||||
# uses parameters for file store, we set the self._params to
|
||||
# self._params_filestore.
|
||||
|
||||
port = get_free_port()
|
||||
self._params = RendezvousParameters(
|
||||
backend="dummy_backend",
|
||||
endpoint="localhost:29300",
|
||||
endpoint=f"localhost:{port}",
|
||||
run_id="dummy_run_id",
|
||||
min_nodes=1,
|
||||
max_nodes=1,
|
||||
@ -95,7 +98,7 @@ class CreateBackendTest(TestCase):
|
||||
self._expected_temp_dir = tempfile.gettempdir()
|
||||
|
||||
self._expected_endpoint_host = "localhost"
|
||||
self._expected_endpoint_port = 29300
|
||||
self._expected_endpoint_port = port
|
||||
self._expected_store_type = TCPStore
|
||||
self._expected_read_timeout = timedelta(seconds=10)
|
||||
|
||||
@ -173,11 +176,14 @@ class CreateBackendTest(TestCase):
|
||||
def test_create_backend_returns_backend_if_endpoint_port_is_not_specified(
|
||||
self,
|
||||
) -> None:
|
||||
self._params.endpoint = self._expected_endpoint_host
|
||||
# patch default port and pass endpoint with no port specified
|
||||
with mock.patch(
|
||||
"torch.distributed.elastic.rendezvous.c10d_rendezvous_backend.DEFAULT_PORT",
|
||||
self._expected_endpoint_port,
|
||||
):
|
||||
self._params.endpoint = self._expected_endpoint_host
|
||||
|
||||
self._expected_endpoint_port = 29400
|
||||
|
||||
self._assert_create_backend_returns_backend()
|
||||
self._assert_create_backend_returns_backend()
|
||||
|
||||
def test_create_backend_returns_backend_if_endpoint_file_is_not_specified(
|
||||
self,
|
||||
|
||||
@ -1597,6 +1597,23 @@ class CreateHandlerTest(TestCase):
|
||||
create_handler(self._store, self._backend, self._params)
|
||||
record_mock.assert_called_once()
|
||||
|
||||
def test_create_handler_rdzv_local_addr(self) -> None:
|
||||
params = RendezvousParameters(
|
||||
backend=self._backend.name,
|
||||
endpoint="dummy_endpoint",
|
||||
run_id="dummy_run_id",
|
||||
min_nodes=1,
|
||||
max_nodes=1,
|
||||
join_timeout="50",
|
||||
last_call_timeout="60",
|
||||
close_timeout="70",
|
||||
local_addr="127.0.0.2",
|
||||
)
|
||||
store = HashStore()
|
||||
handler = create_handler(store, self._backend, params)
|
||||
rdzv_info = handler.next_rendezvous()
|
||||
self.assertEqual(rdzv_info.bootstrap_store_info.master_addr, "127.0.0.2")
|
||||
|
||||
|
||||
def _ignore_exception(exception_type: Exception, fn: Callable):
|
||||
try:
|
||||
@ -1656,7 +1673,7 @@ class IntegrationTest(TestCase):
|
||||
"min_nodes": 2,
|
||||
"max_nodes": 2,
|
||||
"join_timeout": "5",
|
||||
"local_addr": f"address_{len(self._handlers)}",
|
||||
"local_addr": f"127.0.0.{len(self._handlers)}",
|
||||
}
|
||||
params.update(**kwargs)
|
||||
|
||||
@ -1714,7 +1731,7 @@ class IntegrationTest(TestCase):
|
||||
state_and_token = self._backend.get_state()
|
||||
state = pickle.loads(state_and_token[0])
|
||||
addresses = [node.addr for node in state.redundancy_list]
|
||||
self.assertListEqual(addresses, ["address_2"])
|
||||
self.assertListEqual(addresses, ["127.0.0.2"])
|
||||
|
||||
def test_redundancy_transition_to_wait_list_then_join_rendezvous(self) -> None:
|
||||
handler1 = self._create_handler(
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
# This source code is licensed under the BSD-style license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import signal
|
||||
import time
|
||||
import unittest
|
||||
@ -37,7 +38,7 @@ if not (IS_WINDOWS or IS_MACOS):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.max_interval = 0.01
|
||||
self.file_path = "/tmp/test_file_path_" + str(uuid.uuid4())
|
||||
self.file_path = f"/tmp/test_file_path_{os.getpid()}_{uuid.uuid4()}"
|
||||
self.server = timer.FileTimerServer(
|
||||
self.file_path, "test", self.max_interval
|
||||
)
|
||||
@ -204,7 +205,7 @@ if not (IS_WINDOWS or IS_MACOS):
|
||||
class FileTimerServerTest(TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.file_path = "/tmp/test_file_path_" + str(uuid.uuid4())
|
||||
self.file_path = f"/tmp/test_file_path_{os.getpid()}_{uuid.uuid4()}"
|
||||
self.max_interval = 0.01
|
||||
self.server = timer.FileTimerServer(
|
||||
self.file_path, "test", self.max_interval
|
||||
|
||||
@ -914,9 +914,6 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
|
||||
with _dynamo_dist_per_rank_init(self.rank, self.world_size):
|
||||
torch._dynamo.utils.clear_compilation_metrics()
|
||||
|
||||
# TODO: This should be possible to do inside the function, but
|
||||
device = f"cuda:{self.rank}"
|
||||
|
||||
@torch.compile()
|
||||
def f(x, y):
|
||||
zx = x.shape
|
||||
@ -940,6 +937,28 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
|
||||
for r in res[1:]:
|
||||
self.assertEqual(res[0], r)
|
||||
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@config.patch(enable_compiler_collectives=True)
|
||||
def test_compiler_collectives_missing_source(self):
|
||||
with _dynamo_dist_per_rank_init(self.rank, self.world_size):
|
||||
torch._dynamo.utils.clear_compilation_metrics()
|
||||
|
||||
@torch.compile()
|
||||
def f(rank, xs):
|
||||
return xs[rank].sum()
|
||||
|
||||
xs = []
|
||||
for _ in range(self.world_size):
|
||||
xs.append(torch.randn(10, device=self.rank))
|
||||
|
||||
f(self.rank, xs)
|
||||
|
||||
metrics = torch._dynamo.utils.get_compilation_metrics()
|
||||
res = [None] * self.world_size
|
||||
torch.distributed.all_gather_object(res, len(metrics))
|
||||
for r in res[1:]:
|
||||
self.assertEqual(res[0], r)
|
||||
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@patch.object(torch._inductor.config, "fx_graph_cache", False)
|
||||
@patch.object(torch._inductor.config, "fx_graph_remote_cache", False)
|
||||
|
||||
@ -1013,22 +1013,17 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
|
||||
return ar
|
||||
|
||||
input = torch.ones(4, 4, device="cuda", requires_grad=True)
|
||||
# TODO implement backwards
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError,
|
||||
"element 0 of tensors does not require grad and does not have a grad_fn",
|
||||
):
|
||||
compiled = torch.compile(
|
||||
func, backend="aot_eager"
|
||||
) # inductor bug with single-op allreduce graph
|
||||
out = compiled(input)
|
||||
out.sum().backward()
|
||||
compiled = torch.compile(
|
||||
func, backend="aot_eager"
|
||||
) # inductor bug with single-op allreduce graph
|
||||
out = compiled(input)
|
||||
out.sum().backward()
|
||||
|
||||
correct_input = input.clone().detach().requires_grad_()
|
||||
correct = func(correct_input)
|
||||
correct.sum().backward()
|
||||
self.assertTrue(same(out, correct))
|
||||
self.assertTrue(same(input.grad, correct_input.grad))
|
||||
correct_input = input.clone().detach().requires_grad_()
|
||||
correct = func(correct_input)
|
||||
correct.sum().backward()
|
||||
self.assertTrue(same(out, correct))
|
||||
self.assertTrue(same(input.grad, correct_input.grad))
|
||||
|
||||
def test_meta(self):
|
||||
x = torch.rand((2, 3, 4), device="meta")
|
||||
|
||||
@ -107,7 +107,7 @@ due to:
|
||||
Traceback (most recent call last):
|
||||
File "test_exc.py", line N, in f
|
||||
raise NotImplementedError
|
||||
torch._dynamo.exc.InternalTorchDynamoError:
|
||||
torch._dynamo.exc.InternalTorchDynamoError: NotImplementedError:
|
||||
|
||||
from user code:
|
||||
File "test_exc.py", line N, in fn001
|
||||
|
||||
@ -239,6 +239,22 @@ class FunctionTests(torch._dynamo.test_case.TestCase):
|
||||
v = v + x
|
||||
return v
|
||||
|
||||
def test_itertools_reconstruct(self):
|
||||
def fn(a):
|
||||
it1 = itertools.repeat(1)
|
||||
it2 = itertools.count(2)
|
||||
for _ in range(3):
|
||||
a += next(it1)
|
||||
a += next(it2)
|
||||
return it1, it2, a
|
||||
|
||||
opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
|
||||
i1, i2, a = fn(torch.ones(3, 3))
|
||||
it1, it2, b = opt_fn(torch.ones(3, 3))
|
||||
self.assertEqual(next(i1), next(it1))
|
||||
self.assertEqual(next(i2), next(it2))
|
||||
self.assertEqual(a, b)
|
||||
|
||||
@make_test
|
||||
def test_obj_eq(a, b):
|
||||
v = a + b
|
||||
@ -507,8 +523,7 @@ class FunctionTests(torch._dynamo.test_case.TestCase):
|
||||
empty = collections.deque()
|
||||
d.extend(empty)
|
||||
|
||||
# dynamo same() util doesn't support deque so just return a list
|
||||
return list(d)
|
||||
return d
|
||||
|
||||
@make_test
|
||||
def test_slice1(a):
|
||||
@ -3115,6 +3130,199 @@ class GraphModule(torch.nn.Module):
|
||||
fn(arr, np.s_[..., 1], np.array([3, 3])), np.array([[1, 3], [2, 3]])
|
||||
)
|
||||
|
||||
def test_map_return(self):
|
||||
def fn(a, b):
|
||||
return map(lambda x: x + 1, [a, b])
|
||||
|
||||
opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
|
||||
m = opt_fn(torch.randn(3, 3), torch.randn(3, 3))
|
||||
self.assertIsInstance(m, map)
|
||||
|
||||
@make_test
|
||||
def test_map_max(a, b):
|
||||
return max(map(lambda x: x.sum(), [a, b]))
|
||||
|
||||
# max(map(...)) graph breaks
|
||||
@unittest.expectedFailure
|
||||
@make_test
|
||||
def test_map_max_const(a):
|
||||
return max(map(lambda x: x, [1, 2, 3])), a + 1
|
||||
|
||||
@make_test
|
||||
def test_map_list(a, b):
|
||||
return list(map(lambda x: x + 1, [a, b]))
|
||||
|
||||
@make_test
|
||||
def test_map_tuple(a, b):
|
||||
return tuple(map(lambda x: x + 1, [a, b]))
|
||||
|
||||
@make_test
|
||||
def test_map_iter(a, b):
|
||||
it = iter(map(lambda x: x + 1, [a, b]))
|
||||
return next(it)
|
||||
|
||||
@make_test
|
||||
def test_map_zip_dict(a):
|
||||
d = dict(
|
||||
zip(
|
||||
map(lambda x: x + 1, [0, 1, 2]),
|
||||
[map(lambda x: x - 1, [y]) for y in [3, 4, 5]],
|
||||
)
|
||||
)
|
||||
return list(d[3])[0], a + 1 # noqa: RUF015
|
||||
|
||||
@make_test
|
||||
def test_map_dict_fromkeys(a):
|
||||
return dict.fromkeys(map(lambda x: x + 1, [0, 1])), a + 1
|
||||
|
||||
@make_test
|
||||
def test_map_set(a):
|
||||
return set(map(lambda x: x + 1, [0, 1])), a + 1
|
||||
|
||||
# test_map_sum defined earlier
|
||||
|
||||
@make_test
|
||||
def test_map_reduce(a, b):
|
||||
return functools.reduce(lambda x, y: x + y, map(lambda x: x + 1, [a, b]))
|
||||
|
||||
@make_test
|
||||
def test_map_sorted(a):
|
||||
return sorted(map(lambda x: x + 1, [0, 4, 3, 1, 2])), a + 1
|
||||
|
||||
@make_test
|
||||
def test_map_list_extend(a, b, c):
|
||||
l = [a]
|
||||
l.extend(map(lambda x: x + 1, [b, c]))
|
||||
return l
|
||||
|
||||
@make_test
|
||||
def test_map_list_slice_assign(a, b, c, d, e):
|
||||
l = [a, b, c]
|
||||
l[1:2] = map(lambda x: x + 1, [d, e])
|
||||
return l
|
||||
|
||||
@make_test
|
||||
def test_map_deque_extendleft(a, b, c):
|
||||
d = collections.deque([a])
|
||||
d.extendleft(map(lambda x: x + 1, [b, c]))
|
||||
return d
|
||||
|
||||
@make_test
|
||||
def test_map_str_join(a):
|
||||
return "".join(map(lambda x: x, ["a", "b", "c"])), a + 1
|
||||
|
||||
def test_map_with_graph_break(self):
|
||||
def f(a):
|
||||
a += 1
|
||||
|
||||
def g(x):
|
||||
nonlocal a
|
||||
a += 1
|
||||
return x + 1
|
||||
|
||||
m = map(g, [1, 2, 3, 4, 5])
|
||||
a += next(m) # won't graph break
|
||||
torch._dynamo.graph_break()
|
||||
a += next(m) # will graph break
|
||||
return a
|
||||
|
||||
cnts = torch._dynamo.testing.CompileCounter()
|
||||
opt_f = torch.compile(f, backend=cnts)
|
||||
self.assertEqual(f(torch.ones(3, 3)), opt_f(torch.ones(3, 3)))
|
||||
self.assertEqual(cnts.frame_count, 3)
|
||||
|
||||
def test_map_reconstruct(self):
|
||||
def fn(a):
|
||||
return map(lambda x: x[0] + x[1], zip([1, 2, 3], [1, 2, 3])), a + 1
|
||||
|
||||
opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
|
||||
m = opt_fn(torch.ones(3, 3))[0]
|
||||
self.assertIsInstance(m, map)
|
||||
self.assertEqual(list(m), list(fn(torch.ones(3, 3))[0]))
|
||||
|
||||
def test_zip_reconstruct(self):
|
||||
def fn(a):
|
||||
return zip([1, 2, 3], map(lambda x: x + 1, [1, 2, 3])), a + 1
|
||||
|
||||
opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
|
||||
m = opt_fn(torch.ones(3, 3))[0]
|
||||
self.assertIsInstance(m, zip)
|
||||
self.assertEqual(list(m), list(fn(torch.ones(3, 3))[0]))
|
||||
|
||||
@make_test
|
||||
def test_map_partial_unpack(a, b):
|
||||
y = 1
|
||||
|
||||
def f(x):
|
||||
nonlocal y
|
||||
y += 1
|
||||
return x
|
||||
|
||||
l = list(zip([a, b], map(f, [1, 2, 3, 4])))
|
||||
return a + y
|
||||
|
||||
@make_test
|
||||
def test_map_call_function_ex(a, b):
|
||||
def f(x, y):
|
||||
return x + y
|
||||
|
||||
return f(*map(lambda x: x + 1, [a, b]))
|
||||
|
||||
@make_test
|
||||
def test_map_unpack_twice(a, b):
|
||||
m = map(lambda x: x + 1, [a, b])
|
||||
l1 = list(m)
|
||||
l2 = list(m)
|
||||
return l1, l2
|
||||
|
||||
@make_test
|
||||
def test_enumerate(a, b):
|
||||
return list(enumerate([a, b], start=1)), a + 1
|
||||
|
||||
@make_test
|
||||
def test_map_enumerate(a, b):
|
||||
return list(enumerate(map(lambda x: x + 1, [a, b]), start=1)), a + 1
|
||||
|
||||
@make_test
|
||||
def test_map_infinite(a, b):
|
||||
return list(map(lambda x, y: x + y, [a, b], itertools.count(3)))
|
||||
|
||||
@make_test
|
||||
def test_map_unpack_vars(a, b):
|
||||
x, y = map(lambda x: x + 1, [a, b])
|
||||
return x + y
|
||||
|
||||
def test_enumerate_custom(self):
|
||||
class MyClass:
|
||||
def __iter__(self):
|
||||
self.a = 1
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
if self.a > 3:
|
||||
raise StopIteration
|
||||
self.a += 1
|
||||
return self.a
|
||||
|
||||
def fn(x):
|
||||
for i, it in enumerate(MyClass()):
|
||||
x += i + it
|
||||
return x
|
||||
|
||||
opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
|
||||
self.assertEqual(fn(torch.ones(3, 3)), opt_fn(torch.ones(3, 3)))
|
||||
|
||||
def test_enumerate_reconstruct(self):
|
||||
def fn(a, b):
|
||||
return enumerate([a, b], start=1)
|
||||
|
||||
opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
|
||||
inps = (torch.randn(3, 3), torch.randn(3, 3))
|
||||
it1 = fn(*inps)
|
||||
it2 = opt_fn(*inps)
|
||||
self.assertIsInstance(it2, enumerate)
|
||||
self.assertEqual(list(it1), list(it2))
|
||||
|
||||
|
||||
def udf_mul(x, y):
|
||||
return x * y
|
||||
@ -3394,6 +3602,71 @@ class DefaultsTests(torch._dynamo.test_case.TestCase):
|
||||
ref = opt_fn(x)
|
||||
self.assertEqual(ref, res)
|
||||
|
||||
def test_frozenset_construction(self):
|
||||
def fn(x):
|
||||
s = frozenset({x})
|
||||
t = frozenset(s)
|
||||
return len(t)
|
||||
|
||||
opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
|
||||
x = torch.randn(4)
|
||||
res = fn(x)
|
||||
ref = opt_fn(x)
|
||||
self.assertEqual(ref, res)
|
||||
|
||||
def test_frozenset_reconstruction(self):
|
||||
d = {}
|
||||
f = frozenset()
|
||||
d[f] = torch.randn(4)
|
||||
|
||||
def fn(x):
|
||||
k = frozenset()
|
||||
torch._dynamo.graph_break()
|
||||
return d[k] * x
|
||||
|
||||
opt_fn = torch.compile(fn, backend="eager")
|
||||
x = torch.randn(4)
|
||||
res = fn(x)
|
||||
ref = opt_fn(x)
|
||||
self.assertEqual(ref, res)
|
||||
|
||||
def test_frozenset_illegal_call_method(self):
|
||||
def fn_add():
|
||||
s = frozenset((1, 2, 3))
|
||||
s.add({2})
|
||||
return len(s)
|
||||
|
||||
def fn_pop():
|
||||
s = frozenset((1, 2, 3))
|
||||
s.pop()
|
||||
return len(s)
|
||||
|
||||
def fn_update():
|
||||
s = frozenset((1, 2, 3))
|
||||
s.update({4, 5, 6})
|
||||
return len(s)
|
||||
|
||||
def fn_remove():
|
||||
s = frozenset((1, 2, 3))
|
||||
s.remove(2)
|
||||
return len(s)
|
||||
|
||||
def fn_discard():
|
||||
s = frozenset((1, 2, 3))
|
||||
s.discard(2)
|
||||
return len(s)
|
||||
|
||||
def fn_clear():
|
||||
s = frozenset((1, 2, 3))
|
||||
s.clear()
|
||||
return len(s)
|
||||
|
||||
for fn in [fn_add, fn_pop, fn_update, fn_remove, fn_discard, fn_clear]:
|
||||
torch._dynamo.reset()
|
||||
opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
|
||||
with self.assertRaises(torch._dynamo.exc.InternalTorchDynamoError):
|
||||
opt_fn()
|
||||
|
||||
def test_is_tensor_tensor(self):
|
||||
def fn(x, y):
|
||||
if x is y:
|
||||
@ -3605,10 +3878,16 @@ class DefaultsTests(torch._dynamo.test_case.TestCase):
|
||||
with self.assertRaisesRegex(torch._dynamo.exc.UserError, "zip()"):
|
||||
nopython_fn(x, ys[:1], zs)
|
||||
|
||||
with self.assertRaisesRegex(torch._dynamo.exc.UserError, "zip()"):
|
||||
nopython_fn(x, ys, zs[:1])
|
||||
|
||||
# Should cause fallback if allow graph break
|
||||
with self.assertRaisesRegex(ValueError, "zip()"):
|
||||
opt_fn(x, ys[:1], zs)
|
||||
|
||||
with self.assertRaisesRegex(ValueError, "zip()"):
|
||||
opt_fn(x, ys, zs[:1])
|
||||
|
||||
def test_fn_with_attr(self):
|
||||
def fn(x):
|
||||
if fn.pred:
|
||||
|
||||
@ -308,6 +308,19 @@ class MiscTests(torch._inductor.test_case.TestCase):
|
||||
"Graph break for an optree C/C++ function optree._C.PyCapsule.flatten. Consider using torch.utils._pytree - https://github.com/pytorch/pytorch/blob/main/torch/utils/_pytree.py",
|
||||
)
|
||||
|
||||
def test_scalar_device_movement(self):
|
||||
if not torch._dynamo.config.assume_static_by_default:
|
||||
self.skipTest("Doesn't work with symints")
|
||||
|
||||
def add_fn(a, b, out):
|
||||
res = torch.add(a, b, out=out)
|
||||
return res
|
||||
|
||||
res = add_fn(2, 3, torch.tensor(0.0))
|
||||
add_fn = torch.compile(add_fn, backend="eager", fullgraph=True)
|
||||
res_compiled = add_fn(2, 3, torch.tensor(0.0))
|
||||
self.assertEqual(res, res_compiled)
|
||||
|
||||
@skipIfNNModuleInlined("fails internal CI")
|
||||
@unittest.skipIf(IS_FBCODE, "inline cpp_extension doesn't work in fbcode")
|
||||
def test_cpp_extension_recommends_custom_ops(self):
|
||||
@ -3367,6 +3380,21 @@ utils_device.CURRENT_DEVICE == None""".split(
|
||||
self.assertTrue(same(obj41.y, obj42.y))
|
||||
self.assertEqual(cnts.frame_count, 1)
|
||||
|
||||
def test_thread_local_setattr(self):
|
||||
from threading import local
|
||||
|
||||
loc = local()
|
||||
|
||||
@torch.compile(fullgraph=True)
|
||||
def fn(x, l):
|
||||
l.x = x
|
||||
return x + 1
|
||||
|
||||
x = torch.ones(2, 2)
|
||||
fn(x, loc)
|
||||
|
||||
self.assertTrue(loc.x is x)
|
||||
|
||||
def test_user_defined_class_name(self):
|
||||
class MyClassFoo:
|
||||
pass
|
||||
|
||||
@ -47,6 +47,21 @@ class TestHFPretrained(torch._dynamo.test_case.TestCase):
|
||||
res = opt_fn(x, tmp)
|
||||
self.assertTrue(same(ref, res))
|
||||
|
||||
@maybe_skip
|
||||
def test_pretrained_non_const_attr(self):
|
||||
def fn(a, tmp):
|
||||
if tmp.pruned_heads:
|
||||
return a + 1
|
||||
else:
|
||||
return a - 1
|
||||
|
||||
x = torch.randn(2)
|
||||
tmp = PretrainedConfig()
|
||||
ref = fn(x, tmp)
|
||||
opt_fn = torch.compile(backend="eager", fullgraph=True)(fn)
|
||||
res = opt_fn(x, tmp)
|
||||
self.assertTrue(same(ref, res))
|
||||
|
||||
|
||||
class TestModelOutput(torch._dynamo.test_case.TestCase):
|
||||
@maybe_skip
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
# Owner(s): ["module: dynamo"]
|
||||
from unittest.mock import patch
|
||||
|
||||
import torch
|
||||
import torch._dynamo.test_case
|
||||
@ -14,6 +13,17 @@ from torch.utils._device import DeviceContext
|
||||
from torch.utils._python_dispatch import TorchDispatchMode
|
||||
|
||||
|
||||
class TestMode(BaseTorchFunctionMode):
|
||||
def __torch_function__(self, func, types, args, kwargs=None):
|
||||
if not kwargs:
|
||||
kwargs = {}
|
||||
|
||||
if func == torch.add:
|
||||
return torch.zeros(2, 2)
|
||||
|
||||
return super().__torch_function__(func, types, args, kwargs)
|
||||
|
||||
|
||||
class TorchDispatchModeTests(torch._dynamo.test_case.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
@ -57,9 +67,11 @@ class TorchFunctionModeTests(torch._dynamo.test_case.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
torch.set_default_device(None)
|
||||
torch._dynamo.reset()
|
||||
|
||||
def tearDown(self):
|
||||
torch.set_default_device(None)
|
||||
torch._dynamo.reset()
|
||||
|
||||
def _run_torch_function_mode_guard_test(self):
|
||||
class TestMode1(BaseTorchFunctionMode):
|
||||
@ -94,70 +106,6 @@ class TorchFunctionModeTests(torch._dynamo.test_case.TestCase):
|
||||
fn(inp)
|
||||
self.assertEqual(cnt.frame_count, 4)
|
||||
|
||||
def _run_ignored_mode_types_test(self):
|
||||
class IgnoredMode(BaseTorchFunctionMode):
|
||||
pass
|
||||
|
||||
cnt = torch._dynamo.testing.CompileCounter()
|
||||
|
||||
@torch.compile(backend=cnt.__call__, fullgraph=True)
|
||||
def fn(x):
|
||||
return x + 1
|
||||
|
||||
inp = torch.ones(2, 2)
|
||||
|
||||
with patch(
|
||||
"torch._dynamo.variables.torch_function.IGNORED_MODES", {IgnoredMode}
|
||||
):
|
||||
# initial compile
|
||||
fn(inp)
|
||||
|
||||
# no recompile, mode ignored
|
||||
# note: the ref stack is length 0, and the stack we are checking against has length 2
|
||||
# we want to check both ref stack len > runtime stack, and ref stack len < runtime stack
|
||||
with IgnoredMode(), IgnoredMode():
|
||||
fn(inp)
|
||||
|
||||
self.assertEqual(cnt.frame_count, 1)
|
||||
|
||||
# recompile due to new mode on the stack
|
||||
with BaseTorchFunctionMode(), BaseTorchFunctionMode(), BaseTorchFunctionMode():
|
||||
fn(inp)
|
||||
|
||||
self.assertEqual(cnt.frame_count, 2)
|
||||
|
||||
# recompile
|
||||
# tests both ref stack len > runtime stack len for the above guard check
|
||||
# and ref stack len < runtime stack len for the initial zero mode case
|
||||
with BaseTorchFunctionMode(), IgnoredMode(), BaseTorchFunctionMode():
|
||||
fn(inp)
|
||||
|
||||
self.assertEqual(cnt.frame_count, 3)
|
||||
|
||||
# no recompile
|
||||
with IgnoredMode(), IgnoredMode(), BaseTorchFunctionMode(), BaseTorchFunctionMode():
|
||||
fn(inp)
|
||||
|
||||
self.assertEqual(cnt.frame_count, 3)
|
||||
|
||||
# This is tricky, basically the ignored modes are baked into the guard
|
||||
# IgnoredMode will be ignored forever by that guard.
|
||||
# This is okay since we don't expect to be modifying IGNORED_MODES
|
||||
# in the middle of execution except for the purposes of testing.
|
||||
torch._dynamo.reset()
|
||||
|
||||
with IgnoredMode():
|
||||
fn(inp)
|
||||
|
||||
self.assertEqual(cnt.frame_count, 4)
|
||||
|
||||
@torch._dynamo.config.patch("enable_cpp_guard_manager", False)
|
||||
def test_torch_function_mode_guards_ignored_types_py(self):
|
||||
self._run_ignored_mode_types_test()
|
||||
|
||||
def test_torch_function_mode_guards_ignored_types_cpp(self):
|
||||
self._run_ignored_mode_types_test()
|
||||
|
||||
@torch._dynamo.config.patch("enable_cpp_guard_manager", False)
|
||||
def test_torch_function_mode_guards_py(self):
|
||||
self._run_torch_function_mode_guard_test()
|
||||
@ -324,6 +272,218 @@ class TorchFunctionModeTests(torch._dynamo.test_case.TestCase):
|
||||
fn(inp)
|
||||
self.assertEqual(cnt.frame_count, 2)
|
||||
|
||||
def test_nested_torch_function_mode(self):
|
||||
mode_1_called = False
|
||||
mode_2_called = False
|
||||
|
||||
def reset_state():
|
||||
nonlocal mode_1_called
|
||||
nonlocal mode_2_called
|
||||
mode_1_called = False
|
||||
mode_2_called = False
|
||||
|
||||
ones = torch.ones(2, 2)
|
||||
zeros = torch.zeros(2, 2)
|
||||
|
||||
class TestMode1(BaseTorchFunctionMode):
|
||||
def __torch_function__(self, func, types, args, kwargs=None):
|
||||
if not kwargs:
|
||||
kwargs = {}
|
||||
|
||||
nonlocal mode_1_called
|
||||
|
||||
mode_1_called = True
|
||||
|
||||
if func == torch.add:
|
||||
return zeros
|
||||
|
||||
return super().__torch_function__(func, types, args, kwargs)
|
||||
|
||||
class TestMode2(BaseTorchFunctionMode):
|
||||
def __torch_function__(self, func, types, args, kwargs=None):
|
||||
if not kwargs:
|
||||
kwargs = {}
|
||||
|
||||
nonlocal mode_2_called
|
||||
|
||||
mode_2_called = True
|
||||
|
||||
if func == torch.mul:
|
||||
return ones
|
||||
|
||||
return super().__torch_function__(func, types, args, kwargs)
|
||||
|
||||
def fn(x):
|
||||
return torch.add(x, 3)
|
||||
|
||||
def fn_2(x):
|
||||
return torch.mul(x, 3) + torch.add(x, 3)
|
||||
|
||||
inp = torch.ones(2, 2) + 1
|
||||
|
||||
for fn_i in [fn, fn_2]:
|
||||
fn_opt = torch.compile(fn_i, fullgraph=True)
|
||||
with TestMode1(), TestMode2():
|
||||
expected = fn_i(inp), mode_1_called, mode_2_called
|
||||
reset_state()
|
||||
actual = fn_opt(inp), mode_1_called, mode_2_called
|
||||
reset_state()
|
||||
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
def test_torch_function_mode_disable(self):
|
||||
class TestSubclass(torch.Tensor):
|
||||
@classmethod
|
||||
def __torch_function__(cls, func, types, args, kwargs=None):
|
||||
if not kwargs:
|
||||
kwargs = {}
|
||||
if func == torch.add:
|
||||
return torch.ones(2, 2)
|
||||
return super().__torch_function__(func, types, args, kwargs)
|
||||
|
||||
class TestMode(BaseTorchFunctionMode):
|
||||
def __torch_function__(self, func, types, args, kwargs=None):
|
||||
if not kwargs:
|
||||
kwargs = {}
|
||||
|
||||
if func == torch.add:
|
||||
return torch.zeros(2, 2)
|
||||
|
||||
return super().__torch_function__(func, types, args, kwargs)
|
||||
|
||||
def fn(x):
|
||||
return torch.add(x, 3)
|
||||
|
||||
inp = (torch.ones(2, 2) + 1).as_subclass(TestSubclass)
|
||||
|
||||
fn_opt = torch.compile(fn, fullgraph=True)
|
||||
with TestMode(), torch._dynamo.config.patch(
|
||||
"traceable_tensor_subclasses", {TestSubclass}
|
||||
):
|
||||
with torch._C.DisableTorchFunctionSubclass():
|
||||
expected = fn(inp)
|
||||
actual = fn_opt(inp)
|
||||
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
with torch._C.DisableTorchFunction():
|
||||
expected = fn(inp)
|
||||
actual = fn_opt(inp)
|
||||
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
def test_torch_function_mode_highest_priority(self):
|
||||
class TestSubclass(torch.Tensor):
|
||||
@classmethod
|
||||
def __torch_function__(cls, func, types, args, kwargs=None):
|
||||
if not kwargs:
|
||||
kwargs = {}
|
||||
if func == torch.add:
|
||||
return torch.ones(2, 2)
|
||||
return super().__torch_function__(func, types, args, kwargs)
|
||||
|
||||
def fn(x):
|
||||
return torch.add(x, 3)
|
||||
|
||||
inp = (torch.ones(2, 2) + 1).as_subclass(TestSubclass)
|
||||
|
||||
fn_opt = torch.compile(fn, fullgraph=True)
|
||||
with TestMode(), torch._dynamo.config.patch(
|
||||
"traceable_tensor_subclasses", {TestSubclass}
|
||||
):
|
||||
expected = fn(inp)
|
||||
actual = fn_opt(inp)
|
||||
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
def test_torch_function_mode_enter_exit(self):
|
||||
def fn(x, y):
|
||||
with TestMode():
|
||||
o = torch.add(x, 3)
|
||||
|
||||
return torch.add(o, y)
|
||||
|
||||
inp = (torch.ones(2, 2) + 1, torch.ones(2, 2) + 2)
|
||||
fn_opt = torch.compile(fn, fullgraph=True)
|
||||
|
||||
expected = fn(*inp)
|
||||
actual = fn_opt(*inp)
|
||||
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
def test_torch_function_mode_graph_break(self):
|
||||
def fn(x, y):
|
||||
with TestMode():
|
||||
torch._dynamo.graph_break()
|
||||
o = torch.add(x, 3)
|
||||
|
||||
return torch.add(o, y)
|
||||
|
||||
inp = (torch.ones(2, 2) + 1, torch.ones(2, 2) + 2)
|
||||
fn_opt = torch.compile(fn)
|
||||
|
||||
expected = fn(*inp)
|
||||
actual = fn_opt(*inp)
|
||||
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
def test_torch_function_mode_and_pop_graph_break(self):
|
||||
def fn(x, y):
|
||||
with TestMode():
|
||||
z = _pop_torch_function_stack()
|
||||
torch._dynamo.graph_break()
|
||||
_push_on_torch_function_stack(z)
|
||||
o = torch.add(x, 3)
|
||||
|
||||
return torch.add(o, y)
|
||||
|
||||
inp = (torch.ones(2, 2) + 1, torch.ones(2, 2) + 2)
|
||||
fn_opt = torch.compile(fn)
|
||||
|
||||
expected = fn(*inp)
|
||||
actual = fn_opt(*inp)
|
||||
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
def test_torch_function_mode_restore_on_exc(self):
|
||||
@torch._dynamo.disable()
|
||||
def err():
|
||||
raise RuntimeError("test")
|
||||
|
||||
@torch.compile()
|
||||
def fn(x):
|
||||
with TestMode():
|
||||
x += 1
|
||||
err()
|
||||
x += 2
|
||||
return x
|
||||
|
||||
try:
|
||||
fn(torch.ones(2, 2))
|
||||
except RuntimeError:
|
||||
pass
|
||||
self.assertEqual(_len_torch_function_stack(), 0)
|
||||
|
||||
def test_torch_function_mode_and_pop_graph_break_mutation(self):
|
||||
def fn(x, y):
|
||||
with TestMode():
|
||||
z = _pop_torch_function_stack()
|
||||
z.y = 5
|
||||
torch._dynamo.graph_break()
|
||||
_push_on_torch_function_stack(z)
|
||||
o = torch.add(x, 3)
|
||||
o = torch.mul(o, z.y)
|
||||
|
||||
return torch.add(o, y)
|
||||
|
||||
inp = (torch.ones(2, 2) + 1, torch.ones(2, 2) + 2)
|
||||
fn_opt = torch.compile(fn)
|
||||
|
||||
expected = fn(*inp)
|
||||
actual = fn_opt(*inp)
|
||||
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from torch._dynamo.test_case import run_tests
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user