Compare commits

..

12 Commits

Author SHA1 Message Date
f10edf1ecd update int8 sdpa 2024-12-03 05:58:44 -05:00
9d645a6025 update int8 sdpa 2024-12-03 00:49:02 -08:00
676da3c16a update fa int8 2024-11-19 01:54:23 -05:00
46769004e5 add kernel for small size 2024-10-30 01:35:28 -07:00
9cb324d903 update int8 sdpa 2024-10-29 02:51:50 -07:00
325db8f2a3 update int8 sdpa 2024-10-21 18:15:19 -07:00
97922c4754 update fa_u8_brgemm 2024-10-17 20:01:34 -07:00
d72ab195da update fa_u8_brgemm 2024-10-17 19:49:32 -07:00
43b5c4101d int8 optimization 2024-10-14 23:45:57 -07:00
b640cf15ab test 2024-08-18 23:34:46 -07:00
67ccb2ce72 update fa u8 brgemm 2024-07-16 19:56:05 -07:00
4a2715e652 add fa u8 brgemm 2024-07-12 01:58:44 -07:00
1601 changed files with 24911 additions and 29480 deletions

View File

@ -6,7 +6,6 @@ from cryptography.hazmat.primitives import hashes, serialization
from cryptography.hazmat.primitives.asymmetric import rsa
from cryptography.x509.oid import NameOID
temp_dir = mkdtemp()
print(temp_dir)

View File

@ -18,7 +18,6 @@ time python test/run_test.py --verbose -i distributed/test_c10d_gloo
time python test/run_test.py --verbose -i distributed/test_c10d_nccl
time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering
time python test/run_test.py --verbose -i distributed/test_store
time python test/run_test.py --verbose -i distributed/test_symmetric_memory
time python test/run_test.py --verbose -i distributed/test_pg_wrapper

View File

@ -3,7 +3,6 @@ import json
import math
import sys
parser = argparse.ArgumentParser()
parser.add_argument(
"--test-name", dest="test_name", action="store", required=True, help="test name"

View File

@ -3,7 +3,6 @@ import sys
import numpy
sample_data_list = sys.argv[1:]
sample_data_list = [float(v.strip()) for v in sample_data_list]

View File

@ -1,7 +1,6 @@
import json
import sys
data_file_path = sys.argv[1]
commit_hash = sys.argv[2]

View File

@ -1,6 +1,5 @@
import sys
log_file_path = sys.argv[1]
with open(log_file_path) as f:

View File

@ -4,7 +4,6 @@ import os
import subprocess
import sys
COMMON_TESTS = [
(
"Checking that torch is available",

View File

@ -5,7 +5,6 @@ import sys
import yaml
# Need to import modules that lie on an upward-relative path
sys.path.append(os.path.join(sys.path[0], ".."))

View File

@ -118,18 +118,9 @@ if [[ "$PACKAGE_TYPE" == libtorch ]]; then
cd /tmp/libtorch
fi
if [[ "$GPU_ARCH_TYPE" == xpu ]]; then
# Workaround for __mkl_tmp_MOD unbound variable issue, refer https://github.com/pytorch/pytorch/issues/130543
set +u
source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
fi
# Test the package
/builder/check_binary.sh
# Clean temp files
cd /builder && git clean -ffdx
# =================== The above code will be executed inside Docker container ===================
EOL
echo

View File

@ -100,20 +100,6 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B
fi
fi
# Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton xpu package
if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*xpu.* && $(uname) == "Linux" ]]; then
TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}"
if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-xpu.txt)
TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}+${TRITON_SHORTHASH}"
fi
if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
else
export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
fi
fi
cat >"$envfile" <<EOL
# =================== The following code will be executed inside Docker container ===================
export TZ=UTC

View File

@ -29,11 +29,6 @@ if [[ "${USE_SPLIT_BUILD:-false}" == "true" ]]; then
UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_pkg"
fi
# this is special build with all dependencies packaged
if [[ ${BUILD_NAME} == *-full* ]]; then
UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_full"
fi
# Sleep 2 minutes between retries for conda upload
retry () {
"$@" || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@")

View File

@ -8,7 +8,6 @@ import time
import requests
AZURE_PIPELINE_BASE_URL = "https://aiinfra.visualstudio.com/PyTorch/"
AZURE_DEVOPS_PAT_BASE64 = os.environ.get("AZURE_DEVOPS_PAT_BASE64_SECRET", "")
PIPELINE_ID = "911"

View File

@ -2,7 +2,7 @@
# NOTE: **Mirror any changes** to this file the [tool.ruff] config in pyproject.toml
# before we can fully move to use ruff
enable-extensions = G
select = B,C,E,F,G,P,SIM1,SIM911,T4,W,B9,TOR0,TOR1,TOR2,TOR9
select = B,C,E,F,G,P,SIM1,T4,W,B9,TOR0,TOR1,TOR2,TOR9
max-line-length = 120
# C408 ignored because we like the dict keyword argument syntax
# E501 is not flexible enough, we're using B950 instead

View File

@ -40,7 +40,3 @@ e6ec0efaf87703c5f889cfc20b29be455885d58d
a53cda1ddc15336dc1ff0ce1eff2a49cdc5f882e
# 2024-01-02 clangformat: fused adam #116583
9dc68d1aa9e554d09344a10fff69f7b50b2d23a0
# 2024-06-28 enable UFMT in `torch/storage.py`
d80939e5e9337e8078f11489afefec59fd42f93b
# 2024-06-28 enable UFMT in `torch.utils.data`
7cf0b90e49689d45be91aa539fdf54cf2ea8a9a3

View File

@ -47,5 +47,3 @@ self-hosted-runner:
- macos-latest-xlarge
- macos-13-xlarge
- macos-14-xlarge
# Organization-wide Intel hosted XPU runners
- linux.idc.xpu

View File

@ -36,8 +36,7 @@ runs:
"${DOCKER_IMAGE}"
)
echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" && "${GPU_ARCH_TYPE}" != "xpu" ]]; then
if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" ]]; then
# Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner
grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts"
fi
@ -48,9 +47,10 @@ runs:
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
- name: Cleanup docker
if: always() && (env.BUILD_ENVIRONMENT == 'linux-s390x-binary-manywheel' || env.GPU_ARCH_TYPE == 'xpu')
if: always() && env.BUILD_ENVIRONMENT == 'linux-s390x-binary-manywheel'
shell: bash
run: |
# on s390x or xpu stop the container for clean worker stop
# on s390x stop the container for clean worker stop
# ignore expansion of "docker ps -q" since it could be empty
# shellcheck disable=SC2046
docker stop "${{ env.CONTAINER_NAME }}" || true
docker stop $(docker ps -q) || true

View File

@ -407,7 +407,7 @@
- torch/_inductor/codegen/cpp_template.py
- torch/_inductor/codegen/cpp_gemm_template.py
- test/inductor/test_mkldnn_pattern_matcher.py
- test/inductor/test_cpu_repro.py
- test/inductor/test_cpu_repo.py
- test/inductor/test_cpu_cpp_wrapper.py
- test/inductor/test_cpu_select_algorithm.py
- aten/src/ATen/cpu/**

View File

@ -1,5 +1,4 @@
#!/usr/bin/env python3
import os
import shutil
import sys
@ -8,7 +7,6 @@ from subprocess import check_call
from tempfile import TemporaryDirectory
from typing import Optional
SCRIPT_DIR = Path(__file__).parent
REPO_DIR = SCRIPT_DIR.parent.parent

View File

@ -5,6 +5,7 @@ import sys
from typing import Any
from github_utils import gh_delete_comment, gh_post_pr_comment
from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
from label_utils import has_required_labels, is_label_err_comment, LABEL_ERR_MSG
from trymerge import GitHubPR

View File

@ -4,9 +4,11 @@ import json
import os
import re
from typing import Any, cast, Dict, List, Optional
from urllib.error import HTTPError
from github_utils import gh_fetch_url, gh_post_pr_comment, gh_query_issues_by_labels
from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
from trymerge import get_pr_commit_sha, GitHubPR

View File

@ -10,7 +10,6 @@ import requests
import rockset # type: ignore[import]
from gitutils import retries_decorator
LOGS_QUERY = """
with
shas as (

View File

@ -1,12 +1,10 @@
#!/usr/bin/env python3
import sys
from pathlib import Path
from typing import Any, cast, Dict, List, Set
import yaml
GITHUB_DIR = Path(__file__).parent.parent

View File

@ -1,6 +1,7 @@
import json
import subprocess
import sys
from enum import Enum
from pathlib import Path
from typing import NamedTuple, Optional

View File

@ -9,7 +9,6 @@ from typing import Any, Callable, Dict, List, Set
from github_utils import gh_fetch_json_dict, gh_graphql
from gitutils import GitRepo
SEC_IN_DAY = 24 * 60 * 60
CLOSED_PR_RETENTION = 30 * SEC_IN_DAY
NO_PR_RETENTION = 1.5 * 365 * SEC_IN_DAY

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python3
import sys
from pathlib import Path
import yaml

View File

@ -14,6 +14,7 @@ import json
from typing import Any
import boto3 # type: ignore[import]
from label_utils import gh_get_labels

View File

@ -15,7 +15,6 @@ from urllib.request import Request, urlopen
import yaml
REENABLE_TEST_REGEX = "(?i)(Close(d|s)?|Resolve(d|s)?|Fix(ed|es)?) (#|https://github.com/pytorch/pytorch/issues/)([0-9]+)"
PREFIX = "test-config/"

View File

@ -8,13 +8,11 @@ architectures:
* CPU
* Latest CUDA
* Latest ROCM
* Latest XPU
"""
import os
from typing import Dict, List, Optional, Tuple
CUDA_ARCHES = ["11.8", "12.1", "12.4"]
@ -26,7 +24,6 @@ CUDA_ARCHES_CUDNN_VERSION = {"11.8": "9", "12.1": "9", "12.4": "9"}
ROCM_ARCHES = ["6.0", "6.1"]
XPU_ARCHES = ["xpu"]
CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]
@ -135,8 +132,6 @@ def arch_type(arch_version: str) -> str:
return "cuda"
elif arch_version in ROCM_ARCHES:
return "rocm"
elif arch_version in XPU_ARCHES:
return "xpu"
elif arch_version in CPU_CXX11_ABI_ARCH:
return "cpu-cxx11-abi"
elif arch_version in CPU_AARCH64_ARCH:
@ -161,7 +156,6 @@ WHEEL_CONTAINER_IMAGES = {
gpu_arch: f"pytorch/manylinux-builder:rocm{gpu_arch}-{DEFAULT_TAG}"
for gpu_arch in ROCM_ARCHES
},
"xpu": f"pytorch/manylinux2_28-builder:xpu-{DEFAULT_TAG}",
"cpu": f"pytorch/manylinux-builder:cpu-{DEFAULT_TAG}",
"cpu-cxx11-abi": f"pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-{DEFAULT_TAG}",
"cpu-aarch64": f"pytorch/manylinuxaarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
@ -227,7 +221,6 @@ def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
"cuda": f"cu{gpu_arch_version.replace('.', '')}",
"cuda-aarch64": "cu124",
"rocm": f"rocm{gpu_arch_version}",
"xpu": "xpu",
}.get(gpu_arch_type, gpu_arch_version)
@ -338,7 +331,7 @@ def generate_wheels_matrix(
# Define default compute archivectures
arches = ["cpu"]
if os == "linux":
arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES
elif os == "windows":
arches += CUDA_ARCHES
elif os == "linux-aarch64":
@ -361,14 +354,11 @@ def generate_wheels_matrix(
or arch_version == "cpu-aarch64"
or arch_version == "cpu-s390x"
or arch_version == "cuda-aarch64"
or arch_version == "xpu"
else arch_version
)
# TODO: Enable python 3.13 on rocm, xpu, aarch64, windows
if (
gpu_arch_type in ["rocm", "xpu"] or os != "linux"
) and python_version == "3.13":
# TODO: Enable python 3.13 on rocm, aarch64, windows
if (gpu_arch_type == "rocm" or os != "linux") and python_version == "3.13":
continue
# 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
@ -410,7 +400,9 @@ def generate_wheels_matrix(
gpu_arch_type, gpu_arch_version
),
"use_split_build": "True",
"devtoolset": "",
"devtoolset": (
"cxx11-abi" if arch_version == "cuda-aarch64" else ""
),
"container_image": WHEEL_CONTAINER_IMAGES[arch_version],
"package_type": package_type,
"pytorch_extra_install_requirements": (
@ -423,26 +415,6 @@ def generate_wheels_matrix(
),
}
)
# Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
if python_version == "3.10" and arch_version == "12.1":
ret.append(
{
"python_version": python_version,
"gpu_arch_type": gpu_arch_type,
"gpu_arch_version": gpu_arch_version,
"desired_cuda": translate_desired_cuda(
gpu_arch_type, gpu_arch_version
),
"use_split_build": "False",
"devtoolset": "",
"container_image": WHEEL_CONTAINER_IMAGES[arch_version],
"package_type": package_type,
"pytorch_extra_install_requirements": "",
"build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace( # noqa: B950
".", "_"
),
}
)
else:
ret.append(
{
@ -453,9 +425,7 @@ def generate_wheels_matrix(
gpu_arch_type, gpu_arch_version
),
"devtoolset": (
"cxx11-abi"
if arch_version in ["cpu-cxx11-abi", "xpu"]
else ""
"cxx11-abi" if arch_version == "cpu-cxx11-abi" else ""
),
"container_image": WHEEL_CONTAINER_IMAGES[arch_version],
"package_type": package_type,

View File

@ -8,8 +8,8 @@ from typing import Dict, Iterable, List, Literal, Set
from typing_extensions import TypedDict # Python 3.11+
import generate_binary_build_matrix # type: ignore[import]
import jinja2
import jinja2
Arch = Literal["windows", "linux", "macos"]

View File

@ -16,7 +16,6 @@ from typing import Dict, List
import generate_binary_build_matrix
DOCKER_IMAGE_TYPES = ["runtime", "devel"]

View File

@ -4,11 +4,11 @@ import argparse
import os
import re
import subprocess
from datetime import datetime
from distutils.util import strtobool
from pathlib import Path
LEADING_V_PATTERN = re.compile("^v")
TRAILING_RC_PATTERN = re.compile("-rc[0-9]*$")
LEGACY_BASE_VERSION_SUFFIX_PATTERN = re.compile("a0$")

View File

@ -11,6 +11,7 @@ import sys
import time
import urllib
import urllib.parse
from typing import Any, Callable, Dict, List, Optional, Tuple
from urllib.request import Request, urlopen

View File

@ -3,6 +3,7 @@
import json
import os
import warnings
from dataclasses import dataclass
from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union
from urllib.error import HTTPError

View File

@ -19,7 +19,6 @@ from typing import (
Union,
)
T = TypeVar("T")
RE_GITHUB_URL_MATCH = re.compile("^https://.*@?github.com/(.+)/(.+)$")

View File

@ -1,12 +1,12 @@
"""GitHub Label Utilities."""
import json
from functools import lru_cache
from typing import Any, List, Tuple, TYPE_CHECKING, Union
from github_utils import gh_fetch_url_and_headers, GitHubComment
# TODO: this is a temp workaround to avoid circular dependencies,
# and should be removed once GitHubPR is refactored out of trymerge script.
if TYPE_CHECKING:

View File

@ -9,7 +9,6 @@ from pytest_caching_utils import (
upload_pytest_cache,
)
TEMP_DIR = "./tmp" # a backup location in case one isn't provided

View File

@ -14,7 +14,6 @@ from file_io_utils import (
zip_folder,
)
PYTEST_CACHE_KEY_PREFIX = "pytest_cache"
PYTEST_CACHE_DIR_NAME = ".pytest_cache"
BUCKET = "gha-artifacts"

View File

@ -2,7 +2,7 @@
set -eoux pipefail
SYNC_BRANCH=pytorch-stable-prototype
SYNC_BRANCH=fbcode/pytorch-stable-prototype
git config user.email "fake@example.com"
git config user.name "PyTorch Stable Bot"
@ -11,9 +11,7 @@ git fetch origin main
git fetch origin "$SYNC_BRANCH"
git checkout "$SYNC_BRANCH"
# Using a hardcoded SHA here is a massive speedup as we can skip the entire history of the pytorch GitHub repo.
# This specific SHA was chosen as it was before the "branch point" of the stable branch
for SHA in $(git log ba3b05fdf37ddbc3c301294d6a560a816335e717..origin/main --pretty="%h" --reverse -- torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed)
for SHA in $(git log 4333e122d4b74cdf84351ed2907045c6a767b4cd..origin/main --pretty="%h" --reverse -- torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed)
do
# `git merge-base --is-ancestor` exits with code 0 if the given SHA is an ancestor, and non-0 otherwise
if git merge-base --is-ancestor $SHA HEAD || [[ $(git log --grep="(cherry picked from commit $SHA") ]]
@ -22,12 +20,7 @@ do
continue
fi
echo "Copying $SHA"
git cherry-pick -x "$SHA" -X theirs
git reset --soft HEAD~1
git add torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed
git checkout .
git commit --reuse-message=HEAD@{1}
git clean -f
git cherry-pick -x "$SHA"
done
if [[ "${WITH_PUSH}" == true ]]; then

View File

@ -41,7 +41,7 @@ def main() -> None:
)
options = parser.parse_args()
tagged_images: Dict[str, bool] = {}
tagged_images: Dict[str, bool] = dict()
platform_images = [
generate_binary_build_matrix.WHEEL_CONTAINER_IMAGES,
generate_binary_build_matrix.LIBTORCH_CONTAINER_IMAGES,

View File

@ -7,7 +7,6 @@ cd llm-target-determinator
pip install -q -r requirements.txt
cd ../codellama
pip install -e .
pip install numpy==1.26.0
# Run indexer
cd ../llm-target-determinator

View File

@ -17,7 +17,9 @@ from unittest import main, mock, skip, TestCase
from urllib.error import HTTPError
from github_utils import gh_graphql
from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
from trymerge import (
categorize_checks,
DRCI_CHECKRUN_NAME,
@ -37,7 +39,6 @@ from trymerge import (
validate_revert,
)
if "GIT_REMOTE_URL" not in os.environ:
os.environ["GIT_REMOTE_URL"] = "https://github.com/pytorch/pytorch"

View File

@ -45,6 +45,7 @@ from github_utils import (
gh_update_pr_state,
GitHubComment,
)
from gitutils import (
are_ghstack_branches_in_sync,
get_git_remote_name,
@ -61,7 +62,6 @@ from label_utils import (
)
from trymerge_explainer import get_revert_message, TryMergeExplainer
# labels
MERGE_IN_PROGRESS_LABEL = "merging"
MERGE_COMPLETE_LABEL = "merged"

View File

@ -11,7 +11,6 @@ from github_utils import gh_post_pr_comment as gh_post_comment
from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
from trymerge import GitHubPR
SAME_SHA_ERROR = (
"\n```\nAborting rebase because rebasing the branch resulted in the same sha as the target branch.\n"
+ "This usually happens because the PR has already been merged. Please rebase locally and push.\n```"

View File

@ -81,7 +81,7 @@ jobs:
!{{ config["build_name"] }}-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs: !{{ config["build_name"] }}-build
{%- if config["gpu_arch_type"] not in ["rocm", "xpu"] %}
{%- if config["gpu_arch_type"] != "rocm" %}
uses: ./.github/workflows/_binary-test-linux.yml
with:!{{ upload.binary_env_as_input(config) }}
build_name: !{{ config["build_name"] }}
@ -101,40 +101,6 @@ jobs:
{%- endif %}
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
{%- elif config["gpu_arch_type"] == "xpu" %}
runs-on: linux.idc.xpu
timeout-minutes: !{{ common.timeout_minutes }}
!{{ upload.binary_env(config) }}
permissions:
id-token: write
contents: read
steps:
- name: Setup XPU
uses: ./.github/actions/setup-xpu
- name: configure aws credentials
id: aws_creds
uses: aws-actions/configure-aws-credentials@v1.7.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v2
- uses: !{{ common.download_artifact_action }}
name: Download Build Artifacts
with:
name: !{{ config["build_name"] }}
path: "${{ runner.temp }}/artifacts/"
!{{ common.checkout(deep_clone=False, directory="pytorch") }}
!{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch) }}
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: !{{ config["container_image"] }}
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
- name: Teardown XPU
uses: ./.github/actions/teardown-xpu
{%- else %}
runs-on: linux.rocm.gpu
timeout-minutes: !{{ common.timeout_minutes }}

105
.github/workflows/_linux-build-rg.yml vendored Normal file
View File

@ -0,0 +1,105 @@
name: linux-build-rg
on:
workflow_call:
inputs:
build-environment:
required: true
type: string
description: Top-level label for what's being built/tested.
docker-image-name:
required: true
type: string
description: Name of the base docker image to build with.
build-generates-artifacts:
required: false
type: boolean
default: true
description: If set, upload generated build artifacts.
build-with-debug:
required: false
type: boolean
default: false
description: If set, build in debug mode.
sync-tag:
required: false
type: string
default: ""
description: |
If this is set, our linter will use this to make sure that every other
job with the same `sync-tag` is identical.
cuda-arch-list:
required: false
type: string
default: "5.2"
description: |
List of CUDA architectures CI build should target.
runner-group:
required: false
type: string
default: "arc-lf-linux.2xlarge"
description: Runner group to select group type
test-matrix:
required: false
type: string
description: |
An option JSON description of what test configs to run later on. This
is moved here from the Linux test workflow so that we can apply filter
logic using test-config labels earlier and skip unnecessary builds
s3-bucket:
description: S3 bucket to download artifact
required: false
type: string
default: "gha-artifacts"
aws-role-to-assume:
description: role to assume for downloading artifacts
required: false
type: string
default: ""
secrets:
HUGGING_FACE_HUB_TOKEN:
required: false
description: |
HF Auth token to avoid rate limits when downloading models or datasets from hub
outputs:
docker-image:
value: ${{ jobs.build.outputs.docker-image }}
description: The docker image containing the built PyTorch.
test-matrix:
value: ${{ jobs.build.outputs.test-matrix }}
description: An optional JSON description of what test configs to run later on.
jobs:
build:
# Don't run on forked repos
if: github.repository_owner == 'pytorch'
runs-on:
group: ${{ inputs.runner-group }}
timeout-minutes: 240
outputs:
docker-image: ${{ steps.linux-build.outputs.docker-image }}
test-matrix: ${{ steps.linux-build.outputs.test-matrix }}
steps:
# [pytorch repo ref]
# Use a pytorch/pytorch reference instead of a reference to the local
# checkout because when we run this action we don't *have* a local
# checkout. In other cases you should prefer a local checkout.
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
- name: Linux Build
id: linux-build
uses: ./.github/actions/linux-build
with:
build-environment: ${{ inputs.build-environment }}
docker-image-name: ${{ inputs.docker-image-name }}
build-generates-artifacts: ${{ inputs.build-generates-artifacts }}
build-with-debug: ${{ inputs.build-with-debug }}
sync-tag: ${{ inputs.sync-tag }}
cuda-arch-list: ${{ inputs.cuda-arch-list }}
test-matrix: ${{ inputs.test-matrix }}
s3-bucket: ${{ inputs.s3-bucket }}
aws-role-to-assume: ${{ inputs.aws-role-to-assume }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}

85
.github/workflows/_linux-test-label.yml vendored Normal file
View File

@ -0,0 +1,85 @@
name: linux-test-rg
on:
workflow_call:
inputs:
build-environment:
required: true
type: string
description: Top-level label for what's being built/tested.
test-matrix:
required: true
type: string
description: JSON description of what test configs to run.
docker-image:
required: true
type: string
description: Docker image to run in.
sync-tag:
required: false
type: string
default: ""
description: |
If this is set, our linter will use this to make sure that every other
job with the same `sync-tag` is identical.
timeout-minutes:
required: false
type: number
default: 240
description: |
Set the maximum (in minutes) how long the workflow should take to finish
use-gha:
required: false
type: string
default: ""
description: If set to any value, upload to GHA. Otherwise upload to S3.
dashboard-tag:
required: false
type: string
default: ""
s3-bucket:
description: S3 bucket to download artifact
required: false
type: string
default: "gha-artifacts"
aws-role-to-assume:
description: role to assume for downloading artifacts
required: false
type: string
default: ""
secrets:
HUGGING_FACE_HUB_TOKEN:
required: false
description: |
HF Auth token to avoid rate limits when downloading models or datasets from hub
env:
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
jobs:
test:
# Don't run on forked repos or empty test matrix
if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
strategy:
matrix: ${{ fromJSON(inputs.test-matrix) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
steps:
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
- name: Linux Test
id: linux-test
uses: ./.github/actions/linux-test
with:
build-environment: ${{ inputs.build-environment }}
test-matrix: ${{ inputs.test-matrix }}
docker-image: ${{ inputs.docker-image }}
sync-tag: ${{ inputs.sync-tag }}
use-gha: ${{ inputs.use-gha }}
dashboard-tag: ${{ inputs.dashboard-tag }}
s3-bucket: ${{ inputs.s3-bucket }}
aws-role-to-assume: ${{ inputs.aws-role-to-assume }}
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

86
.github/workflows/_linux-test-rg.yml vendored Normal file
View File

@ -0,0 +1,86 @@
name: linux-test-label
on:
workflow_call:
inputs:
build-environment:
required: true
type: string
description: Top-level label for what's being built/tested.
test-matrix:
required: true
type: string
description: JSON description of what test configs to run.
docker-image:
required: true
type: string
description: Docker image to run in.
sync-tag:
required: false
type: string
default: ""
description: |
If this is set, our linter will use this to make sure that every other
job with the same `sync-tag` is identical.
timeout-minutes:
required: false
type: number
default: 240
description: |
Set the maximum (in minutes) how long the workflow should take to finish
use-gha:
required: false
type: string
default: ""
description: If set to any value, upload to GHA. Otherwise upload to S3.
dashboard-tag:
required: false
type: string
default: ""
s3-bucket:
description: S3 bucket to download artifact
required: false
type: string
default: "gha-artifacts"
aws-role-to-assume:
description: role to assume for downloading artifacts
required: false
type: string
default: ""
secrets:
HUGGING_FACE_HUB_TOKEN:
required: false
description: |
HF Auth token to avoid rate limits when downloading models or datasets from hub
env:
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
jobs:
test:
# Don't run on forked repos or empty test matrix
if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
strategy:
matrix: ${{ fromJSON(inputs.test-matrix) }}
fail-fast: false
runs-on:
group: ${{ matrix.runner }}
timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
steps:
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
- name: Linux Test
id: linux-test
uses: ./.github/actions/linux-test
with:
build-environment: ${{ inputs.build-environment }}
test-matrix: ${{ inputs.test-matrix }}
docker-image: ${{ inputs.docker-image }}
sync-tag: ${{ inputs.sync-tag }}
use-gha: ${{ inputs.use-gha }}
dashboard-tag: ${{ inputs.dashboard-tag }}
s3-bucket: ${{ inputs.s3-bucket }}
aws-role-to-assume: ${{ inputs.aws-role-to-assume }}
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@ -54,7 +54,6 @@ jobs:
# Hardcoding below is temporary for testing ALI runners
# This file below should match the script found in .github/scripts/runner_determinator.py
- name: Hardcode runner-determinator script
id: hardcode-script
run: |
cat <<EOF > runner_determinator.py
# flake8: noqa: G004

View File

@ -751,118 +751,6 @@ jobs:
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_8-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: xpu
GPU_ARCH_TYPE: xpu
DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
DESIRED_DEVTOOLSET: cxx11-abi
DESIRED_PYTHON: "3.8"
build_name: manywheel-py3_8-xpu
build_environment: linux-binary-manywheel
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_8-xpu-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs: manywheel-py3_8-xpu-build
runs-on: linux.idc.xpu
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: xpu
GPU_ARCH_TYPE: xpu
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
DESIRED_DEVTOOLSET: cxx11-abi
DESIRED_PYTHON: "3.8"
permissions:
id-token: write
contents: read
steps:
- name: Setup XPU
uses: ./.github/actions/setup-xpu
- name: configure aws credentials
id: aws_creds
uses: aws-actions/configure-aws-credentials@v1.7.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v2
- uses: actions/download-artifact@v3
name: Download Build Artifacts
with:
name: manywheel-py3_8-xpu
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: malfet/checkout@silent-checkout
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
quiet-checkout: true
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: malfet/checkout@silent-checkout
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
quiet-checkout: true
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: pytorch/manylinux2_28-builder:xpu-main
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
- name: Teardown XPU
uses: ./.github/actions/teardown-xpu
manywheel-py3_8-xpu-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_8-xpu-test
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: xpu
GPU_ARCH_TYPE: xpu
DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
DESIRED_DEVTOOLSET: cxx11-abi
DESIRED_PYTHON: "3.8"
build_name: manywheel-py3_8-xpu
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_9-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -1577,118 +1465,6 @@ jobs:
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_9-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: xpu
GPU_ARCH_TYPE: xpu
DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
DESIRED_DEVTOOLSET: cxx11-abi
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-xpu
build_environment: linux-binary-manywheel
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-xpu-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs: manywheel-py3_9-xpu-build
runs-on: linux.idc.xpu
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: xpu
GPU_ARCH_TYPE: xpu
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
DESIRED_DEVTOOLSET: cxx11-abi
DESIRED_PYTHON: "3.9"
permissions:
id-token: write
contents: read
steps:
- name: Setup XPU
uses: ./.github/actions/setup-xpu
- name: configure aws credentials
id: aws_creds
uses: aws-actions/configure-aws-credentials@v1.7.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v2
- uses: actions/download-artifact@v3
name: Download Build Artifacts
with:
name: manywheel-py3_9-xpu
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: malfet/checkout@silent-checkout
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
quiet-checkout: true
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: malfet/checkout@silent-checkout
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
quiet-checkout: true
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: pytorch/manylinux2_28-builder:xpu-main
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
- name: Teardown XPU
uses: ./.github/actions/teardown-xpu
manywheel-py3_9-xpu-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_9-xpu-test
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: xpu
GPU_ARCH_TYPE: xpu
DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
DESIRED_DEVTOOLSET: cxx11-abi
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-xpu
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -2068,71 +1844,6 @@ jobs:
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cuda12_1-full-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu121
GPU_ARCH_VERSION: 12.1
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
use_split_build: False
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cuda12_1-full
build_environment: linux-binary-manywheel
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_1-full-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs: manywheel-py3_10-cuda12_1-full-build
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu121
GPU_ARCH_VERSION: 12.1
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
use_split_build: False
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cuda12_1-full
build_environment: linux-binary-manywheel
runs_on: linux.4xlarge.nvidia.gpu
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_1-full-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_10-cuda12_1-full-test
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu121
GPU_ARCH_VERSION: 12.1
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
use_split_build: False
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cuda12_1-full
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-cuda12_4-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -2468,118 +2179,6 @@ jobs:
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_10-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: xpu
GPU_ARCH_TYPE: xpu
DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
DESIRED_DEVTOOLSET: cxx11-abi
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-xpu
build_environment: linux-binary-manywheel
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-xpu-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs: manywheel-py3_10-xpu-build
runs-on: linux.idc.xpu
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: xpu
GPU_ARCH_TYPE: xpu
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
DESIRED_DEVTOOLSET: cxx11-abi
DESIRED_PYTHON: "3.10"
permissions:
id-token: write
contents: read
steps:
- name: Setup XPU
uses: ./.github/actions/setup-xpu
- name: configure aws credentials
id: aws_creds
uses: aws-actions/configure-aws-credentials@v1.7.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v2
- uses: actions/download-artifact@v3
name: Download Build Artifacts
with:
name: manywheel-py3_10-xpu
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: malfet/checkout@silent-checkout
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
quiet-checkout: true
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: malfet/checkout@silent-checkout
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
quiet-checkout: true
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: pytorch/manylinux2_28-builder:xpu-main
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
- name: Teardown XPU
uses: ./.github/actions/teardown-xpu
manywheel-py3_10-xpu-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_10-xpu-test
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: xpu
GPU_ARCH_TYPE: xpu
DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
DESIRED_DEVTOOLSET: cxx11-abi
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-xpu
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -3294,118 +2893,6 @@ jobs:
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: xpu
GPU_ARCH_TYPE: xpu
DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
DESIRED_DEVTOOLSET: cxx11-abi
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-xpu
build_environment: linux-binary-manywheel
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-xpu-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs: manywheel-py3_11-xpu-build
runs-on: linux.idc.xpu
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: xpu
GPU_ARCH_TYPE: xpu
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
DESIRED_DEVTOOLSET: cxx11-abi
DESIRED_PYTHON: "3.11"
permissions:
id-token: write
contents: read
steps:
- name: Setup XPU
uses: ./.github/actions/setup-xpu
- name: configure aws credentials
id: aws_creds
uses: aws-actions/configure-aws-credentials@v1.7.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v2
- uses: actions/download-artifact@v3
name: Download Build Artifacts
with:
name: manywheel-py3_11-xpu
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: malfet/checkout@silent-checkout
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
quiet-checkout: true
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: malfet/checkout@silent-checkout
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
quiet-checkout: true
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: pytorch/manylinux2_28-builder:xpu-main
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
- name: Teardown XPU
uses: ./.github/actions/teardown-xpu
manywheel-py3_11-xpu-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_11-xpu-test
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: xpu
GPU_ARCH_TYPE: xpu
DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
DESIRED_DEVTOOLSET: cxx11-abi
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-xpu
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
@ -4120,118 +3607,6 @@ jobs:
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: xpu
GPU_ARCH_TYPE: xpu
DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
DESIRED_DEVTOOLSET: cxx11-abi
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-xpu
build_environment: linux-binary-manywheel
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-xpu-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs: manywheel-py3_12-xpu-build
runs-on: linux.idc.xpu
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: xpu
GPU_ARCH_TYPE: xpu
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
DESIRED_DEVTOOLSET: cxx11-abi
DESIRED_PYTHON: "3.12"
permissions:
id-token: write
contents: read
steps:
- name: Setup XPU
uses: ./.github/actions/setup-xpu
- name: configure aws credentials
id: aws_creds
uses: aws-actions/configure-aws-credentials@v1.7.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v2
- uses: actions/download-artifact@v3
name: Download Build Artifacts
with:
name: manywheel-py3_12-xpu
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: malfet/checkout@silent-checkout
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
quiet-checkout: true
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: malfet/checkout@silent-checkout
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
quiet-checkout: true
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: pytorch/manylinux2_28-builder:xpu-main
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
- name: Teardown XPU
uses: ./.github/actions/teardown-xpu
manywheel-py3_12-xpu-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_12-xpu-test
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: xpu
GPU_ARCH_TYPE: xpu
DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
DESIRED_DEVTOOLSET: cxx11-abi
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-xpu
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_13-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml

View File

@ -39,10 +39,10 @@ jobs:
update-vision-commit-hash:
runs-on: ubuntu-latest
environment: update-commit-hash
if: ${{ github.event_name == 'schedule' }}
steps:
- name: update-vision-commit-hash
uses: pytorch/test-infra/.github/actions/update-commit-hash@main
if: ${{ github.event_name == 'schedule' }}
with:
repo-name: vision
branch: main
@ -54,10 +54,10 @@ jobs:
update-audio-commit-hash:
runs-on: ubuntu-latest
environment: update-commit-hash
if: ${{ github.event_name == 'schedule' }}
steps:
- name: update-audio-commit-hash
uses: pytorch/test-infra/.github/actions/update-commit-hash@main
if: ${{ github.event_name == 'schedule' }}
with:
repo-name: audio
branch: main
@ -69,10 +69,10 @@ jobs:
update-executorch-commit-hash:
runs-on: ubuntu-latest
environment: update-commit-hash
if: ${{ github.event_name == 'schedule' }}
steps:
- name: update-executorch-commit-hash
uses: pytorch/test-infra/.github/actions/update-commit-hash@main
if: ${{ github.event_name == 'schedule' }}
with:
repo-name: executorch
branch: main

View File

@ -1,42 +0,0 @@
name: runner-determinator
on:
workflow_dispatch:
pull_request:
branches: [main]
paths:
- .github/workflows/_runner-determinator.yaml
- .github/workflows/_runner_determinator_script_sync.yaml
- .github/workflows/scripts/runner_determinator.py
jobs:
python-script-sync-check:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
sparse-checkout: |
.github
- name: Extract the script from runner_determinator
run: |
# Runner determinator files
RUNNER_DETERMINATOR_WORKFLOW_FILE=.github/workflows/_runner-determinator.yml
RUNNER_DETERMINATOR_PYTHON_SCRIPT_FILE=.github/scripts/runner_determinator.py
# Parse the job file, extract the script and run it, up to the final EOF,
# to generate the python file in the local folder
yq '.jobs.runner-determinator.steps[] | select(.id == "hardcode-script") | .run' \
"${RUNNER_DETERMINATOR_WORKFLOW_FILE}" | sed '/^EOF$/q' | bash
set +e
DIFF="$(diff "$(basename ${RUNNER_DETERMINATOR_PYTHON_SCRIPT_FILE})" ${RUNNER_DETERMINATOR_PYTHON_SCRIPT_FILE})"
IS_DIFF=$?
set -e
if [ $IS_DIFF -eq 0 ]; then
echo "Scripts are in sync! ^_^";
else
echo -e "Scripts are *NOT* in sync:\n ${DIFF}";
exit 1
fi

View File

@ -203,14 +203,25 @@ jobs:
cuda-version: cpu
test-matrix: ${{ needs.win-vs2019-cpu-py3-build.outputs.test-matrix }}
win-vs2019-cuda12_1-py3-build:
name: win-vs2019-cuda12.1-py3
win-vs2019-cuda11_8-py3-build:
name: win-vs2019-cuda11.8-py3
uses: ./.github/workflows/_win-build.yml
needs: get-label-type
with:
build-environment: win-vs2019-cuda12.1-py3
cuda-version: "12.1"
build-environment: win-vs2019-cuda11.8-py3
cuda-version: "11.8"
sync-tag: win-cuda-build
runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}windows.g5.4xlarge.nvidia.gpu" },
{ config: "default", shard: 2, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}windows.g5.4xlarge.nvidia.gpu" },
{ config: "default", shard: 3, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}windows.g5.4xlarge.nvidia.gpu" },
{ config: "default", shard: 4, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}windows.g5.4xlarge.nvidia.gpu" },
{ config: "default", shard: 5, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}windows.g5.4xlarge.nvidia.gpu" },
{ config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}windows.g5.4xlarge.nvidia.gpu" },
{ config: "force_on_cpu", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
]}
linux-focal-rocm6_1-py3_8-build:
name: linux-focal-rocm6.1-py3.8

View File

@ -38,7 +38,7 @@ init_command = [
'--dry-run={{DRYRUN}}',
'flake8==6.1.0',
'flake8-bugbear==23.3.23',
'flake8-comprehensions==3.15.0',
'flake8-comprehensions==3.12.0',
'flake8-executable==2.1.3',
'flake8-logging-format==0.9.0',
'flake8-pyi==23.3.1',
@ -1531,6 +1531,10 @@ exclude_patterns = [
'torch/signal/__init__.py',
'torch/signal/windows/__init__.py',
'torch/signal/windows/windows.py',
'torch/sparse/__init__.py',
'torch/sparse/_semi_structured_conversions.py',
'torch/sparse/_triton_ops.py',
'torch/sparse/semi_structured.py',
'torch/special/__init__.py',
'torch/testing/_internal/__init__.py',
'torch/testing/_internal/autocast_test_lists.py',
@ -1779,7 +1783,7 @@ init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'ruff==0.5.2',
'ruff==0.5.0',
]
is_formatter = true

View File

@ -208,6 +208,7 @@ endif()
include(CMakeDependentOption)
option(ATEN_NO_TEST "Do not build ATen test binaries" OFF)
option(BUILD_BINARY "Build C++ binaries" OFF)
option(BUILD_DOCS "Build Caffe2 documentation" OFF)
option(BUILD_CUSTOM_PROTOBUF
"Build and use Caffe2's own protobuf under third_party" ON)
option(BUILD_PYTHON "Build Python binaries" ON)
@ -749,6 +750,7 @@ if(NOT TORCH_BUILD_VERSION)
CACHE STRING "Torch build version" FORCE)
endif()
caffe2_parse_version_str(TORCH ${TORCH_BUILD_VERSION})
caffe2_parse_version_str(CAFFE2 ${TORCH_BUILD_VERSION})
set(TORCH_SOVERSION "${TORCH_VERSION_MAJOR}.${TORCH_VERSION_MINOR}")
# ---[ CMake scripts + modules
@ -1221,6 +1223,45 @@ endif()
add_subdirectory(c10)
add_subdirectory(caffe2)
# --[ Documentation
if(BUILD_DOCS)
# check if Doxygen is installed
find_package(Doxygen)
if(DOXYGEN_FOUND)
message("Generating documentation")
set(DOXYGEN_C_IN ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/.Doxyfile-c)
set(DOXYGEN_C_OUT ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/Doxyfile-c)
set(DOXYGEN_P_IN ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/.Doxyfile-python)
set(DOXYGEN_P_OUT ${CMAKE_CURRENT_SOURCE_DIR}/docs/caffe2/Doxyfile-python)
if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs)
file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/docs)
endif()
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/docs)
configure_file(${DOXYGEN_C_IN} ${DOXYGEN_C_OUT} @ONLY)
configure_file(${DOXYGEN_P_IN} ${DOXYGEN_P_OUT} @ONLY)
add_custom_target(
doc_doxygen_c ALL
COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_C_OUT}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "Generating C++ API documentation with Doxygen"
VERBATIM)
add_custom_target(
doc_doxygen_python ALL
COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_P_OUT}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "Generating Python API documentation with Doxygen"
VERBATIM)
else()
message(
FATAL_ERROR "Doxygen needs to be installed to generate the documentation")
endif()
endif()
# ---[ CMake related files Uninistall option.
if(NOT TARGET caffe2_uninstall)
configure_file(

View File

@ -156,12 +156,12 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
/torch/csrc/jit/python/init.cpp @mikaylagawarecki
# CUDA and CUDA math libraries
aten/src/ATen/cuda/ @eqy @syed-ahmed
aten/src/ATen/cudnn/ @eqy @syed-ahmed
aten/src/ATen/native/cuda/ @eqy @syed-ahmed
aten/src/ATen/native/cudnn/ @eqy @syed-ahmed
c10/cuda @eqy @syed-ahmed
torch/cuda/ @eqy @syed-ahmed
torch/csrc/cuda/ @eqy @syed-ahmed
torch/backends/cuda/ @eqy @syed-ahmed
torch/backends/cudnn/ @eqy @syed-ahmed
aten/src/ATen/cuda/ @eqy
aten/src/ATen/cudnn/ @eqy
aten/src/ATen/native/cuda/ @eqy
aten/src/ATen/native/cudnn/ @eqy
c10/cuda @eqy
torch/cuda/ @eqy
torch/csrc/cuda/ @eqy
torch/backends/cuda/ @eqy
torch/backends/cudnn/ @eqy

View File

@ -3,7 +3,6 @@ from typing import Dict, List, Optional, Tuple
import torch
from torch import Tensor
OUTPUT_DIR = "src/androidTest/assets/"

View File

@ -119,7 +119,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
}
deviceType_ = deviceJniCodeToDeviceType(device);
module_ = torch::jit::load(
std::move(modelPath->toStdString()), std::nullopt, extra_files);
std::move(modelPath->toStdString()), c10::nullopt, extra_files);
if (has_extra) {
static auto putMethod =
facebook::jni::JMap<facebook::jni::JString, facebook::jni::JString>::

View File

@ -84,9 +84,9 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
}
deviceType_ = deviceJniCodeToDeviceType(device);
module_ = torch::jit::_load_for_mobile(
std::move(modelPath->toStdString()), std::nullopt, extra_files);
std::move(modelPath->toStdString()), c10::nullopt, extra_files);
torch::jit::_load_extra_only_for_mobile(
std::move(modelPath->toStdString()), std::nullopt, extra_files);
std::move(modelPath->toStdString()), c10::nullopt, extra_files);
if (has_extra) {
static auto putMethod =
facebook::jni::JMap<facebook::jni::JString, facebook::jni::JString>::

View File

@ -2,7 +2,6 @@ from torchvision import models
import torch
print(torch.version.__version__)
resnet18 = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)

View File

@ -9,7 +9,6 @@ from torchvision import models
import torch
# Download and trace the model.
model = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.IMAGENET1K_V1)
model.eval()

View File

@ -82,7 +82,6 @@ using acc_type = typename AccumulateType<T, is_cuda>::type;
using type = acc_t; \
};
#define MPS_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::MPS)
#define XPU_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::XPU)
#define CUDA_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::CUDA)
#define CPU_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::CPU)
@ -105,25 +104,6 @@ MPS_ACC_TYPE(c10::complex<Half>, c10::complex<float>);
MPS_ACC_TYPE(c10::complex<float>, c10::complex<float>);
MPS_ACC_TYPE(c10::complex<double>, c10::complex<float>);
XPU_ACC_TYPE(BFloat16, float);
XPU_ACC_TYPE(Half, float);
XPU_ACC_TYPE(Float8_e5m2, float);
XPU_ACC_TYPE(Float8_e4m3fn, float);
XPU_ACC_TYPE(Float8_e5m2fnuz, float);
XPU_ACC_TYPE(Float8_e4m3fnuz, float);
XPU_ACC_TYPE(float, float);
XPU_ACC_TYPE(double, double);
XPU_ACC_TYPE(int8_t, int64_t);
XPU_ACC_TYPE(uint8_t, int64_t);
XPU_ACC_TYPE(char, int64_t);
XPU_ACC_TYPE(int16_t, int64_t);
XPU_ACC_TYPE(int32_t, int64_t);
XPU_ACC_TYPE(int64_t, int64_t);
XPU_ACC_TYPE(bool, bool);
XPU_ACC_TYPE(c10::complex<Half>, c10::complex<float>);
XPU_ACC_TYPE(c10::complex<float>, c10::complex<float>);
XPU_ACC_TYPE(c10::complex<double>, c10::complex<double>);
#if defined(__CUDACC__) || defined(__HIPCC__)
CUDA_ACC_TYPE(half, float);
#endif

View File

@ -283,7 +283,7 @@ at::BlasBackend Context::blasPreferredBackend() {
if (blas_preferred_backend == at::BlasBackend::Cublaslt) {
static const bool hipblaslt_unsupported = []() {
static const std::vector<std::string> archs = {"gfx90a", "gfx940", "gfx941", "gfx942"};
for (auto index: c10::irange(getNumGPUs())) {
for (auto index = 0; index < at::getNumGPUs(); index++) {
if (!detail::getCUDAHooks().isGPUArch(index, archs)) {
TORCH_WARN_ONCE(
"Attempting to use hipBLASLt on an unsupported architecture! "

View File

@ -73,8 +73,6 @@ class TORCH_API Context {
return at::detail::getPrivateUse1Hooks();
} else if (device_type == at::kMTIA) {
return at::detail::getMTIAHooks();
} else if (device_type == at::kHIP) {
return at::detail::getHIPHooks();
} else {
AT_ERROR(
c10::DeviceTypeName(device_type), " device type not an accelerator.");
@ -96,22 +94,8 @@ class TORCH_API Context {
AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");
}
}
bool isPinnedPtr(
const void* data,
std::optional<DeviceType> device_type = std::nullopt) {
auto opt_device_type =
device_type.has_value() ? device_type.value() : at::getAccelerator();
if (!opt_device_type.has_value() || // there is no accelerator
!at::isAccelerator(
opt_device_type.value())) { // passed device not an accelerator
return false;
}
return getAcceleratorHooksInterface(opt_device_type.value())
.isPinnedPtr(data);
}
Allocator* getPinnedMemoryAllocator(
std::optional<DeviceType> device_type = std::nullopt) {
return getAcceleratorHooksInterface(device_type).getPinnedMemoryAllocator();
static bool isPinnedPtr(const void* data) {
return detail::getCUDAHooks().isPinnedPtr(data);
}
static bool hasOpenMP();
static bool hasMKL();
@ -432,73 +416,73 @@ class TORCH_API Context {
TORCH_API Context& globalContext();
inline void init() {
static inline void init() {
globalContext();
}
TORCH_API Allocator* getCPUAllocator();
inline DeprecatedTypeProperties& getDeprecatedTypeProperties(
static inline DeprecatedTypeProperties& getDeprecatedTypeProperties(
Backend p,
ScalarType s) {
return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
p, s);
}
inline DeprecatedTypeProperties& CPU(ScalarType s) {
static inline DeprecatedTypeProperties& CPU(ScalarType s) {
return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
Backend::CPU, s);
}
inline DeprecatedTypeProperties& CUDA(ScalarType s) {
static inline DeprecatedTypeProperties& CUDA(ScalarType s) {
return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
Backend::CUDA, s);
}
inline DeprecatedTypeProperties& HIP(ScalarType s) {
static inline DeprecatedTypeProperties& HIP(ScalarType s) {
return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
Backend::HIP, s);
}
inline DeprecatedTypeProperties& MPS(ScalarType s) {
static inline DeprecatedTypeProperties& MPS(ScalarType s) {
return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
Backend::MPS, s);
}
inline bool hasCUDA() {
static inline bool hasCUDA() {
return globalContext().hasCUDA();
}
inline bool hasMTIA() {
static inline bool hasMTIA() {
return globalContext().hasMTIA();
}
inline bool hasHIP() {
static inline bool hasHIP() {
return globalContext().hasHIP();
}
inline bool hasIPU() {
static inline bool hasIPU() {
return globalContext().hasIPU();
}
inline bool hasXLA() {
static inline bool hasXLA() {
return globalContext().hasXLA();
}
inline bool hasMPS() {
static inline bool hasMPS() {
return globalContext().hasMPS();
}
inline bool hasMAIA() {
static inline bool hasMAIA() {
return globalContext().hasMAIA();
}
inline bool hasXPU() {
static inline bool hasXPU() {
return globalContext().hasXPU();
}
// Despite its name, this function returns the number of *CUDA* GPUs.
inline size_t getNumGPUs() {
static inline size_t getNumGPUs() {
// WARNING: DO NOT ADD LOGIC TO HANDLE OTHER DEVICE TYPES TO THIS
// FUNCTION. If you are interested in interrogating the number of
// devices for a specific device type, add that function to the
@ -517,27 +501,27 @@ inline size_t getNumGPUs() {
}
}
inline bool hasOpenMP() {
static inline bool hasOpenMP() {
return globalContext().hasOpenMP();
}
inline bool hasMKL() {
static inline bool hasMKL() {
return globalContext().hasMKL();
}
inline bool hasLAPACK() {
static inline bool hasLAPACK() {
return globalContext().hasLAPACK();
}
inline bool hasMAGMA() {
static inline bool hasMAGMA() {
return globalContext().hasMAGMA();
}
inline bool hasMKLDNN() {
static inline bool hasMKLDNN() {
return globalContext().hasMKLDNN();
}
inline void manual_seed(uint64_t seed) {
static inline void manual_seed(uint64_t seed) {
auto gen = globalContext().defaultGenerator(c10::DeviceType::CPU);
{
// See Note [Acquire lock when using random generators]

View File

@ -2,7 +2,7 @@
#include <ATen/DeviceAccelerator.h>
namespace at {
std::optional<DeviceType> getAccelerator(bool checked) {
C10_API std::optional<DeviceType> getAccelerator(bool checked) {
#define DETECT_AND_ASSIGN_ACCELERATOR(device_name) \
if (at::has##device_name()) { \
device_type = k##device_name; \
@ -25,8 +25,6 @@ std::optional<DeviceType> getAccelerator(bool checked) {
DETECT_AND_ASSIGN_ACCELERATOR(CUDA)
DETECT_AND_ASSIGN_ACCELERATOR(MTIA)
DETECT_AND_ASSIGN_ACCELERATOR(XPU)
DETECT_AND_ASSIGN_ACCELERATOR(HIP)
DETECT_AND_ASSIGN_ACCELERATOR(MPS)
if (checked) {
TORCH_CHECK(
device_type, "Cannot access accelerator device when none is available.")
@ -36,18 +34,4 @@ std::optional<DeviceType> getAccelerator(bool checked) {
#undef DETECT_AND_ASSIGN_ACCELERATOR
}
bool isAccelerator(c10::DeviceType d) {
switch (d) {
case at::kCUDA:
case at::kMTIA:
case at::kXPU:
case at::kHIP:
case at::kMPS:
case at::kPrivateUse1:
return true;
default:
return false;
}
}
} // namespace at

View File

@ -13,7 +13,9 @@
// - It provides a set of common APIs as defined by AcceleratorHooksInterface
//
// As of today, accelerator devices are (in no particular order):
// CUDA, MTIA, XPU, HIP, MPS, PrivateUse1
// CUDA, MTIA, XPU, PrivateUse1
// We want to add once all the proper APIs are supported and tested:
// HIP, MPS
namespace at {
@ -22,6 +24,4 @@ namespace at {
// When checked is true, the returned optional always has a value.
TORCH_API std::optional<c10::DeviceType> getAccelerator(bool checked = false);
TORCH_API bool isAccelerator(c10::DeviceType d);
} // namespace at

View File

@ -499,7 +499,7 @@ inline Tensor sum_to(
return _sum_to(std::move(tensor), shape, always_return_non_view);
}
inline bool is_expandable_to(
static inline bool is_expandable_to(
SymIntArrayRef shape,
c10::SymIntArrayRef desired) {
size_t ndim = shape.size();
@ -517,7 +517,7 @@ inline bool is_expandable_to(
return true;
}
inline bool is_expandable_to(IntArrayRef shape, IntArrayRef desired) {
static inline bool is_expandable_to(IntArrayRef shape, IntArrayRef desired) {
auto sym_shape = c10::SymIntArrayRef(
reinterpret_cast<const c10::SymInt*>(shape.data()), shape.size());
auto sym_desired = c10::SymIntArrayRef(

View File

@ -303,7 +303,7 @@ Tensor FunctionalInverses::_nested_view_from_buffer_inverse(const Tensor& base,
return Tensor();
}
Tensor FunctionalInverses::_nested_view_from_jagged_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, const Tensor& offsets, const Tensor& dummy, const std::optional<Tensor>& lengths, int64_t ragged_idx, const std::optional<Tensor>& min_seqlen, const std::optional<Tensor>& max_seqlen) {
Tensor FunctionalInverses::_nested_view_from_jagged_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, const Tensor& offsets, const Tensor& dummy, const std::optional<Tensor>& lengths, int64_t ragged_idx, const c10::optional<Tensor>& min_seqlen, const c10::optional<Tensor>& max_seqlen) {
auto values = at::_nested_get_values(mutated_view);
if (inverse_return_mode != InverseReturnMode::NeverView) {
return values;
@ -321,8 +321,8 @@ Tensor FunctionalInverses::_nested_get_values_inverse(const Tensor& base, const
auto max_seqlen = at::_nested_get_max_seqlen(base);
auto nt = at::_nested_view_from_jagged(
mutated_view, offsets, dummy, lengths, ragged_idx,
(min_seqlen.defined() ? std::optional<Tensor>(min_seqlen) : std::nullopt),
(max_seqlen.defined() ? std::optional<Tensor>(max_seqlen) : std::nullopt));
(min_seqlen.defined() ? c10::optional<Tensor>(min_seqlen) : std::nullopt),
(max_seqlen.defined() ? c10::optional<Tensor>(max_seqlen) : std::nullopt));
if (inverse_return_mode != InverseReturnMode::NeverView) {
return nt;

View File

@ -62,7 +62,7 @@ static bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
return dim == 0 || dim == -1;
}
Tensor sum_batching_rule(const Tensor& self, OptionalIntArrayRef opt_dims, bool keepdim, std::optional<ScalarType> dtype) {
Tensor sum_batching_rule(const Tensor& self, OptionalIntArrayRef opt_dims, bool keepdim, optional<ScalarType> dtype) {
if (opt_dims.has_value()) {
auto dims = opt_dims.value();
// PyTorch has a special case where sum(scalar_tensor, dim=0) does not fail
@ -198,7 +198,7 @@ std::vector<Tensor> chunk_batching_rule(const Tensor& self, int64_t chunks, int6
return result;
}
Tensor clamp_batching_rule(const Tensor& self, const std::optional<Scalar>& min, const std::optional<Scalar>& max) {
Tensor clamp_batching_rule(const Tensor& self, const optional<Scalar>& min, const optional<Scalar>& max) {
auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
auto result = at::clamp(self_physical.tensor(), min, max);
return self_physical.getPhysicalToLogicalMap().apply(result);
@ -508,11 +508,11 @@ static void checkBatchDimsAtFrontInLayout(IntArrayRef physical_strides, int64_t
// given (sizes, strides, storage_offset) returns the maximum location that
// can be indexed (or nullopt if such a location doesn't exist, e.g., tensors
// with zero-size dims).
static std::optional<int64_t> maximum_indexable_location(
static optional<int64_t> maximum_indexable_location(
IntArrayRef sizes, IntArrayRef strides, int64_t storage_offset) {
auto result = native::storage_size_for(sizes, strides);
if (result == 0) {
return std::nullopt;
return nullopt;
}
return result + storage_offset;
}
@ -526,7 +526,7 @@ static void checkBasicAsStridedValidForSlice(
int64_t num_batch_dims,
IntArrayRef sizes,
IntArrayRef strides,
std::optional<int64_t> maybe_storage_offset) {
optional<int64_t> maybe_storage_offset) {
auto slice_sizes = physical_tensor.sizes().slice(num_batch_dims);
auto slice_strides = physical_tensor.strides().slice(num_batch_dims);
auto base_offset = physical_tensor.storage_offset();
@ -614,7 +614,7 @@ Tensor as_strided_batching_rule(
const Tensor& tensor,
IntArrayRef sizes,
IntArrayRef strides,
std::optional<int64_t> storage_offset) {
optional<int64_t> storage_offset) {
auto physical_view = at::MultiBatchVmapTransform::logicalToPhysical(tensor);
auto num_batch_dims = physical_view.numBatchDims();
auto physical_sizes = physical_view.getPhysicalShape(sizes);
@ -763,7 +763,7 @@ Tensor pow_scalar_Tensor_batching_rule(const Scalar& other, const Tensor& self)
return makeBatched(output_physical, BatchDims(old_bdims.begin(), old_bdims.end()));
}
Tensor clone_batching_rule(const Tensor& self, std::optional<MemoryFormat> memory_format) {
Tensor clone_batching_rule(const Tensor& self, optional<MemoryFormat> memory_format) {
// Memory format support is a little tricky because vmap is allowed to move
// around batch dimensions and some memory formats are rank-dependent.
// Another weird case is:
@ -958,12 +958,12 @@ Tensor stack_batching_rule(TensorList tensors, int64_t dim) {
// unwrap_and_call<..., at::to> because at::to takes TensorOptions& (!!)
Tensor to_dtype_layout_batching_rule(
const Tensor& self,
std::optional<ScalarType> dtype,
std::optional<Layout> layout,
std::optional<Device> device,
std::optional<bool> pin_memory,
optional<ScalarType> dtype,
optional<Layout> layout,
optional<Device> device,
optional<bool> pin_memory,
bool non_blocking, bool copy,
std::optional<MemoryFormat> memory_format) {
optional<MemoryFormat> memory_format) {
auto options = TensorOptions()
.dtype(dtype)
.layout(layout)
@ -978,10 +978,10 @@ Tensor to_dtype_layout_batching_rule(
Tensor new_zeros_batching_rule(
const Tensor& self,
IntArrayRef size,
std::optional<ScalarType> dtype,
std::optional<Layout> layout,
std::optional<Device> device,
std::optional<bool> pin_memory) {
optional<ScalarType> dtype,
optional<Layout> layout,
optional<Device> device,
optional<bool> pin_memory) {
auto physical_view = MultiBatchVmapTransform::logicalToPhysical(self);
auto physical_size = physical_view.getPhysicalShape(size);
auto options = TensorOptions()
@ -1010,10 +1010,10 @@ Tensor new_empty_strided_batching_rule(
const Tensor& self,
IntArrayRef size,
IntArrayRef stride,
std::optional<ScalarType> dtype,
std::optional<Layout> layout,
std::optional<Device> device,
std::optional<bool> pin_memory) {
optional<ScalarType> dtype,
optional<Layout> layout,
optional<Device> device,
optional<bool> pin_memory) {
auto physical_view = MultiBatchVmapTransform::logicalToPhysical(self);
auto physical_size = physical_view.getPhysicalShape(size);
@ -1181,9 +1181,9 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
m.impl(name, unwrap_and_call_method< \
to_type, &Tensor::to, __VA_ARGS__>);\
}
TO_BATCHING_RULE("to.device", Device, ScalarType, bool, bool, std::optional<MemoryFormat>)
TO_BATCHING_RULE("to.dtype", ScalarType, bool, bool, std::optional<MemoryFormat>)
TO_BATCHING_RULE("to.other", const Tensor&, bool, bool, std::optional<MemoryFormat>)
TO_BATCHING_RULE("to.device", Device, ScalarType, bool, bool, optional<MemoryFormat>)
TO_BATCHING_RULE("to.dtype", ScalarType, bool, bool, optional<MemoryFormat>)
TO_BATCHING_RULE("to.other", const Tensor&, bool, bool, optional<MemoryFormat>)
m.impl("to.dtype_layout", to_dtype_layout_batching_rule);
#undef TO_BATCHING_RULE
m.impl("clone", clone_batching_rule);

View File

@ -33,15 +33,15 @@ namespace at {
_(==, x.eq(y), y.eq(x)) \
_(!=, x.ne(y), y.ne(x))
#define DEFINE_OPERATOR(op, body, reverse_scalar_body) \
inline Tensor operator op(const Tensor& x, const Tensor& y) { \
return body; \
} \
inline Tensor operator op(const Tensor& x, const Scalar& y) { \
return body; \
} \
inline Tensor operator op(const Scalar& x, const Tensor& y) { \
return reverse_scalar_body; \
#define DEFINE_OPERATOR(op, body, reverse_scalar_body) \
static inline Tensor operator op(const Tensor& x, const Tensor& y) { \
return body; \
} \
static inline Tensor operator op(const Tensor& x, const Scalar& y) { \
return body; \
} \
static inline Tensor operator op(const Scalar& x, const Tensor& y) { \
return reverse_scalar_body; \
}
AT_FORALL_BINARY_OPS(DEFINE_OPERATOR)

View File

@ -65,9 +65,7 @@ inline bool areAnyOptionalTensorSubclassLike(
if (c10::impl::dispatch_mode_enabled())
return true;
return std::any_of(
tensors.begin(),
tensors.end(),
[](const std::optional<Tensor>& opt_tensor) {
tensors.begin(), tensors.end(), [](const optional<Tensor>& opt_tensor) {
return (
opt_tensor.has_value() && isTensorSubclassLike(opt_tensor.value()));
});

View File

@ -113,12 +113,12 @@
namespace at::tracer::impl {
inline bool is_dispatch_enabled() {
static inline bool is_dispatch_enabled() {
return c10::impl::tls_is_dispatch_key_included(at::DispatchKey::Tracer) &&
!c10::impl::tls_is_dispatch_key_excluded(at::DispatchKey::Tracer);
}
inline void set_dispatch_enabled(bool enabled) {
static inline void set_dispatch_enabled(bool enabled) {
TORCH_INTERNAL_ASSERT(
!c10::impl::tls_is_dispatch_key_excluded(at::DispatchKey::Tracer),
"Cannot enable tracing within the scope of NoTracerDispatchMode!");

View File

@ -29,7 +29,7 @@ TORCH_API int _crash_if_asan(int);
// Converts a TensorList (i.e. ArrayRef<Tensor> to vector of TensorImpl*)
// NB: This is ONLY used by legacy TH bindings, and ONLY used by cat.
// Once cat is ported entirely to ATen this can be deleted!
inline std::vector<TensorImpl*> checked_dense_tensor_list_unwrap(
static inline std::vector<TensorImpl*> checked_dense_tensor_list_unwrap(
ArrayRef<Tensor> tensors,
const char* name,
int pos,

View File

@ -42,70 +42,70 @@ TORCH_LIBRARY_IMPL(aten, VmapMode, m) {
#define TENSOROPTIONS std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>
// random operations (out-of-place)
m.impl("bernoulli", unsupportedRandomOp<const Tensor&, std::optional<Generator>>);
m.impl("bernoulli.out", unsupportedRandomOp_<const Tensor&, std::optional<Generator>, Tensor&>);
m.impl("bernoulli.p", unsupportedRandomOp<const Tensor&, double, std::optional<Generator>>);
m.impl("bernoulli_.Tensor", unsupportedRandomOp_<Tensor&, const Tensor&, std::optional<Generator>>);
m.impl("bernoulli_.float", unsupportedRandomOp_<Tensor&, double, std::optional<Generator>>);
m.impl("bernoulli", unsupportedRandomOp<const Tensor&, optional<Generator>>);
m.impl("bernoulli.out", unsupportedRandomOp_<const Tensor&, optional<Generator>, Tensor&>);
m.impl("bernoulli.p", unsupportedRandomOp<const Tensor&, double, optional<Generator>>);
m.impl("bernoulli_.Tensor", unsupportedRandomOp_<Tensor&, const Tensor&, optional<Generator>>);
m.impl("bernoulli_.float", unsupportedRandomOp_<Tensor&, double, optional<Generator>>);
m.impl("cauchy_", unsupportedRandomOp_<Tensor&, double, double, std::optional<Generator>>);
m.impl("exponential_", unsupportedRandomOp_<Tensor&, double, std::optional<Generator>>);
m.impl("geometric_", unsupportedRandomOp_<Tensor&, double, std::optional<Generator>>);
m.impl("log_normal_", unsupportedRandomOp_<Tensor&, double, double, std::optional<Generator>>);
m.impl("multinomial", unsupportedRandomOp<const Tensor&, int64_t, bool, std::optional<Generator>>);
m.impl("multinomial.out", unsupportedRandomOp_<const Tensor&, int64_t, bool, std::optional<Generator>, Tensor&>);
m.impl("cauchy_", unsupportedRandomOp_<Tensor&, double, double, optional<Generator>>);
m.impl("exponential_", unsupportedRandomOp_<Tensor&, double, optional<Generator>>);
m.impl("geometric_", unsupportedRandomOp_<Tensor&, double, optional<Generator>>);
m.impl("log_normal_", unsupportedRandomOp_<Tensor&, double, double, optional<Generator>>);
m.impl("multinomial", unsupportedRandomOp<const Tensor&, int64_t, bool, optional<Generator>>);
m.impl("multinomial.out", unsupportedRandomOp_<const Tensor&, int64_t, bool, optional<Generator>, Tensor&>);
m.impl("normal.Tensor_float", unsupportedRandomOp<const Tensor&, double, std::optional<Generator>>);
m.impl("normal.Tensor_float_out", unsupportedRandomOp_<const Tensor&, double, std::optional<Generator>, Tensor&>);
m.impl("normal.float_Tensor_out", unsupportedRandomOp_<double, const Tensor&, std::optional<Generator>, Tensor&>);
m.impl("normal.float_Tensor", unsupportedRandomOp<double, const Tensor&, std::optional<Generator>>);
m.impl("normal.Tensor_Tensor", unsupportedRandomOp<const Tensor&, const Tensor&, std::optional<Generator>>);
m.impl("normal.Tensor_Tensor_out", unsupportedRandomOp_<const Tensor&, const Tensor&, std::optional<Generator>, Tensor&>);
m.impl("normal.float_float", unsupportedRandomOp<double, double, IntArrayRef, std::optional<Generator>, TENSOROPTIONS>);
m.impl("normal.float_float_out", unsupportedRandomOp_<double, double, IntArrayRef, std::optional<Generator>, Tensor&>);
m.impl("normal_", unsupportedRandomOp_<Tensor&, double, double, std::optional<Generator>>);
m.impl("normal.Tensor_float", unsupportedRandomOp<const Tensor&, double, optional<Generator>>);
m.impl("normal.Tensor_float_out", unsupportedRandomOp_<const Tensor&, double, optional<Generator>, Tensor&>);
m.impl("normal.float_Tensor_out", unsupportedRandomOp_<double, const Tensor&, optional<Generator>, Tensor&>);
m.impl("normal.float_Tensor", unsupportedRandomOp<double, const Tensor&, optional<Generator>>);
m.impl("normal.Tensor_Tensor", unsupportedRandomOp<const Tensor&, const Tensor&, optional<Generator>>);
m.impl("normal.Tensor_Tensor_out", unsupportedRandomOp_<const Tensor&, const Tensor&, optional<Generator>, Tensor&>);
m.impl("normal.float_float", unsupportedRandomOp<double, double, IntArrayRef, optional<Generator>, TENSOROPTIONS>);
m.impl("normal.float_float_out", unsupportedRandomOp_<double, double, IntArrayRef, optional<Generator>, Tensor&>);
m.impl("normal_", unsupportedRandomOp_<Tensor&, double, double, optional<Generator>>);
m.impl("poisson", unsupportedRandomOp<const Tensor&, std::optional<Generator>>);
m.impl("poisson", unsupportedRandomOp<const Tensor&, optional<Generator>>);
m.impl("random_.from", unsupportedRandomOp_<Tensor&, int64_t, std::optional<int64_t>, std::optional<Generator>>);
m.impl("random_.to", unsupportedRandomOp_<Tensor&, int64_t, std::optional<Generator>>);
m.impl("random_", unsupportedRandomOp_<Tensor&, std::optional<Generator>>);
m.impl("random_.from", unsupportedRandomOp_<Tensor&, int64_t, optional<int64_t>, optional<Generator>>);
m.impl("random_.to", unsupportedRandomOp_<Tensor&, int64_t, optional<Generator>>);
m.impl("random_", unsupportedRandomOp_<Tensor&, optional<Generator>>);
m.impl("rand_like", unsupportedRandomOp<const Tensor&, TENSOROPTIONS, std::optional<MemoryFormat>>);
m.impl("randn_like", unsupportedRandomOp<const Tensor&, TENSOROPTIONS, std::optional<MemoryFormat>>);
m.impl("rand_like", unsupportedRandomOp<const Tensor&, TENSOROPTIONS, optional<MemoryFormat>>);
m.impl("randn_like", unsupportedRandomOp<const Tensor&, TENSOROPTIONS, optional<MemoryFormat>>);
m.impl("randint_like", unsupportedRandomOp<const Tensor&, int64_t, TENSOROPTIONS, std::optional<MemoryFormat>>);
m.impl("randint_like.low_dtype", unsupportedRandomOp<const Tensor&, int64_t, int64_t, TENSOROPTIONS, std::optional<MemoryFormat>>);
m.impl("randint_like", unsupportedRandomOp<const Tensor&, int64_t, TENSOROPTIONS, optional<MemoryFormat>>);
m.impl("randint_like.low_dtype", unsupportedRandomOp<const Tensor&, int64_t, int64_t, TENSOROPTIONS, optional<MemoryFormat>>);
m.impl("rand", unsupportedRandomOp<IntArrayRef, TENSOROPTIONS>);
m.impl("rand.generator", unsupportedRandomOp<IntArrayRef, std::optional<Generator>, TENSOROPTIONS>);
m.impl("rand.names", unsupportedRandomOp<IntArrayRef, std::optional<DimnameList>, TENSOROPTIONS>);
m.impl("rand.generator_with_names", unsupportedRandomOp<IntArrayRef, std::optional<Generator>, std::optional<DimnameList>, TENSOROPTIONS>);
m.impl("rand.generator", unsupportedRandomOp<IntArrayRef, optional<Generator>, TENSOROPTIONS>);
m.impl("rand.names", unsupportedRandomOp<IntArrayRef, optional<DimnameList>, TENSOROPTIONS>);
m.impl("rand.generator_with_names", unsupportedRandomOp<IntArrayRef, optional<Generator>, optional<DimnameList>, TENSOROPTIONS>);
m.impl("rand.out", unsupportedRandomOp_<IntArrayRef, Tensor&>);
m.impl("rand.generator_out", unsupportedRandomOp_<IntArrayRef, std::optional<Generator>, Tensor&>);
m.impl("rand.generator_out", unsupportedRandomOp_<IntArrayRef, optional<Generator>, Tensor&>);
m.impl("randn", unsupportedRandomOp<IntArrayRef, TENSOROPTIONS>);
m.impl("randn.generator", unsupportedRandomOp<IntArrayRef, std::optional<Generator>, TENSOROPTIONS>);
m.impl("randn.names", unsupportedRandomOp<IntArrayRef, std::optional<DimnameList>, TENSOROPTIONS>);
m.impl("randn.generator_with_names", unsupportedRandomOp<IntArrayRef, std::optional<Generator>, std::optional<DimnameList>, TENSOROPTIONS>);
m.impl("randn.generator", unsupportedRandomOp<IntArrayRef, optional<Generator>, TENSOROPTIONS>);
m.impl("randn.names", unsupportedRandomOp<IntArrayRef, optional<DimnameList>, TENSOROPTIONS>);
m.impl("randn.generator_with_names", unsupportedRandomOp<IntArrayRef, optional<Generator>, optional<DimnameList>, TENSOROPTIONS>);
m.impl("randn.out", unsupportedRandomOp_<IntArrayRef, Tensor&>);
m.impl("randn.generator_out", unsupportedRandomOp_<IntArrayRef, std::optional<Generator>, Tensor&>);
m.impl("randn.generator_out", unsupportedRandomOp_<IntArrayRef, optional<Generator>, Tensor&>);
m.impl("randperm", unsupportedRandomOp<int64_t, TENSOROPTIONS>);
m.impl("randperm.generator", unsupportedRandomOp<int64_t, std::optional<Generator>, TENSOROPTIONS>);
m.impl("randperm.generator", unsupportedRandomOp<int64_t, optional<Generator>, TENSOROPTIONS>);
m.impl("randperm.out", unsupportedRandomOp_<int64_t, Tensor&>);
m.impl("randperm.generator_out", unsupportedRandomOp_<int64_t, std::optional<Generator>, Tensor&>);
m.impl("randperm.generator_out", unsupportedRandomOp_<int64_t, optional<Generator>, Tensor&>);
m.impl("randint", unsupportedRandomOp<int64_t, IntArrayRef, TENSOROPTIONS>);
m.impl("randint.generator", unsupportedRandomOp<int64_t, IntArrayRef, std::optional<Generator>, TENSOROPTIONS>);
m.impl("randint.generator", unsupportedRandomOp<int64_t, IntArrayRef, optional<Generator>, TENSOROPTIONS>);
m.impl("randint.low", unsupportedRandomOp<int64_t, int64_t, IntArrayRef, TENSOROPTIONS>);
m.impl("randint.low_generator", unsupportedRandomOp<int64_t, int64_t, IntArrayRef, std::optional<Generator>, TENSOROPTIONS>);
m.impl("randint.low_generator", unsupportedRandomOp<int64_t, int64_t, IntArrayRef, optional<Generator>, TENSOROPTIONS>);
m.impl("randint.out", unsupportedRandomOp_<int64_t, IntArrayRef, Tensor&>);
m.impl("randint.generator_out", unsupportedRandomOp_<int64_t, IntArrayRef, std::optional<Generator>, Tensor&>);
m.impl("randint.generator_out", unsupportedRandomOp_<int64_t, IntArrayRef, optional<Generator>, Tensor&>);
m.impl("randint.low_out", unsupportedRandomOp_<int64_t, int64_t, IntArrayRef, Tensor&>);
m.impl("randint.low_generator_out", unsupportedRandomOp_<int64_t, int64_t, IntArrayRef, std::optional<Generator>, Tensor&>);
m.impl("randint.low_generator_out", unsupportedRandomOp_<int64_t, int64_t, IntArrayRef, optional<Generator>, Tensor&>);
m.impl("uniform_", unsupportedRandomOp_<Tensor&, double, double, std::optional<Generator>>);
m.impl("uniform_", unsupportedRandomOp_<Tensor&, double, double, optional<Generator>>);
#undef TENSOROPTIONS
}

View File

@ -13,7 +13,7 @@ namespace at {
constexpr size_t dim_bitset_size = 64;
inline std::bitset<dim_bitset_size> dim_list_to_bitset(
static inline std::bitset<dim_bitset_size> dim_list_to_bitset(
OptionalIntArrayRef opt_dims,
size_t ndims) {
TORCH_CHECK(

View File

@ -296,7 +296,7 @@ TORCH_API Tensor cached_cast(
const Tensor& arg,
c10::DeviceType device_type = c10::DeviceType::CUDA);
// Overload to process std::optional<Tensor>
// Overload to process optional<Tensor>
inline std::optional<Tensor> cached_cast(
at::ScalarType to_type,
const std::optional<Tensor>& arg,

View File

@ -364,7 +364,7 @@ public:
bool is(const Dict& rhs) const;
// private API for now because the return type will change to TypePtr
// instead of std::optional<TypePtr> once types are mandatory.
// instead of optional<TypePtr> once types are mandatory.
TypePtr keyType() const;
TypePtr valueType() const;

View File

@ -18,7 +18,7 @@ TORCH_API std::ostream& print(
std::ostream& stream,
const Tensor& tensor,
int64_t linesize);
inline std::ostream& operator<<(std::ostream & out, const Tensor & t) {
static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) {
return print(out,t,80);
}
TORCH_API void print(const Tensor & t, int64_t linesize=80);

View File

@ -159,7 +159,7 @@ class IListRefTagImpl<IListRefTag::Unboxed, at::OptionalTensorRef>
template <>
class IListRefTagImpl<IListRefTag::Boxed, at::OptionalTensorRef>
: public IListRefTagImplBase<IListRefTag::Boxed, at::OptionalTensorRef, std::optional<at::Tensor>> {
: public IListRefTagImplBase<IListRefTag::Boxed, at::OptionalTensorRef, optional<at::Tensor>> {
public:
/*

View File

@ -18,11 +18,11 @@ static std::vector<at::Tensor> get_tensor_vector() {
return tensors;
}
static std::vector<std::optional<at::Tensor>> get_boxed_opt_tensor_vector() {
std::vector<std::optional<at::Tensor>> optional_tensors;
static std::vector<optional<at::Tensor>> get_boxed_opt_tensor_vector() {
std::vector<optional<at::Tensor>> optional_tensors;
const size_t SIZE = 5;
for (size_t i = 0; i < SIZE * 2; i++) {
auto opt_tensor = (i % 2 == 0) ? std::optional<at::Tensor>(at::empty({0})) : nullopt;
auto opt_tensor = (i % 2 == 0) ? optional<at::Tensor>(at::empty({0})) : nullopt;
optional_tensors.emplace_back(opt_tensor);
}
return optional_tensors;
@ -234,7 +234,7 @@ TEST(ITensorListRefIteratorTest, Unboxed_Iterate) {
TEST(IOptTensorListRefTest, Boxed_Iterate) {
auto vec = get_boxed_opt_tensor_vector();
const List<std::optional<at::Tensor>> boxed(vec);
const List<optional<at::Tensor>> boxed(vec);
at::IOptTensorListRef list(boxed);
size_t i = 0;
for (const auto t : list) {

View File

@ -16,7 +16,7 @@ void NamesMode::set_enabled(bool enabled) {
c10::impl::tls_set_dispatch_key_excluded(DispatchKey::Named, !enabled);
}
const TensorBase& internal_set_names_inplace(const TensorBase& tensor, std::optional<DimnameList> names) {
const TensorBase& internal_set_names_inplace(const TensorBase& tensor, optional<DimnameList> names) {
impl::internal_set_names_inplace(tensor.unsafeGetTensorImpl(), names, /*validate_names=*/true);
return tensor;
}
@ -84,7 +84,7 @@ void check_names_valid_for(TensorImpl* impl, DimnameList names) {
check_names_valid_for(impl->dim(), names);
}
void internal_set_names_inplace(TensorImpl* impl, std::optional<DimnameList> names, bool validate_names) {
void internal_set_names_inplace(TensorImpl* impl, optional<DimnameList> names, bool validate_names) {
TORCH_CHECK(impl->layout() == Layout::Strided,
"NYI: named tensors only support strided layout");
TORCH_CHECK(impl->device().is_cpu() || impl->device().is_cuda() || impl->device().is_xpu() || impl->device().is_privateuseone(),
@ -130,7 +130,7 @@ void internal_set_names_inplace(TensorImpl* impl, std::vector<Dimname>&& names,
optional<DimnameList> get_opt_names(const TensorImpl* impl) {
const auto* meta = get_named_tensor_meta(impl);
if (meta == nullptr) {
return std::nullopt;
return nullopt;
} else {
return meta->names();
}

View File

@ -16,7 +16,7 @@ class TensorBase;
// actually exists outside of c10 and needs to be moved in.
// TensorImpl has a unique_ptr<NamedTensorMetaInterface> field.
// XXX: Ideally we would just put std::optional<vector<Dimname>> into TensorImpl.
// XXX: Ideally we would just put optional<vector<Dimname>> into TensorImpl.
//
// This class has an important invariant: there must be at least ONE
// non-wildcard

View File

@ -93,7 +93,7 @@ torch::jit::Stack boxArgs(Args... args) {
}
template <class T>
inline constexpr size_t boxed_size_one() {
static inline constexpr size_t boxed_size_one() {
static_assert(!std::is_same<std::decay_t<T>, c10::TensorOptions>::value, "need to patch this path to support TensorOptions passed by reference");
return 1;
}

View File

@ -393,9 +393,9 @@ namespace impl {
};
template<class T, bool AllowDeprecatedTypes>
struct ivalue_to_arg<optional<ArrayRef<T>>, AllowDeprecatedTypes> final {
// If an argument is std::optional<ArrayRef<T>>, convert the IValue to an std::optional<std::vector<T>> and pass that
// to the operator. OptionalArray<T> is basically a std::optional<std::vector<T>> but implicitly convertible
// to std::optional<ArrayRef<T>>.
// If an argument is optional<ArrayRef<T>>, convert the IValue to an optional<std::vector<T>> and pass that
// to the operator. OptionalArray<T> is basically a optional<std::vector<T>> but implicitly convertible
// to optional<ArrayRef<T>>.
static OptionalArray<T> call(IValue& v) {
return ivalue_to_arg<OptionalArray<T>, AllowDeprecatedTypes>::call(v);
}
@ -404,8 +404,8 @@ namespace impl {
template<class T, bool AllowDeprecatedTypes>
struct ivalue_to_arg<OptionalArrayRef<T>, AllowDeprecatedTypes> final {
// If an argument is OptionalArrayRef<T>, convert the IValue to an
// std::optional<std::vector<T>> and pass that to the operator. OptionalArray<T>
// is basically a std::optional<std::vector<T>> but implicitly convertible to
// optional<std::vector<T>> and pass that to the operator. OptionalArray<T>
// is basically a optional<std::vector<T>> but implicitly convertible to
// OptionalArrayRef<T>
static OptionalArray<T> call(IValue& v) {
return ivalue_to_arg<OptionalArray<T>, AllowDeprecatedTypes>::call(v);

View File

@ -325,7 +325,7 @@ struct TORCH_API FunctionSchema {
std::optional<AliasAnalysisKind> alias_kind_;
template <typename T>
void checkArg(const IValue& value, const Argument& argument, std::optional<size_t> pos) const;
void checkArg(const IValue& value, const Argument& argument, optional<size_t> pos) const;
void checkSchema() const {
bool seen_default_arg = false;

View File

@ -328,7 +328,7 @@ template<typename T>
inline void FunctionSchema::checkArg(
const IValue& value,
const Argument& argument,
std::optional<size_t> pos) const {
optional<size_t> pos) const {
if (value.isTensor() && argument.type() == TensorType::get()) {
// Fast-path for the common case
return;

View File

@ -87,7 +87,7 @@ struct StreamData3Holder : c10::intrusive_ptr_target {
} // namespace ivalue
// This is an owning wrapper for a std::optional<std::vector<T>>
// that can be implicitly converted to a (non-owning) std::optional<ArrayRef<T>>.
// that can be implicitly converted to a (non-owning) optional<ArrayRef<T>>.
// Its purpose is to be used in generated code to keep the vector alive
// either until the end of a statement (as a temporary), or as a saved arg
// in autograd.
@ -120,14 +120,14 @@ struct OptionalArray {
operator std::optional<c10::ArrayRef<T>>() {
if (!list) {
return std::nullopt;
return nullopt;
}
return *list;
}
operator c10::OptionalArrayRef<T>() {
if (!list) {
return std::nullopt;
return nullopt;
}
return *list;
}
@ -1021,9 +1021,9 @@ struct TORCH_API IValue final {
// ToOptional: convert a IValue to the Optional obj that accepts both T and
// None
template <typename T>
std::optional<T> toOptional();
optional<T> toOptional();
template <typename T>
std::optional<T> toOptional() const;
optional<T> toOptional() const;
/// @private [doxygen private]
/// this is a shallow comparison of two IValues to test the object identity

View File

@ -1375,7 +1375,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
// The device that was current when markCompleted was called, which we'll
// restore when invoking callbacks. It's optional because we'll only store it
// if the future completes successfully.
std::optional<c10::Device> currentDevice_;
optional<c10::Device> currentDevice_;
// The events that correspond to the completion of the async I/O kernels. They
// are recorded on the appropriate streams when the future is marked completed
@ -1748,7 +1748,7 @@ template <class T>
struct _fake_type {};
// generic_to<T> converts an IValue from a generic list or generic dict
// to a concrete list/dict type likelike List<T>, Dict<...> or std::optional<T>.
// to a concrete list/dict type likelike List<T>, Dict<...> or optional<T>.
// Note that in the case of lists, this only works for IValue-based lists,
// i.e. not for int64_t, double, ...
// generic_to<T> is an implementation detail of IValue::to<T> and not
@ -1949,7 +1949,7 @@ inline T IValue::to() && {
template <>
inline std::optional<c10::string_view> IValue::to() && {
// In the default implementation, the IValue is destroyed with std::move.
// But if the unboxed type is std::optional<string_view> we cannot destroy
// But if the unboxed type is optional<string_view> we cannot destroy
// the IValue.
return generic_to(*this, _fake_type<std::optional<c10::string_view>>{});
}
@ -2366,7 +2366,7 @@ inline std::optional<std::reference_wrapper<const std::string>> IValue::
if (isNone()) {
return std::nullopt;
}
AT_ASSERT(isString(), "Expected std::optional<string> but got ", tagKind());
AT_ASSERT(isString(), "Expected optional<string> but got ", tagKind());
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
"called toOptionalStringRef on null intrusive_ptr IValue");
@ -2390,17 +2390,17 @@ inline PyObject* IValue::toPyObject() const {
}
template <typename T>
inline std::optional<T> IValue::toOptional() {
inline optional<T> IValue::toOptional() {
if (this->isNone()) {
return std::nullopt;
return nullopt;
}
return this->to<T>();
}
template <typename T>
inline std::optional<T> IValue::toOptional() const {
inline optional<T> IValue::toOptional() const {
if (this->isNone()) {
return std::nullopt;
return nullopt;
}
return this->to<T>();
}

View File

@ -2043,7 +2043,7 @@ template <class T, bool fake>
struct getMaybeFakeTypePtr_<std::optional<T>, fake> final {
static const auto& call() {
static auto inner_type = getMaybeFakeTypePtr_<T, fake>::call();
// The "per std::optional<T>" static singleton needs to live in a .cpp file,
// The "per optional<T>" static singleton needs to live in a .cpp file,
// otherwise we'll end up with one singleton instance per shared library.
static auto type = OptionalType::get(inner_type);
return type;
@ -2055,7 +2055,7 @@ template<>
struct getTypePtr_<at::OptionalIntArrayRef> final {
static const auto& call() {
static auto inner_type = getMaybeFakeTypePtr_<IntArrayRef, false>::call();
// The "per std::optional<T>" static singleton needs to live in a .cpp file,
// The "per optional<T>" static singleton needs to live in a .cpp file,
// otherwise we'll end up with one singleton instance per shared library.
static auto type = OptionalType::get(inner_type);
return type;
@ -2065,7 +2065,7 @@ struct getTypePtr_<at::OptionalIntArrayRef> final {
template <bool fake>
struct getMaybeFakeTypePtr_<at::OptionalSymIntArrayRef, fake> final {
static const auto& call() {
// The "per std::optional<T>" static singleton needs to live in a .cpp file,
// The "per optional<T>" static singleton needs to live in a .cpp file,
// otherwise we'll end up with one singleton instance per shared library.
static auto inner_type = getMaybeFakeTypePtr_<SymIntArrayRef, fake>::call();
static auto type = OptionalType::get(inner_type);

View File

@ -455,7 +455,7 @@ struct TORCH_API Type {
// this method.
std::string annotation_str(const TypePrinter& printer) const {
if (printer) {
// the printer can return std::nullopt to fall through to the default impl
// the printer can return nullopt to fall through to the default impl
if (auto renamed = printer(*this)) {
return *renamed;
}

View File

@ -9,11 +9,11 @@
* [Note: hacky wrapper removal for optional tensor]
*
* The kernel implementation takes an optional tensor marked in the schema as
* Tensor? but the C++ function takes Tensor instead of the std::optional<Tensor>
* Tensor? but the C++ function takes Tensor instead of the optional<Tensor>
* expected by the dispatcher.
*
* To remove the hacky wrapper, the C++ function is changed to take
* std::optional<Tensor> and unwrap the Tensor value at the beginning of
* optional<Tensor> and unwrap the Tensor value at the beginning of
* the function, e.g.:
* > c10::MaybeOwned<Tensor> weight_maybe_owned =
* > at::borrow_from_optional_tensor(weight_opt);
@ -62,7 +62,7 @@ inline void check_and_update_common_device(optional<Device>& common_device, cons
}
}
inline void check_and_update_common_device(optional<Device>& common_device, const std::optional<at::Tensor>& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
inline void check_and_update_common_device(optional<Device>& common_device, const optional<at::Tensor>& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
if (tensor.has_value()) {
check_and_update_common_device(common_device, tensor.value(), methodName, argName);
}

View File

@ -434,7 +434,7 @@ public:
std::optional<std::variant<OperatorName, FunctionSchema>> schemaOrName_;
std::vector<KernelRegistrationConfig> kernels;
std::optional<AliasAnalysisKind> aliasAnalysisKind_;
optional<AliasAnalysisKind> aliasAnalysisKind_;
friend class RegisterOperators;
friend class Library;
};

View File

@ -133,6 +133,32 @@ struct VecConvert<int32_t, 1, uint8_t, 1> {
}
};
template <>
struct VecConvert<int32_t, 1, float, 1> {
static inline VectorizedN<int32_t, 1> apply(
const VectorizedN<float, 1>& src) {
return Vectorized<int32_t>(_mm256_cvttps_epi32(src[0]));
}
};
template <>
struct VecConvert<float, 1, int32_t, 1> {
static inline VectorizedN<float, 1> apply(
const VectorizedN<int32_t, 1>& src) {
return Vectorized<float>(_mm256_cvtepi32_ps(src[0]));
}
};
template <>
struct VecConvert<int16_t, 1, uint8_t, 1> {
static inline VectorizedN<int16_t, 1> apply(
const VectorizedN<uint8_t, 1>& src) {
auto src128 = _mm256_castsi256_si128(src[0]);
return Vectorized<int16_t>(_mm256_cvtepu8_epi16(src128));
}
};
template <typename dst_t, typename src_t>
struct VecConvert<
dst_t,

View File

@ -246,6 +246,12 @@ public:
return _mm256_floor_pd(values);
}
Vectorized<double> frac() const;
double reduce_add() const {
return values[0];
}
double reduce_max() const {
return values[0];
}
Vectorized<double> neg() const {
return _mm256_xor_pd(_mm256_set1_pd(-0.), values);
}

View File

@ -342,6 +342,12 @@ public:
}
return loadu(tmp);
}
float reduce_add() const {
return values[0];
}
float reduce_max() const {
return values[0];
}
Vectorized<float> neg() const {
return _mm256_xor_ps(_mm256_set1_ps(-0.f), values);
}

View File

@ -241,6 +241,12 @@ public:
Vectorized<int32_t> abs() const {
return _mm256_abs_epi32(values);
}
int32_t reduce_add() const {
return values[0];
}
int32_t reduce_max() const {
return values[0];
}
Vectorized<int32_t> real() const {
return *this;
}

View File

@ -11,6 +11,7 @@
#define SLEEF_STATIC_LIBS
#include <sleef.h>
#endif
#include <iostream>
namespace at {
namespace vec {
@ -43,6 +44,9 @@ static inline void cvtbf16_fp32(const __m512i& a, __m512& o1, __m512& o2) {
}
static inline __m256i cvtfp32_bf16(const __m512& src) {
// #if defined(CPU_CAPABILITY_AVX512_BF16)
// return reinterpret_cast<__m256i>(_mm512_cvtneps_pbh(src));
// #else
__m512i value = _mm512_castps_si512(src);
__m512i nan = _mm512_set1_epi32(0xffff);
auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q);
@ -59,6 +63,7 @@ static inline __m256i cvtfp32_bf16(const __m512& src) {
// Check NaN before converting back to bf16
t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value);
return _mm512_cvtusepi32_epi16(t_value);
// #endif
}
static inline __m512i cvtfp32_bf16(const __m512& a, const __m512& b) {

View File

@ -117,6 +117,49 @@ struct VecConvert<int32_t, 1, uint8_t, 1> {
}
};
template <>
struct VecConvert<int32_t, 1, float, 1> {
static inline VectorizedN<int32_t, 1> apply(
const VectorizedN<float, 1>& src) {
return Vectorized<int32_t>(_mm512_cvttps_epi32(src[0]));
}
};
template <>
struct VecConvert<float, 1, int32_t, 1> {
static inline VectorizedN<float, 1> apply(
const VectorizedN<int32_t, 1>& src) {
return Vectorized<float>(_mm512_cvtepi32_ps(src[0]));
}
};
template <>
struct VecConvert<int16_t, 1, uint8_t, 1> {
static inline VectorizedN<int16_t, 1> apply(
const VectorizedN<uint8_t, 1>& src) {
auto src256 = _mm512_castsi512_si256(src[0]);
return Vectorized<int16_t>(_mm512_cvtepu8_epi16(src256));
}
};
template <>
struct VecConvert<int8_t, 1, int32_t, 1> {
static inline VectorizedN<int8_t, 1> apply(
const VectorizedN<int32_t, 1>& src) {
auto src128 = _mm512_cvtepi32_epi8(src[0]);
return Vectorized<int8_t>(_mm512_castsi128_si512(src128));
}
};
template <>
struct VecConvert<int8_t, 1, int16_t, 1> {
static inline VectorizedN<int8_t, 1> apply(
const VectorizedN<int16_t, 1>& src) {
auto src256 = _mm512_cvtepi16_epi8(src[0]);
return Vectorized<int8_t>(_mm512_castsi256_si512(src256));
}
};
template <typename dst_t, typename src_t>
struct VecConvert<
dst_t,

View File

@ -255,6 +255,12 @@ public:
return _mm512_floor_pd(values);
}
Vectorized<double> frac() const;
double reduce_add() const {
return values[0];
}
double reduce_max() const {
return values[0];
}
Vectorized<double> neg() const {
return _mm512_xor_pd(_mm512_set1_pd(-0.), values);
}

View File

@ -236,27 +236,27 @@ public:
}
Vectorized<float> exp_u20() const {
// A faster version of exp with ULP=20
static __m512 vec_factorial_1 =
const __m512 vec_factorial_1 =
_mm512_set1_ps(0.999999701f); // 1/factorial(1)
static __m512 vec_factorial_2 =
const __m512 vec_factorial_2 =
_mm512_set1_ps(0.499991506f); // 1/factorial(2)
static __m512 vec_factorial_3 =
const __m512 vec_factorial_3 =
_mm512_set1_ps(0.166676521f); // 1/factorial(3)
static __m512 vec_factorial_4 =
const __m512 vec_factorial_4 =
_mm512_set1_ps(0.0418978221f); // 1/factorial(4)
static __m512 vec_factorial_5 =
const __m512 vec_factorial_5 =
_mm512_set1_ps(0.00828929059f); // 1/factorial(5)
static __m512 vec_exp_log2ef =
const __m512 vec_exp_log2ef =
_mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
static __m512 vec_half = _mm512_set1_ps(0.5f);
static __m512 vec_one = _mm512_set1_ps(1.f);
static __m512 vec_zero = _mm512_set1_ps(0.f);
static __m512 vec_two = _mm512_set1_ps(2.f);
static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
static __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
static int n_mantissa_bits = 23;
const __m512 vec_half = _mm512_set1_ps(0.5f);
const __m512 vec_one = _mm512_set1_ps(1.f);
const __m512 vec_zero = _mm512_set1_ps(0.f);
const __m512 vec_two = _mm512_set1_ps(2.f);
const __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
const __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
const __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
const __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
const int n_mantissa_bits = 23;
// exp(x) =
// = exp(n * ln(2) + r) // divide x by ln(2) and get quot and rem
@ -364,6 +364,12 @@ public:
}
return loadu(tmp);
}
float reduce_add() const {
return _mm512_reduce_add_ps(values);
}
float reduce_max() const {
return _mm512_reduce_max_ps(values);
}
Vectorized<float> neg() const {
return _mm512_xor_ps(_mm512_set1_ps(-0.f), values);
}
@ -473,26 +479,26 @@ inline Vectorized<float> Vectorized<float>::frac() const {
// either input is a NaN.
template <>
Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<float>& b) {
auto zero_vec = _mm512_set1_epi32(0);
auto max = _mm512_max_ps(a, b);
auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q);
auto isnan = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, isnan_mask,
0xFFFFFFFF));
// Exploit the fact that all-ones is a NaN.
return _mm512_or_ps(max, isnan);
// auto zero_vec = _mm512_set1_epi32(0);
return _mm512_max_ps(a, b);
// auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q);
// auto isnan = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, isnan_mask,
// 0xFFFFFFFF));
// // Exploit the fact that all-ones is a NaN.
// return _mm512_or_ps(max, isnan);
}
// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
// either input is a NaN.
template <>
Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
auto zero_vec = _mm512_set1_epi32(0);
auto min = _mm512_min_ps(a, b);
auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q);
auto isnan = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, isnan_mask,
0xFFFFFFFF));
// auto zero_vec = _mm512_set1_epi32(0);
return _mm512_min_ps(a, b);
// auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q);
// auto isnan = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, isnan_mask,
// 0xFFFFFFFF));
// Exploit the fact that all-ones is a NaN.
return _mm512_or_ps(min, isnan);
// return _mm512_or_ps(min, isnan);
}
template <>

View File

@ -267,6 +267,12 @@ public:
Vectorized<int32_t> abs() const {
return _mm512_abs_epi32(values);
}
int32_t reduce_add() const {
return _mm512_reduce_add_epi32(values);
}
int32_t reduce_max() const {
return _mm512_reduce_max_epi32(values);
}
Vectorized<int32_t> real() const {
return *this;
}

Some files were not shown because too many files have changed in this diff Show More