Compare commits

..

3 Commits

Author SHA1 Message Date
e38ac55aae cleanup and remove alert test 2025-09-10 22:46:12 +00:00
f20c28eba1 update 2025-09-02 17:48:42 +00:00
daac58237f check in 2025-08-28 23:43:31 +00:00
246 changed files with 2531 additions and 9495 deletions

View File

@ -81,8 +81,8 @@ elif [[ "$image" == *riscv* ]]; then
DOCKERFILE="ubuntu-cross-riscv/Dockerfile"
fi
_UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152
_UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
if [[ "$image" == *rocm* ]]; then
_UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
_UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
@ -114,19 +114,31 @@ case "$tag" in
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
;;
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
CUDA_VERSION=13.0.0
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=12.8.1
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
CUDA_VERSION=12.8.1
ANACONDA_PYTHON_VERSION=3.10
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
CUDA_VERSION=12.8.1
ANACONDA_PYTHON_VERSION=3.13
GCC_VERSION=9
VISION=yes
KATEX=yes
@ -213,8 +225,7 @@ case "$tag" in
NINJA_VERSION=1.9.0
TRITON=yes
;;
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
# TODO (huydhn): Upgrade this to Python >= 3.10
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
VISION=yes

View File

@ -1 +1 @@
74a23feff57432129df84d8099e622773cf77925
22bc29b4d503fc895ff73bc720ff396e9723465f

View File

@ -44,12 +44,8 @@ function install_ucc() {
./autogen.sh
if [[ -n "$CUDA_VERSION" && $CUDA_VERSION == 13* ]]; then
NVCC_GENCODE="-gencode=arch=compute_86,code=compute_86"
else
# We only run distributed tests on Tesla M60 and A10G
NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
fi
# We only run distributed tests on Tesla M60 and A10G
NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
if [[ -n "$ROCM_VERSION" ]]; then
if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then

View File

@ -1,7 +1,7 @@
sphinx==5.3.0
#Description: This is used to generate PyTorch docs
#Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably

View File

@ -66,7 +66,6 @@ ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
# (optional) Install UCC
ARG UCX_COMMIT
ARG UCC_COMMIT
ARG CUDA_VERSION
ENV UCX_COMMIT $UCX_COMMIT
ENV UCC_COMMIT $UCC_COMMIT
ENV UCX_HOME /usr

View File

@ -35,10 +35,11 @@ fi
print_cmake_info
if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
# Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
else
# NB: we always build with distributed; USE_DISTRIBUTED turns off all
# backends (specifically the gloo backend), so test that this case works too
# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
fi
if which sccache > /dev/null; then

View File

@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
fi
popd
python -mpip install -r requirements.txt
# enable debug asserts in serialization
export TORCH_SERIALIZATION_DEBUG=1
python -mpip install --no-input -r requirements.txt
setup_test_python() {
# The CircleCI worker hostname doesn't resolve to an address.
# This environment variable makes ProcessGroupGloo default to

View File

@ -496,14 +496,6 @@ test_inductor_cpp_wrapper_shard() {
-k 'take' \
--shard "$1" "$NUM_TEST_SHARDS" \
--verbose
if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
python test/run_test.py \
--include inductor/test_mkldnn_pattern_matcher \
-k 'xpu' \
--shard "$1" "$NUM_TEST_SHARDS" \
--verbose
fi
}
# "Global" flags for inductor benchmarking controlled by TEST_CONFIG

View File

@ -213,8 +213,7 @@ pip install requests ninja typing-extensions
retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
retry brew install libomp
# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
# is build as part of tensorpipe submodule
# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
export USE_DISTRIBUTED=1
export USE_MKLDNN=OFF

View File

@ -57,21 +57,6 @@ runs:
submodules: ${{ inputs.submodules }}
show-progress: false
- name: Clean submodules post checkout
id: clean-submodules
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
shell: bash
env:
NO_SUDO: ${{ inputs.no-sudo }}
run: |
cd "${GITHUB_WORKSPACE}"
# Clean stale submodule dirs
if [ -z "${NO_SUDO}" ]; then
sudo git submodule foreach --recursive git clean -ffdx
else
git submodule foreach --recursive git clean -ffdx
fi
- name: Clean workspace (try again)
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' &&
(steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }}

View File

@ -1 +1 @@
1c66402d0fa47ea74d365dcaa468d397da481918
10a5002c6195bd95e34df8fe28ff8a2d55a2a922

View File

@ -1 +1 @@
752d2e1c364e4195093e4f3f2fc33e3ae1840707
321938e9ac4000e0cb37e328359a7fd3026bc672

View File

@ -1 +1 @@
763e5b78d4fcd74a9e812256656c075f99d9a781
a1c6ee92c85e8b0955c20892ed68f032a6015c09

View File

@ -244,6 +244,8 @@ def generate_libtorch_matrix(
arches.remove("13.0")
elif os == "windows":
arches += CUDA_ARCHES
if "13.0" in arches:
arches.remove("13.0")
if libtorch_variants is None:
libtorch_variants = [
"shared-with-deps",
@ -308,6 +310,8 @@ def generate_wheels_matrix(
arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
elif os == "windows":
arches += CUDA_ARCHES + XPU_ARCHES
if "13.0" in arches:
arches.remove("13.0")
elif os == "linux-aarch64":
# Separate new if as the CPU type is different and
# uses different build/test scripts

View File

@ -27,7 +27,6 @@ from trymerge import (
get_drci_classifications,
gh_get_team_members,
GitHubPR,
iter_issue_timeline_until_comment,
JobCheckState,
main as trymerge_main,
MandatoryChecksMissingError,
@ -35,8 +34,6 @@ from trymerge import (
RE_GHSTACK_DESC,
read_merge_rules,
remove_job_name_suffix,
sha_from_committed_event,
sha_from_force_push_after,
validate_revert,
)
@ -1141,176 +1138,5 @@ Pull Request resolved: https://github.com/pytorch/pytorch/pull/154394"""
)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
@mock.patch(
"trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
)
class TestTimelineFunctions(TestCase):
"""Tests for the new timeline-related functions"""
def test_sha_from_committed_event(self, *args: Any) -> None:
"""Test extracting SHA from committed event"""
# Based on actual GitHub API format - committed events have "sha" at top level
event = {
"event": "committed",
"sha": "fb21ce932ded6670c918804a0d9151b773770a7c",
}
self.assertEqual(
sha_from_committed_event(event), "fb21ce932ded6670c918804a0d9151b773770a7c"
)
# Test with missing SHA
event_no_sha = {"event": "committed"}
self.assertIsNone(sha_from_committed_event(event_no_sha))
def test_sha_from_force_push_after(self, *args: Any) -> None:
"""Test extracting SHA from force push event"""
# NOTE: The current function doesn't handle the actual GitHub API format
# Real force push events have "commit_id" at top level, but this function
# looks for "after", "after_commit", "after_sha", or "head_sha" fields
# Test with the legacy format the current function handles
event_legacy = {
"event": "head_ref_force_pushed",
"after": {"sha": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e"},
}
self.assertEqual(
sha_from_force_push_after(event_legacy),
"ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
)
# Test with current GitHub API format (should return None with current implementation)
event_real_api = {
"event": "head_ref_force_pushed",
"commit_id": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
}
self.assertEqual(
sha_from_force_push_after(event_real_api),
"ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
) # Current function doesn't handle commit_id
# Test with missing SHA
event_no_sha = {"event": "head_ref_force_pushed"}
self.assertIsNone(sha_from_force_push_after(event_no_sha))
@mock.patch("trymerge.gh_fetch_json_list")
def test_iter_issue_timeline_until_comment(
self, mock_gh_fetch_json_list: Any, *args: Any
) -> None:
"""Test timeline iteration until target comment"""
# Mock timeline data based on actual GitHub API format
timeline_data = [
{"event": "commented", "id": 100, "body": "first comment"},
{"event": "committed", "sha": "fb21ce932ded6670c918804a0d9151b773770a7c"},
{"event": "commented", "id": 200, "body": "target comment"},
{"event": "commented", "id": 300, "body": "after target"},
]
mock_gh_fetch_json_list.return_value = timeline_data
# Test iteration stops at target comment
events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 200))
self.assertEqual(len(events), 3) # Should stop at target comment
self.assertEqual(events[0]["event"], "commented")
self.assertEqual(events[0]["id"], 100)
self.assertEqual(events[1]["event"], "committed")
self.assertEqual(events[1]["sha"], "fb21ce932ded6670c918804a0d9151b773770a7c")
self.assertEqual(events[2]["event"], "commented")
self.assertEqual(events[2]["id"], 200)
@mock.patch("trymerge.gh_fetch_json_list")
def test_iter_issue_timeline_until_comment_not_found(
self, mock_gh_fetch_json_list: Any, *args: Any
) -> None:
"""Test timeline iteration when target comment is not found"""
# Mock empty timeline
mock_gh_fetch_json_list.return_value = []
events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 999))
self.assertEqual(len(events), 0)
@mock.patch("trymerge.iter_issue_timeline_until_comment")
def test_get_commit_sha_at_comment_commit_after_comment(
self, mock_iter_timeline: Any, *args: Any
) -> None:
"""Test get_commit_sha_at_comment returns correct SHA after comment"""
mock_iter_timeline.return_value = [
{"event": "committed", "sha": "commit1"},
{"event": "committed", "sha": "commit2"},
{"event": "commented", "id": 100},
{"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
]
pr = GitHubPR("pytorch", "pytorch", 77700)
sha = pr.get_commit_sha_at_comment(100)
self.assertEqual(sha, "commit2")
@mock.patch("trymerge.iter_issue_timeline_until_comment")
def test_get_commit_sha_at_comment_force_push_before_comment(
self, mock_iter_timeline: Any, *args: Any
) -> None:
mock_iter_timeline.return_value = [
{"event": "committed", "sha": "commit1"},
{"event": "committed", "sha": "commit2"},
{"event": "head_ref_force_pushed", "commit_id": "commit3"},
{"event": "commented", "id": 100},
]
pr = GitHubPR("pytorch", "pytorch", 77700)
sha = pr.get_commit_sha_at_comment(100)
self.assertEqual(sha, "commit3")
@mock.patch("trymerge.iter_issue_timeline_until_comment")
def test_get_commit_sha_at_comment_force_push_before_comment_legacy_mode(
self, mock_iter_timeline: Any, *args: Any
) -> None:
mock_iter_timeline.return_value = [
{"event": "committed", "sha": "commit1"},
{"event": "committed", "sha": "commit2"},
{"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
{"event": "commented", "id": 100},
]
pr = GitHubPR("pytorch", "pytorch", 77700)
sha = pr.get_commit_sha_at_comment(100)
self.assertEqual(sha, "commit3")
@mock.patch("trymerge.iter_issue_timeline_until_comment")
def test_get_commit_sha_at_comment_multiple_comments(
self, mock_iter_timeline: Any, *args: Any
) -> None:
mock_iter_timeline.return_value = [
{"event": "committed", "sha": "commit1"},
{"event": "commented", "id": 100},
{"event": "committed", "sha": "commit2"},
{"event": "commented", "id": 200},
{"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
{"event": "commented", "id": 300},
]
pr = GitHubPR("pytorch", "pytorch", 77700)
sha = pr.get_commit_sha_at_comment(200)
self.assertEqual(sha, "commit2")
sha = pr.get_commit_sha_at_comment(300)
self.assertEqual(sha, "commit3")
@mock.patch("trymerge.iter_issue_timeline_until_comment")
def test_get_commit_sha_at_comment_no_events(
self, mock_iter_timeline: Any, *args: Any
) -> None:
mock_iter_timeline.return_value = [
{"event": "commented", "id": 100},
{"event": "labeled", "label": {"name": "test"}},
]
pr = GitHubPR("pytorch", "pytorch", 77700)
sha = pr.get_commit_sha_at_comment(100)
self.assertIsNone(sha)
@mock.patch("trymerge.iter_issue_timeline_until_comment")
def test_get_commit_sha_at_comment_exception(
self, mock_iter_timeline: Any, *args: Any
) -> None:
mock_iter_timeline.side_effect = Exception("API error")
pr = GitHubPR("pytorch", "pytorch", 77700)
sha = pr.get_commit_sha_at_comment(100)
self.assertIsNone(sha)
if __name__ == "__main__":
main()

View File

@ -450,63 +450,6 @@ HAS_NO_CONNECTED_DIFF_TITLE = (
IGNORABLE_FAILED_CHECKS_THESHOLD = 10
def iter_issue_timeline_until_comment(
org: str, repo: str, issue_number: int, target_comment_id: int, max_pages: int = 200
) -> Any:
"""
Yield timeline entries in order until (and including) the entry whose id == target_comment_id
for a 'commented' event. Stops once the target comment is encountered.
"""
page = 1
while page <= max_pages:
url = (
f"https://api.github.com/repos/{org}/{repo}/issues/{issue_number}/timeline"
)
params = {"per_page": 100, "page": page}
batch = gh_fetch_json_list(url, params)
if not batch:
return
for ev in batch:
# The target is the issue comment row with event == "commented" and id == issue_comment_id
if ev.get("event") == "commented" and ev.get("id") == target_comment_id:
yield ev # nothing in the timeline after this matters, so stop early
return
yield ev
if len(batch) < 100:
return
page += 1
# If we got here without finding the comment, then we either hit a bug or some github PR
# has a _really_ long timeline.
# The max # of pages found on any pytorch/pytorch PR at the time of this change was 41
raise RuntimeError(
f"Could not find a merge commit in the first {max_pages} pages of the timeline at url {url}."
f"This is most likely a bug, please report it to the @pytorch/pytorch-dev-infra team."
)
def sha_from_committed_event(ev: dict[str, Any]) -> Optional[str]:
"""Extract SHA from committed event in timeline"""
return ev.get("sha")
def sha_from_force_push_after(ev: dict[str, Any]) -> Optional[str]:
"""Extract SHA from force push event in timeline"""
# The current GitHub API format
commit_id = ev.get("commit_id")
if commit_id:
return str(commit_id)
# Legacy format
after = ev.get("after") or ev.get("after_commit") or {}
if isinstance(after, dict):
return after.get("sha") or after.get("oid")
return ev.get("after_sha") or ev.get("head_sha")
def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any:
rc = gh_graphql(GH_GET_PR_INFO_QUERY, name=proj, owner=org, number=pr_no)
return rc["data"]["repository"]["pullRequest"]
@ -900,44 +843,6 @@ class GitHubPR:
def get_commit_count(self) -> int:
return int(self.info["commits_with_authors"]["totalCount"])
def get_commit_sha_at_comment(self, comment_id: int) -> Optional[str]:
"""
Get the PR head commit SHA that was present when a specific comment was posted.
This ensures we only merge the state of the PR at the time the merge command was issued,
not any subsequent commits that may have been pushed after.
Returns None if no head-changing events found before the comment or if the comment was not found.
"""
head = None
try:
for event in iter_issue_timeline_until_comment(
self.org, self.project, self.pr_num, comment_id
):
etype = event.get("event")
if etype == "committed":
sha = sha_from_committed_event(event)
if sha:
head = sha
print(f"Timeline: Found commit event for SHA {sha}")
elif etype == "head_ref_force_pushed":
sha = sha_from_force_push_after(event)
if sha:
head = sha
print(f"Timeline: Found force push event for SHA {sha}")
elif etype == "commented":
if event.get("id") == comment_id:
print(f"Timeline: Found final comment with sha {sha}")
return head
except Exception as e:
print(
f"Warning: Failed to reconstruct timeline for comment {comment_id}: {e}"
)
return None
print(f"Did not find comment with id {comment_id} in the PR timeline")
return None
def get_pr_creator_login(self) -> str:
return cast(str, self.info["author"]["login"])
@ -1329,14 +1234,11 @@ class GitHubPR:
skip_all_rule_checks: bool = False,
) -> list["GitHubPR"]:
"""
:param skip_all_rule_checks: If true, skips all rule checks on ghstack PRs, useful for dry-running merge locally
:param skip_all_rule_checks: If true, skips all rule checks, useful for dry-running merge locally
"""
branch_to_merge_into = self.default_branch() if branch is None else branch
if repo.current_branch() != branch_to_merge_into:
repo.checkout(branch_to_merge_into)
# It's okay to skip the commit SHA check for ghstack PRs since
# authoring requires write access to the repo.
if self.is_ghstack_pr():
return self.merge_ghstack_into(
repo,
@ -1347,41 +1249,14 @@ class GitHubPR:
msg = self.gen_commit_message()
pr_branch_name = f"__pull-request-{self.pr_num}__init__"
# Determine which commit SHA to merge
commit_to_merge = None
if not comment_id:
raise ValueError("Must provide --comment-id when merging regular PRs")
# Get the commit SHA that was present when the comment was made
commit_to_merge = self.get_commit_sha_at_comment(comment_id)
if not commit_to_merge:
raise RuntimeError(
f"Could not find commit that was pushed before comment {comment_id}"
)
# Validate that this commit is the latest commit on the PR
latest_commit = self.last_commit_sha()
if commit_to_merge != latest_commit:
raise RuntimeError(
f"Commit {commit_to_merge} was HEAD when comment {comment_id} was posted "
f"but now the latest commit on the PR is {latest_commit}. "
f"Please re-issue the merge command to merge the latest commit."
)
print(f"Merging commit {commit_to_merge} locally")
repo.fetch(commit_to_merge, pr_branch_name)
repo.fetch(self.last_commit_sha(), pr_branch_name)
repo._run_git("merge", "--squash", pr_branch_name)
repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
# Did the PR change since we started the merge?
pulled_sha = repo.show_ref(pr_branch_name)
latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
if (
pulled_sha != latest_pr_status.last_commit_sha()
or pulled_sha != commit_to_merge
):
if pulled_sha != latest_pr_status.last_commit_sha():
raise RuntimeError(
"PR has been updated since CI checks last passed. Please rerun the merge command."
)

View File

@ -77,7 +77,6 @@ jobs:
run: |
git config --global core.longpaths true
git config --global core.symlinks true
git config --global core.ignorecase false
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported

View File

@ -70,7 +70,6 @@ jobs:
run: |
git config --global core.longpaths true
git config --global core.symlinks true
git config --global core.ignorecase false
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported

View File

@ -275,7 +275,7 @@ jobs:
- name: Change permissions
if: ${{ always() && steps.test.conclusion }}
run: |
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1000:1000 test"
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
- name: Print remaining test logs
shell: bash

View File

@ -50,9 +50,10 @@ jobs:
runner: [linux.12xlarge]
docker-image-name: [
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
pytorch-linux-jammy-py3.10-clang12,
@ -63,7 +64,7 @@ jobs:
pytorch-linux-jammy-rocm-n-py3-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12,
pytorch-linux-jammy-py3.10-gcc11,
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
pytorch-linux-jammy-py3.12-halide,
pytorch-linux-jammy-xpu-n-1-py3,
pytorch-linux-jammy-xpu-n-py3,

View File

@ -1038,253 +1038,3 @@ jobs:
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-cuda13_0-shared-with-deps-debug-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
LIBTORCH_CONFIG: debug
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
shell: bash
run: |
git config --global core.longpaths true
git config --global core.symlinks true
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported
# in https://github.com/actions/checkout/issues/1018
git config --global core.fsmonitor false
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
- uses: actions/upload-artifact@v4.4.0
if: always()
with:
name: libtorch-cuda13_0-shared-with-deps-debug
retention-days: 14
if-no-files-found: error
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
libtorch-cuda13_0-shared-with-deps-debug-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-cuda13_0-shared-with-deps-debug-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
LIBTORCH_CONFIG: debug
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
steps:
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
shell: bash
run: |
git config --global core.longpaths true
git config --global core.symlinks true
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported
# in https://github.com/actions/checkout/issues/1018
git config --global core.fsmonitor false
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: libtorch-cuda13_0-shared-with-deps-debug
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Test PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
libtorch-cuda13_0-shared-with-deps-debug-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: libtorch-cuda13_0-shared-with-deps-debug-test
with:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
LIBTORCH_CONFIG: debug
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
build_name: libtorch-cuda13_0-shared-with-deps-debug
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml

View File

@ -1038,253 +1038,3 @@ jobs:
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-cuda13_0-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
shell: bash
run: |
git config --global core.longpaths true
git config --global core.symlinks true
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported
# in https://github.com/actions/checkout/issues/1018
git config --global core.fsmonitor false
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Build PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
- uses: actions/upload-artifact@v4.4.0
if: always()
with:
name: libtorch-cuda13_0-shared-with-deps-release
retention-days: 14
if-no-files-found: error
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
libtorch-cuda13_0-shared-with-deps-release-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-cuda13_0-shared-with-deps-release-build
- get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
steps:
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@main
continue-on-error: true
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
shell: bash
run: |
git config --global core.longpaths true
git config --global core.symlinks true
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
# the directory on Windows and prevent GHA from checking out as reported
# in https://github.com/actions/checkout/issues/1018
git config --global core.fsmonitor false
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell
run: |
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
# removed once Windows Defender is removed from the AMI
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
continue-on-error: true
shell: powershell
run: |
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
# Let's both exclude the path and disable Windows Defender completely just to be sure
# that it doesn't interfere
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
# runner.temp variable, which we need.
- name: Populate binary env
shell: bash
run: |
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: libtorch-cuda13_0-shared-with-deps-release
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
- name: Populate binary env
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
- name: Test PyTorch binary
shell: bash
run: |
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
- name: Wait until all sessions have drained
shell: powershell
working-directory: pytorch
if: always()
timeout-minutes: 120
run: |
.github\scripts\wait_for_ssh_to_drain.ps1
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
shell: powershell
working-directory: pytorch
if: always()
run: |
.github\scripts\kill_active_ssh_sessions.ps1
libtorch-cuda13_0-shared-with-deps-release-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: libtorch-cuda13_0-shared-with-deps-release-test
with:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130
GPU_ARCH_VERSION: "13.0"
GPU_ARCH_TYPE: cuda
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
# This is a dummy value for libtorch to work correctly with our batch scripts
# without this value pip does not get installed for some reason
DESIRED_PYTHON: "3.9"
build_name: libtorch-cuda13_0-shared-with-deps-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml

File diff suppressed because it is too large Load Diff

View File

@ -18,13 +18,13 @@ permissions:
contents: read
jobs:
inductor-build:
linux-jammy-cpu-py3_9-gcc11-inductor-build:
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
name: inductor-build
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-py3.9-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
# Use metal host for benchmark jobs
test-matrix: |
{ include: [
@ -32,13 +32,13 @@ jobs:
]}
secrets: inherit
inductor-micro-benchmark-test:
name: inductor-micro-benchmark-test
linux-jammy-cpu-py3_9-gcc11-inductor-micro-benchmark-test:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
with:
build-environment: linux-jammy-py3.9-gcc11
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
timeout-minutes: 720
secrets: inherit

View File

@ -32,13 +32,13 @@ jobs:
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
nightly-dynamo-benchmarks-build:
name: nightly-dynamo-benchmarks-build
linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build:
name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
test-matrix: |
{ include: [
@ -51,13 +51,13 @@ jobs:
build-additional-packages: "vision audio torchao"
secrets: inherit
nightly-dynamo-benchmarks-test:
name: nightly-dynamo-benchmarks-test
linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-test:
name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
uses: ./.github/workflows/_linux-test.yml
needs: nightly-dynamo-benchmarks-build
needs: linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.test-matrix }}
timeout-minutes: 720
secrets: inherit

View File

@ -84,8 +84,9 @@ jobs:
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
# NB: Keep this in sync with trunk.yml
build:
name: build
name: cuda12.8-py3.10-gcc9-sm90
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
@ -127,7 +128,7 @@ jobs:
secrets: inherit
test-periodically:
name: test-periodically
name: cuda12.8-py3.10-gcc9-sm90
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event.schedule == '15 0,12 * * 1-6'
@ -144,7 +145,7 @@ jobs:
secrets: inherit
test-weekly:
name: test-weekly
name: cuda12.8-py3.10-gcc9-sm90
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event.schedule == '0 7 * * 0'
@ -161,7 +162,7 @@ jobs:
secrets: inherit
test:
name: test
name: cuda12.8-py3.10-gcc9-sm90
uses: ./.github/workflows/_linux-test.yml
needs: build
# The pull_request trigger is used in PR to bump transformers pin which always

View File

@ -69,14 +69,14 @@ jobs:
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
inductor-build:
name: inductor-build
linux-jammy-zen-cpu-py3_9-gcc11-inductor-build:
name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_cpu_x86_zen", shard: 1, num_shards: 3, runner: "linux.24xlarge.amd" },
@ -95,16 +95,16 @@ jobs:
selected-test-configs: ${{ inputs.benchmark_configs }}
secrets: inherit
inductor-test-nightly:
name: inductor-test-nightly
linux-jammy-zen-cpu-py3_9-gcc11-inductor-test-nightly:
name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build
needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
if: github.event.schedule == '0 7 * * *'
with:
build-environment: linux-jammy-py3.9-gcc11-build
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests
disable-monitor: false
@ -112,16 +112,17 @@ jobs:
monitor-data-collect-interval: 4
secrets: inherit
inductor-test:
name: inductor-test
linux-jammy-zen-cpu-py3_9-gcc11-inductor-test:
name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build
needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
if: github.event_name == 'workflow_dispatch'
with:
build-environment: linux-jammy-py3.9-gcc11-build
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests
disable-monitor: false

View File

@ -74,14 +74,14 @@ jobs:
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
inductor-build:
name: inductor-build
linux-jammy-cpu-py3_9-gcc11-inductor-build:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xl.spr-metal" },
@ -101,16 +101,16 @@ jobs:
build-additional-packages: "vision audio torchao"
secrets: inherit
inductor-test-nightly-freezing:
name: inductor-test-nightly-freezing
linux-jammy-cpu-py3_9-gcc11-inductor-test-nightly-freezing:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
if: github.event.schedule == '0 7 * * *'
with:
build-environment: linux-jammy-py3.9-gcc11-build
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests
disable-monitor: false
@ -118,16 +118,16 @@ jobs:
monitor-data-collect-interval: 4
secrets: inherit
inductor-test:
name: inductor-test
linux-jammy-cpu-py3_9-gcc11-inductor-test:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
if: github.event_name == 'workflow_dispatch'
with:
build-environment: linux-jammy-py3.9-gcc11-build
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }}
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests
disable-monitor: false

View File

@ -79,6 +79,7 @@ jobs:
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
# NB: Keep this in sync with trunk.yml
build:
name: cuda12.8-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml

View File

@ -31,8 +31,8 @@ jobs:
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
periodic-dynamo-benchmarks-build:
name: periodic-dynamo-benchmarks-build
linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build:
name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix
with:
@ -57,33 +57,23 @@ jobs:
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
]}
build-additional-packages: "vision audio fbgemm torchao"
secrets: inherit
periodic-dynamo-benchmarks-test:
name: periodic-dynamo-benchmarks-test
linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-test:
name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
uses: ./.github/workflows/_linux-test.yml
needs: periodic-dynamo-benchmarks-build
needs: linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image: ${{ needs.periodic-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
secrets: inherit
rocm-periodic-dynamo-benchmarks-build:
linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build:
if: github.repository_owner == 'pytorch'
name: rocm-periodic-dynamo-benchmarks-build
name: rocm-py3_10-periodic-dynamo-benchmarks
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-rocm-py3_10
@ -109,21 +99,21 @@ jobs:
]}
secrets: inherit
rocm-periodic-dynamo-benchmarks-test:
linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-test:
permissions:
id-token: write
contents: read
name: rocm-periodic-dynamo-benchmarks-test
name: rocm-py3_10-periodic-dynamo-benchmarks
uses: ./.github/workflows/_rocm-test.yml
needs: rocm-periodic-dynamo-benchmarks-build
needs: linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build
with:
build-environment: linux-jammy-rocm-py3_10
docker-image: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
secrets: inherit
inductor-smoke-build:
name: inductor-smoke-build
linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build:
name: cuda12.8-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
needs:
- get-default-label-prefix
@ -139,23 +129,23 @@ jobs:
build-additional-packages: "vision audio fbgemm torchao"
secrets: inherit
inductor-smoke-test:
name: inductor-smoke-test
linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-test:
name: cuda12.8-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: inductor-smoke-build
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image: ${{ needs.inductor-smoke-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.test-matrix }}
secrets: inherit
periodic-dynamo-benchmarks-cpu-build:
name: periodic-dynamo-benchmarks-cpu-build
linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build:
name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
test-matrix: |
{ include: [
@ -170,6 +160,68 @@ jobs:
{ config: "cpu_inductor_freezing_avx2_torchbench", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
{ config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
{ config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
]}
build-additional-packages: "vision audio torchao"
secrets: inherit
linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-test:
name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
name: cuda12.8-py3.10-gcc9-sm86
uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
test-matrix: |
{ include: [
{ config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
]}
build-additional-packages: "vision audio fbgemm torchao"
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
name: cuda12.8-py3.10-gcc9-sm86
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cpu-py3_9-gcc11-inductor-build:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
test-matrix: |
{ include: [
{ config: "cpu_inductor_freezing_huggingface", shard: 1, num_shards: 1, runner: "linux.8xlarge.amx" },
{ config: "cpu_inductor_freezing_timm", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
{ config: "cpu_inductor_freezing_timm", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
@ -195,12 +247,12 @@ jobs:
build-additional-packages: "vision audio torchao"
secrets: inherit
periodic-dynamo-benchmarks-cpu-test:
name: periodic-dynamo-benchmarks-cpu-test
linux-jammy-cpu-py3_9-gcc11-inductor-test:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-test.yml
needs: periodic-dynamo-benchmarks-cpu-build
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
secrets: inherit

View File

@ -28,8 +28,8 @@ jobs:
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
inductor-build:
name: inductor-build
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
name: cuda12.8-py3.10-gcc9-sm86
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
@ -47,18 +47,44 @@ jobs:
]}
secrets: inherit
inductor-test:
name: inductor-test
linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
name: cuda12.8-py3.10-gcc9-sm86
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit
inductor-halide-build:
name: inductor-halide-build
linux-jammy-cuda12_8-py3_12-gcc9-inductor-build:
name: cuda12.8-py3.12-gcc9-sm86
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
test-matrix: |
{ include: [
{ config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_12-gcc9-inductor-test:
name: cuda12.8-py3.12-gcc9-sm86
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-cuda12_8-py3_12-gcc9-inductor-build
with:
build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cpu-py3_12-inductor-halide-build:
name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
@ -71,18 +97,18 @@ jobs:
]}
secrets: inherit
inductor-halide-test:
name: inductor-halide-test
linux-jammy-cpu-py3_12-inductor-halide-test:
name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
uses: ./.github/workflows/_linux-test.yml
needs: inductor-halide-build
needs: linux-jammy-cpu-py3_12-inductor-halide-build
with:
build-environment: linux-jammy-py3.12-gcc11
docker-image: ${{ needs.inductor-halide-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
secrets: inherit
inductor-triton-cpu-build:
name: inductor-triton-cpu-build
linux-jammy-cpu-py3_12-inductor-triton-cpu-build:
name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
@ -95,23 +121,23 @@ jobs:
]}
secrets: inherit
inductor-triton-cpu-test:
linux-jammy-cpu-py3_12-inductor-triton-cpu-test:
name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
uses: ./.github/workflows/_linux-test.yml
needs: inductor-triton-cpu-build
needs: linux-jammy-cpu-py3_12-inductor-triton-cpu-build
with:
build-environment: linux-jammy-py3.12-gcc11
docker-image: ${{ needs.inductor-triton-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-triton-cpu-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.test-matrix }}
secrets: inherit
inductor-cpu-build:
name: inductor-cpu-build
linux-jammy-cpu-py3_9-gcc11-inductor-build:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
test-matrix: |
{ include: [
@ -122,12 +148,37 @@ jobs:
]}
secrets: inherit
inductor-cpu-test:
name: inductor-cpu-test
linux-jammy-cpu-py3_9-gcc11-inductor-test:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-test.yml
needs: inductor-cpu-build
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_8-py3_13-gcc9-inductor-build:
name: cuda12.8-py3.13-gcc9-sm86
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
test-matrix: |
{ include: [
{ config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_13-gcc9-inductor-test:
name: cuda12.8-py3.13-gcc9-sm86
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-cuda12_8-py3_13-gcc9-inductor-build
with:
build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit

View File

@ -44,8 +44,8 @@ jobs:
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
inductor-build:
name: inductor-build
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
name: cuda12.8-py3.10-gcc9-sm86
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
@ -53,6 +53,7 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
test-matrix: |
{ include: [
{ config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
@ -64,24 +65,25 @@ jobs:
build-additional-packages: "vision audio fbgemm torchao"
secrets: inherit
inductor-test:
name: inductor-test
linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
name: cuda12.8-py3.10-gcc9-sm86
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit
inductor-cpu-build:
name: inductor-cpu-build
linux-jammy-cpu-py3_9-gcc11-inductor-build:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
test-matrix: |
{ include: [
{ config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
@ -96,12 +98,12 @@ jobs:
build-additional-packages: "vision audio torchao"
secrets: inherit
inductor-cpu-test:
name: inductor-cpu-test
linux-jammy-cpu-py3_9-gcc11-inductor-test:
name: linux-jammy-cpu-py3.9-gcc11-inductor
uses: ./.github/workflows/_linux-test.yml
needs: inductor-cpu-build
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
secrets: inherit

View File

@ -24,38 +24,38 @@ permissions:
contents: read
jobs:
opbenchmark-build:
linux-jammy-cpu-py3_9-gcc11-opbenchmark-build:
if: github.repository_owner == 'pytorch'
name: opbenchmark-build
name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
test-matrix: |
{ include: [
{ config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
]}
secrets: inherit
opbenchmark-on-demand-build:
linux-jammy-cpu-py3_9-gcc11-opbenchmark-on-demand-build:
if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }}
name: opbenchmark-on-demand-build
name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
test-matrix: |
{ include: [
{ config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
]}
secrets: inherit
opbenchmark-test:
name: opbenchmark-test
linux-jammy-cpu-py3_9-gcc11-opbenchmark-test:
name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
uses: ./.github/workflows/_linux-test.yml
needs: opbenchmark-build
needs: linux-jammy-cpu-py3_9-gcc11-opbenchmark-build
with:
build-environment: linux-jammy-py3.9-gcc11-build
docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.test-matrix }}
secrets: inherit

View File

@ -170,38 +170,6 @@ jobs:
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda13_0-py3_10-gcc11-build:
name: linux-jammy-cuda13.0-py3.10-gcc11
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
cuda-arch-list: 7.5
build-environment: linux-jammy-cuda13.0-py3.10-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
test-matrix: |
{ include: [
{ config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda13_0-py3_10-gcc11-test:
name: linux-jammy-cuda13.0-py3.10-gcc11
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda13_0-py3_10-gcc11-build
- target-determination
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc11
docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-rocm-py3_10-build:
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_linux-build.yml

View File

@ -224,12 +224,13 @@ jobs:
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
secrets: inherit
inductor-build:
name: inductor-build
# NB: Keep this in sync with inductor-perf-test-nightly.yml
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
name: cuda12.8-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
secrets: inherit
@ -241,7 +242,7 @@ jobs:
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
test-matrix: |
{ include: [
{ config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },

View File

@ -1801,26 +1801,3 @@ command = [
"python3",
"tools/linter/adapters/gb_registry_linter.py",
]
[[linter]]
code = 'DISTRIBUTED_C10D_DIRECT_ACCESS'
include_patterns = ['**/*.py']
exclude_patterns = [
'torch/distributed/_distributed_c10d.py',
'fb/**',
'**/fb/**',
]
command = [
'python3',
'tools/linter/adapters/grep_linter.py',
'--pattern=torch\._C\._distributed_c10d',
'--linter-name=DISTRIBUTED_C10D_DIRECT_ACCESS',
'--error-name=direct access to torch._C._distributed_c10d',
"""--error-description=\
Never access torch._C._distributed_c10d directly in code. Always \
import from and use torch.distributed._distributed_c10d which is \
guaranteed to have all functions available\
""",
'--',
'@{{PATHSFILE}}'
]

View File

@ -22,6 +22,7 @@ COMMON_COPTS = [
"-DHAVE_SHM_UNLINK=1",
"-D_FILE_OFFSET_BITS=64",
"-DUSE_FBGEMM",
"-DUSE_DISTRIBUTED",
"-DAT_PER_OPERATOR_HEADERS",
"-DATEN_THREADING=NATIVE",
"-DNO_CUDNN_DESTROY_HANDLE",

View File

@ -181,9 +181,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
set(CPU_POWER ON)
endif()
# For non-supported platforms, turn USE_DISTRIBUTED off by default.
# NB: USE_DISTRIBUTED simply disables the backend; distributed code
# still gets built
# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
# tested and likely won't work without additional changes.
if(NOT LINUX AND NOT WIN32)
set(USE_DISTRIBUTED
OFF
@ -262,11 +261,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
option(USE_NATIVE_ARCH "Use -march=native" OFF)
cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
option(USE_DISTRIBUTED "Enable default distributed backends" ON)
option(USE_DISTRIBUTED "Use distributed" ON)
cmake_dependent_option(USE_NCCL "Use NCCL" ON
"USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
cmake_dependent_option(USE_XCCL "Use XCCL" ON
"USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
"USE_XPU;UNIX;NOT APPLE" OFF)
cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
@ -431,10 +430,11 @@ if(WIN32)
PATH_SUFFIXES lib
NO_DEFAULT_PATH)
if(NOT libuv_tmp_LIBRARY)
set(USE_DISTRIBUTED OFF)
set(USE_GLOO OFF)
message(
WARNING
"Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
"Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
"Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
)
else()

View File

@ -216,7 +216,7 @@ file(GLOB mem_eff_attention_cuda_cpp "native/transformers/cuda/mem_eff_attention
if(USE_CUDA AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
add_library(flash_attention OBJECT EXCLUDE_FROM_ALL ${flash_attention_cuda_kernels_cu} ${flash_attention_cuda_cpp})
target_include_directories(flash_attention SYSTEM PUBLIC
target_include_directories(flash_attention PUBLIC
${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc
${PROJECT_SOURCE_DIR}/third_party/flash-attention/include
${PROJECT_SOURCE_DIR}/third_party/cutlass/include

View File

@ -279,45 +279,10 @@ bool Context::userEnabledOverrideableSDP() const {
return enabled_overrideable;
}
static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
#ifdef USE_ROCM
static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32";
#endif
bool Context::checkCuBLASConfigDeterministic() {
// If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
// is set to deterministic setting
if (hasCUDART()) {
const auto workspace_config = c10::utils::get_env(cublas_config_var_name);
return (workspace_config == cublas_deterministic_configs[0] || workspace_config == cublas_deterministic_configs[1]);
}
return true;
}
void Context::alertCuBLASConfigNotDeterministic() const {
static const bool cublas_config_deterministic = checkCuBLASConfigDeterministic();
if (C10_LIKELY(!deterministicAlgorithms() || cublas_config_deterministic)) {
return;
}
auto msg = c10::str(
"Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or ",
"`at::Context::setDeterministicAlgorithms(true)`, but this operation is not deterministic because ",
"it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this ",
"case, you must set an environment variable before running your PyTorch application: ",
cublas_config_var_name, "=", cublas_deterministic_configs[0], " or ",
cublas_config_var_name, "=", cublas_deterministic_configs[1], ". For more information, go to ",
"https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility"
);
if (deterministicAlgorithmsWarnOnly()) {
TORCH_WARN(msg);
} else {
TORCH_CHECK(false, msg);
}
}
bool Context::benchmarkCuDNN() const {
return benchmark_cudnn;
}

View File

@ -310,13 +310,7 @@ class TORCH_API Context {
//
// * Throw an error when `Context::deterministicAlgorithms()` is true. Most
// of the time, this should be accomplished by calling
// `at::globalContext().alertNotDeterminstic()`. However, if the
// nondeterministic behavior is caused by the CuBLAS workspace
// configuration in CUDA >= 10.2,
// `at::globalContext().alertCuBLASConfigNotDeterministic()` should be
// called instead (in this case, a comment explaining why the operation is
// nondeterministic is not necessary). See below for details on these
// methods.
// `at::globalContext().alertNotDeterminstic().
//
// * Have an entry in the list of nondeterministic PyTorch operations in the
// docstring of `use_deterministic_algorithms()` in torch/__init__.py
@ -340,12 +334,6 @@ class TORCH_API Context {
// Throws an error if `Context::deterministicAlgorithms()` is true
static void alertNotDeterministic(std::string_view const& caller);
// Throws an error if `Context::deterministicAlgorithms()` is true, CUDA
// >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or
// ":4096:8". For more details:
// https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
void alertCuBLASConfigNotDeterministic() const;
void setFloat32MatmulPrecision(const std::string& s);
void setFloat32Precision(
const std::string& backend,
@ -429,7 +417,6 @@ class TORCH_API Context {
}
private:
static bool checkCuBLASConfigDeterministic();
std::array<c10::once_flag, at::COMPILE_TIME_MAX_DEVICE_TYPES> init_;
bool enabled_cudnn = true;
bool deterministic_cudnn = false;

View File

@ -436,7 +436,6 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublaslt: not implemented");
}
globalContext().alertCuBLASConfigNotDeterministic();
cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -570,8 +569,6 @@ inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_D
template <>
void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -583,8 +580,6 @@ void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
template <>
void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -596,8 +591,6 @@ void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
template <>
void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -611,8 +604,6 @@ void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::co
template <>
void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -626,8 +617,6 @@ void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::com
template <typename C_Dtype>
inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -697,8 +686,6 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
template <typename C_Dtype>
inline void bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
BGEMM_CHECK_ARGVALUES(at::BFloat16);
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
@ -1027,8 +1014,6 @@ inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dty
template <>
void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1040,8 +1025,6 @@ void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
template <>
void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1053,8 +1036,6 @@ void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
template <>
void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1068,8 +1049,6 @@ void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::comp
template <>
void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1083,8 +1062,6 @@ void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::compl
template <typename C_Dtype>
inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1191,7 +1168,6 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
template <typename C_Dtype>
inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1947,11 +1923,11 @@ void scaled_gemm(
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
cublasLtMatmulDescAttributes_t matmulDescA = CUBLASLT_MATMUL_DESC_A_SCALE_POINTER;
cublasLtMatmulDescAttributes_t matmulDescB = CUBLASLT_MATMUL_DESC_B_SCALE_POINTER;
#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
// hipblaslt supported row-wise before cublas, and did so their own way (via
// the SCALE_POINTERSs), but then migrated to match how cublas does it (via
// the SCALE_MODEs). Here we check for this early custom mode.
bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise);
#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
if (use_rowwise) {
matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
@ -1966,8 +1942,12 @@ void scaled_gemm(
}
#endif
}
#else
// rowwise isn't supported using cublaslt or older hipblaslt
#elif (CUDA_VERSION < 12080) && !defined(USE_ROCM)
// hipblaslt supported row-wise before cublas, and did so their own way (via
// the SCALE_POINTERSs), but then migrated to match how cublas does it (via
// the SCALE_MODEs). Here we check for this early custom mode.
bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise);
// rowwise isn't supported using older cublaslt or older hipblaslt
TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt");
#endif // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
computeDesc.setAttribute(matmulDescA, mat1_scale_ptr);
@ -2411,8 +2391,6 @@ void trsmBatched<c10::complex<double>>(
template <>
void gemv<c10::complex<double>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<double>)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t op = _cublasOpFromChar(trans);
_cublasAdjustLdLevel2(m, n, &lda);
@ -2428,8 +2406,6 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
// gemv is bw bound, and does not benefit from TF32. But the precision
// loss still happens on TF32. So we disable it here.
NoTF32Guard disable_tf32;
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t op = _cublasOpFromChar(trans);
_cublasAdjustLdLevel2(m, n, &lda);
@ -2442,8 +2418,6 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
template <>
void gemv<double>(CUDABLAS_GEMV_ARGTYPES(double)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t op = _cublasOpFromChar(trans);
_cublasAdjustLdLevel2(m, n, &lda);
@ -2457,8 +2431,6 @@ void gemv<float>(CUDABLAS_GEMV_ARGTYPES(float)) {
// gemv is bw bound, and does not benefit from TF32. But the precision
// loss still happens on TF32. So we disable it here.
NoTF32Guard disable_tf32;
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t op = _cublasOpFromChar(trans);
_cublasAdjustLdLevel2(m, n, &lda);

View File

@ -185,17 +185,6 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
// right: "lro, summed, ro" permuted with rpermutation and the three flattened
// then the permuted output is a view of bmm(left, right)
// finally, opermutation reverts the permutation to the original order of dimensions
// By default the output is "lro, lo, 1-for-summed-dims, ro" with original shape dimensions.
// However, if all dimensions from the right operand appear before those from the left
// operand in the final output, we can swap the operands so that bmm directly produces
// the result in the correct memory order.
bool swap_lo_ro = !lo.empty() && !ro.empty() && ro.back() < lo.front();
if (swap_lo_ro) {
std::swap(left, right);
std::swap(lo, ro);
std::swap(lo_size, ro_size);
}
auto out_num_dim = lro.size() + lo.size() + sum_dims_.size() + ro.size();
std::vector<SymInt> out_size;
out_size.reserve(out_num_dim);

View File

@ -156,7 +156,7 @@ void cpu_padding(
int64_t offset_h = ndim >= 2 ? p.offsets[ndim - 2] : 0;
int64_t offset_w = p.offsets[ndim - 1];
// do vectorized copy when output is overlapped with input on W,
// do vectorized copy whe output is overlapped with input on W,
// only applies to positive padding
auto loop = [=](scalar_t* out, const scalar_t* in, bool positive_padding) {
if (positive_padding) {

View File

@ -318,7 +318,7 @@ batch_norm_cpu_collect_stats_channels_last_impl(
//
// The optimal THRESHOLD to tile was found empirically.
// When C > THRESHOLD, C is large enough that the benefit from tiling and vectorization outweigh the synchronization overhead.
// When C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization.
// Wehn C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization.
//
// When num_threads == 1, always use Method 2 as there is no synchronization overhead.
//

View File

@ -1665,7 +1665,7 @@ const std::optional<at::Tensor>& bias,
const std::optional<at::Tensor>& scale_result,
std::optional<c10::ScalarType> out_dtype,
bool use_fast_accum) {
bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/false);
bool allowed_device = _scaled_mm_allowed_device();
TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0, or ROCm MI300+");
TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed");

View File

@ -20,7 +20,7 @@
// SegmentReduce compilation with CUDA-12.9 causes NVCC crash on Windows
// See https://github.com/pytorch/pytorch/issues/156181
#if !(defined(_WIN32) && CUDART_VERSION == 12090)
#if !defined(_WIN32) || CUDART_VERSION < 12090
namespace at::native {
@ -606,4 +606,4 @@ REGISTER_DISPATCH(
} // namespace at::native
#endif
#endif

View File

@ -464,7 +464,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
}
#endif
int32_t trailingSize;
TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam;
if (isInOutAligned) {
// in this case we can and should flatten the tensors after the cat dim
// we want to view the tensors as if consisting of `alignment`-sized elements
@ -476,15 +475,16 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
// then we need to divide last out size by elems_per_vec,
// and divide all strides except last by elems_per_vec (last stride is 1 always)
// for input, we will fix up the sizes and strides in the kernel directly
kernelOutputParam = outputParam;
nDims = dimension + 1;
constexpr auto elems_per_vec = alignment / sizeof(scalar_t);
auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1];
kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec;
auto out_size = dimension == 0 ? out.numel() : outputParam.tensorStride[dimension-1];
outputParam.tensorSize[dimension] = out_size / elems_per_vec;
trailingSize = outputParam.tensorStride[dimension];
kernelOutputParam.tensorStride[dimension] = 1;
for (int i = 0; i < dimension; ++i) {
kernelOutputParam.tensorStride[i] /= elems_per_vec;
outputParam.tensorStride[dimension] = 1;
for (int i = 0; i < nDims; ++i) {
if (i!=dimension) {
outputParam.tensorStride[i] /= elems_per_vec;
}
}
}
@ -505,7 +505,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \
CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\
catGrid, applyBlock, 0, stream.stream()>>>(\
(char*)data, catMetaData, kernelOutputParam, dimension, trailingSize);\
(char*)data, catMetaData, outputParam, dimension, trailingSize);\
} else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\
catGrid, applyBlock, 0, stream.stream()>>>(\

View File

@ -1,7 +1,5 @@
#include <ATen/core/op_registration/op_registration.h>
#include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
#include <ATen/native/mkldnn/xpu/qconv.h>
#include <c10/core/MemoryFormat.h>
#include <c10/core/ScalarType.h>
#include <torch/library.h>
@ -9,7 +7,7 @@
using namespace at::native::onednn;
namespace at::native::xpu {
inline c10::ScalarType QConvoneDNNXPU::qconv_decide_out_dtype(
static inline c10::ScalarType qconv_decide_out_dtype(
const at::Tensor& act,
const std::optional<c10::ScalarType> output_dtype) {
bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
@ -21,7 +19,7 @@ inline c10::ScalarType QConvoneDNNXPU::qconv_decide_out_dtype(
return dst_dtype;
}
at::Tensor QConvoneDNNXPU::qconv_prepack_xpu(
static at::Tensor qconv_prepack_xpu(
at::Tensor weight,
at::Tensor weight_scales,
double input_scale,
@ -35,265 +33,222 @@ at::Tensor QConvoneDNNXPU::qconv_prepack_xpu(
return weight;
}
at::Tensor QConvoneDNNXPU::run_pointwise(
at::Tensor act,
double act_scale,
int64_t act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double inv_output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
std::string_view attr,
torch::List<std::optional<at::Scalar>> scalars,
std::optional<std::string_view> algorithm) {
if (act.dim() == 3 || act.dim() == 5) {
TORCH_CHECK(
attr == "none",
"quantized pointwise conv",
act.dim() - 2,
"d doesn't support unary_post_op fusion. Got unary_post_op:",
attr,
".");
} else {
TORCH_CHECK(
attr == "none" || attr == "relu" || attr == "hardtanh" ||
attr == "hardswish" || attr == "swish",
"We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:",
attr,
".");
class QConvoneDNNXPU final {
public:
static at::Tensor run_pointwise(
at::Tensor act,
double act_scale,
int64_t act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double inv_output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
std::string_view attr,
torch::List<std::optional<at::Scalar>> scalars,
std::optional<std::string_view> algorithm) {
if (act.dim() == 3 || act.dim() == 5) {
TORCH_CHECK(
attr == "none",
"quantized pointwise conv",
act.dim() - 2,
"d doesn't support unary_post_op fusion. Got unary_post_op:",
attr,
".");
} else {
TORCH_CHECK(
attr == "none" || attr == "relu" || attr == "hardtanh" ||
attr == "hardswish" || attr == "swish",
"We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:",
attr,
".");
}
bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
auto mfmt = is_channels_last_suggested
? get_cl_tag_by_ndim(act.ndimension())
: at::MemoryFormat::Contiguous;
Tensor input_ = act.contiguous(mfmt);
Tensor weight_ = weight.contiguous(mfmt);
auto dst_tz = conv_dst_size(
input_.ndimension(),
input_.sizes(),
weight_.sizes(),
padding.vec(),
padding.vec(),
stride.vec(),
dilation.vec());
auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
Tensor output =
at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
return quantized_convolution(
act,
act_scale,
act_zero_point,
weight,
weight_scales,
weight_zero_points,
bias,
stride,
padding,
dilation,
/*transposed*/ false,
groups,
output,
inv_output_scale,
output_zero_point,
/*accum*/ std::nullopt,
/*accum_scale*/ 0.0,
/*accum_zero_point*/ 0,
/*output_dtype*/ output_dtype,
/*binary_attr*/ std::nullopt,
/*binary_alpha*/ std::nullopt,
/*unary_attr*/ attr,
/*unary_scalars*/ scalars,
/*unary_algorithm*/ algorithm);
}
bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(act.ndimension())
: at::MemoryFormat::Contiguous;
Tensor input_ = act.contiguous(mfmt);
Tensor weight_ = weight.contiguous(mfmt);
auto dst_tz = conv_dst_size(
input_.ndimension(),
input_.sizes(),
weight_.sizes(),
padding.vec(),
padding.vec(),
stride.vec(),
dilation.vec());
auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
Tensor output =
at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
return quantized_convolution(
act,
act_scale,
act_zero_point,
weight,
weight_scales,
weight_zero_points,
bias,
stride,
padding,
dilation,
/*transposed*/ false,
groups,
output,
inv_output_scale,
output_zero_point,
/*accum*/ std::nullopt,
/*accum_scale*/ 0.0,
/*accum_zero_point*/ 0,
/*output_dtype*/ output_dtype,
/*binary_attr*/ std::nullopt,
/*binary_alpha*/ std::nullopt,
/*unary_attr*/ attr,
/*unary_scalars*/ scalars,
/*unary_algorithm*/ algorithm);
}
at::Tensor QConvoneDNNXPU::run_pointwise_tensor(
at::Tensor act,
at::Tensor act_scale,
at::Tensor act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
std::string_view attr,
torch::List<std::optional<at::Scalar>> scalars,
std::optional<std::string_view> algorithm) {
return run_pointwise(
act,
act_scale.item().toDouble(),
act_zero_point.item().toLong(),
weight,
weight_scales,
weight_zero_points,
bias,
stride,
padding,
dilation,
groups,
output_scale,
output_zero_point,
output_dtype,
/*unary_attr*/ attr,
/*unary_scalars*/ scalars,
/*unary_algorithm*/ algorithm);
}
at::Tensor QConvoneDNNXPU::run_pointwise_binary(
at::Tensor act,
double act_scale,
int64_t act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
at::Tensor accum,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
double accum_scale,
int64_t accum_zero_point,
std::string_view binary_attr,
std::optional<at::Scalar> alpha,
std::optional<std::string_view> unary_attr,
torch::List<std::optional<at::Scalar>> unary_scalars,
std::optional<std::string_view> unary_algorithm) {
TORCH_CHECK(
act.dim() == 4 && binary_attr == "sum" &&
(!unary_attr.has_value() ||
(unary_attr.has_value() &&
(unary_attr.value() == "none" || unary_attr.value() == "relu"))),
"post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ",
binary_attr,
" unary_post_op: ",
unary_attr.has_value() ? unary_attr.value() : "none",
".")
bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(act.ndimension())
: at::MemoryFormat::Contiguous;
Tensor input_ = act.contiguous(mfmt);
Tensor weight_ = weight.contiguous(mfmt);
auto dst_tz = conv_dst_size(
input_.ndimension(),
input_.sizes(),
weight_.sizes(),
padding.vec(),
padding.vec(),
stride.vec(),
dilation.vec());
auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
bool has_accum_postop_sum = binary_attr == "sum";
Tensor output = has_accum_postop_sum
? accum
: at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
output = quantized_convolution(
act,
act_scale,
act_zero_point,
weight,
weight_scales,
weight_zero_points,
bias,
stride,
padding,
dilation,
/*transposed*/ false,
groups,
output,
output_scale,
output_zero_point,
/*accum*/ accum,
/*accum_scale*/ accum_scale,
/*accum_zero_point*/ accum_zero_point,
/*output_dtype*/ output_dtype,
/*binary_attr*/ binary_attr,
/*binary_alpha*/ alpha,
/*unary_attr*/ unary_attr,
/*unary_scalars*/ unary_scalars,
/*unary_algorithm*/ unary_algorithm);
if (!has_accum_postop_sum) {
return output;
} else {
return accum;
static at::Tensor run_pointwise_tensor(
at::Tensor act,
at::Tensor act_scale,
at::Tensor act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
std::string_view attr,
torch::List<std::optional<at::Scalar>> scalars,
std::optional<std::string_view> algorithm) {
return run_pointwise(
act,
act_scale.item().toDouble(),
act_zero_point.item().toLong(),
weight,
weight_scales,
weight_zero_points,
bias,
stride,
padding,
dilation,
groups,
output_scale,
output_zero_point,
output_dtype,
/*unary_attr*/ attr,
/*unary_scalars*/ scalars,
/*unary_algorithm*/ algorithm);
}
}
at::Tensor QConvoneDNNXPU::run_pointwise_binary_tensor(
at::Tensor act, // contains quantized values but not QTensor
at::Tensor act_scale,
at::Tensor act_zero_point,
at::Tensor weight, // contains quantized values but not QTensor
at::Tensor weight_scales,
at::Tensor weight_zero_points,
at::Tensor accum, // contains quantized values but not QTensor
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
double accum_scale,
int64_t accum_zero_point,
std::string_view binary_attr,
std::optional<at::Scalar> alpha,
std::optional<std::string_view> unary_attr,
torch::List<std::optional<at::Scalar>> unary_scalars,
std::optional<std::string_view> unary_algorithm) {
return run_pointwise_binary(
act,
act_scale.item().toDouble(),
act_zero_point.item().toLong(),
weight,
weight_scales,
weight_zero_points,
accum,
bias,
stride,
padding,
dilation,
groups,
output_scale,
output_zero_point,
output_dtype,
accum_scale,
accum_zero_point,
binary_attr,
alpha,
unary_attr,
unary_scalars,
unary_algorithm);
}
static at::Tensor run_pointwise_binary(
at::Tensor act,
double act_scale,
int64_t act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
at::Tensor accum,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
double accum_scale,
int64_t accum_zero_point,
std::string_view binary_attr,
std::optional<at::Scalar> alpha,
std::optional<std::string_view> unary_attr,
torch::List<std::optional<at::Scalar>> unary_scalars,
std::optional<std::string_view> unary_algorithm) {
TORCH_CHECK(
act.dim() == 4 && binary_attr == "sum" &&
(!unary_attr.has_value() ||
(unary_attr.has_value() &&
(unary_attr.value() == "none" || unary_attr.value() == "relu"))),
"post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ",
binary_attr,
" unary_post_op: ",
unary_attr.has_value() ? unary_attr.value() : "none",
".")
bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
auto mfmt = is_channels_last_suggested
? get_cl_tag_by_ndim(act.ndimension())
: at::MemoryFormat::Contiguous;
Tensor input_ = act.contiguous(mfmt);
Tensor weight_ = weight.contiguous(mfmt);
auto dst_tz = conv_dst_size(
input_.ndimension(),
input_.sizes(),
weight_.sizes(),
padding.vec(),
padding.vec(),
stride.vec(),
dilation.vec());
auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
bool has_accum_postop_sum = binary_attr == "sum";
Tensor output = has_accum_postop_sum
? accum
: at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
output = quantized_convolution(
act,
act_scale,
act_zero_point,
weight,
weight_scales,
weight_zero_points,
bias,
stride,
padding,
dilation,
/*transposed*/ false,
groups,
output,
output_scale,
output_zero_point,
/*accum*/ accum,
/*accum_scale*/ accum_scale,
/*accum_zero_point*/ accum_zero_point,
/*output_dtype*/ output_dtype,
/*binary_attr*/ binary_attr,
/*binary_alpha*/ alpha,
/*unary_attr*/ unary_attr,
/*unary_scalars*/ unary_scalars,
/*unary_algorithm*/ unary_algorithm);
if (!has_accum_postop_sum) {
return output;
} else {
return accum;
}
}
};
TORCH_LIBRARY_IMPL(onednn, XPU, m) {
m.impl(
TORCH_SELECTIVE_NAME("onednn::qconv_prepack"),
TORCH_FN(QConvoneDNNXPU::qconv_prepack_xpu));
TORCH_FN(xpu::qconv_prepack_xpu));
m.impl(
TORCH_SELECTIVE_NAME("onednn::qconv1d_pointwise"),
QConvoneDNNXPU::run_pointwise);
@ -312,9 +267,6 @@ TORCH_LIBRARY_IMPL(onednn, XPU, m) {
m.impl(
TORCH_SELECTIVE_NAME("onednn::qconv_pointwise.tensor"),
QConvoneDNNXPU::run_pointwise_tensor);
m.impl(
TORCH_SELECTIVE_NAME("onednn::qconv2d_pointwise.binary_tensor"),
QConvoneDNNXPU::run_pointwise_binary_tensor);
}
} // namespace at::native::xpu

View File

@ -1,111 +0,0 @@
#pragma once
#include <ATen/Config.h>
#include <ATen/Tensor.h>
namespace at::native::xpu {
class QConvoneDNNXPU final {
public:
C10_API static at::Tensor run_pointwise(
at::Tensor act,
double act_scale,
int64_t act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double inv_output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
std::string_view attr,
torch::List<std::optional<at::Scalar>> scalars,
std::optional<std::string_view> algorithm);
C10_API static at::Tensor run_pointwise_tensor(
at::Tensor act,
at::Tensor act_scale,
at::Tensor act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
std::string_view attr,
torch::List<std::optional<at::Scalar>> scalars,
std::optional<std::string_view> algorithm);
C10_API static at::Tensor run_pointwise_binary(
at::Tensor act,
double act_scale,
int64_t act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
at::Tensor accum,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
double accum_scale,
int64_t accum_zero_point,
std::string_view binary_attr,
std::optional<at::Scalar> alpha,
std::optional<std::string_view> unary_attr,
torch::List<std::optional<at::Scalar>> unary_scalars,
std::optional<std::string_view> unary_algorithm);
C10_API static at::Tensor run_pointwise_binary_tensor(
at::Tensor act,
at::Tensor act_scale,
at::Tensor act_zero_point,
at::Tensor weight,
at::Tensor weight_scales,
at::Tensor weight_zero_points,
at::Tensor accum,
std::optional<at::Tensor> bias,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
double accum_scale,
int64_t accum_zero_point,
std::string_view binary_attr,
std::optional<at::Scalar> alpha,
std::optional<std::string_view> unary_attr,
torch::List<std::optional<at::Scalar>> unary_scalars,
std::optional<std::string_view> unary_algorithm);
static inline c10::ScalarType qconv_decide_out_dtype(
const at::Tensor& act,
const std::optional<c10::ScalarType> output_dtype);
static at::Tensor qconv_prepack_xpu(
at::Tensor weight,
at::Tensor weight_scales,
double input_scale,
int64_t input_zero_point,
torch::List<int64_t> stride,
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
std::optional<torch::List<int64_t>> input_shape);
};
} // namespace at::native::xpu

View File

@ -1,14 +1,13 @@
#include <torch/library.h>
#include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
#include <ATen/native/mkldnn/xpu/qlinear.h>
#include <c10/core/ScalarType.h>
using namespace at::native::onednn;
namespace at::native::xpu {
inline c10::ScalarType QLinearOnednnXPU::qlinear_decide_out_dtype(
static inline c10::ScalarType qlinear_decide_out_dtype(
const at::Tensor& act,
const std::optional<c10::ScalarType> output_dtype) {
bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
@ -20,7 +19,7 @@ inline c10::ScalarType QLinearOnednnXPU::qlinear_decide_out_dtype(
return dst_dtype;
}
Tensor QLinearOnednnXPU::q_linear_pointwise(
static Tensor q_linear_pointwise(
Tensor act,
double act_scale,
int64_t act_zero_point,
@ -79,7 +78,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise(
return qout;
}
Tensor QLinearOnednnXPU::q_linear_pointwise_tensor(
static Tensor q_linear_pointwise_tensor(
Tensor act,
Tensor act_scale,
Tensor act_zero_point,
@ -138,7 +137,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise_tensor(
return qout;
}
Tensor QLinearOnednnXPU::q_linear_pointwise_binary(
static Tensor q_linear_pointwise_binary(
Tensor act,
double act_scale,
int64_t act_zero_point,
@ -209,7 +208,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise_binary(
return dim == 3 ? qout.reshape({act.size(0), -1, N}) : qout;
}
Tensor QLinearOnednnXPU::q_linear_pointwise_binary_tensor(
static Tensor q_linear_pointwise_binary_tensor(
Tensor act,
Tensor act_scale,
Tensor act_zero_point,
@ -249,7 +248,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise_binary_tensor(
unary_post_op_algorithm);
}
Tensor QLinearOnednnXPU::q_linear_prepack_onednn(
static at::Tensor q_linear_prepack_onednn(
at::Tensor weight,
std::optional<torch::List<int64_t>> input_shape) {
at::Tensor weight_transposed = weight.transpose(0, 1);
@ -259,19 +258,19 @@ Tensor QLinearOnednnXPU::q_linear_prepack_onednn(
TORCH_LIBRARY_IMPL(onednn, XPU, m) {
m.impl(
TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise"),
TORCH_FN(QLinearOnednnXPU::q_linear_pointwise));
TORCH_FN(q_linear_pointwise));
m.impl(
TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.tensor"),
TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_tensor));
TORCH_FN(q_linear_pointwise_tensor));
m.impl(
TORCH_SELECTIVE_NAME("onednn::qlinear_prepack"),
TORCH_FN(QLinearOnednnXPU::q_linear_prepack_onednn));
TORCH_FN(q_linear_prepack_onednn));
m.impl(
TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary"),
TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_binary));
TORCH_FN(q_linear_pointwise_binary));
m.impl(
TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary_tensor"),
TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_binary_tensor));
TORCH_FN(q_linear_pointwise_binary_tensor));
}
} // namespace at::native::xpu

View File

@ -1,91 +0,0 @@
#pragma once
#include <ATen/Config.h>
#include <ATen/Tensor.h>
#include <ATen/core/List.h>
namespace at::native::xpu {
class QLinearOnednnXPU final {
public:
C10_API static Tensor q_linear_pointwise(
Tensor act,
double act_scale,
int64_t act_zero_point,
Tensor weight,
Tensor weight_scales,
Tensor weight_zero_points,
std::optional<Tensor> bias,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
std::string_view post_op_name,
torch::List<std::optional<at::Scalar>> post_op_args,
std::string_view post_op_algorithm);
C10_API static Tensor q_linear_pointwise_tensor(
Tensor act,
Tensor act_scale,
Tensor act_zero_point,
Tensor weight,
Tensor weight_scales,
Tensor weight_zero_points,
std::optional<Tensor> bias,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
std::string_view post_op_name,
torch::List<std::optional<at::Scalar>> post_op_args,
std::string_view post_op_algorithm);
C10_API static Tensor q_linear_pointwise_binary(
Tensor act,
double act_scale,
int64_t act_zero_point,
Tensor weight,
Tensor weight_scales,
Tensor weight_zero_points,
std::optional<at::Tensor> other,
std::optional<Tensor> bias,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
double other_scale,
int64_t other_zero_point,
std::string_view binary_post_op,
double binary_alpha,
std::string_view unary_post_op,
torch::List<std::optional<at::Scalar>> unary_post_op_args,
std::string_view unary_post_op_algorithm);
C10_API static Tensor q_linear_pointwise_binary_tensor(
Tensor act,
Tensor act_scale,
Tensor act_zero_point,
Tensor weight,
Tensor weight_scales,
Tensor weight_zero_points,
std::optional<at::Tensor> other,
std::optional<Tensor> bias,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
double other_scale,
int64_t other_zero_point,
std::string_view binary_post_op,
double binary_alpha,
std::string_view unary_post_op,
torch::List<std::optional<at::Scalar>> unary_post_op_args,
std::string_view unary_post_op_algorithm);
C10_API static Tensor q_linear_prepack_onednn(
at::Tensor weight,
std::optional<torch::List<int64_t>> input_shape);
static inline c10::ScalarType qlinear_decide_out_dtype(
const at::Tensor& act,
const std::optional<c10::ScalarType> output_dtype);
}; // class QLinearOnednnXPU
} // namespace at::native::xpu

View File

@ -2,7 +2,6 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/MemoryOverlap.h>
#include <ATen/WrapDimUtils.h>
#include <ATen/native/SortingUtils.h>
#include <ATen/native/TensorShape.h>
#include <ATen/native/TypeProperties.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
@ -12,85 +11,10 @@
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/kthvalue_native.h>
#include <ATen/ops/sort.h>
#include <ATen/ops/sort_native.h>
#endif
namespace at::native {
namespace {
void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& values, Tensor& indices) {
using namespace mps;
if (self.dim() == 0 && self.numel() == 1) {
values.copy_(self);
indices.zero_();
return;
}
// Handle empty tensors
if (self.numel() == 0) {
values.copy_(self);
indices.copy_(values.toType(at::ScalarType::Long));
return;
}
// issue #154890, raising error to prevent crash within MPSGraph until
// workaround is implemented.
TORCH_CHECK(self.dim() - dim <= 4, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890");
auto stream = getCurrentMPSStream();
struct CachedGraph : public MPSCachedGraph {
CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
MPSGraphTensor *selfTensor = nil, *valuesTensor = nil, *indicesTensor = nil;
};
// MPSGraph kthvalue is always sorted.
@autoreleasepool {
// Input as placeholders
MPSShape* input_shape = getMPSShape(self);
NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
std::string key = std::string("kthvalue:") + [ns_shape_key UTF8String] + ":" + getMPSTypeString(self) + ":k" +
std::to_string(k) + ":dim" + std::to_string(dim);
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), input_shape);
MPSGraphTensor* castInputTensor = newCachedGraph->selfTensor;
MPSDataType dataType = getMPSDataType(self);
// #issue 104398441 sortWithTensor and argsortWithTensor
if (dataType != MPSDataTypeInt32 && dataType != MPSDataTypeFloat32 && dataType != MPSDataTypeFloat16) {
dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32;
castInputTensor = [mpsGraph castTensor:newCachedGraph->selfTensor toType:dataType name:@"castInputTensor"];
}
MPSGraphTensor* sortedTensor = [mpsGraph sortWithTensor:castInputTensor
axis:(NSUInteger)dim
descending:false
name:nil];
sortedTensor = [mpsGraph sliceTensor:sortedTensor
dimension:(NSUInteger)dim
start:((NSUInteger)k - 1)
length:1
name:nil];
MPSGraphTensor* argSortedTensor = [mpsGraph argSortWithTensor:castInputTensor
axis:(NSInteger)dim
descending:false
name:@"kthvalue_out"];
argSortedTensor = [mpsGraph sliceTensor:argSortedTensor
dimension:dim
start:((NSUInteger)k - 1)
length:1
name:nil];
newCachedGraph->valuesTensor = sortedTensor;
newCachedGraph->indicesTensor = argSortedTensor;
});
Placeholder inputPlaceholder = Placeholder(cachedGraph->selfTensor, self);
// Outputs as placeholders
Placeholder valuesPlaceholder = Placeholder(cachedGraph->valuesTensor, values);
Placeholder indicesPlaceholder = Placeholder(cachedGraph->indicesTensor, indices);
// Create dictionary of inputs and outputs
auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
auto results = dictionaryFromPlaceholders(valuesPlaceholder, indicesPlaceholder);
runMPSGraph(stream, cachedGraph->graph(), feeds, results);
}
}
} // anonymous namespace
// sort
TORCH_IMPL_FUNC(sort_stable_out_mps)
@ -157,31 +81,4 @@ TORCH_IMPL_FUNC(sort_stable_out_mps)
runMPSGraph(stream, cachedGraph->graph(), feeds, results);
}
}
std::tuple<Tensor&, Tensor&> kthvalue_out_mps(const Tensor& self,
int64_t k,
int64_t dim_,
bool keepdim,
Tensor& values,
Tensor& indices) {
// See note [Writing Nondeterministic Operations]
// If there are duplicate elements of the kth value, the procedure for choosing which
// of the duplicates to use for the indices output is nondeterministic.
at::globalContext().alertNotDeterministic("kthvalue MPS");
int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
int64_t slicesize = self.dim() == 0 ? 1 : self.size(dim);
TORCH_CHECK(k >= 1 && k <= slicesize, "kthvalue(): selected number k out of range for dimension ", dim);
at::assert_no_overlap(self, values);
_reduction_with_indices_allocate_or_resize_output(values, indices, self, dim, keepdim);
kthvalue_out_mps_impl(self, k, dim, values, indices);
if (!keepdim) {
values.squeeze_(dim);
indices.squeeze_(dim);
}
return std::forward_as_tuple(values, indices);
}
} // namespace at::native

View File

@ -335,9 +335,6 @@ static void isin_Tensor_Tensor_out_mps(const Tensor& elements,
}
static void is_posneginf_helper(TensorIteratorBase& iter, bool is_neg) {
if (iter.numel() == 0) {
return;
}
const auto& self = iter.input(0);
auto& out = iter.output(0);
@autoreleasepool {

View File

@ -417,7 +417,6 @@ TORCH_IMPL_FUNC(sgn_out_mps)(const Tensor& self, const Tensor& output) {
Tensor& conj_physical_out_mps(const Tensor& self, Tensor& result) {
TORCH_CHECK(self.is_complex());
TORCH_CHECK(self.dtype() != at::kComplexDouble);
mps::unary_op(self, result, "conj", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
return [mpsGraph conjugateWithTensor:inputTensor name:nil];
});

View File

@ -340,8 +340,8 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: abs
SparseCPU, SparseCUDA, SparseMPS: abs_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr
SparseCPU, SparseCUDA: abs_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs
tags: [core, pointwise]
@ -350,16 +350,16 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: abs_
SparseCPU, SparseCUDA, SparseMPS: abs_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_
SparseCPU, SparseCUDA: abs_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs_
- func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
CPU, CUDA, MPS, MTIA: abs_out
SparseCPU, SparseCUDA, SparseMPS: abs_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_out
SparseCPU, SparseCUDA: abs_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_out
tags: pointwise
# Note [Adding an alias]
@ -428,7 +428,7 @@
variants: function, method
structured_delegate: sgn.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sgn_sparse
SparseCPU, SparseCUDA: sgn_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn
tags: pointwise
@ -437,7 +437,7 @@
variants: method
structured_delegate: sgn.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_
SparseCPU, SparseCUDA: sgn_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn_
tags: pointwise
@ -448,7 +448,7 @@
dispatch:
CPU, CUDA: sgn_out
MPS: sgn_out_mps
SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_out
SparseCPU, SparseCUDA: sgn_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_out
tags: pointwise
@ -476,7 +476,7 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: _conj_physical
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr
autogen: _conj_physical.out
- func: conj_physical(Tensor self) -> Tensor
@ -487,8 +487,8 @@
dispatch:
CPU, CUDA: conj_physical_out
MPS: conj_physical_out_mps
SparseCPU, SparseCUDA, SparseMPS: conj_physical_out_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr_out
SparseCPU, SparseCUDA: conj_physical_out_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr_out
tags: pointwise
- func: conj_physical_(Tensor(a!) self) -> Tensor(a!)
@ -554,7 +554,7 @@
structured_delegate: add.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse
SparseCPU, SparseCUDA, SparseMeta: add_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr
MkldnnCPU: mkldnn_add
ZeroTensor: add_zerotensor
@ -566,7 +566,7 @@
variants: method
structured_delegate: add.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse_
SparseCPU, SparseCUDA, SparseMeta: add_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_
MkldnnCPU: mkldnn_add_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add__Tensor
@ -582,7 +582,6 @@
dispatch:
SparseCPU, SparseMeta: add_out_sparse_cpu
SparseCUDA: add_out_sparse_cuda
SparseMPS: add_out_sparse_mps
SparseCsrCPU, SparseCsrMeta: add_out_sparse_compressed_cpu
SparseCsrCUDA: add_out_sparse_compressed_cuda
MkldnnCPU: mkldnn_add_out
@ -875,7 +874,7 @@
variants: function, method
structured_delegate: asinh.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: asinh_sparse
SparseCPU, SparseCUDA: asinh_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr
tags: [core, pointwise]
@ -883,7 +882,7 @@
variants: function, method
structured_delegate: asinh.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_
SparseCPU, SparseCUDA: asinh_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_
tags: pointwise
@ -893,7 +892,7 @@
dispatch:
CPU, CUDA: asinh_out
MPS: asinh_out_mps
SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_out
SparseCPU, SparseCUDA: asinh_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_out
tags: pointwise
@ -910,7 +909,7 @@
structured_delegate: atanh.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: atanh_sparse
SparseCPU, SparseCUDA: atanh_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr
tags: [core, pointwise]
@ -918,7 +917,7 @@
structured_delegate: atanh.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_
SparseCPU, SparseCUDA: atanh_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_
tags: pointwise
@ -928,7 +927,7 @@
dispatch:
CPU, CUDA: atanh_out
MPS: atanh_out_mps
SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_out
SparseCPU, SparseCUDA: atanh_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_out
tags: pointwise
# arctanh, alias for atanh
@ -965,7 +964,7 @@
variants: function, method
structured_delegate: asin.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: asin_sparse
SparseCPU, SparseCUDA: asin_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr
tags: [core, pointwise]
@ -974,7 +973,7 @@
variants: function, method
structured_delegate: asin.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: asin_sparse_
SparseCPU, SparseCUDA: asin_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_
tags: pointwise
@ -984,7 +983,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: asin_out
SparseCPU, SparseCUDA, SparseMPS: asin_sparse_out
SparseCPU, SparseCUDA: asin_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_out
tags: pointwise
@ -1002,7 +1001,7 @@
structured_delegate: atan.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: atan_sparse
SparseCPU, SparseCUDA: atan_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr
tags: [core, pointwise]
@ -1011,7 +1010,7 @@
structured_delegate: atan.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: atan_sparse_
SparseCPU, SparseCUDA: atan_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_
tags: pointwise
@ -1021,7 +1020,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: atan_out
SparseCPU, SparseCUDA, SparseMPS: atan_sparse_out
SparseCPU, SparseCUDA: atan_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_out
tags: pointwise
@ -1460,7 +1459,7 @@
structured_delegate: ceil.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: ceil_sparse
SparseCPU, SparseCUDA: ceil_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr
tags: [core, pointwise]
@ -1469,7 +1468,7 @@
structured_delegate: ceil.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_
SparseCPU, SparseCUDA: ceil_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_
tags: pointwise
@ -1479,7 +1478,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: ceil_out
SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_out
SparseCPU, SparseCUDA: ceil_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_out
tags: pointwise
@ -2407,7 +2406,7 @@
MPS: empty_mps
Meta: empty_meta_symint
MkldnnCPU: empty_mkldnn
SparseCPU, SparseCUDA, SparseMPS: empty_sparse
SparseCPU, SparseCUDA: empty_sparse
SparseMeta: empty_sparse_symint
SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
SparseCsrMeta: empty_sparse_compressed_symint
@ -2535,7 +2534,7 @@
structured_delegate: erf.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: erf_sparse
SparseCPU, SparseCUDA: erf_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr
tags: [core, pointwise]
@ -2544,7 +2543,7 @@
structured_delegate: erf.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: erf_sparse_
SparseCPU, SparseCUDA: erf_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_
tags: pointwise
@ -2554,7 +2553,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS, MTIA: erf_out
SparseCPU, SparseCUDA, SparseMPS: erf_sparse_out
SparseCPU, SparseCUDA: erf_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_out
tags: pointwise
@ -2620,7 +2619,7 @@
structured_delegate: expm1.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: expm1_sparse
SparseCPU, SparseCUDA: expm1_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr
tags: [core, pointwise]
@ -2629,7 +2628,7 @@
structured_delegate: expm1.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_
SparseCPU, SparseCUDA: expm1_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_
tags: pointwise
@ -2639,7 +2638,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: expm1_out
SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_out
SparseCPU, SparseCUDA: expm1_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_out
tags: pointwise
@ -2738,7 +2737,7 @@
structured_delegate: floor.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: floor_sparse
SparseCPU, SparseCUDA: floor_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr
tags: [core, pointwise]
@ -2747,7 +2746,7 @@
structured_delegate: floor.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: floor_sparse_
SparseCPU, SparseCUDA: floor_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_
tags: pointwise
@ -2757,7 +2756,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: floor_out
SparseCPU, SparseCUDA, SparseMPS: floor_sparse_out
SparseCPU, SparseCUDA: floor_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_out
tags: pointwise
@ -2765,7 +2764,7 @@
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
CPU, CUDA, MPS, MTIA: floor_divide
CPU, CUDA, MPS: floor_divide
SparseCPU, SparseCUDA: floor_divide_sparse
- func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@ -2799,7 +2798,7 @@
structured_delegate: frac.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: frac_sparse
SparseCPU, SparseCUDA: frac_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr
tags: pointwise
@ -2808,7 +2807,7 @@
structured_delegate: frac.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: frac_sparse_
SparseCPU, SparseCUDA: frac_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_
tags: pointwise
@ -2819,7 +2818,7 @@
dispatch:
CPU, CUDA: frac_out
MPS: frac_out_mps
SparseCPU, SparseCUDA, SparseMPS: frac_sparse_out
SparseCPU, SparseCUDA: frac_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_out
tags: pointwise
@ -3209,7 +3208,7 @@
dispatch:
CPU, CUDA, MPS, MTIA: isnan
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isnan
SparseCPU, SparseCUDA, SparseMPS: isnan_sparse
SparseCPU, SparseCUDA: isnan_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isnan_sparse_csr
autogen: isnan.out
tags: [core, pointwise]
@ -3290,7 +3289,6 @@
dispatch:
CPU: kthvalue_out_cpu
CUDA: kthvalue_out_cuda
MPS: kthvalue_out_mps
- func: kthvalue.dimname(Tensor self, SymInt k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
variants: function, method
@ -3338,21 +3336,21 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: nan_to_num
SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse
SparseCPU, SparseCUDA: nan_to_num_sparse
tags: pointwise
- func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!)
variants: function, method
dispatch:
CompositeExplicitAutograd: nan_to_num_
SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_
SparseCPU, SparseCUDA: nan_to_num_sparse_
tags: pointwise
- func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CPU, CUDA, MTIA: nan_to_num_out
MPS: nan_to_num_out_mps
SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_out
SparseCPU, SparseCUDA: nan_to_num_sparse_out
tags: pointwise
- func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
@ -3555,7 +3553,7 @@
structured_delegate: log1p.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: log1p_sparse
SparseCPU, SparseCUDA: log1p_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr
tags: [core, pointwise]
@ -3564,7 +3562,7 @@
structured_delegate: log1p.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_
SparseCPU, SparseCUDA: log1p_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_
tags: pointwise
@ -3574,7 +3572,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: log1p_out
SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_out
SparseCPU, SparseCUDA: log1p_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_out
tags: pointwise
@ -4666,7 +4664,7 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: rad2deg
SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse
SparseCPU, SparseCUDA: rad2deg_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr
tags: pointwise
@ -4674,14 +4672,14 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: rad2deg_
SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_
SparseCPU, SparseCUDA: rad2deg_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_
tags: pointwise
- func: rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CompositeExplicitAutograd: rad2deg_out
SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_out
SparseCPU, SparseCUDA: rad2deg_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_out
tags: pointwise
@ -4689,7 +4687,7 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: deg2rad
SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse
SparseCPU, SparseCUDA: deg2rad_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr
tags: pointwise
@ -4697,14 +4695,14 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: deg2rad_
SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_
SparseCPU, SparseCUDA: deg2rad_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_
tags: pointwise
- func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CompositeExplicitAutograd: deg2rad_out
SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_out
SparseCPU, SparseCUDA: deg2rad_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_out
tags: pointwise
@ -4930,7 +4928,7 @@
structured_delegate: neg.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: neg_sparse
SparseCPU, SparseCUDA: neg_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg
tags: [core, pointwise]
@ -4940,7 +4938,7 @@
structured_delegate: neg.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: neg_sparse_
SparseCPU, SparseCUDA: neg_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg_
tags: pointwise
@ -4951,7 +4949,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS, MTIA: neg_out
SparseCPU, SparseCUDA, SparseMPS: neg_out_sparse
SparseCPU, SparseCUDA: neg_out_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_out
tags: pointwise
# Alias for neg
@ -5035,7 +5033,7 @@
structured_delegate: round.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: round_sparse
SparseCPU, SparseCUDA: round_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr
tags: [core, pointwise]
@ -5044,7 +5042,7 @@
structured_delegate: round.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: round_sparse_
SparseCPU, SparseCUDA: round_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_
tags: pointwise
@ -5054,7 +5052,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: round_out
SparseCPU, SparseCUDA, SparseMPS: round_sparse_out
SparseCPU, SparseCUDA: round_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_out
tags: pointwise
@ -5097,7 +5095,7 @@
QuantizedCPU: relu_quantized_cpu
QuantizedCUDA: relu_quantized_cuda
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu
SparseCPU, SparseCUDA, SparseMPS: relu_sparse
SparseCPU, SparseCUDA: relu_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr
tags: [core, pointwise]
@ -5112,7 +5110,7 @@
QuantizedCPU: relu_quantized_cpu_
QuantizedCUDA: relu_quantized_cuda_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu_
SparseCPU, SparseCUDA, SparseMPS: relu_sparse_
SparseCPU, SparseCUDA: relu_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr_
autogen: relu.out
tags: pointwise
@ -5399,7 +5397,7 @@
variants: function, method
dispatch:
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr
SparseCPU, SparseCUDA, SparseMPS: sin_sparse
SparseCPU, SparseCUDA: sin_sparse
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sin
tags: [core, pointwise]
@ -5409,7 +5407,7 @@
variants: function, method
dispatch:
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_
SparseCPU, SparseCUDA, SparseMPS: sin_sparse_
SparseCPU, SparseCUDA: sin_sparse_
tags: pointwise
- func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -5419,7 +5417,7 @@
dispatch:
CPU, CUDA, MPS, MTIA: sin_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_out
SparseCPU, SparseCUDA, SparseMPS: sin_sparse_out
SparseCPU, SparseCUDA: sin_sparse_out
tags: pointwise
- func: sinc(Tensor self) -> Tensor
@ -5444,7 +5442,7 @@
structured_delegate: sinh.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sinh_sparse
SparseCPU, SparseCUDA: sinh_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr
tags: [core, pointwise]
@ -5453,7 +5451,7 @@
structured_delegate: sinh.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_
SparseCPU, SparseCUDA: sinh_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_
tags: pointwise
@ -5463,7 +5461,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: sinh_out
SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_out
SparseCPU, SparseCUDA: sinh_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_out
# Returns a copy of this `Variable` that is detached from its autograd graph.
@ -5906,7 +5904,7 @@
variants: function, method
dispatch:
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sqrt
SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse
SparseCPU, SparseCUDA: sqrt_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr
tags: [core, pointwise]
@ -5915,7 +5913,7 @@
structured_delegate: sqrt.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_
SparseCPU, SparseCUDA: sqrt_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_
tags: pointwise
@ -5925,7 +5923,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS, MTIA: sqrt_out
SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_out
SparseCPU, SparseCUDA: sqrt_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_out
tags: pointwise
@ -6063,7 +6061,7 @@
structured_delegate: tan.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: tan_sparse
SparseCPU, SparseCUDA: tan_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr
tags: [core, pointwise]
@ -6072,7 +6070,7 @@
structured_delegate: tan.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: tan_sparse_
SparseCPU, SparseCUDA: tan_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_
tags: pointwise
@ -6082,7 +6080,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: tan_out
SparseCPU, SparseCUDA, SparseMPS: tan_sparse_out
SparseCPU, SparseCUDA: tan_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_out
tags: pointwise
@ -6093,7 +6091,7 @@
dispatch:
QuantizedCPU: tanh_quantized_cpu
MkldnnCPU: mkldnn_tanh
SparseCPU, SparseCUDA, SparseMPS: tanh_sparse
SparseCPU, SparseCUDA: tanh_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh
tags: [core, pointwise]
@ -6104,7 +6102,7 @@
variants: function, method
dispatch:
MkldnnCPU: mkldnn_tanh_
SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_
SparseCPU, SparseCUDA: tanh_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh_
tags: pointwise
@ -6115,7 +6113,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS, MTIA: tanh_out
SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_out
SparseCPU, SparseCUDA: tanh_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_out
tags: pointwise
@ -6387,8 +6385,8 @@
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: trunc_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr
SparseCPU, SparseCUDA: trunc_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr
tags: [core, pointwise]
- func: trunc_(Tensor(a!) self) -> Tensor(a!)
@ -6396,8 +6394,8 @@
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_
SparseCPU, SparseCUDA: trunc_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_
tags: pointwise
- func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -6406,8 +6404,8 @@
device_check: NoCheck # TensorIterator
dispatch:
CPU, CUDA, MPS: trunc_out
SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_out
SparseCPU, SparseCUDA: trunc_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_out
tags: pointwise
# Alias for trunc
@ -7369,8 +7367,8 @@
- func: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor
variants: method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sparse_to_dense
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: sparse_compressed_to_dense
SparseCPU, SparseCUDA: sparse_to_dense
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_compressed_to_dense
MkldnnCPU: mkldnn_to_dense
autogen: _to_dense.out
@ -7396,8 +7394,8 @@
- func: dense_dim(Tensor self) -> int
variants: method
dispatch:
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: dense_dim_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: dense_dim_sparse_csr
SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr
CompositeExplicitAutograd: dense_dim_default
device_check: NoCheck
device_guard: False
@ -7530,7 +7528,7 @@
device_check: NoCheck # Allows copy into different device
variants: function
dispatch:
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: copy_sparse_
SparseCPU, SparseCUDA, SparseMeta: copy_sparse_
autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out
# By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors
@ -9721,7 +9719,7 @@
structured_delegate: sign.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sign_sparse
SparseCPU, SparseCUDA: sign_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr
tags: [core, pointwise]
@ -9730,7 +9728,7 @@
structured_delegate: sign.out
variants: method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sign_sparse_
SparseCPU, SparseCUDA: sign_sparse_
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_
tags: pointwise
@ -9741,7 +9739,7 @@
dispatch:
CPU, CUDA: sign_out
MPS: sign_out_mps
SparseCPU, SparseCUDA, SparseMPS: sign_sparse_out
SparseCPU, SparseCUDA: sign_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_out
tags: pointwise
@ -9749,7 +9747,7 @@
variants: function, method
structured_delegate: signbit.out
dispatch:
SparseCPU, SparseCUDA, SparseMPS: signbit_sparse
SparseCPU, SparseCUDA: signbit_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr
tags: pointwise
@ -9760,7 +9758,7 @@
CPU: signbit_out
CUDA: signbit_out
MPS: signbit_out_mps
SparseCPU, SparseCUDA, SparseMPS: signbit_sparse_out
SparseCPU, SparseCUDA: signbit_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr_out
tags: pointwise
@ -13264,7 +13262,7 @@
dispatch:
CompositeExplicitAutograd: isinf
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isinf
SparseCPU, SparseCUDA, SparseMPS: isinf_sparse
SparseCPU, SparseCUDA: isinf_sparse
SparseMeta: isinf_sparse_meta
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isinf_sparse_csr
autogen: isinf.out
@ -13280,7 +13278,7 @@
structured_delegate: isposinf.out
dispatch:
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isposinf
SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse
SparseCPU, SparseCUDA: isposinf_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr
tags: pointwise
@ -13289,7 +13287,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: isposinf_out
SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse_out
SparseCPU, SparseCUDA: isposinf_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr_out
tags: pointwise
@ -13298,7 +13296,7 @@
structured_delegate: isneginf.out
dispatch:
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isneginf
SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse
SparseCPU, SparseCUDA: isneginf_sparse
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr
tags: pointwise
@ -13307,7 +13305,7 @@
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: isneginf_out
SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse_out
SparseCPU, SparseCUDA: isneginf_sparse_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr_out
tags: pointwise

View File

@ -1,73 +0,0 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/native/SparseTensorUtils.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/native/sparse/SparseStubs.h>
#include <ATen/native/sparse/FlattenIndicesCommon.h>
#include <ATen/ExpandUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/_coalesce_native.h>
#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
#include <ATen/ops/empty_native.h>
#include <ATen/ops/zeros_native.h>
#endif
namespace at::native {
namespace {
using namespace mps;
using namespace at::sparse;
#ifndef PYTORCH_JIT_COMPILE_SHADERS
static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
#else
#include <ATen/native/mps/FlattenIndices_metallib.h>
#endif
Tensor flatten_indices_mps(const Tensor& indices, IntArrayRef size) {
TORCH_CHECK(indices.dim() == 2, "flatten_indices: indices must be 2D");
TORCH_CHECK(static_cast<size_t>(indices.size(0)) == size.size(),
"flatten_indices: indices.size(0) must equal size.size()");
const int64_t sparse_dim = indices.size(0);
const int64_t nnz = indices.size(1);
if (nnz == 0) {
return at::empty({0}, indices.options().dtype(kLong));
}
// Row-major multipliers for flattening: mul[d] = prod_{j>d}(size[j])
std::vector<int64_t> row_muls(sparse_dim);
row_muls[sparse_dim - 1] = 1;
for (int64_t i = sparse_dim - 2; i >= 0; --i) {
row_muls[i] = row_muls[i + 1] * size[i + 1];
}
auto flat_indices = at::empty({nnz}, indices.options().dtype(kLong));
auto stream = getCurrentMPSStream();
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
auto pipeline = lib.getPipelineStateForFunc("flatten_indices_kernel");
auto encoder = stream->commandEncoder();
[encoder setComputePipelineState:pipeline];
mtl_setArgs(encoder,
indices,
row_muls,
flat_indices,
static_cast<uint>(sparse_dim),
indices.strides()
);
mtl_dispatch1DJob(encoder, pipeline, nnz);
}
});
return flat_indices;
}
} // namespace
REGISTER_MPS_DISPATCH(flatten_indices_stub, &flatten_indices_mps)
} // namespace at::native

View File

@ -20,9 +20,46 @@ using namespace at::sparse;
#ifndef PYTORCH_JIT_COMPILE_SHADERS
static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
#else
#include <ATen/native/mps/Coalesce_metallib.h>
#include <ATen/native/mps/Sparse_metallib.h>
#endif
static Tensor flatten_indices(const Tensor& indices, IntArrayRef size) {
TORCH_CHECK(indices.dim() == 2, "flatten_indices: indices must be 2D");
TORCH_CHECK(static_cast<size_t>(indices.size(0)) == size.size(),
"flatten_indices: indices.size(0) must equal size.size()");
int64_t sparse_dim = indices.size(0);
int64_t nnz = indices.size(1);
if (nnz == 0) {
return at::empty({0}, indices.options().dtype(kLong));
}
std::vector<int64_t> strides(sparse_dim);
strides[sparse_dim - 1] = 1;
for (int64_t i = sparse_dim - 2; i >= 0; i--) {
strides[i] = strides[i + 1] * size[i + 1];
}
Tensor flat_indices = at::empty({nnz}, indices.options().dtype(kLong));
auto stream = getCurrentMPSStream();
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
auto pipeline = lib.getPipelineStateForFunc("flatten_indices_kernel");
auto encoder = stream->commandEncoder();
[encoder setComputePipelineState:pipeline];
mtl_setArgs(encoder, indices, strides, flat_indices, sparse_dim, nnz);
mtl_dispatch1DJob(encoder, pipeline, nnz);
}
});
return flat_indices;
}
static Tensor compute_output_positions(const Tensor& is_unique) {
int64_t nnz = is_unique.size(0);

View File

@ -1,169 +0,0 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/native/SparseTensorUtils.h>
#include <ATen/native/mps/OperationUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/_coalesce_native.h>
#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
#include <ATen/ops/cat.h>
#include <ATen/ops/add_native.h>
#include <ATen/ops/empty_native.h>
#include <ATen/ops/zeros_native.h>
#include <ATen/ops/result_type.h>
#include <ATen/ops/copy_sparse_to_sparse.h>
#include <ATen/ops/mul.h>
#endif
namespace at::native {
using namespace at::sparse;
Tensor& add_out_dense_sparse_mps(Tensor& out, const Tensor& dense, const SparseTensor& sparse, const Scalar& alpha);
Tensor& add_out_dense_sparse_mps(
Tensor& out,
const Tensor& dense,
const SparseTensor& sparse,
const Scalar& alpha) {
TORCH_CHECK(dense.is_mps(), "add: expected 'self' to be an MPS tensor, got ", dense.device());
TORCH_CHECK(sparse.is_mps(), "add: expected 'other' to be an MPS tensor, got ", sparse.device());
TORCH_CHECK(out.is_mps(), "add: expected 'out' to be an MPS tensor, got ", out.device());
TORCH_CHECK(dense.sizes().equals(sparse.sizes()),
"add: expected 'self' and 'other' to have same size, but self has size ",
dense.sizes(), " while other has size ", sparse.sizes(),
" (FYI: dense-sparse addition does not currently support broadcasting)");
const int64_t nnz = sparse._nnz();
if (nnz == 0) {
out.resize_as_(dense);
out.copy_(dense);
return out;
}
auto commonDtype = at::result_type(dense, sparse);
TORCH_CHECK(canCast(commonDtype, out.scalar_type()),
"Can't convert result type ", commonDtype, " to output ", out.scalar_type());
Tensor r;
const bool need_separate_buffer = out.is_same(dense) || (out.scalar_type() != commonDtype);
if (need_separate_buffer) {
r = at::empty(dense.sizes(), out.options().dtype(commonDtype));
} else {
r = out;
r.resize_as_(dense);
}
Tensor dense_buffer = dense.to(commonDtype);
if (!r.is_same(dense_buffer)) {
r.copy_(dense_buffer);
}
Tensor indices = sparse._indices();
Tensor values = sparse._values().to(commonDtype);
if (values.numel() == 0) {
if (!out.is_same(r)) {
out.resize_as_(dense);
out.copy_(r);
}
return out;
}
const int64_t nDim = r.dim();
const int64_t nDimI = sparse.sparse_dim();
TORCH_CHECK(nDimI >= 0 && nDimI <= nDim,
"Invalid sparse_dim=", nDimI, " for dense tensor of dim ", nDim);
Tensor indices1D = at::sparse::flatten_indices(indices, sparse.sizes()).contiguous();
int64_t view_rows = 1;
int64_t view_cols = 1;
for (int64_t i = 0; i < nDimI; i++) {
view_rows *= r.size(i);
}
for (int64_t i = nDimI; i < nDim; i++) {
view_cols *= r.size(i);
}
if (view_cols == 1) {
Tensor r_flat = r.reshape({view_rows});
Tensor values_1d = values.reshape({nnz});
r_flat.index_add_(0, indices1D, values_1d, alpha);
} else {
Tensor r_view = r.view({view_rows, view_cols});
Tensor values_2d = values.reshape({nnz, view_cols});
r_view.index_add_(0, indices1D, values_2d, alpha);
}
if (!out.is_same(r)) {
out.resize_as_(dense);
out.copy_(r);
}
return out;
}
SparseTensor& add_out_sparse_mps(const SparseTensor& self,
const SparseTensor& other,
const Scalar& alpha,
SparseTensor& out) {
TORCH_CHECK(other.is_sparse(), "add(sparse, dense) is not supported. Use add(dense, sparse) instead.");
TORCH_CHECK(self.is_mps(), "add: expected 'self' to be MPS, but got ", self.device());
TORCH_CHECK(other.is_mps(), "add: expected 'other' to be MPS, but got ", other.device());
TORCH_CHECK(out.is_mps(), "add: expected 'out' to be MPS, but got ", out.device());
if (!self.is_sparse()) {
return add_out_dense_sparse_mps(out, self, other, alpha);
}
auto commonDtype = at::result_type(self, other);
TORCH_CHECK(canCast(commonDtype, out.scalar_type()),
"Can't convert result type ", commonDtype, " to output ", out.scalar_type());
TORCH_CHECK(self.sizes().equals(other.sizes()),
"add: expected 'self' and 'other' to have same size, but ", self.sizes(), " != ", other.sizes());
TORCH_CHECK(is_same_density(self, other),
"add: expected 'self' and 'other' to have same density, but 'self' has ",
self.sparse_dim(), " sparse dimensions while 'other' has ", other.sparse_dim(), " sparse dimensions");
if (other._nnz() == 0) {
out.resize_as_(self);
Tensor vals = self._values();
if (vals.scalar_type() != out.scalar_type()) {
vals = vals.to(out.scalar_type());
}
alias_into_sparse(out, self._indices(), vals);
out._coalesced_(self.is_coalesced());
return out;
}
Tensor t_indices_ = self._indices();
Tensor s_indices_ = other._indices();
Tensor t_values_ = self._values().to(commonDtype);
Tensor s_values_ = other._values().to(commonDtype);
if (!alpha.isIntegral(false) || alpha.to<double>() != 1.0) {
s_values_ = at::mul(s_values_, alpha);
}
Tensor r_indices_ = at::cat({t_indices_, s_indices_}, 1);
Tensor r_values_ = at::cat({t_values_, s_values_ }, 0);
SparseTensor tmp = empty({0}, out.options().dtype(commonDtype));
tmp.resize_as_(other);
alias_into_sparse(tmp, r_indices_, r_values_);
tmp = _coalesce_sparse_mps(tmp);
out.resize_as_(other);
Tensor out_vals = tmp._values();
if (out.scalar_type() != commonDtype) {
out_vals = out_vals.to(out.scalar_type());
}
alias_into_sparse(out, tmp._indices(), out_vals);
out._coalesced_(tmp.is_coalesced());
return out;
}
} // namespace at::native

View File

@ -1,19 +0,0 @@
#include <metal_stdlib>
using namespace metal;
kernel void flatten_indices_kernel(
device const long* indices [[ buffer(0) ]],
device const long* row_muls [[ buffer(1) ]],
device long* flat_indices [[ buffer(2) ]],
constant uint& sparse_dim [[ buffer(3) ]],
constant long2& idx_strides [[ buffer(4) ]],
uint gid [[ thread_position_in_grid ]]) {
long flat = 0;
for (uint d = 0; d < sparse_dim; ++d) {
long off = (long)d * idx_strides.x + (long)gid * idx_strides.y;
long v = indices[off];
flat += v * row_muls[d];
}
flat_indices[gid] = flat;
}

View File

@ -2,6 +2,19 @@
#include <metal_stdlib>
using namespace metal;
kernel void flatten_indices_kernel(
device const int64_t* indices [[buffer(0)]],
device const int64_t* strides [[buffer(1)]],
device int64_t* flat_indices [[buffer(2)]],
constant uint& sparse_dim [[buffer(3)]],
constant uint& nnz [[buffer(4)]],
uint gid [[thread_position_in_grid]]) {
int64_t flat_idx = 0;
for (uint d = 0; d < sparse_dim; d++) {
flat_idx += indices[d * nnz + gid] * strides[d];
}
flat_indices[gid] = flat_idx;
}
kernel void compute_output_positions_kernel(
device const bool* is_unique [[buffer(0)]],

View File

@ -10,13 +10,8 @@ using namespace at::native::memory;
constexpr int buffer_size = 1024;
#if defined(CUDA_VERSION) && CUDA_VERSION < 13000
__managed__ double4 buffer1[buffer_size];
__managed__ double4 buffer2[buffer_size];
#else
__managed__ double4_16a buffer1[buffer_size];
__managed__ double4_16a buffer2[buffer_size];
#endif
void reset_buffers() {
for (int i = 0; i < buffer_size; i++) {

View File

@ -106,7 +106,7 @@ dlrm,pass,0
doctr_det_predictor,pass,3
doctr_det_predictor,pass,4

1 name accuracy graph_breaks
106
107
108
109
110
111
112

View File

@ -106,7 +106,7 @@ dlrm,pass,0
doctr_det_predictor,pass,3
doctr_det_predictor,pass,4

1 name accuracy graph_breaks
106
107
108
109
110
111
112

View File

@ -106,7 +106,7 @@ dlrm,pass,0
doctr_det_predictor,pass,3
doctr_det_predictor,pass,4

1 name accuracy graph_breaks
106
107
108
109
110
111
112

View File

@ -106,7 +106,7 @@ dlrm,pass,0
doctr_det_predictor,pass,3
doctr_det_predictor,pass,4

1 name accuracy graph_breaks
106
107
108
109
110
111
112

View File

@ -106,7 +106,7 @@ dlrm,pass,0
doctr_det_predictor,pass,3
doctr_det_predictor,pass,4

1 name accuracy graph_breaks
106
107
108
109
110
111
112

View File

@ -1427,13 +1427,23 @@ class AOTInductorModelCache:
inductor_configs = {}
if mode == "max-autotune":
inductor_configs["max_autotune"] = True
ep = torch.export.export(
model_clone,
example_args,
example_kwargs,
dynamic_shapes=dynamic_shapes,
strict=False,
)
# We can't support this in non-strict
if hasattr(model_clone, "name") and model.name == "levit_128":
ep = torch.export.export(
model_clone,
example_args,
example_kwargs,
dynamic_shapes=dynamic_shapes,
strict=True,
)
else:
ep = torch.export.export(
model_clone,
example_args,
example_kwargs,
dynamic_shapes=dynamic_shapes,
strict=True,
)
with torch.no_grad():
package_path = torch._inductor.aoti_compile_and_package(
ep, inductor_configs=inductor_configs
@ -2317,6 +2327,7 @@ class BenchmarkRunner:
# no need for n iterations
# the logic should be the same to self.model_iter_fn (forward_pass)
with self.autocast(**self.autocast_arg):
model_copy.name = name
optimized_model_iter_fn = optimize_ctx(
model_copy, example_inputs
)

View File

@ -759,13 +759,6 @@ libtorch_cuda_distributed_extra_sources = [
"torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
]
libtorch_nvshmem_sources = [
"torch/csrc/distributed/c10d/cuda/utils.cpp",
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
"torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu",
"torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu",
]
libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources
libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_sources + [

View File

@ -61,14 +61,11 @@ void* get_symbol(const char* name, int version) {
}
#endif
// As of CUDA 13, this API is deprecated.
#if defined(CUDA_VERSION) && (CUDA_VERSION < 13000)
// This fallback to the old API to try getting the symbol again.
if (auto st = cudaGetDriverEntryPoint(name, &out, cudaEnableDefault, &qres);
st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
return out;
}
#endif
// If the symbol cannot be resolved, report and return nullptr;
// the caller is responsible for checking the pointer.

View File

@ -540,9 +540,11 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
)
append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
if(NOT WIN32)
append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
if(USE_DISTRIBUTED)
append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
if(NOT WIN32)
append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
endif()
endif()
endif()
@ -566,30 +568,32 @@ if(USE_CUDA)
list(APPEND Caffe2_GPU_SRCS
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
endif()
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
if(NOT WIN32)
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
set_source_files_properties(
${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
)
endif()
if(USE_DISTRIBUTED)
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
if(NOT WIN32)
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
set_source_files_properties(
${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
)
endif()
set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
# Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
if(CMAKE_COMPILER_IS_GNUCXX)
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
endif()
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
# Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
if(CMAKE_COMPILER_IS_GNUCXX)
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
endif()
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
endif()
endif()
set_source_files_properties(
${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@ -622,9 +626,11 @@ if(USE_ROCM)
list(APPEND Caffe2_HIP_SRCS
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
endif()
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
if(NOT WIN32)
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
if(USE_DISTRIBUTED)
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
if(NOT WIN32)
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
endif()
endif()
# caffe2_nvrtc's stubs to driver APIs are useful for HIP.
# See NOTE [ ATen NVRTC Stub and HIP ]
@ -1057,7 +1063,7 @@ elseif(USE_CUDA)
UNFUSE_FMA # Addressing issue #121558
)
target_sources(torch_cuda PRIVATE $<TARGET_OBJECTS:flash_attention>)
target_include_directories(torch_cuda SYSTEM PUBLIC
target_include_directories(torch_cuda PUBLIC
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc>
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/flash-attention/include>
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/cutlass/include>
@ -1345,10 +1351,12 @@ if(BUILD_TEST)
add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
if(NOT WIN32)
add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
if(USE_DISTRIBUTED)
add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
if(NOT WIN32)
add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
endif()
endif()
if(NOT NO_API)
add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api)
@ -1453,40 +1461,46 @@ if(BUILD_LITE_INTERPRETER)
endif()
endif()
if(USE_GLOO AND USE_C10D_GLOO)
target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
endif()
if(USE_UCC AND USE_C10D_UCC)
target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
if(USE_CUDA)
target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
# Pass USE_DISTRIBUTED to torch_cpu, as some codes in jit/pickler.cpp and
# jit/unpickler.cpp need to be compiled only when USE_DISTRIBUTED is set
if(USE_DISTRIBUTED)
target_compile_definitions(torch_cpu PUBLIC USE_DISTRIBUTED)
if(USE_GLOO AND USE_C10D_GLOO)
target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
endif()
endif()
if(USE_NCCL AND USE_C10D_NCCL)
if(USE_ROCM)
target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
else()
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
if(USE_UCC AND USE_C10D_UCC)
target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
if(USE_CUDA)
target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
endif()
endif()
endif()
if(USE_MPI AND USE_C10D_MPI)
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set_source_files_properties(
"${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
if(USE_NCCL AND USE_C10D_NCCL)
if(USE_ROCM)
target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
else()
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
endif()
endif()
if(USE_MPI AND USE_C10D_MPI)
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set_source_files_properties(
"${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
endif()
target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
endif()
# Pass USE_RPC in order to reduce use of
# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
# need to be removed when RPC is supported
if(NOT WIN32)
target_compile_definitions(torch_cpu PUBLIC USE_RPC)
endif()
# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
# can only be compiled with USE_TENSORPIPE is set.
if(USE_TENSORPIPE)
target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
endif()
target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
endif()
# Pass USE_RPC in order to reduce use of
# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
# need to be removed when RPC is supported
if(NOT WIN32)
target_compile_definitions(torch_cpu PUBLIC USE_RPC)
endif()
# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
# can only be compiled with USE_TENSORPIPE is set.
if(USE_TENSORPIPE)
target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
endif()
if(NOT INTERN_BUILD_MOBILE)

View File

@ -1126,7 +1126,7 @@ if(USE_CUDA AND CUDA_VERSION VERSION_LESS 13.0)
include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
endif()
if(USE_TENSORPIPE)
if(USE_DISTRIBUTED AND USE_TENSORPIPE)
if(MSVC)
message(WARNING "Tensorpipe cannot be used on Windows.")
else()

View File

@ -191,11 +191,13 @@ function(caffe2_print_configuration_summary)
message(STATUS " USE_PYTORCH_QNNPACK : ${USE_PYTORCH_QNNPACK}")
message(STATUS " USE_XNNPACK : ${USE_XNNPACK}")
message(STATUS " USE_DISTRIBUTED : ${USE_DISTRIBUTED}")
message(STATUS " USE_MPI : ${USE_MPI}")
message(STATUS " USE_GLOO : ${USE_GLOO}")
message(STATUS " USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
message(STATUS " USE_GLOO_IBVERBS : ${USE_GLOO_IBVERBS}")
message(STATUS " USE_TENSORPIPE : ${USE_TENSORPIPE}")
if(${USE_DISTRIBUTED})
message(STATUS " USE_MPI : ${USE_MPI}")
message(STATUS " USE_GLOO : ${USE_GLOO}")
message(STATUS " USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
message(STATUS " USE_GLOO_IBVERBS : ${USE_GLOO_IBVERBS}")
message(STATUS " USE_TENSORPIPE : ${USE_TENSORPIPE}")
endif()
if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
message(STATUS " SELECTED_OP_LIST : ${SELECTED_OP_LIST}")
endif()

View File

@ -3331,6 +3331,13 @@ def coverage_post_process(app, exception):
if not isinstance(app.builder, CoverageBuilder):
return
if not torch.distributed.is_available():
raise RuntimeError(
"The coverage tool cannot run with a version "
"of PyTorch that was built with USE_DISTRIBUTED=0 "
"as this module's API changes."
)
# These are all the modules that have "automodule" in an rst file
# These modules are the ones for which coverage is checked
# Here, we make sure that no module is missing from that list

View File

@ -24,12 +24,17 @@ For Intel Client GPU
+-------------------------------------+----------------------------------------------------------------------------------------------------+
| Supported OS | Validated Hardware |
+=====================================+====================================================================================================+
|| Windows 11 & Ubuntu 24.04/25.04 || Intel® Arc A-Series Graphics (CodeName: Alchemist) |
|| Windows 11 & Ubuntu 24.10 || Intel® Arc A-Series Graphics (CodeName: Alchemist) |
|| || Intel® Arc B-Series Graphics (CodeName: Battlemage) |
|| || Intel® Core™ Ultra Processors with Intel® Arc™ Graphics (CodeName: Meteor Lake-H) |
|| || Intel® Core™ Ultra Desktop Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Lunar Lake) |
|| || Intel® Core™ Ultra Mobile Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Arrow Lake-H)|
+-------------------------------------+----------------------------------------------------------------------------------------------------+
|| Ubuntu 24.04 & WSL2 (Ubuntu 24.04) || Intel® Arc A-Series Graphics (CodeName: Alchemist) |
|| || Intel® Core™ Ultra Processors with Intel® Arc™ Graphics (CodeName: Meteor Lake-H) |
|| || Intel® Core™ Ultra Desktop Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Lunar Lake) |
|| || Intel® Core™ Ultra Mobile Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Arrow Lake-H)|
+-------------------------------------+----------------------------------------------------------------------------------------------------+
Intel GPUs support (Prototype) is ready from PyTorch* 2.5 for Intel® Client GPUs and Intel® Data Center GPU Max Series on both Linux and Windows, which brings Intel GPUs and the SYCL* software stack into the official PyTorch stack with consistent user experience to embrace more AI application scenarios.

View File

@ -125,10 +125,6 @@ deterministic implementation will be used::
[[ 0.1509, 1.8027],
[ 0.0333, -1.1444]]], device='cuda:0')
Furthermore, if you are using CUDA tensors, and your CUDA version is 10.2 or greater, you
should set the environment variable `CUBLAS_WORKSPACE_CONFIG` according to CUDA documentation:
`<https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility>`_
CUDA convolution determinism
----------------------------
While disabling CUDA convolution benchmarking (discussed above) ensures that

View File

@ -32,7 +32,7 @@ torch.cuda.memory._dump_snapshot("my_snapshot.pickle")
## Using the visualizer
Open <https://pytorch.org/memory_viz> and drag/drop the pickled snapshot file into the visualizer.
Open [pytorch.org/memory_viz](https://pytorch.org/memory_viz) and drag/drop the pickled snapshot file into the visualizer.
The visualizer is a javascript application that runs locally on your computer. It does not upload any snapshot data.

View File

@ -1,4 +1,4 @@
if(NOT WIN32)
if(USE_DISTRIBUTED AND NOT WIN32)
set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd")
set(DIST_AUTOGRAD_TEST_SOURCES
${TORCH_ROOT}/test/cpp/common/main.cpp

View File

@ -15,8 +15,6 @@ set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
set(CMAKE_CXX_VISIBILITY_PRESET hidden)
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
if(APPLE)
set(CMAKE_INSTALL_RPATH "@loader_path/lib;@loader_path")
elseif(UNIX)
@ -36,7 +34,6 @@ else()
message(FATAL_ERROR "Cannot find Python directory")
endif()
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
include(${PROJECT_SOURCE_DIR}/cmake/TorchPythonTargets.cmake)
add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/openreg)

View File

@ -164,13 +164,13 @@ at::Tensor view(const at::Tensor& self, c10::SymIntArrayRef size) {
// LITERALINCLUDE START: FALLBACK IMPL
void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
static const std::unordered_set<c10::OperatorName> cpu_fallback_blocklist = {
static const std::unordered_set<c10::OperatorName> cpu_fallback_blacklist = {
c10::OperatorName("aten::abs", ""),
c10::OperatorName("aten::abs", "out"),
};
const auto& op_name = op.schema().operator_name();
if (cpu_fallback_blocklist.count(op_name)) {
if (cpu_fallback_blacklist.count(op_name)) {
TORCH_CHECK(
false,
"Operator '",

View File

@ -1,146 +0,0 @@
#pragma once
#include <include/openreg.h>
#include "OpenRegException.h"
#include "OpenRegStream.h"
namespace c10::openreg {
struct OpenRegEvent {
OpenRegEvent(bool enable_timing) noexcept : enable_timing_{enable_timing} {}
~OpenRegEvent() {
if (is_created_) {
OPENREG_CHECK(orEventDestroy(event_));
}
}
OpenRegEvent(const OpenRegEvent&) = delete;
OpenRegEvent& operator=(const OpenRegEvent&) = delete;
OpenRegEvent(OpenRegEvent&& other) noexcept {
moveHelper(std::move(other));
}
OpenRegEvent& operator=(OpenRegEvent&& other) noexcept {
if (this != &other) {
moveHelper(std::move(other));
}
return *this;
}
operator orEvent_t() const {
return event();
}
std::optional<at::Device> device() const {
if (is_created_) {
return at::Device(at::kPrivateUse1, device_index_);
} else {
return std::nullopt;
}
}
bool isCreated() const {
return is_created_;
}
DeviceIndex device_index() const {
return device_index_;
}
orEvent_t event() const {
return event_;
}
bool query() const {
if (!is_created_) {
return true;
}
orError_t err = orEventQuery(event_);
if (err == orSuccess) {
return true;
}
return false;
}
void record() {
record(getCurrentOpenRegStream());
}
void recordOnce(const OpenRegStream& stream) {
if (!was_recorded_)
record(stream);
}
void record(const OpenRegStream& stream) {
if (!is_created_) {
createEvent(stream.device_index());
}
TORCH_CHECK(
device_index_ == stream.device_index(),
"Event device ",
device_index_,
" does not match recording stream's device ",
stream.device_index(),
".");
OPENREG_CHECK(orEventRecord(event_, stream));
was_recorded_ = true;
}
void block(const OpenRegStream& stream) {
if (is_created_) {
OPENREG_CHECK(orStreamWaitEvent(stream, event_, 0));
}
}
float elapsed_time(const OpenRegEvent& other) const {
TORCH_CHECK_VALUE(
!(enable_timing_ & orEventDisableTiming) &&
!(other.enable_timing_ & orEventDisableTiming),
"Both events must be created with argument 'enable_timing=True'.");
TORCH_CHECK_VALUE(
is_created_ && other.isCreated(),
"Both events must be recorded before calculating elapsed time.");
TORCH_CHECK(
query() && other.query(),
"Both events must be completed before calculating elapsed time.");
float time_ms = 0;
OPENREG_CHECK(orEventElapsedTime(&time_ms, event_, other.event_));
return time_ms;
}
void synchronize() const {
if (is_created_) {
OPENREG_CHECK(orEventSynchronize(event_));
}
}
private:
unsigned int enable_timing_{orEventDisableTiming};
bool is_created_{false};
bool was_recorded_{false};
DeviceIndex device_index_{-1};
orEvent_t event_{};
void createEvent(DeviceIndex device_index) {
device_index_ = device_index;
OPENREG_CHECK(orEventCreateWithFlags(&event_, enable_timing_));
is_created_ = true;
}
void moveHelper(OpenRegEvent&& other) {
std::swap(enable_timing_, other.enable_timing_);
std::swap(is_created_, other.is_created_);
std::swap(was_recorded_, other.was_recorded_);
std::swap(device_index_, other.device_index_);
std::swap(event_, other.event_);
}
};
} // namespace c10::openreg

View File

@ -1,9 +0,0 @@
#include "OpenRegException.h"
void orCheckFail(
const char* func,
const char* file,
uint32_t line,
const char* msg) {
throw ::c10::Error({func, file, line}, msg);
}

View File

@ -1,20 +0,0 @@
#pragma once
#include <include/openreg.h>
#include <c10/util/Exception.h>
void orCheckFail(
const char* func,
const char* file,
uint32_t line,
const char* msg = "");
#define OPENREG_CHECK(EXPR, ...) \
do { \
const orError_t __err = EXPR; \
if (__err != orSuccess) { \
orCheckFail( \
__func__, __FILE__, static_cast<uint32_t>(__LINE__), ##__VA_ARGS__); \
} \
} while (0)

View File

@ -1,6 +1,5 @@
#include <include/openreg.h>
#include "OpenRegException.h"
#include "OpenRegFunctions.h"
namespace c10::openreg {

View File

@ -1,10 +1,14 @@
#pragma once
#ifdef _WIN32
#define OPENREG_EXPORT __declspec(dllexport)
#else
#define OPENREG_EXPORT __attribute__((visibility("default")))
#endif
#include <c10/core/Device.h>
#include <c10/macros/Macros.h>
#include <include/Macros.h>
#include <limits>
namespace c10::openreg {

View File

@ -1,253 +0,0 @@
#include "OpenRegStream.h"
#include <c10/util/CallOnce.h>
#include <c10/util/Exception.h>
#include <c10/util/irange.h>
#include <array>
#include <atomic>
#include <cstdint>
#include <deque>
namespace c10::openreg {
namespace {
// Global stream state and constants
static c10::once_flag init_flag;
static DeviceIndex num_devices = -1;
static constexpr int kStreamsPerPoolBits = 5;
static constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
static constexpr int kStreamTypeBits = 2;
/*
* The stream pools are lazily initialized when the first queue is requested
* for a device. The device flags track the initialization of each device. When
* a queue is requested, the next queue in the pool to be returned in a
* round-robin fashion, see Note [Stream Management].
*/
static std::deque<c10::once_flag> device_flags;
static std::vector<std::array<
std::array<orStream_t, kStreamsPerPool>,
c10::openreg::max_compile_time_stream_priorities>>
streams;
static std::deque<
std::array<std::atomic<uint32_t>, max_compile_time_stream_priorities>>
priority_counters;
static thread_local std::unique_ptr<StreamId[]> current_streams = nullptr;
/*
* Note [StreamId assignment]
* ~~~~~~~~~~~~~~~~~~~~~~~~~~
* How do we assign stream IDs?
*
* -- 56 bits -- -- 5 bits -- -- 2 bits -- -- 1 bit --
* zeros StreamIdIndex StreamIdType Ext/native stream
* ignored for ext ignored for ext
*
* Where StreamIdType:
* 00 = default stream
* 01 = normal stream
* 11 = external stream
*
* For external stream, StreamID is a orStream_t pointer. This means that last
* bit will always be 0. So when constructing StreamId for a native stream we
* set last bit to 1 to distinguish between native and external streams.
*
* StreamId is 64-bit, so we can just rely on regular promotion rules.
* We rely on StreamIdIndex and StreamIdType being non-negative;
*/
using StreamIdIndex = uint8_t;
enum class StreamIdType : uint8_t {
DEFAULT = 0x0,
NORMAL = 0x1,
EXT = 0x3,
};
inline std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
switch (s) {
case StreamIdType::DEFAULT:
return stream << "DEFAULT";
case StreamIdType::NORMAL:
return stream << "NORMAL";
case StreamIdType::EXT:
return stream << "EXT";
default:
break;
}
return stream << static_cast<int16_t>(s);
}
static inline StreamIdType streamIdType(StreamId s) {
// Externally allocated streams have their id being the orStream_ptr
// so the last bit will be 0
if (!(s & 1)) {
return StreamIdType(StreamIdType::EXT);
}
int mask_for_type = (1 << kStreamTypeBits) - 1;
auto st = static_cast<StreamIdType>((s >> 1) & mask_for_type);
TORCH_CHECK(
st == StreamIdType::DEFAULT || st == StreamIdType::NORMAL,
"invalid StreamId: ",
s);
return st;
}
static inline size_t streamIdIndex(StreamId s) {
return static_cast<size_t>(
(s >> (kStreamTypeBits + 1)) & ((1 << kStreamsPerPoolBits) - 1));
}
StreamId makeStreamId(StreamIdType st, size_t si) {
if (st == StreamIdType::EXT) {
return static_cast<StreamId>(0);
}
return (static_cast<StreamId>(si) << (kStreamTypeBits + 1)) |
(static_cast<StreamId>(st) << 1) | 1;
}
static void initGlobalStreamState() {
num_devices = device_count();
device_flags.resize(num_devices);
streams.resize(num_devices);
priority_counters.resize(num_devices);
}
static void initSingleDeviceStream(
int priority,
DeviceIndex device_index,
int i) {
auto& stream = streams[device_index][priority][i];
OPENREG_CHECK(orStreamCreateWithPriority(&stream, 0, priority));
priority_counters[device_index][priority] = 0;
}
// Creates stream pools for the specified device. It should be call only once.
static void initDeviceStreamState(DeviceIndex device_index) {
for (const auto i : c10::irange(kStreamsPerPool)) {
for (const auto p : c10::irange(max_compile_time_stream_priorities)) {
initSingleDeviceStream(p, device_index, i);
}
}
}
static void initOpenRegStreamsOnce() {
c10::call_once(init_flag, initGlobalStreamState);
if (current_streams) {
return;
}
// Inits current streams (thread local) to the last queue in the "normal
// priority" queue pool. Note: the queue pool have not been initialized yet.
// It will be initialized in initDeviceStreamState for the specified device.
current_streams = std::make_unique<StreamId[]>(num_devices);
for (const auto i : c10::irange(num_devices)) {
current_streams[i] = makeStreamId(StreamIdType::DEFAULT, 0);
}
}
static uint32_t get_idx(std::atomic<uint32_t>& counter) {
auto raw_idx = counter++;
return raw_idx % kStreamsPerPool;
}
OpenRegStream OpenRegStreamForId(DeviceIndex device_index, StreamId stream_id) {
return OpenRegStream(
OpenRegStream::UNCHECKED,
Stream(
Stream::UNSAFE,
c10::Device(DeviceType::PrivateUse1, device_index),
stream_id));
}
} // anonymous namespace
// See Note [StreamId assignment]
orStream_t OpenRegStream::stream() const {
c10::DeviceIndex device_index = stream_.device_index();
StreamId stream_id = stream_.id();
StreamIdType st = streamIdType(stream_id);
size_t si = streamIdIndex(stream_id);
switch (st) {
// The index 0 stream is default as well.
case StreamIdType::DEFAULT:
case StreamIdType::NORMAL:
return streams[device_index][static_cast<uint8_t>(st)][si];
case StreamIdType::EXT:
return reinterpret_cast<orStream_t>(stream_id);
default:
TORCH_CHECK(
false,
"Unrecognized stream ",
stream_,
" (I didn't recognize the stream type, ",
st,
").",
" Did you manufacture the StreamId yourself? Don't do that;");
}
}
// Returns a stream from the requested pool
// Note: when called the first time on a device, this will create the
// stream pools for that device.
OpenRegStream getStreamFromPool(const int priority, DeviceIndex device_index) {
initOpenRegStreamsOnce();
if (device_index == -1) {
device_index = current_device();
}
c10::call_once(
device_flags[device_index], initDeviceStreamState, device_index);
auto pri_idx =
std::clamp(priority, 0, max_compile_time_stream_priorities - 1);
const auto idx = get_idx(priority_counters[device_index][pri_idx]);
auto id_type = static_cast<StreamIdType>(pri_idx);
return OpenRegStreamForId(device_index, makeStreamId(id_type, idx));
}
OpenRegStream getStreamFromPool(const bool isHighPriority, DeviceIndex device) {
initOpenRegStreamsOnce();
int priority = 0;
return getStreamFromPool(priority, device);
}
OpenRegStream getStreamFromExternal(
orStream_t ext_stream,
DeviceIndex device_index) {
return OpenRegStreamForId(
device_index, reinterpret_cast<int64_t>(ext_stream));
}
OpenRegStream getDefaultOpenRegStream(DeviceIndex device_index) {
initOpenRegStreamsOnce();
if (device_index == -1) {
device_index = current_device();
}
return OpenRegStreamForId(
device_index, makeStreamId(StreamIdType::DEFAULT, 0));
}
OpenRegStream getCurrentOpenRegStream(DeviceIndex device_index) {
initOpenRegStreamsOnce();
if (device_index == -1) {
device_index = current_device();
}
return OpenRegStreamForId(device_index, current_streams[device_index]);
}
void setCurrentOpenRegStream(OpenRegStream stream) {
initOpenRegStreamsOnce();
current_streams[stream.device_index()] = stream.id();
}
std::ostream& operator<<(std::ostream& stream, const OpenRegStream& s) {
return stream << s.unwrap();
}
} // namespace c10::openreg

View File

@ -1,162 +0,0 @@
#pragma once
#include <include/openreg.h>
#include "OpenRegException.h"
#include "OpenRegFunctions.h"
#include <c10/core/DeviceGuard.h>
#include <c10/core/Stream.h>
#include <c10/util/Exception.h>
namespace c10::openreg {
static constexpr int max_compile_time_stream_priorities = 1;
class OpenRegStream {
public:
enum Unchecked { UNCHECKED };
explicit OpenRegStream(Stream stream) : stream_(stream) {
TORCH_CHECK(stream_.device_type() == DeviceType::PrivateUse1);
}
explicit OpenRegStream(Unchecked, Stream stream) : stream_(stream) {}
bool operator==(const OpenRegStream& other) const noexcept {
return unwrap() == other.unwrap();
}
bool operator!=(const OpenRegStream& other) const noexcept {
return unwrap() != other.unwrap();
}
operator orStream_t() const {
return stream();
}
operator Stream() const {
return unwrap();
}
DeviceType device_type() const {
return DeviceType::PrivateUse1;
}
DeviceIndex device_index() const {
return stream_.device_index();
}
Device device() const {
return Device(DeviceType::PrivateUse1, device_index());
}
StreamId id() const {
return stream_.id();
}
bool query() const {
DeviceGuard guard{stream_.device()};
if (orStreamQuery(stream()) == orSuccess) {
return true;
}
return false;
}
void synchronize() const {
DeviceGuard guard{stream_.device()};
OPENREG_CHECK(orStreamSynchronize(stream()));
}
int priority() const {
DeviceGuard guard{stream_.device()};
int priority = 0;
OPENREG_CHECK(orStreamGetPriority(stream(), &priority));
return priority;
}
orStream_t stream() const;
Stream unwrap() const {
return stream_;
}
struct c10::StreamData3 pack3() const {
return stream_.pack3();
}
static OpenRegStream unpack3(
StreamId stream_id,
DeviceIndex device_index,
DeviceType device_type) {
return OpenRegStream(Stream::unpack3(stream_id, device_index, device_type));
}
private:
Stream stream_;
};
/*
* Get a stream from the pool in a round-robin fashion.
*
* You can request a stream from the highest priority pool by setting
* isHighPriority to true for a specific device.
*/
OPENREG_EXPORT OpenRegStream
getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
/*
* Get a stream from the pool in a round-robin fashion.
*
* You can request a stream by setting a priority value for a specific device.
* The priority number lower, the priority higher.
*/
OPENREG_EXPORT OpenRegStream
getStreamFromPool(const int priority, DeviceIndex device = -1);
/*
* Get a OpenRegStream from a externally allocated one.
*
* This is mainly for interoperability with different libraries where we
* want to operate on a non-torch allocated stream for data exchange or similar
* purposes
*/
OPENREG_EXPORT OpenRegStream
getStreamFromExternal(orStream_t ext_stream, DeviceIndex device_index);
/*
* Get the default OpenReg stream, for the passed OpenReg device, or for the
* current device if no device index is passed.
*/
OPENREG_EXPORT OpenRegStream
getDefaultOpenRegStream(DeviceIndex device_index = -1);
/*
* Get the current OpenReg stream, for the passed OpenReg device, or for the
* current device if no device index is passed.
*/
OPENREG_EXPORT OpenRegStream
getCurrentOpenRegStream(DeviceIndex device_index = -1);
/*
* Set the current stream on the device of the passed in stream to be the passed
* in stream.
*/
OPENREG_EXPORT void setCurrentOpenRegStream(OpenRegStream stream);
OPENREG_EXPORT std::ostream& operator<<(
std::ostream& stream,
const OpenRegStream& s);
} // namespace c10::openreg
namespace std {
template <>
struct hash<c10::openreg::OpenRegStream> {
size_t operator()(c10::openreg::OpenRegStream s) const noexcept {
return std::hash<c10::Stream>{}(s.unwrap());
}
};
} // namespace std

View File

@ -1,7 +0,0 @@
#pragma once
#ifdef _WIN32
#define OPENREG_EXPORT __declspec(dllexport)
#else
#define OPENREG_EXPORT __attribute__((visibility("default")))
#endif

View File

@ -53,7 +53,7 @@ def build_deps():
".",
"--target",
"install",
"--config", # For multi-config generators
"--config",
"Release",
"--",
]

View File

@ -1,14 +1,7 @@
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
project(TORCH_OPENREG CXX C)
option(USE_TEST "Build and run unit tests" ON)
set(LIBRARY_NAME openreg)
set(LIBRARY_TEST ortests)
file(GLOB_RECURSE SOURCE_FILES
"${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/*.cpp"
)
add_library(${LIBRARY_NAME} SHARED ${SOURCE_FILES})
@ -20,26 +13,3 @@ install(TARGETS ${LIBRARY_NAME}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}
)
if(USE_TEST)
enable_testing()
include(${CMAKE_CURRENT_LIST_DIR}/cmake/GTestTargets.cmake)
file(GLOB_RECURSE TEST_FILES
"${CMAKE_CURRENT_SOURCE_DIR}/tests/*.cpp"
)
add_executable(${LIBRARY_TEST} ${TEST_FILES})
target_link_libraries(${LIBRARY_TEST}
PRIVATE
${LIBRARY_NAME}
gtest
gtest_main
)
add_test(NAME alltests COMMAND ${LIBRARY_TEST})
add_custom_command(TARGET ${LIBRARY_TEST}
POST_BUILD
COMMAND ${CMAKE_CTEST_COMMAND} -C Release --output-on-failure --verbose)
endif()

View File

@ -4,9 +4,9 @@
OpenReg is a C++ backend library that simulates the behavior of a CUDA-like device on a CPU. Its core objective is **not to accelerate computation or improve performance**, but rather to **simulate modern CUDA programming, enabling developers to prototype and test in an environment without actual GPU hardware**. The current design principles are as follows:
* **API Consistency**: Provide an interface consistent with the CUDA Runtime API, allowing upper-level applications (like PyTorch's `PrivateUse1` backend) to switch and test seamlessly.
* **API Consistency**: Provide an interface consistent with the CUDA Runtime API, allowing upper-level applications (like PyTorch's PrivateUse1 backend) to switch and test seamlessly.
* **Functional Consistency**: Provide behavior consistent with the CUDA Runtime, such as memory isolation, device context management, etc.
* **Completeness**: Aim to support `PrivateUse1` device integration and safeguard the third-party device integration mechanism, without striving to cover all capabilities of the CUDA Runtime.
* **Completeness**: Aim to support PrivateUse1 device integration and safeguard the third-party device integration mechanism, without striving to cover all capabilities of the CUDA Runtime.
## Directory Structure
@ -14,34 +14,19 @@ The project's code is organized with a clear structure and separation of respons
```text
openreg/
├── README.md # Comprehensive introduction of OpenReg.
├── CMakeLists.txt # Top-level CMake build script, used to compile and generate libopenreg.so
├── cmake/
│ └── GTestTargets.cmake # Utils of fetching GoogleTest.
├── CMakeLists.txt # Top-level CMake build script, used to compile and generate libopenreg.so
├── include/
── openreg.h # Public API header file, external users only need to include this file
│ └── openreg.inl # Public API header file, as an extension of openreg.h, cannot be included separately.
├── example/
│ └── example.cpp # Example for OpenReg.
├── tests/
│ ├── event_tests.cpp # Testcases about OpenReg Event.
│ ├── stream_tests.cpp # Testcases about OpenReg Stream.
│ ├── device_tests.cpp # Testcases about OpenReg Device.
│ └── memory_tests.cpp # Testcases about OpenReg Memory.
── openreg.h # Public API header file, external users only need to include this file
└── csrc/
├── device.cpp # Implementation of device management APIs
── memory.cpp # Implementation of memory management APIs
└── stream.cpp # Implementation of stream and event APIs.
├── device.cpp # Implementation of device management-related APIs
── memory.cpp # Implementation of APIs for memory management, copying, and protection
```
* `CMakeLists.txt`: Responsible for compiling and linking all source files under the `csrc/` directory to generate the final `libopenreg.so` shared library.
* `include`: Defines all externally exposed APIs, data structures, and enums.
* `openreg.h`: Defines all externally exposed C-style APIs.
* `openreg.inl`: Defines all externally exposed C++ APIs.
* `include/openreg.h`: Defines all externally exposed C-style APIs, data structures, and enums. It is the "public face" of this library.
* `csrc/`: Contains the C++ implementation source code for all core functionalities.
* `device.cpp`: Implements the core functions of device management: device discovery and context management.
* `memory.cpp`: Implements the core functions of memory management: allocation, free, copy and memory protection.
* `stream.cpp`: Implements the core functions of stream and event: creation, destroy, record, synchronization and so on.
* `device.cpp`: Implements device discovery (`orGetDeviceCount`) and thread context management (`orSetDevice`/`orGetDevice`).
* `memory.cpp`: Implements the core functions of memory allocation (`orMalloc`/`orMallocHost`), deallocation, copying, and memory protection (`orMemoryProtect`, `orMemoryUnprotect`).
* `CMakeLists.txt`: Responsible for compiling and linking all source files under the `csrc/` directory to generate the final `libopenreg.so` shared library.
## Implemented APIs
@ -49,49 +34,25 @@ OpenReg currently provides a set of APIs covering basic memory and device manage
### Device Management APIs
| OpenReg | CUDA | Feature Description |
| :------------------------------- | :--------------------------------- | :--------------------------------- |
| `orGetDeviceCount` | `cudaGetDeviceCount` | Get the number of available GPUs |
| `orSetDevice` | `cudaSetDevice` | Set the active GPU |
| `orGetDevice` | `cudaGetDevice` | Get the current GPU |
| `orDeviceSynchronize` | `cudaDeviceSynchronize` | Wait for all GPU tasks to finish |
| `orDeviceGetStreamPriorityRange` | `cudaDeviceGetStreamPriorityRange` | Get the range of stream priorities |
| OpenReg | CUDA | Feature Description |
| :------------------- | :------------------- | :------------------------------------------------ |
| `orGetDeviceCount` | `cudaGetDeviceCount` | Get the number of devices |
| `orSetDevice` | `cudaSetDevice` | Set the current device for the current thread |
| `orGetDevice` | `cudaGetDevice` | Get the current device for the current thread |
### Memory Management APIs
| OpenReg | CUDA | Feature Description |
| :----------------------- | :------------------------- | :---------------------------------------- |
| `orMalloc` | `cudaMalloc` | Allocate device memory |
| `orFree` | `cudaFree` | Free device memory |
| `orMallocHost` | `cudaMallocHost` | Allocate page-locked (Pinned) host memory |
| `orFreeHost` | `cudaFreeHost` | Free page-locked host memory |
| `orMemcpy` | `cudaMemcpy` | Synchronous memory copy |
| `orMemcpyAsyn` | `cudaMemcpyAsyn` | Asynchronous memory copy |
| `orPointerGetAttributes` | `cudaPointerGetAttributes` | Get pointer attributes |
### Stream APIs
| OpenReg | CUDA | Feature Description |
| :--------------------------- | :----------------------------- | :------------------------------------- |
| `orStreamCreate` | `cudaStreamCreate` | Create a default-priority stream |
| `orStreamCreateWithPriority` | `cudaStreamCreateWithPriority` | Create a stream with a given priority |
| `orStreamDestroy` | `cudaStreamDestroy` | Destroy a stream |
| `orStreamQuery` | `cudaStreamQuery` | Check if a stream has completed |
| `orStreamSynchronize` | `cudaStreamSynchronize` | Wait for a stream to complete |
| `orStreamWaitEvent` | `cudaStreamWaitEvent` | Make a stream wait for an event |
| `orStreamGetPriority` | `cudaStreamGetPriority` | Get a streams priority |
### Event APIs
| OpenReg | CUDA | Feature Description |
| :----------------------- | :------------------------- | :---------------------------------- |
| `orEventCreate` | `cudaEventCreate` | Create an event with default flag |
| `orEventCreateWithFlags` | `cudaEventCreateWithFlags` | Create an event with specific flag |
| `orEventDestroy` | `cudaEventDestroy` | Destroy an event |
| `orEventRecord` | `cudaEventRecord` | Record an event in a stream |
| `orEventSynchronize` | `cudaEventSynchronize` | Wait for an event to complete |
| `orEventQuery` | `cudaEventQuery` | Check if an event has completed |
| `orEventElapsedTime` | `cudaEventElapsedTime` | Get time elapsed between two events |
| OpenReg | CUDA | Feature Description |
| :----------------------- | :--------------------------- | :----------------------------------------- |
| `orMalloc` | `cudaMalloc` | Allocate device memory |
| `orFree` | `cudaFree` | Free device memory |
| `orMallocHost` | `cudaMallocHost` | Allocate page-locked (Pinned) host memory |
| `orFreeHost` | `cudaFreeHost` | Free page-locked host memory |
| `orMemcpy` | `cudaMemcpy` | Synchronous memory copy |
| `orMemcpyAsync` | `cudaMemcpyAsync` | Asynchronous memory copy |
| `orPointerGetAttributes` | `cudaPointerGetAttributes` | Get pointer attributes |
| `orMemoryUnprotect` | - | (Internal use) Unprotect memory |
| `orMemoryProtect` | - | (Internal use) Restore memory protection |
## Implementation Principles
@ -110,42 +71,67 @@ Simulating device memory, host memory, and memory copies:
2. **Deallocation**: Memory is freed using `munmap`.
3. **Authorization**: When a legitimate memory access is required, an RAII guard restores the memory permissions to `PROT_READ | PROT_WRITE`. The permissions are automatically reverted to `PROT_NONE` when the scope is exited.
### Stream&Event Principles
Simulating creation, release and synchronization for event and steam:
1. **Event**: Each event is encapsulated as a task function and placed into a stream, which acts as a thread. Upon completion of the task, a flag within the event is modified to simulate the event's status.
2. **Stream**: When each stream is requested, a new thread is created, which sequentially processes each task in the task queue within the stream structure. Tasks can be wrappers around kernel functions or events.
3. **Synchronization**: Synchronization between streams and events is achieved using multithreading, condition variables, and mutexes.
## Usage Example
Please refer to [example](example/example.cpp) for example.
The following is a simple code snippet demonstrating how to use the core features of the OpenReg library.
The command to compile example.cpp is as follow:
```cpp
#include "openreg.h"
#include <iostream>
#include <vector>
#include <cstdio>
```Shell
mkdir build
#define OR_CHECK(call) do { \
orError_t err = call; \
if (err != orSuccess) { \
fprintf(stderr, "OR Error code %d in %s at line %d\n", err, __FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} \
} while (0)
pushd build
cmake ..
make -j 32
popd
int main() {
int device_count = 0;
OR_CHECK(orGetDeviceCount(&device_count));
std::cout << "Found " << device_count << " simulated devices." << std::endl;
g++ -o out example/example.cpp -L ./build -lopenreg
LD_LIBRARY_PATH=./build ./out
```
int current_device = -1;
OR_CHECK(orSetDevice(1));
OR_CHECK(orGetDevice(&current_device));
std::cout << "Set current device to " << current_device << "." << std::endl;
The output is as follow:
const int n = 1024;
const size_t size = n * sizeof(int);
int *h_a, *d_a;
OR_CHECK(orMallocHost((void**)&h_a, size));
OR_CHECK(orMalloc((void**)&d_a, size));
```Shell
Current environment have 2 devices
Current is 0 device
All tasks have been submitted.
Kernel execution time: 0.238168 ms
Verification PASSED!
orPointerAttributes attr;
OR_CHECK(orPointerGetAttributes(&attr, d_a));
std::cout << "Pointer " << (void*)d_a << " is of type " << attr.type
<< " on device " << attr.device << std::endl;
for (int i = 0; i < n; ++i) {
h_a[i] = i;
}
OR_CHECK(orMemcpy(d_a, h_a, size, orMemcpyHostToDevice));
std::cout << "Data copied from Host to Device." << std::endl;
// std::cout << "Trying to access device memory directly from CPU..." << std::endl;
// int val = d_a[0]; // CRASH!
// Clean up resources
OR_CHECK(orFree(d_a));
OR_CHECK(orFreeHost(h_a));
std::cout << "Resources freed." << std::endl;
return 0;
}
```
## Next Steps
The most basic functions of the OpenReg backend are currently supported, and will be dynamically optimized and expanded based on the needs of PyTorch integration.
To better support PrivateUse1 device integration, the following capabilities are planned for the future:
* **Stream Support**: Provide the ability to simulate CUDA Streams.
* **Event Support**: Provide the ability to simulate CUDA Events.
* **Cross-Platform Support**: Add support for Windows and macOS (low priority).

View File

@ -1,12 +0,0 @@
set(GTest_REL_PATH "../../../../../../../third_party/googletest")
get_filename_component(GTest_DIR "${CMAKE_CURRENT_LIST_DIR}/${GTest_REL_PATH}" ABSOLUTE)
if(EXISTS "${GTest_DIR}/CMakeLists.txt")
message(STATUS "Found GTest: ${GTest_DIR}")
set(BUILD_GMOCK OFF CACHE BOOL "Disable GMock build")
set(INSTALL_GTEST OFF CACHE BOOL "Disable GTest install")
add_subdirectory(${GTest_DIR} "${CMAKE_BINARY_DIR}/gtest")
else()
message(FATAL_ERROR "GTest Not Found")
endif()

View File

@ -1,12 +1,10 @@
#include <include/openreg.h>
namespace {
// Total device numbers
constexpr int DEVICE_COUNT = 2;
// Current device index
thread_local int gCurrentDevice = 0;
} // namespace
orError_t orGetDeviceCount(int* count) {

View File

@ -1,20 +1,10 @@
#include "memory.h"
#include <include/openreg.h>
#include <map>
#include <mutex>
namespace {
struct Block {
orMemoryType type = orMemoryType::orMemoryTypeUnmanaged;
int device = -1;
void* pointer = nullptr;
size_t size = 0;
int refcount{0};
};
class MemoryManager {
public:
static MemoryManager& getInstance() {
@ -48,7 +38,7 @@ class MemoryManager {
}
}
m_registry[mem] = {type, current_device, mem, aligned_size, 0};
m_registry[mem] = {type, current_device, mem, aligned_size};
*ptr = mem;
return orSuccess;
}
@ -61,15 +51,14 @@ class MemoryManager {
auto it = m_registry.find(ptr);
if (it == m_registry.end())
return orErrorUnknown;
const auto& info = it->second;
if (info.type == orMemoryType::orMemoryTypeDevice) {
openreg::mprotect(info.pointer, info.size, F_PROT_READ | F_PROT_WRITE);
openreg::munmap(info.pointer, info.size);
} else {
openreg::free(info.pointer);
}
m_registry.erase(it);
return orSuccess;
}
@ -81,39 +70,36 @@ class MemoryManager {
orMemcpyKind kind) {
if (!dst || !src || count == 0)
return orErrorUnknown;
std::lock_guard<std::mutex> lock(m_mutex);
Block* dst_info = getBlockInfoNoLock(dst);
Block* src_info = getBlockInfoNoLock(src);
orPointerAttributes dst_info = getPointerInfo(dst);
orPointerAttributes src_info = getPointerInfo(src);
switch (kind) {
case orMemcpyHostToDevice:
if ((!dst_info || dst_info->type != orMemoryType::orMemoryTypeDevice) ||
(src_info && src_info->type == orMemoryType::orMemoryTypeDevice))
if (dst_info.type != orMemoryType::orMemoryTypeDevice ||
src_info.type == orMemoryType::orMemoryTypeDevice)
return orErrorUnknown;
break;
case orMemcpyDeviceToHost:
if ((dst_info && dst_info->type == orMemoryType::orMemoryTypeDevice) ||
(!src_info || src_info->type != orMemoryType::orMemoryTypeDevice))
if (dst_info.type == orMemoryType::orMemoryTypeDevice ||
src_info.type != orMemoryType::orMemoryTypeDevice)
return orErrorUnknown;
break;
case orMemcpyDeviceToDevice:
if ((!dst_info || dst_info->type != orMemoryType::orMemoryTypeDevice) ||
(!src_info || src_info->type != orMemoryType::orMemoryTypeDevice))
if (dst_info.type != orMemoryType::orMemoryTypeDevice ||
src_info.type != orMemoryType::orMemoryTypeDevice)
return orErrorUnknown;
break;
case orMemcpyHostToHost:
if ((dst_info && dst_info->type == orMemoryType::orMemoryTypeDevice) ||
(src_info && src_info->type == orMemoryType::orMemoryTypeDevice))
if (dst_info.type == orMemoryType::orMemoryTypeDevice ||
src_info.type == orMemoryType::orMemoryTypeDevice)
return orErrorUnknown;
break;
}
unprotectNoLock(dst_info);
unprotectNoLock(src_info);
::memcpy(dst, src, count);
protectNoLock(dst_info);
protectNoLock(src_info);
{
ScopedMemoryProtector dst_protector(dst_info);
ScopedMemoryProtector src_protector(src_info);
::memcpy(dst, src, count);
}
return orSuccess;
}
@ -125,16 +111,17 @@ class MemoryManager {
return orErrorUnknown;
std ::lock_guard<std::mutex> lock(m_mutex);
Block* info = getBlockInfoNoLock(ptr);
orPointerAttributes info = getPointerInfo(ptr);
if (!info) {
attributes->type = orMemoryType::orMemoryTypeUnmanaged;
attributes->type = info.type;
if (info.type == orMemoryType::orMemoryTypeUnmanaged) {
attributes->device = -1;
attributes->pointer = const_cast<void*>(ptr);
attributes->size = 0;
} else {
attributes->type = info->type;
attributes->device = info->device;
attributes->pointer = info->pointer;
attributes->device = info.device;
attributes->pointer = info.pointer;
attributes->size = info.size;
}
return orSuccess;
@ -142,61 +129,71 @@ class MemoryManager {
orError_t unprotect(void* ptr) {
std::lock_guard<std::mutex> lock(m_mutex);
return unprotectNoLock(getBlockInfoNoLock(ptr));
orPointerAttributes info = getPointerInfo(ptr);
if (info.type != orMemoryType::orMemoryTypeDevice) {
return orErrorUnknown;
}
if (openreg::mprotect(
info.pointer, info.size, F_PROT_READ | F_PROT_WRITE) != 0) {
return orErrorUnknown;
}
return orSuccess;
}
orError_t protect(void* ptr) {
std::lock_guard<std::mutex> lock(m_mutex);
return protectNoLock(getBlockInfoNoLock(ptr));
orPointerAttributes info = getPointerInfo(ptr);
if (info.type != orMemoryType::orMemoryTypeDevice) {
return orErrorUnknown;
}
if (openreg::mprotect(info.pointer, info.size, F_PROT_NONE) != 0) {
return orErrorUnknown;
}
return orSuccess;
}
private:
class ScopedMemoryProtector {
public:
ScopedMemoryProtector(const orPointerAttributes& info)
: m_info(info), m_protected(false) {
if (m_info.type == orMemoryType::orMemoryTypeDevice) {
if (openreg::mprotect(
m_info.pointer, m_info.size, F_PROT_READ | F_PROT_WRITE) == 0) {
m_protected = true;
}
}
}
~ScopedMemoryProtector() {
if (m_protected) {
openreg::mprotect(m_info.pointer, m_info.size, F_PROT_NONE);
}
}
ScopedMemoryProtector(const ScopedMemoryProtector&) = delete;
ScopedMemoryProtector& operator=(const ScopedMemoryProtector&) = delete;
private:
orPointerAttributes m_info;
bool m_protected;
};
MemoryManager() = default;
orError_t unprotectNoLock(Block* info) {
if (info && info->type == orMemoryType::orMemoryTypeDevice) {
if (info->refcount == 0) {
if (openreg::mprotect(
info->pointer, info->size, F_PROT_READ | F_PROT_WRITE) != 0) {
return orErrorUnknown;
}
}
info->refcount++;
}
return orSuccess;
}
orError_t protectNoLock(Block* info) {
if (info && info->type == orMemoryType::orMemoryTypeDevice) {
if (info->refcount == 1) {
if (openreg::mprotect(info->pointer, info->size, F_PROT_NONE) != 0) {
return orErrorUnknown;
}
}
info->refcount--;
}
return orSuccess;
}
Block* getBlockInfoNoLock(const void* ptr) {
orPointerAttributes getPointerInfo(const void* ptr) {
auto it = m_registry.upper_bound(const_cast<void*>(ptr));
if (it != m_registry.begin()) {
--it;
const char* p_char = static_cast<const char*>(ptr);
const char* base_char = static_cast<const char*>(it->first);
if (p_char >= base_char && p_char < (base_char + it->second.size)) {
return &it->second;
return it->second;
}
}
return nullptr;
return {};
}
std::map<void*, Block> m_registry;
std::map<void*, orPointerAttributes> m_registry;
std::mutex m_mutex;
};
@ -228,22 +225,6 @@ orError_t orMemcpy(
return MemoryManager::getInstance().memcpy(dst, src, count, kind);
}
orError_t orMemcpyAsync(
void* dst,
const void* src,
size_t count,
orMemcpyKind kind,
orStream_t stream) {
if (!stream) {
return orErrorUnknown;
}
auto& mm = MemoryManager::getInstance();
return orLaunchKernel(
stream, &MemoryManager::memcpy, &mm, dst, src, count, kind);
}
orError_t orPointerGetAttributes(
orPointerAttributes* attributes,
const void* ptr) {

View File

@ -4,6 +4,8 @@
#include <cstdlib>
#include <cstring>
#include <include/openreg.h>
#if defined(_WIN32)
#include <windows.h>
#else

View File

@ -1,313 +0,0 @@
#include <include/openreg.h>
#include <atomic>
#include <chrono>
#include <condition_variable>
#include <mutex>
#include <queue>
#include <set>
#include <thread>
static std::mutex g_mutex;
static std::once_flag g_flag;
static std::vector<std::set<orStream_t>> g_streams_per_device;
static void initialize_registries() {
int device_count = 0;
orGetDeviceCount(&device_count);
g_streams_per_device.resize(device_count);
}
struct orEventImpl {
std::mutex mtx;
std::condition_variable cv;
std::atomic<bool> completed{true};
int device_index = -1;
bool timing_enabled{false};
std::chrono::high_resolution_clock::time_point completion_time;
};
struct orEvent {
std::shared_ptr<orEventImpl> impl;
};
struct orStream {
std::queue<std::function<void()>> tasks;
std::mutex mtx;
std::condition_variable cv;
std::thread worker;
std::atomic<bool> stop_flag{false};
int device_index = -1;
orStream() {
worker = std::thread([this] {
while (true) {
std::function<void()> task;
{
std::unique_lock<std::mutex> lock(this->mtx);
this->cv.wait(lock, [this] {
return this->stop_flag.load() || !this->tasks.empty();
});
if (this->stop_flag.load() && this->tasks.empty()) {
return;
}
task = std::move(this->tasks.front());
this->tasks.pop();
}
task();
}
});
}
~orStream() {
stop_flag.store(true);
cv.notify_one();
worker.join();
}
};
orError_t openreg::addTaskToStream(
orStream_t stream,
std::function<void()> task) {
if (!stream)
return orErrorUnknown;
{
std::lock_guard<std::mutex> lock(stream->mtx);
stream->tasks.push(std::move(task));
}
stream->cv.notify_one();
return orSuccess;
}
orError_t orEventCreateWithFlags(orEvent_t* event, unsigned int flags) {
if (!event)
return orErrorUnknown;
auto impl = std::make_shared<orEventImpl>();
orGetDevice(&(impl->device_index));
if (flags & orEventEnableTiming) {
impl->timing_enabled = true;
}
*event = new orEvent{std::move(impl)};
return orSuccess;
}
orError_t orEventCreate(orEvent_t* event) {
return orEventCreateWithFlags(event, orEventDisableTiming);
}
orError_t orEventDestroy(orEvent_t event) {
if (!event)
return orErrorUnknown;
delete event;
return orSuccess;
}
orError_t orEventRecord(orEvent_t event, orStream_t stream) {
if (!event || !stream)
return orErrorUnknown;
auto event_impl = event->impl;
event_impl->completed.store(false);
auto record_task = [event_impl]() {
if (event_impl->timing_enabled) {
event_impl->completion_time = std::chrono::high_resolution_clock::now();
}
{
std::lock_guard<std::mutex> lock(event_impl->mtx);
event_impl->completed.store(true);
}
event_impl->cv.notify_all();
};
return openreg::addTaskToStream(stream, record_task);
}
orError_t orEventSynchronize(orEvent_t event) {
if (!event)
return orErrorUnknown;
auto event_impl = event->impl;
std::unique_lock<std::mutex> lock(event_impl->mtx);
event_impl->cv.wait(lock, [&] { return event_impl->completed.load(); });
return orSuccess;
}
orError_t orEventQuery(orEvent_t event) {
if (!event)
return orErrorUnknown;
return event->impl->completed.load() ? orSuccess : orErrorNotReady;
}
orError_t orEventElapsedTime(float* ms, orEvent_t start, orEvent_t end) {
if (!ms || !start || !end)
return orErrorUnknown;
auto start_impl = start->impl;
auto end_impl = end->impl;
if (start_impl->device_index != end_impl->device_index) {
return orErrorUnknown;
}
if (!start_impl->timing_enabled || !end_impl->timing_enabled) {
return orErrorUnknown;
}
if (!start_impl->completed.load() || !end_impl->completed.load()) {
return orErrorUnknown;
}
auto duration = end_impl->completion_time - start_impl->completion_time;
*ms = std::chrono::duration_cast<std::chrono::duration<float, std::milli>>(
duration)
.count();
return orSuccess;
}
orError_t orStreamCreateWithPriority(
orStream_t* stream,
[[maybe_unused]] unsigned int flag,
int priority) {
if (!stream) {
return orErrorUnknown;
}
int min_p, max_p;
orDeviceGetStreamPriorityRange(&min_p, &max_p);
if (priority < min_p || priority > max_p) {
return orErrorUnknown;
}
int current_device = 0;
orGetDevice(&current_device);
orStream_t new_stream = nullptr;
new_stream = new orStream();
new_stream->device_index = current_device;
{
std::lock_guard<std::mutex> lock(g_mutex);
std::call_once(g_flag, initialize_registries);
g_streams_per_device[current_device].insert(new_stream);
}
*stream = new_stream;
return orSuccess;
}
orError_t orStreamCreate(orStream_t* stream) {
int min_p, max_p;
orDeviceGetStreamPriorityRange(&min_p, &max_p);
return orStreamCreateWithPriority(stream, 0, max_p);
}
orError_t orStreamGetPriority(
[[maybe_unused]] orStream_t stream,
int* priority) {
// Since OpenReg has only one priority level, the following code
// returns 0 directly for convenience.
*priority = 0;
return orSuccess;
}
orError_t orStreamDestroy(orStream_t stream) {
if (!stream)
return orErrorUnknown;
{
std::lock_guard<std::mutex> lock(g_mutex);
int device_idx = stream->device_index;
if (device_idx >= 0 && device_idx < g_streams_per_device.size()) {
g_streams_per_device[device_idx].erase(stream);
}
}
delete stream;
return orSuccess;
}
orError_t orStreamQuery(orStream_t stream) {
if (!stream) {
return orErrorUnknown;
}
std::lock_guard<std::mutex> lock(stream->mtx);
return stream->tasks.empty() ? orSuccess : orErrorNotReady;
}
orError_t orStreamSynchronize(orStream_t stream) {
if (!stream)
return orErrorUnknown;
orEvent_t event;
orEventCreate(&event);
orEventRecord(event, stream);
orError_t status = orEventSynchronize(event);
orEventDestroy(event);
return status;
}
orError_t orStreamWaitEvent(orStream_t stream, orEvent_t event, unsigned int) {
if (!stream || !event)
return orErrorUnknown;
auto event_impl = event->impl;
auto wait_task = [event_impl]() {
std::unique_lock<std::mutex> lock(event_impl->mtx);
event_impl->cv.wait(lock, [&] { return event_impl->completed.load(); });
};
return openreg::addTaskToStream(stream, wait_task);
}
orError_t orDeviceGetStreamPriorityRange(
int* leastPriority,
int* greatestPriority) {
if (!leastPriority || !greatestPriority) {
return orErrorUnknown;
}
// OpenReg have only one priority now.
*leastPriority = 0;
*greatestPriority = 0;
return orSuccess;
}
orError_t orDeviceSynchronize(void) {
int current_device = 0;
orGetDevice(&current_device);
std::vector<orStream_t> streams;
{
std::lock_guard<std::mutex> lock(g_mutex);
std::call_once(g_flag, initialize_registries);
auto& streams_on_device = g_streams_per_device[current_device];
streams.assign(streams_on_device.begin(), streams_on_device.end());
}
for (orStream_t stream : streams) {
orError_t status = orStreamSynchronize(stream);
if (status != orSuccess) {
return status;
}
}
return orSuccess;
}

View File

@ -1,112 +0,0 @@
#include "include/openreg.h"
#include <algorithm>
#include <iostream>
#include <numeric>
#include <vector>
struct MemoryGuard {
MemoryGuard(void* ptr) : ptr_(ptr) {
orMemoryUnprotect(ptr_);
}
~MemoryGuard() {
orMemoryProtect(ptr_);
}
private:
void* ptr_{};
};
void add_kernel(float* out, float* a, float* b, int num) {
for (int i = 0; i < num; ++i) {
out[i] = a[i] + b[i];
}
}
int main() {
int device_count = 0;
orGetDeviceCount(&device_count);
std::cout << "Current environment have " << device_count << " devices"
<< std::endl;
orSetDevice(0);
int current_device = -1;
orGetDevice(&current_device);
std::cout << "Current is " << current_device << " device" << std::endl;
constexpr int num = 50000;
constexpr size_t size = num * sizeof(float);
std::vector<float> host_a(num), host_b(num), host_out(num, 0.0f);
std::iota(host_a.begin(), host_a.end(), 0.0f);
for (int i = 0; i < num; ++i) {
host_b[i] = 2.0f;
}
float *dev_a, *dev_b, *dev_out;
orMalloc((void**)&dev_a, size);
orMalloc((void**)&dev_b, size);
orMalloc((void**)&dev_out, size);
// There will be subsequent memory access operations, so memory protection
// needs to be released
MemoryGuard a{dev_a};
MemoryGuard b{dev_b};
MemoryGuard c{dev_out};
orStream_t stream1, stream2;
orEvent_t start_event, stop_event;
orStreamCreate(&stream1);
orStreamCreate(&stream2);
orEventCreateWithFlags(&start_event, orEventEnableTiming);
orEventCreateWithFlags(&stop_event, orEventEnableTiming);
// Copy input from host to device
orMemcpyAsync(dev_a, host_a.data(), size, orMemcpyHostToDevice, stream1);
orMemcpyAsync(dev_b, host_b.data(), size, orMemcpyHostToDevice, stream1);
// Submit compute kernel and two events those are used for calculating time.
orEventRecord(start_event, stream1);
orLaunchKernel(stream1, add_kernel, dev_out, dev_a, dev_b, num);
orEventRecord(stop_event, stream1);
// Synchronization between streams.
orStreamWaitEvent(stream2, stop_event, 0);
orMemcpyAsync(host_out.data(), dev_out, size, orMemcpyDeviceToHost, stream2);
orStreamSynchronize(stream2);
std::cout << "All tasks have been submitted." << std::endl;
float elapsed_ms = 0.0f;
orEventElapsedTime(&elapsed_ms, start_event, stop_event);
std::cout << "Kernel execution time: " << elapsed_ms << " ms" << std::endl;
bool success = true;
for (int i = 0; i < num; ++i) {
if (std::abs(host_out[i] - (host_a[i] + host_b[i])) > 1e-5) {
std::cout << "Verification FAILED at index " << i << "! Expected "
<< (host_a[i] + host_b[i]) << ", got " << host_out[i]
<< std::endl;
success = false;
break;
}
}
if (success) {
std::cout << "Verification PASSED!" << std::endl;
}
orFree(dev_a);
orFree(dev_b);
orFree(dev_out);
orStreamDestroy(stream1);
orStreamDestroy(stream2);
orEventDestroy(start_event);
orEventDestroy(stop_event);
return 0;
}

View File

@ -3,20 +3,16 @@
#include <cstddef>
#ifdef _WIN32
#define OPENREG_EXPORT __declspec(dllexport)
#define OPENREG_EXPORT __declspec(dllexport)
#else
#define OPENREG_EXPORT __attribute__((visibility("default")))
#define OPENREG_EXPORT __attribute__((visibility("default")))
#endif
#ifdef __cplusplus
extern "C" {
#endif
typedef enum orError_t {
orSuccess = 0,
orErrorUnknown = 1,
orErrorNotReady = 2
} orError_t;
typedef enum orError_t { orSuccess = 0, orErrorUnknown = 1 } orError_t;
typedef enum orMemcpyKind {
orMemcpyHostToHost = 0,
@ -35,75 +31,25 @@ struct orPointerAttributes {
orMemoryType type = orMemoryType::orMemoryTypeUnmanaged;
int device;
void* pointer;
size_t size;
};
typedef enum orEventFlags {
orEventDisableTiming = 0x0,
orEventEnableTiming = 0x1,
} orEventFlags;
struct orStream;
struct orEvent;
typedef struct orStream* orStream_t;
typedef struct orEvent* orEvent_t;
// Memory
OPENREG_EXPORT orError_t orMalloc(void** devPtr, size_t size);
OPENREG_EXPORT orError_t orFree(void* devPtr);
OPENREG_EXPORT orError_t orMallocHost(void** hostPtr, size_t size);
OPENREG_EXPORT orError_t orFreeHost(void* hostPtr);
OPENREG_EXPORT orError_t
orMemcpy(void* dst, const void* src, size_t count, orMemcpyKind kind);
OPENREG_EXPORT orError_t orMemcpyAsync(
void* dst,
const void* src,
size_t count,
orMemcpyKind kind,
orStream_t stream);
OPENREG_EXPORT orError_t
orPointerGetAttributes(orPointerAttributes* attributes, const void* ptr);
OPENREG_EXPORT orError_t orMemcpy(void* dst, const void* src, size_t count, orMemcpyKind kind);
OPENREG_EXPORT orError_t orMemoryUnprotect(void* devPtr);
OPENREG_EXPORT orError_t orMemoryProtect(void* devPtr);
// Device
OPENREG_EXPORT orError_t orGetDeviceCount(int* count);
OPENREG_EXPORT orError_t orSetDevice(int device);
OPENREG_EXPORT orError_t orGetDevice(int* device);
OPENREG_EXPORT orError_t
orDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority);
OPENREG_EXPORT orError_t orDeviceSynchronize(void);
// Stream
OPENREG_EXPORT orError_t orStreamCreateWithPriority(
orStream_t* stream,
unsigned int flags,
int priority);
OPENREG_EXPORT orError_t orStreamCreate(orStream_t* stream);
OPENREG_EXPORT orError_t orStreamGetPriority(orStream_t stream, int* priority);
OPENREG_EXPORT orError_t orStreamDestroy(orStream_t stream);
OPENREG_EXPORT orError_t orStreamQuery(orStream_t stream);
OPENREG_EXPORT orError_t orStreamSynchronize(orStream_t stream);
OPENREG_EXPORT orError_t
orStreamWaitEvent(orStream_t stream, orEvent_t event, unsigned int flags);
// Event
OPENREG_EXPORT orError_t
orEventCreateWithFlags(orEvent_t* event, unsigned int flags);
OPENREG_EXPORT orError_t orEventCreate(orEvent_t* event);
OPENREG_EXPORT orError_t orEventDestroy(orEvent_t event);
OPENREG_EXPORT orError_t orEventRecord(orEvent_t event, orStream_t stream);
OPENREG_EXPORT orError_t orEventSynchronize(orEvent_t event);
OPENREG_EXPORT orError_t orEventQuery(orEvent_t event);
OPENREG_EXPORT orError_t
orEventElapsedTime(float* ms, orEvent_t start, orEvent_t end);
OPENREG_EXPORT orError_t orPointerGetAttributes(
orPointerAttributes* attributes,
const void* ptr);
#ifdef __cplusplus
} // extern "C"
#endif
#ifdef __cplusplus
#define OPENREG_H
#include "openreg.inl"
#endif

View File

@ -1,42 +0,0 @@
#ifndef OPENREG_H
#error "Don`t include openreg.inl directly, include openreg.h instead."
#endif
#include <functional>
#include <tuple>
#include <utility>
namespace openreg {
OPENREG_EXPORT orError_t
addTaskToStream(orStream* stream, std::function<void()> task);
}
template <typename Func, typename... Args>
OPENREG_EXPORT inline orError_t orLaunchKernel(
orStream* stream,
Func&& kernel_func,
Args&&... args) {
if (!stream) {
return orErrorUnknown;
}
/*
* Some tests in PyTorch still use C++11, so we use conditional macro to
* select different approaches for different C++ version.
*
* Std::apply is only supported in C++17, so for C++11/14, std::bind is
* a more appropriate approach, but the former has better performance.
*/
#if __cplusplus >= 201703L
auto task = [func = std::forward<Func>(kernel_func),
args_tuple =
std::make_tuple(std::forward<Args>(args)...)]() mutable {
std::apply(func, std::move(args_tuple));
};
#else
auto task =
std::bind(std::forward<Func>(kernel_func), std::forward<Args>(args)...);
#endif
return openreg::addTaskToStream(stream, std::move(task));
}

View File

@ -1,41 +0,0 @@
#include <gtest/gtest.h>
#include <include/openreg.h>
namespace {
class DeviceTest : public ::testing::Test {
protected:
void SetUp() override {
orSetDevice(0);
}
};
TEST_F(DeviceTest, GetDeviceCountValid) {
int count = -1;
EXPECT_EQ(orGetDeviceCount(&count), orSuccess);
EXPECT_EQ(count, 2);
}
TEST_F(DeviceTest, GetDeviceValid) {
int device = -1;
EXPECT_EQ(orGetDevice(&device), orSuccess);
EXPECT_EQ(device, 0);
}
TEST_F(DeviceTest, SetDeviceValid) {
EXPECT_EQ(orSetDevice(1), orSuccess);
int device = -1;
EXPECT_EQ(orGetDevice(&device), orSuccess);
EXPECT_EQ(device, 1);
EXPECT_EQ(orSetDevice(0), orSuccess);
EXPECT_EQ(orGetDevice(&device), orSuccess);
EXPECT_EQ(device, 0);
}
TEST_F(DeviceTest, SetDeviceInvalidNegative) {
EXPECT_EQ(orSetDevice(-1), orErrorUnknown);
}
} // namespace

Some files were not shown because too many files have changed in this diff Show More