mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-28 02:04:53 +08:00
Compare commits
3 Commits
debug-guar
...
cublasnowd
| Author | SHA1 | Date | |
|---|---|---|---|
| e38ac55aae | |||
| f20c28eba1 | |||
| daac58237f |
@ -81,8 +81,8 @@ elif [[ "$image" == *riscv* ]]; then
|
||||
DOCKERFILE="ubuntu-cross-riscv/Dockerfile"
|
||||
fi
|
||||
|
||||
_UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152
|
||||
_UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
|
||||
_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
|
||||
_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
|
||||
if [[ "$image" == *rocm* ]]; then
|
||||
_UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
|
||||
_UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
|
||||
@ -114,19 +114,31 @@ case "$tag" in
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
|
||||
CUDA_VERSION=13.0.0
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.8.1
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=11
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.8.1
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.8.1
|
||||
ANACONDA_PYTHON_VERSION=3.13
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
@ -213,8 +225,7 @@ case "$tag" in
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
|
||||
# TODO (huydhn): Upgrade this to Python >= 3.10
|
||||
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
|
||||
@ -1 +1 @@
|
||||
74a23feff57432129df84d8099e622773cf77925
|
||||
22bc29b4d503fc895ff73bc720ff396e9723465f
|
||||
|
||||
@ -44,12 +44,8 @@ function install_ucc() {
|
||||
|
||||
./autogen.sh
|
||||
|
||||
if [[ -n "$CUDA_VERSION" && $CUDA_VERSION == 13* ]]; then
|
||||
NVCC_GENCODE="-gencode=arch=compute_86,code=compute_86"
|
||||
else
|
||||
# We only run distributed tests on Tesla M60 and A10G
|
||||
NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
|
||||
fi
|
||||
# We only run distributed tests on Tesla M60 and A10G
|
||||
NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
|
||||
|
||||
if [[ -n "$ROCM_VERSION" ]]; then
|
||||
if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
sphinx==5.3.0
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 5.3.0
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2
|
||||
|
||||
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
|
||||
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
|
||||
|
||||
@ -66,7 +66,6 @@ ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
|
||||
# (optional) Install UCC
|
||||
ARG UCX_COMMIT
|
||||
ARG UCC_COMMIT
|
||||
ARG CUDA_VERSION
|
||||
ENV UCX_COMMIT $UCX_COMMIT
|
||||
ENV UCC_COMMIT $UCC_COMMIT
|
||||
ENV UCX_HOME /usr
|
||||
|
||||
@ -35,10 +35,11 @@ fi
|
||||
|
||||
print_cmake_info
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
|
||||
USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
|
||||
# Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
|
||||
USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
|
||||
else
|
||||
# NB: we always build with distributed; USE_DISTRIBUTED turns off all
|
||||
# backends (specifically the gloo backend), so test that this case works too
|
||||
# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
|
||||
# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
|
||||
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
|
||||
fi
|
||||
if which sccache > /dev/null; then
|
||||
|
||||
@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
|
||||
fi
|
||||
popd
|
||||
|
||||
python -mpip install -r requirements.txt
|
||||
|
||||
# enable debug asserts in serialization
|
||||
export TORCH_SERIALIZATION_DEBUG=1
|
||||
|
||||
python -mpip install --no-input -r requirements.txt
|
||||
|
||||
setup_test_python() {
|
||||
# The CircleCI worker hostname doesn't resolve to an address.
|
||||
# This environment variable makes ProcessGroupGloo default to
|
||||
|
||||
@ -496,14 +496,6 @@ test_inductor_cpp_wrapper_shard() {
|
||||
-k 'take' \
|
||||
--shard "$1" "$NUM_TEST_SHARDS" \
|
||||
--verbose
|
||||
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
|
||||
python test/run_test.py \
|
||||
--include inductor/test_mkldnn_pattern_matcher \
|
||||
-k 'xpu' \
|
||||
--shard "$1" "$NUM_TEST_SHARDS" \
|
||||
--verbose
|
||||
fi
|
||||
}
|
||||
|
||||
# "Global" flags for inductor benchmarking controlled by TEST_CONFIG
|
||||
|
||||
@ -213,8 +213,7 @@ pip install requests ninja typing-extensions
|
||||
retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
|
||||
retry brew install libomp
|
||||
|
||||
# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
|
||||
# is build as part of tensorpipe submodule
|
||||
# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
|
||||
export USE_DISTRIBUTED=1
|
||||
|
||||
export USE_MKLDNN=OFF
|
||||
|
||||
15
.github/actions/checkout-pytorch/action.yml
vendored
15
.github/actions/checkout-pytorch/action.yml
vendored
@ -57,21 +57,6 @@ runs:
|
||||
submodules: ${{ inputs.submodules }}
|
||||
show-progress: false
|
||||
|
||||
- name: Clean submodules post checkout
|
||||
id: clean-submodules
|
||||
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
|
||||
shell: bash
|
||||
env:
|
||||
NO_SUDO: ${{ inputs.no-sudo }}
|
||||
run: |
|
||||
cd "${GITHUB_WORKSPACE}"
|
||||
# Clean stale submodule dirs
|
||||
if [ -z "${NO_SUDO}" ]; then
|
||||
sudo git submodule foreach --recursive git clean -ffdx
|
||||
else
|
||||
git submodule foreach --recursive git clean -ffdx
|
||||
fi
|
||||
|
||||
- name: Clean workspace (try again)
|
||||
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' &&
|
||||
(steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }}
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
1c66402d0fa47ea74d365dcaa468d397da481918
|
||||
10a5002c6195bd95e34df8fe28ff8a2d55a2a922
|
||||
|
||||
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
752d2e1c364e4195093e4f3f2fc33e3ae1840707
|
||||
321938e9ac4000e0cb37e328359a7fd3026bc672
|
||||
|
||||
2
.github/ci_commit_pins/xla.txt
vendored
2
.github/ci_commit_pins/xla.txt
vendored
@ -1 +1 @@
|
||||
763e5b78d4fcd74a9e812256656c075f99d9a781
|
||||
a1c6ee92c85e8b0955c20892ed68f032a6015c09
|
||||
|
||||
@ -244,6 +244,8 @@ def generate_libtorch_matrix(
|
||||
arches.remove("13.0")
|
||||
elif os == "windows":
|
||||
arches += CUDA_ARCHES
|
||||
if "13.0" in arches:
|
||||
arches.remove("13.0")
|
||||
if libtorch_variants is None:
|
||||
libtorch_variants = [
|
||||
"shared-with-deps",
|
||||
@ -308,6 +310,8 @@ def generate_wheels_matrix(
|
||||
arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
|
||||
elif os == "windows":
|
||||
arches += CUDA_ARCHES + XPU_ARCHES
|
||||
if "13.0" in arches:
|
||||
arches.remove("13.0")
|
||||
elif os == "linux-aarch64":
|
||||
# Separate new if as the CPU type is different and
|
||||
# uses different build/test scripts
|
||||
|
||||
174
.github/scripts/test_trymerge.py
vendored
174
.github/scripts/test_trymerge.py
vendored
@ -27,7 +27,6 @@ from trymerge import (
|
||||
get_drci_classifications,
|
||||
gh_get_team_members,
|
||||
GitHubPR,
|
||||
iter_issue_timeline_until_comment,
|
||||
JobCheckState,
|
||||
main as trymerge_main,
|
||||
MandatoryChecksMissingError,
|
||||
@ -35,8 +34,6 @@ from trymerge import (
|
||||
RE_GHSTACK_DESC,
|
||||
read_merge_rules,
|
||||
remove_job_name_suffix,
|
||||
sha_from_committed_event,
|
||||
sha_from_force_push_after,
|
||||
validate_revert,
|
||||
)
|
||||
|
||||
@ -1141,176 +1138,5 @@ Pull Request resolved: https://github.com/pytorch/pytorch/pull/154394"""
|
||||
)
|
||||
|
||||
|
||||
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
|
||||
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
|
||||
@mock.patch(
|
||||
"trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
|
||||
)
|
||||
class TestTimelineFunctions(TestCase):
|
||||
"""Tests for the new timeline-related functions"""
|
||||
|
||||
def test_sha_from_committed_event(self, *args: Any) -> None:
|
||||
"""Test extracting SHA from committed event"""
|
||||
# Based on actual GitHub API format - committed events have "sha" at top level
|
||||
event = {
|
||||
"event": "committed",
|
||||
"sha": "fb21ce932ded6670c918804a0d9151b773770a7c",
|
||||
}
|
||||
self.assertEqual(
|
||||
sha_from_committed_event(event), "fb21ce932ded6670c918804a0d9151b773770a7c"
|
||||
)
|
||||
|
||||
# Test with missing SHA
|
||||
event_no_sha = {"event": "committed"}
|
||||
self.assertIsNone(sha_from_committed_event(event_no_sha))
|
||||
|
||||
def test_sha_from_force_push_after(self, *args: Any) -> None:
|
||||
"""Test extracting SHA from force push event"""
|
||||
# NOTE: The current function doesn't handle the actual GitHub API format
|
||||
# Real force push events have "commit_id" at top level, but this function
|
||||
# looks for "after", "after_commit", "after_sha", or "head_sha" fields
|
||||
|
||||
# Test with the legacy format the current function handles
|
||||
event_legacy = {
|
||||
"event": "head_ref_force_pushed",
|
||||
"after": {"sha": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e"},
|
||||
}
|
||||
self.assertEqual(
|
||||
sha_from_force_push_after(event_legacy),
|
||||
"ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
|
||||
)
|
||||
|
||||
# Test with current GitHub API format (should return None with current implementation)
|
||||
event_real_api = {
|
||||
"event": "head_ref_force_pushed",
|
||||
"commit_id": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
|
||||
}
|
||||
self.assertEqual(
|
||||
sha_from_force_push_after(event_real_api),
|
||||
"ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
|
||||
) # Current function doesn't handle commit_id
|
||||
|
||||
# Test with missing SHA
|
||||
event_no_sha = {"event": "head_ref_force_pushed"}
|
||||
self.assertIsNone(sha_from_force_push_after(event_no_sha))
|
||||
|
||||
@mock.patch("trymerge.gh_fetch_json_list")
|
||||
def test_iter_issue_timeline_until_comment(
|
||||
self, mock_gh_fetch_json_list: Any, *args: Any
|
||||
) -> None:
|
||||
"""Test timeline iteration until target comment"""
|
||||
# Mock timeline data based on actual GitHub API format
|
||||
timeline_data = [
|
||||
{"event": "commented", "id": 100, "body": "first comment"},
|
||||
{"event": "committed", "sha": "fb21ce932ded6670c918804a0d9151b773770a7c"},
|
||||
{"event": "commented", "id": 200, "body": "target comment"},
|
||||
{"event": "commented", "id": 300, "body": "after target"},
|
||||
]
|
||||
mock_gh_fetch_json_list.return_value = timeline_data
|
||||
|
||||
# Test iteration stops at target comment
|
||||
events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 200))
|
||||
self.assertEqual(len(events), 3) # Should stop at target comment
|
||||
self.assertEqual(events[0]["event"], "commented")
|
||||
self.assertEqual(events[0]["id"], 100)
|
||||
self.assertEqual(events[1]["event"], "committed")
|
||||
self.assertEqual(events[1]["sha"], "fb21ce932ded6670c918804a0d9151b773770a7c")
|
||||
self.assertEqual(events[2]["event"], "commented")
|
||||
self.assertEqual(events[2]["id"], 200)
|
||||
|
||||
@mock.patch("trymerge.gh_fetch_json_list")
|
||||
def test_iter_issue_timeline_until_comment_not_found(
|
||||
self, mock_gh_fetch_json_list: Any, *args: Any
|
||||
) -> None:
|
||||
"""Test timeline iteration when target comment is not found"""
|
||||
# Mock empty timeline
|
||||
mock_gh_fetch_json_list.return_value = []
|
||||
|
||||
events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 999))
|
||||
self.assertEqual(len(events), 0)
|
||||
|
||||
@mock.patch("trymerge.iter_issue_timeline_until_comment")
|
||||
def test_get_commit_sha_at_comment_commit_after_comment(
|
||||
self, mock_iter_timeline: Any, *args: Any
|
||||
) -> None:
|
||||
"""Test get_commit_sha_at_comment returns correct SHA after comment"""
|
||||
mock_iter_timeline.return_value = [
|
||||
{"event": "committed", "sha": "commit1"},
|
||||
{"event": "committed", "sha": "commit2"},
|
||||
{"event": "commented", "id": 100},
|
||||
{"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
|
||||
]
|
||||
pr = GitHubPR("pytorch", "pytorch", 77700)
|
||||
sha = pr.get_commit_sha_at_comment(100)
|
||||
self.assertEqual(sha, "commit2")
|
||||
|
||||
@mock.patch("trymerge.iter_issue_timeline_until_comment")
|
||||
def test_get_commit_sha_at_comment_force_push_before_comment(
|
||||
self, mock_iter_timeline: Any, *args: Any
|
||||
) -> None:
|
||||
mock_iter_timeline.return_value = [
|
||||
{"event": "committed", "sha": "commit1"},
|
||||
{"event": "committed", "sha": "commit2"},
|
||||
{"event": "head_ref_force_pushed", "commit_id": "commit3"},
|
||||
{"event": "commented", "id": 100},
|
||||
]
|
||||
pr = GitHubPR("pytorch", "pytorch", 77700)
|
||||
sha = pr.get_commit_sha_at_comment(100)
|
||||
self.assertEqual(sha, "commit3")
|
||||
|
||||
@mock.patch("trymerge.iter_issue_timeline_until_comment")
|
||||
def test_get_commit_sha_at_comment_force_push_before_comment_legacy_mode(
|
||||
self, mock_iter_timeline: Any, *args: Any
|
||||
) -> None:
|
||||
mock_iter_timeline.return_value = [
|
||||
{"event": "committed", "sha": "commit1"},
|
||||
{"event": "committed", "sha": "commit2"},
|
||||
{"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
|
||||
{"event": "commented", "id": 100},
|
||||
]
|
||||
pr = GitHubPR("pytorch", "pytorch", 77700)
|
||||
sha = pr.get_commit_sha_at_comment(100)
|
||||
self.assertEqual(sha, "commit3")
|
||||
|
||||
@mock.patch("trymerge.iter_issue_timeline_until_comment")
|
||||
def test_get_commit_sha_at_comment_multiple_comments(
|
||||
self, mock_iter_timeline: Any, *args: Any
|
||||
) -> None:
|
||||
mock_iter_timeline.return_value = [
|
||||
{"event": "committed", "sha": "commit1"},
|
||||
{"event": "commented", "id": 100},
|
||||
{"event": "committed", "sha": "commit2"},
|
||||
{"event": "commented", "id": 200},
|
||||
{"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
|
||||
{"event": "commented", "id": 300},
|
||||
]
|
||||
pr = GitHubPR("pytorch", "pytorch", 77700)
|
||||
sha = pr.get_commit_sha_at_comment(200)
|
||||
self.assertEqual(sha, "commit2")
|
||||
sha = pr.get_commit_sha_at_comment(300)
|
||||
self.assertEqual(sha, "commit3")
|
||||
|
||||
@mock.patch("trymerge.iter_issue_timeline_until_comment")
|
||||
def test_get_commit_sha_at_comment_no_events(
|
||||
self, mock_iter_timeline: Any, *args: Any
|
||||
) -> None:
|
||||
mock_iter_timeline.return_value = [
|
||||
{"event": "commented", "id": 100},
|
||||
{"event": "labeled", "label": {"name": "test"}},
|
||||
]
|
||||
pr = GitHubPR("pytorch", "pytorch", 77700)
|
||||
sha = pr.get_commit_sha_at_comment(100)
|
||||
self.assertIsNone(sha)
|
||||
|
||||
@mock.patch("trymerge.iter_issue_timeline_until_comment")
|
||||
def test_get_commit_sha_at_comment_exception(
|
||||
self, mock_iter_timeline: Any, *args: Any
|
||||
) -> None:
|
||||
mock_iter_timeline.side_effect = Exception("API error")
|
||||
pr = GitHubPR("pytorch", "pytorch", 77700)
|
||||
sha = pr.get_commit_sha_at_comment(100)
|
||||
self.assertIsNone(sha)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
131
.github/scripts/trymerge.py
vendored
131
.github/scripts/trymerge.py
vendored
@ -450,63 +450,6 @@ HAS_NO_CONNECTED_DIFF_TITLE = (
|
||||
IGNORABLE_FAILED_CHECKS_THESHOLD = 10
|
||||
|
||||
|
||||
def iter_issue_timeline_until_comment(
|
||||
org: str, repo: str, issue_number: int, target_comment_id: int, max_pages: int = 200
|
||||
) -> Any:
|
||||
"""
|
||||
Yield timeline entries in order until (and including) the entry whose id == target_comment_id
|
||||
for a 'commented' event. Stops once the target comment is encountered.
|
||||
"""
|
||||
page = 1
|
||||
|
||||
while page <= max_pages:
|
||||
url = (
|
||||
f"https://api.github.com/repos/{org}/{repo}/issues/{issue_number}/timeline"
|
||||
)
|
||||
params = {"per_page": 100, "page": page}
|
||||
|
||||
batch = gh_fetch_json_list(url, params)
|
||||
|
||||
if not batch:
|
||||
return
|
||||
for ev in batch:
|
||||
# The target is the issue comment row with event == "commented" and id == issue_comment_id
|
||||
if ev.get("event") == "commented" and ev.get("id") == target_comment_id:
|
||||
yield ev # nothing in the timeline after this matters, so stop early
|
||||
return
|
||||
yield ev
|
||||
if len(batch) < 100:
|
||||
return
|
||||
page += 1
|
||||
|
||||
# If we got here without finding the comment, then we either hit a bug or some github PR
|
||||
# has a _really_ long timeline.
|
||||
# The max # of pages found on any pytorch/pytorch PR at the time of this change was 41
|
||||
raise RuntimeError(
|
||||
f"Could not find a merge commit in the first {max_pages} pages of the timeline at url {url}."
|
||||
f"This is most likely a bug, please report it to the @pytorch/pytorch-dev-infra team."
|
||||
)
|
||||
|
||||
|
||||
def sha_from_committed_event(ev: dict[str, Any]) -> Optional[str]:
|
||||
"""Extract SHA from committed event in timeline"""
|
||||
return ev.get("sha")
|
||||
|
||||
|
||||
def sha_from_force_push_after(ev: dict[str, Any]) -> Optional[str]:
|
||||
"""Extract SHA from force push event in timeline"""
|
||||
# The current GitHub API format
|
||||
commit_id = ev.get("commit_id")
|
||||
if commit_id:
|
||||
return str(commit_id)
|
||||
|
||||
# Legacy format
|
||||
after = ev.get("after") or ev.get("after_commit") or {}
|
||||
if isinstance(after, dict):
|
||||
return after.get("sha") or after.get("oid")
|
||||
return ev.get("after_sha") or ev.get("head_sha")
|
||||
|
||||
|
||||
def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any:
|
||||
rc = gh_graphql(GH_GET_PR_INFO_QUERY, name=proj, owner=org, number=pr_no)
|
||||
return rc["data"]["repository"]["pullRequest"]
|
||||
@ -900,44 +843,6 @@ class GitHubPR:
|
||||
def get_commit_count(self) -> int:
|
||||
return int(self.info["commits_with_authors"]["totalCount"])
|
||||
|
||||
def get_commit_sha_at_comment(self, comment_id: int) -> Optional[str]:
|
||||
"""
|
||||
Get the PR head commit SHA that was present when a specific comment was posted.
|
||||
This ensures we only merge the state of the PR at the time the merge command was issued,
|
||||
not any subsequent commits that may have been pushed after.
|
||||
|
||||
Returns None if no head-changing events found before the comment or if the comment was not found.
|
||||
"""
|
||||
head = None
|
||||
|
||||
try:
|
||||
for event in iter_issue_timeline_until_comment(
|
||||
self.org, self.project, self.pr_num, comment_id
|
||||
):
|
||||
etype = event.get("event")
|
||||
if etype == "committed":
|
||||
sha = sha_from_committed_event(event)
|
||||
if sha:
|
||||
head = sha
|
||||
print(f"Timeline: Found commit event for SHA {sha}")
|
||||
elif etype == "head_ref_force_pushed":
|
||||
sha = sha_from_force_push_after(event)
|
||||
if sha:
|
||||
head = sha
|
||||
print(f"Timeline: Found force push event for SHA {sha}")
|
||||
elif etype == "commented":
|
||||
if event.get("id") == comment_id:
|
||||
print(f"Timeline: Found final comment with sha {sha}")
|
||||
return head
|
||||
except Exception as e:
|
||||
print(
|
||||
f"Warning: Failed to reconstruct timeline for comment {comment_id}: {e}"
|
||||
)
|
||||
return None
|
||||
|
||||
print(f"Did not find comment with id {comment_id} in the PR timeline")
|
||||
return None
|
||||
|
||||
def get_pr_creator_login(self) -> str:
|
||||
return cast(str, self.info["author"]["login"])
|
||||
|
||||
@ -1329,14 +1234,11 @@ class GitHubPR:
|
||||
skip_all_rule_checks: bool = False,
|
||||
) -> list["GitHubPR"]:
|
||||
"""
|
||||
:param skip_all_rule_checks: If true, skips all rule checks on ghstack PRs, useful for dry-running merge locally
|
||||
:param skip_all_rule_checks: If true, skips all rule checks, useful for dry-running merge locally
|
||||
"""
|
||||
branch_to_merge_into = self.default_branch() if branch is None else branch
|
||||
if repo.current_branch() != branch_to_merge_into:
|
||||
repo.checkout(branch_to_merge_into)
|
||||
|
||||
# It's okay to skip the commit SHA check for ghstack PRs since
|
||||
# authoring requires write access to the repo.
|
||||
if self.is_ghstack_pr():
|
||||
return self.merge_ghstack_into(
|
||||
repo,
|
||||
@ -1347,41 +1249,14 @@ class GitHubPR:
|
||||
|
||||
msg = self.gen_commit_message()
|
||||
pr_branch_name = f"__pull-request-{self.pr_num}__init__"
|
||||
|
||||
# Determine which commit SHA to merge
|
||||
commit_to_merge = None
|
||||
if not comment_id:
|
||||
raise ValueError("Must provide --comment-id when merging regular PRs")
|
||||
|
||||
# Get the commit SHA that was present when the comment was made
|
||||
commit_to_merge = self.get_commit_sha_at_comment(comment_id)
|
||||
if not commit_to_merge:
|
||||
raise RuntimeError(
|
||||
f"Could not find commit that was pushed before comment {comment_id}"
|
||||
)
|
||||
|
||||
# Validate that this commit is the latest commit on the PR
|
||||
latest_commit = self.last_commit_sha()
|
||||
if commit_to_merge != latest_commit:
|
||||
raise RuntimeError(
|
||||
f"Commit {commit_to_merge} was HEAD when comment {comment_id} was posted "
|
||||
f"but now the latest commit on the PR is {latest_commit}. "
|
||||
f"Please re-issue the merge command to merge the latest commit."
|
||||
)
|
||||
|
||||
print(f"Merging commit {commit_to_merge} locally")
|
||||
|
||||
repo.fetch(commit_to_merge, pr_branch_name)
|
||||
repo.fetch(self.last_commit_sha(), pr_branch_name)
|
||||
repo._run_git("merge", "--squash", pr_branch_name)
|
||||
repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
|
||||
|
||||
# Did the PR change since we started the merge?
|
||||
pulled_sha = repo.show_ref(pr_branch_name)
|
||||
latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
|
||||
if (
|
||||
pulled_sha != latest_pr_status.last_commit_sha()
|
||||
or pulled_sha != commit_to_merge
|
||||
):
|
||||
if pulled_sha != latest_pr_status.last_commit_sha():
|
||||
raise RuntimeError(
|
||||
"PR has been updated since CI checks last passed. Please rerun the merge command."
|
||||
)
|
||||
|
||||
1
.github/workflows/_win-build.yml
vendored
1
.github/workflows/_win-build.yml
vendored
@ -77,7 +77,6 @@ jobs:
|
||||
run: |
|
||||
git config --global core.longpaths true
|
||||
git config --global core.symlinks true
|
||||
git config --global core.ignorecase false
|
||||
|
||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
||||
# the directory on Windows and prevent GHA from checking out as reported
|
||||
|
||||
1
.github/workflows/_win-test.yml
vendored
1
.github/workflows/_win-test.yml
vendored
@ -70,7 +70,6 @@ jobs:
|
||||
run: |
|
||||
git config --global core.longpaths true
|
||||
git config --global core.symlinks true
|
||||
git config --global core.ignorecase false
|
||||
|
||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
||||
# the directory on Windows and prevent GHA from checking out as reported
|
||||
|
||||
2
.github/workflows/_xpu-test.yml
vendored
2
.github/workflows/_xpu-test.yml
vendored
@ -275,7 +275,7 @@ jobs:
|
||||
- name: Change permissions
|
||||
if: ${{ always() && steps.test.conclusion }}
|
||||
run: |
|
||||
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1000:1000 test"
|
||||
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
|
||||
|
||||
- name: Print remaining test logs
|
||||
shell: bash
|
||||
|
||||
5
.github/workflows/docker-builds.yml
vendored
5
.github/workflows/docker-builds.yml
vendored
@ -50,9 +50,10 @@ jobs:
|
||||
runner: [linux.12xlarge]
|
||||
docker-image-name: [
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
|
||||
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-py3.10-clang12,
|
||||
@ -63,7 +64,7 @@ jobs:
|
||||
pytorch-linux-jammy-rocm-n-py3-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12,
|
||||
pytorch-linux-jammy-py3.10-gcc11,
|
||||
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
|
||||
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
|
||||
pytorch-linux-jammy-py3.12-halide,
|
||||
pytorch-linux-jammy-xpu-n-1-py3,
|
||||
pytorch-linux-jammy-xpu-n-py3,
|
||||
|
||||
250
.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
generated
vendored
250
.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
generated
vendored
@ -1038,253 +1038,3 @@ jobs:
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
libtorch-cuda13_0-shared-with-deps-debug-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
LIBTORCH_CONFIG: debug
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
# runner.temp variable, which we need.
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
function get_ec2_metadata() {
|
||||
# Pulled from instance metadata endpoint for EC2
|
||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
||||
category=$1
|
||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
||||
}
|
||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
continue-on-error: true
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
||||
shell: bash
|
||||
run: |
|
||||
git config --global core.longpaths true
|
||||
git config --global core.symlinks true
|
||||
|
||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
||||
# the directory on Windows and prevent GHA from checking out as reported
|
||||
# in https://github.com/actions/checkout/issues/1018
|
||||
git config --global core.fsmonitor false
|
||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
||||
- name: Enable long paths on Windows
|
||||
shell: powershell
|
||||
run: |
|
||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
||||
# removed once Windows Defender is removed from the AMI
|
||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
||||
continue-on-error: true
|
||||
shell: powershell
|
||||
run: |
|
||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
||||
# that it doesn't interfere
|
||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
|
||||
- uses: actions/upload-artifact@v4.4.0
|
||||
if: always()
|
||||
with:
|
||||
name: libtorch-cuda13_0-shared-with-deps-debug
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
||||
- name: Wait until all sessions have drained
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
timeout-minutes: 120
|
||||
run: |
|
||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
run: |
|
||||
.github\scripts\kill_active_ssh_sessions.ps1
|
||||
|
||||
libtorch-cuda13_0-shared-with-deps-debug-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- libtorch-cuda13_0-shared-with-deps-debug-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
LIBTORCH_CONFIG: debug
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
function get_ec2_metadata() {
|
||||
# Pulled from instance metadata endpoint for EC2
|
||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
||||
category=$1
|
||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
||||
}
|
||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
continue-on-error: true
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
||||
shell: bash
|
||||
run: |
|
||||
git config --global core.longpaths true
|
||||
git config --global core.symlinks true
|
||||
|
||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
||||
# the directory on Windows and prevent GHA from checking out as reported
|
||||
# in https://github.com/actions/checkout/issues/1018
|
||||
git config --global core.fsmonitor false
|
||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
||||
- name: Enable long paths on Windows
|
||||
shell: powershell
|
||||
run: |
|
||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
||||
# removed once Windows Defender is removed from the AMI
|
||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
||||
continue-on-error: true
|
||||
shell: powershell
|
||||
run: |
|
||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
||||
# that it doesn't interfere
|
||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
# runner.temp variable, which we need.
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: libtorch-cuda13_0-shared-with-deps-debug
|
||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Test PyTorch binary
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
|
||||
- name: Wait until all sessions have drained
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
timeout-minutes: 120
|
||||
run: |
|
||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
run: |
|
||||
.github\scripts\kill_active_ssh_sessions.ps1
|
||||
libtorch-cuda13_0-shared-with-deps-debug-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: libtorch-cuda13_0-shared-with-deps-debug-test
|
||||
with:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
LIBTORCH_CONFIG: debug
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cuda13_0-shared-with-deps-debug
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
250
.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
generated
vendored
250
.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
generated
vendored
@ -1038,253 +1038,3 @@ jobs:
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
libtorch-cuda13_0-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
# runner.temp variable, which we need.
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
function get_ec2_metadata() {
|
||||
# Pulled from instance metadata endpoint for EC2
|
||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
||||
category=$1
|
||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
||||
}
|
||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
continue-on-error: true
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
||||
shell: bash
|
||||
run: |
|
||||
git config --global core.longpaths true
|
||||
git config --global core.symlinks true
|
||||
|
||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
||||
# the directory on Windows and prevent GHA from checking out as reported
|
||||
# in https://github.com/actions/checkout/issues/1018
|
||||
git config --global core.fsmonitor false
|
||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
||||
- name: Enable long paths on Windows
|
||||
shell: powershell
|
||||
run: |
|
||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
||||
# removed once Windows Defender is removed from the AMI
|
||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
||||
continue-on-error: true
|
||||
shell: powershell
|
||||
run: |
|
||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
||||
# that it doesn't interfere
|
||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
|
||||
- uses: actions/upload-artifact@v4.4.0
|
||||
if: always()
|
||||
with:
|
||||
name: libtorch-cuda13_0-shared-with-deps-release
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
||||
- name: Wait until all sessions have drained
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
timeout-minutes: 120
|
||||
run: |
|
||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
run: |
|
||||
.github\scripts\kill_active_ssh_sessions.ps1
|
||||
|
||||
libtorch-cuda13_0-shared-with-deps-release-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- libtorch-cuda13_0-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
function get_ec2_metadata() {
|
||||
# Pulled from instance metadata endpoint for EC2
|
||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
||||
category=$1
|
||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
||||
}
|
||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
continue-on-error: true
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
||||
shell: bash
|
||||
run: |
|
||||
git config --global core.longpaths true
|
||||
git config --global core.symlinks true
|
||||
|
||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
||||
# the directory on Windows and prevent GHA from checking out as reported
|
||||
# in https://github.com/actions/checkout/issues/1018
|
||||
git config --global core.fsmonitor false
|
||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
||||
- name: Enable long paths on Windows
|
||||
shell: powershell
|
||||
run: |
|
||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
||||
# removed once Windows Defender is removed from the AMI
|
||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
||||
continue-on-error: true
|
||||
shell: powershell
|
||||
run: |
|
||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
||||
# that it doesn't interfere
|
||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
# runner.temp variable, which we need.
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: libtorch-cuda13_0-shared-with-deps-release
|
||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Test PyTorch binary
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
|
||||
- name: Wait until all sessions have drained
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
timeout-minutes: 120
|
||||
run: |
|
||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
run: |
|
||||
.github\scripts\kill_active_ssh_sessions.ps1
|
||||
libtorch-cuda13_0-shared-with-deps-release-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: libtorch-cuda13_0-shared-with-deps-release-test
|
||||
with:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cuda13_0-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
1666
.github/workflows/generated-windows-binary-wheel-nightly.yml
generated
vendored
1666
.github/workflows/generated-windows-binary-wheel-nightly.yml
generated
vendored
File diff suppressed because it is too large
Load Diff
@ -18,13 +18,13 @@ permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
inductor-build:
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-build:
|
||||
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||
name: inductor-build
|
||||
name: linux-jammy-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
|
||||
# Use metal host for benchmark jobs
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -32,13 +32,13 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
inductor-micro-benchmark-test:
|
||||
name: inductor-micro-benchmark-test
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-micro-benchmark-test:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-build
|
||||
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
secrets: inherit
|
||||
|
||||
16
.github/workflows/inductor-nightly.yml
vendored
16
.github/workflows/inductor-nightly.yml
vendored
@ -32,13 +32,13 @@ jobs:
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
opt_out_experiments: lf
|
||||
|
||||
nightly-dynamo-benchmarks-build:
|
||||
name: nightly-dynamo-benchmarks-build
|
||||
linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-default-label-prefix
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -51,13 +51,13 @@ jobs:
|
||||
build-additional-packages: "vision audio torchao"
|
||||
secrets: inherit
|
||||
|
||||
nightly-dynamo-benchmarks-test:
|
||||
name: nightly-dynamo-benchmarks-test
|
||||
linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-test:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: nightly-dynamo-benchmarks-build
|
||||
needs: linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }}
|
||||
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
secrets: inherit
|
||||
|
||||
@ -84,8 +84,9 @@ jobs:
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
opt_out_experiments: lf
|
||||
|
||||
# NB: Keep this in sync with trunk.yml
|
||||
build:
|
||||
name: build
|
||||
name: cuda12.8-py3.10-gcc9-sm90
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
@ -127,7 +128,7 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test-periodically:
|
||||
name: test-periodically
|
||||
name: cuda12.8-py3.10-gcc9-sm90
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event.schedule == '15 0,12 * * 1-6'
|
||||
@ -144,7 +145,7 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test-weekly:
|
||||
name: test-weekly
|
||||
name: cuda12.8-py3.10-gcc9-sm90
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event.schedule == '0 7 * * 0'
|
||||
@ -161,7 +162,7 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test:
|
||||
name: test
|
||||
name: cuda12.8-py3.10-gcc9-sm90
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
# The pull_request trigger is used in PR to bump transformers pin which always
|
||||
|
||||
@ -69,14 +69,14 @@ jobs:
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
opt_out_experiments: lf
|
||||
|
||||
inductor-build:
|
||||
name: inductor-build
|
||||
linux-jammy-zen-cpu-py3_9-gcc11-inductor-build:
|
||||
name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor_huggingface_perf_cpu_x86_zen", shard: 1, num_shards: 3, runner: "linux.24xlarge.amd" },
|
||||
@ -95,16 +95,16 @@ jobs:
|
||||
selected-test-configs: ${{ inputs.benchmark_configs }}
|
||||
secrets: inherit
|
||||
|
||||
inductor-test-nightly:
|
||||
name: inductor-test-nightly
|
||||
linux-jammy-zen-cpu-py3_9-gcc11-inductor-test-nightly:
|
||||
name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-build
|
||||
needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
|
||||
if: github.event.schedule == '0 7 * * *'
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
# disable monitor in perf tests
|
||||
disable-monitor: false
|
||||
@ -112,16 +112,17 @@ jobs:
|
||||
monitor-data-collect-interval: 4
|
||||
secrets: inherit
|
||||
|
||||
inductor-test:
|
||||
name: inductor-test
|
||||
|
||||
linux-jammy-zen-cpu-py3_9-gcc11-inductor-test:
|
||||
name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-build
|
||||
needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
|
||||
if: github.event_name == 'workflow_dispatch'
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
# disable monitor in perf tests
|
||||
disable-monitor: false
|
||||
|
||||
@ -74,14 +74,14 @@ jobs:
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
opt_out_experiments: lf
|
||||
|
||||
inductor-build:
|
||||
name: inductor-build
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-build:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor_huggingface_perf_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xl.spr-metal" },
|
||||
@ -101,16 +101,16 @@ jobs:
|
||||
build-additional-packages: "vision audio torchao"
|
||||
secrets: inherit
|
||||
|
||||
inductor-test-nightly-freezing:
|
||||
name: inductor-test-nightly-freezing
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-test-nightly-freezing:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-build
|
||||
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
|
||||
if: github.event.schedule == '0 7 * * *'
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
# disable monitor in perf tests
|
||||
disable-monitor: false
|
||||
@ -118,16 +118,16 @@ jobs:
|
||||
monitor-data-collect-interval: 4
|
||||
secrets: inherit
|
||||
|
||||
inductor-test:
|
||||
name: inductor-test
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-test:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-build
|
||||
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
|
||||
if: github.event_name == 'workflow_dispatch'
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }}
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
# disable monitor in perf tests
|
||||
disable-monitor: false
|
||||
|
||||
@ -79,6 +79,7 @@ jobs:
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
opt_out_experiments: lf
|
||||
|
||||
# NB: Keep this in sync with trunk.yml
|
||||
build:
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
||||
130
.github/workflows/inductor-periodic.yml
vendored
130
.github/workflows/inductor-periodic.yml
vendored
@ -31,8 +31,8 @@ jobs:
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
opt_out_experiments: lf
|
||||
|
||||
periodic-dynamo-benchmarks-build:
|
||||
name: periodic-dynamo-benchmarks-build
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build:
|
||||
name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-default-label-prefix
|
||||
with:
|
||||
@ -57,33 +57,23 @@ jobs:
|
||||
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
build-additional-packages: "vision audio fbgemm torchao"
|
||||
secrets: inherit
|
||||
|
||||
periodic-dynamo-benchmarks-test:
|
||||
name: periodic-dynamo-benchmarks-test
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-test:
|
||||
name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: periodic-dynamo-benchmarks-build
|
||||
needs: linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
docker-image: ${{ needs.periodic-dynamo-benchmarks-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
rocm-periodic-dynamo-benchmarks-build:
|
||||
linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: rocm-periodic-dynamo-benchmarks-build
|
||||
name: rocm-py3_10-periodic-dynamo-benchmarks
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3_10
|
||||
@ -109,21 +99,21 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
rocm-periodic-dynamo-benchmarks-test:
|
||||
linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: rocm-periodic-dynamo-benchmarks-test
|
||||
name: rocm-py3_10-periodic-dynamo-benchmarks
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs: rocm-periodic-dynamo-benchmarks-build
|
||||
needs: linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3_10
|
||||
docker-image: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
inductor-smoke-build:
|
||||
name: inductor-smoke-build
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build:
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs:
|
||||
- get-default-label-prefix
|
||||
@ -139,23 +129,23 @@ jobs:
|
||||
build-additional-packages: "vision audio fbgemm torchao"
|
||||
secrets: inherit
|
||||
|
||||
inductor-smoke-test:
|
||||
name: inductor-smoke-test
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-test:
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-smoke-build
|
||||
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image: ${{ needs.inductor-smoke-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
periodic-dynamo-benchmarks-cpu-build:
|
||||
name: periodic-dynamo-benchmarks-cpu-build
|
||||
linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-default-label-prefix
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -170,6 +160,68 @@ jobs:
|
||||
{ config: "cpu_inductor_freezing_avx2_torchbench", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
|
||||
{ config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
|
||||
{ config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
|
||||
]}
|
||||
build-additional-packages: "vision audio torchao"
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-test:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
|
||||
name: cuda12.8-py3.10-gcc9-sm86
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-default-label-prefix
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.6'
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
build-additional-packages: "vision audio fbgemm torchao"
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
|
||||
name: cuda12.8-py3.10-gcc9-sm86
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-build:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-default-label-prefix
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "cpu_inductor_freezing_huggingface", shard: 1, num_shards: 1, runner: "linux.8xlarge.amx" },
|
||||
{ config: "cpu_inductor_freezing_timm", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
|
||||
{ config: "cpu_inductor_freezing_timm", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
|
||||
@ -195,12 +247,12 @@ jobs:
|
||||
build-additional-packages: "vision audio torchao"
|
||||
secrets: inherit
|
||||
|
||||
periodic-dynamo-benchmarks-cpu-test:
|
||||
name: periodic-dynamo-benchmarks-cpu-test
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-test:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: periodic-dynamo-benchmarks-cpu-build
|
||||
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }}
|
||||
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
107
.github/workflows/inductor-unittest.yml
vendored
107
.github/workflows/inductor-unittest.yml
vendored
@ -28,8 +28,8 @@ jobs:
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
opt_out_experiments: lf
|
||||
|
||||
inductor-build:
|
||||
name: inductor-build
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
|
||||
name: cuda12.8-py3.10-gcc9-sm86
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
@ -47,18 +47,44 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
inductor-test:
|
||||
name: inductor-test
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
|
||||
name: cuda12.8-py3.10-gcc9-sm86
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-build
|
||||
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
inductor-halide-build:
|
||||
name: inductor-halide-build
|
||||
linux-jammy-cuda12_8-py3_12-gcc9-inductor-build:
|
||||
name: cuda12.8-py3.12-gcc9-sm86
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.6'
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_12-gcc9-inductor-test:
|
||||
name: cuda12.8-py3.12-gcc9-sm86
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-cuda12_8-py3_12-gcc9-inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cpu-py3_12-inductor-halide-build:
|
||||
name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
@ -71,18 +97,18 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
inductor-halide-test:
|
||||
name: inductor-halide-test
|
||||
linux-jammy-cpu-py3_12-inductor-halide-test:
|
||||
name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-halide-build
|
||||
needs: linux-jammy-cpu-py3_12-inductor-halide-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.12-gcc11
|
||||
docker-image: ${{ needs.inductor-halide-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }}
|
||||
docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
inductor-triton-cpu-build:
|
||||
name: inductor-triton-cpu-build
|
||||
linux-jammy-cpu-py3_12-inductor-triton-cpu-build:
|
||||
name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
@ -95,23 +121,23 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
inductor-triton-cpu-test:
|
||||
linux-jammy-cpu-py3_12-inductor-triton-cpu-test:
|
||||
name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-triton-cpu-build
|
||||
needs: linux-jammy-cpu-py3_12-inductor-triton-cpu-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.12-gcc11
|
||||
docker-image: ${{ needs.inductor-triton-cpu-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-triton-cpu-build.outputs.test-matrix }}
|
||||
docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
inductor-cpu-build:
|
||||
name: inductor-cpu-build
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-build:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -122,12 +148,37 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
inductor-cpu-test:
|
||||
name: inductor-cpu-test
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-test:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-cpu-build
|
||||
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
|
||||
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_13-gcc9-inductor-build:
|
||||
name: cuda12.8-py3.13-gcc9-sm86
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.6'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_13-gcc9-inductor-test:
|
||||
name: cuda12.8-py3.13-gcc9-sm86
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-cuda12_8-py3_13-gcc9-inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
32
.github/workflows/inductor.yml
vendored
32
.github/workflows/inductor.yml
vendored
@ -44,8 +44,8 @@ jobs:
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
opt_out_experiments: lf
|
||||
|
||||
inductor-build:
|
||||
name: inductor-build
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
|
||||
name: cuda12.8-py3.10-gcc9-sm86
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
@ -53,6 +53,7 @@ jobs:
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.6'
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
@ -64,24 +65,25 @@ jobs:
|
||||
build-additional-packages: "vision audio fbgemm torchao"
|
||||
secrets: inherit
|
||||
|
||||
inductor-test:
|
||||
name: inductor-test
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
|
||||
name: cuda12.8-py3.10-gcc9-sm86
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-build
|
||||
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
inductor-cpu-build:
|
||||
name: inductor-cpu-build
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-build:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
|
||||
@ -96,12 +98,12 @@ jobs:
|
||||
build-additional-packages: "vision audio torchao"
|
||||
secrets: inherit
|
||||
|
||||
inductor-cpu-test:
|
||||
name: inductor-cpu-test
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-test:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-cpu-build
|
||||
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
|
||||
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
22
.github/workflows/operator_benchmark.yml
vendored
22
.github/workflows/operator_benchmark.yml
vendored
@ -24,38 +24,38 @@ permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
opbenchmark-build:
|
||||
linux-jammy-cpu-py3_9-gcc11-opbenchmark-build:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: opbenchmark-build
|
||||
name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
opbenchmark-on-demand-build:
|
||||
linux-jammy-cpu-py3_9-gcc11-opbenchmark-on-demand-build:
|
||||
if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }}
|
||||
name: opbenchmark-on-demand-build
|
||||
name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
opbenchmark-test:
|
||||
name: opbenchmark-test
|
||||
linux-jammy-cpu-py3_9-gcc11-opbenchmark-test:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: opbenchmark-build
|
||||
needs: linux-jammy-cpu-py3_9-gcc11-opbenchmark-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
|
||||
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
32
.github/workflows/periodic.yml
vendored
32
.github/workflows/periodic.yml
vendored
@ -170,38 +170,6 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda13_0-py3_10-gcc11-build:
|
||||
name: linux-jammy-cuda13.0-py3.10-gcc11
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
cuda-arch-list: 7.5
|
||||
build-environment: linux-jammy-cuda13.0-py3.10-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
|
||||
{ config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
|
||||
{ config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
|
||||
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda13_0-py3_10-gcc11-test:
|
||||
name: linux-jammy-cuda13.0-py3.10-gcc11
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-jammy-cuda13_0-py3_10-gcc11-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-cuda13.0-py3.10-gcc11
|
||||
docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-build:
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
||||
9
.github/workflows/trunk.yml
vendored
9
.github/workflows/trunk.yml
vendored
@ -224,12 +224,13 @@ jobs:
|
||||
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
|
||||
secrets: inherit
|
||||
|
||||
inductor-build:
|
||||
name: inductor-build
|
||||
# NB: Keep this in sync with inductor-perf-test-nightly.yml
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
secrets: inherit
|
||||
@ -241,7 +242,7 @@ jobs:
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
|
||||
@ -1801,26 +1801,3 @@ command = [
|
||||
"python3",
|
||||
"tools/linter/adapters/gb_registry_linter.py",
|
||||
]
|
||||
|
||||
[[linter]]
|
||||
code = 'DISTRIBUTED_C10D_DIRECT_ACCESS'
|
||||
include_patterns = ['**/*.py']
|
||||
exclude_patterns = [
|
||||
'torch/distributed/_distributed_c10d.py',
|
||||
'fb/**',
|
||||
'**/fb/**',
|
||||
]
|
||||
command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/grep_linter.py',
|
||||
'--pattern=torch\._C\._distributed_c10d',
|
||||
'--linter-name=DISTRIBUTED_C10D_DIRECT_ACCESS',
|
||||
'--error-name=direct access to torch._C._distributed_c10d',
|
||||
"""--error-description=\
|
||||
Never access torch._C._distributed_c10d directly in code. Always \
|
||||
import from and use torch.distributed._distributed_c10d which is \
|
||||
guaranteed to have all functions available\
|
||||
""",
|
||||
'--',
|
||||
'@{{PATHSFILE}}'
|
||||
]
|
||||
|
||||
@ -22,6 +22,7 @@ COMMON_COPTS = [
|
||||
"-DHAVE_SHM_UNLINK=1",
|
||||
"-D_FILE_OFFSET_BITS=64",
|
||||
"-DUSE_FBGEMM",
|
||||
"-DUSE_DISTRIBUTED",
|
||||
"-DAT_PER_OPERATOR_HEADERS",
|
||||
"-DATEN_THREADING=NATIVE",
|
||||
"-DNO_CUDNN_DESTROY_HANDLE",
|
||||
|
||||
@ -181,9 +181,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
|
||||
set(CPU_POWER ON)
|
||||
endif()
|
||||
|
||||
# For non-supported platforms, turn USE_DISTRIBUTED off by default.
|
||||
# NB: USE_DISTRIBUTED simply disables the backend; distributed code
|
||||
# still gets built
|
||||
# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
|
||||
# tested and likely won't work without additional changes.
|
||||
if(NOT LINUX AND NOT WIN32)
|
||||
set(USE_DISTRIBUTED
|
||||
OFF
|
||||
@ -262,11 +261,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
|
||||
option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
|
||||
option(USE_NATIVE_ARCH "Use -march=native" OFF)
|
||||
cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
|
||||
option(USE_DISTRIBUTED "Enable default distributed backends" ON)
|
||||
option(USE_DISTRIBUTED "Use distributed" ON)
|
||||
cmake_dependent_option(USE_NCCL "Use NCCL" ON
|
||||
"USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
|
||||
cmake_dependent_option(USE_XCCL "Use XCCL" ON
|
||||
"USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
|
||||
"USE_XPU;UNIX;NOT APPLE" OFF)
|
||||
cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
|
||||
cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
|
||||
cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
|
||||
@ -431,10 +430,11 @@ if(WIN32)
|
||||
PATH_SUFFIXES lib
|
||||
NO_DEFAULT_PATH)
|
||||
if(NOT libuv_tmp_LIBRARY)
|
||||
set(USE_DISTRIBUTED OFF)
|
||||
set(USE_GLOO OFF)
|
||||
message(
|
||||
WARNING
|
||||
"Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
|
||||
"Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
|
||||
"Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
|
||||
)
|
||||
else()
|
||||
|
||||
@ -216,7 +216,7 @@ file(GLOB mem_eff_attention_cuda_cpp "native/transformers/cuda/mem_eff_attention
|
||||
if(USE_CUDA AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
|
||||
add_library(flash_attention OBJECT EXCLUDE_FROM_ALL ${flash_attention_cuda_kernels_cu} ${flash_attention_cuda_cpp})
|
||||
|
||||
target_include_directories(flash_attention SYSTEM PUBLIC
|
||||
target_include_directories(flash_attention PUBLIC
|
||||
${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc
|
||||
${PROJECT_SOURCE_DIR}/third_party/flash-attention/include
|
||||
${PROJECT_SOURCE_DIR}/third_party/cutlass/include
|
||||
|
||||
@ -279,45 +279,10 @@ bool Context::userEnabledOverrideableSDP() const {
|
||||
return enabled_overrideable;
|
||||
}
|
||||
|
||||
static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
|
||||
static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
|
||||
#ifdef USE_ROCM
|
||||
static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32";
|
||||
#endif
|
||||
|
||||
bool Context::checkCuBLASConfigDeterministic() {
|
||||
// If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
|
||||
// is set to deterministic setting
|
||||
if (hasCUDART()) {
|
||||
const auto workspace_config = c10::utils::get_env(cublas_config_var_name);
|
||||
return (workspace_config == cublas_deterministic_configs[0] || workspace_config == cublas_deterministic_configs[1]);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void Context::alertCuBLASConfigNotDeterministic() const {
|
||||
static const bool cublas_config_deterministic = checkCuBLASConfigDeterministic();
|
||||
if (C10_LIKELY(!deterministicAlgorithms() || cublas_config_deterministic)) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto msg = c10::str(
|
||||
"Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or ",
|
||||
"`at::Context::setDeterministicAlgorithms(true)`, but this operation is not deterministic because ",
|
||||
"it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this ",
|
||||
"case, you must set an environment variable before running your PyTorch application: ",
|
||||
cublas_config_var_name, "=", cublas_deterministic_configs[0], " or ",
|
||||
cublas_config_var_name, "=", cublas_deterministic_configs[1], ". For more information, go to ",
|
||||
"https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility"
|
||||
);
|
||||
|
||||
if (deterministicAlgorithmsWarnOnly()) {
|
||||
TORCH_WARN(msg);
|
||||
} else {
|
||||
TORCH_CHECK(false, msg);
|
||||
}
|
||||
}
|
||||
|
||||
bool Context::benchmarkCuDNN() const {
|
||||
return benchmark_cudnn;
|
||||
}
|
||||
|
||||
@ -310,13 +310,7 @@ class TORCH_API Context {
|
||||
//
|
||||
// * Throw an error when `Context::deterministicAlgorithms()` is true. Most
|
||||
// of the time, this should be accomplished by calling
|
||||
// `at::globalContext().alertNotDeterminstic()`. However, if the
|
||||
// nondeterministic behavior is caused by the CuBLAS workspace
|
||||
// configuration in CUDA >= 10.2,
|
||||
// `at::globalContext().alertCuBLASConfigNotDeterministic()` should be
|
||||
// called instead (in this case, a comment explaining why the operation is
|
||||
// nondeterministic is not necessary). See below for details on these
|
||||
// methods.
|
||||
// `at::globalContext().alertNotDeterminstic().
|
||||
//
|
||||
// * Have an entry in the list of nondeterministic PyTorch operations in the
|
||||
// docstring of `use_deterministic_algorithms()` in torch/__init__.py
|
||||
@ -340,12 +334,6 @@ class TORCH_API Context {
|
||||
// Throws an error if `Context::deterministicAlgorithms()` is true
|
||||
static void alertNotDeterministic(std::string_view const& caller);
|
||||
|
||||
// Throws an error if `Context::deterministicAlgorithms()` is true, CUDA
|
||||
// >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or
|
||||
// ":4096:8". For more details:
|
||||
// https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
|
||||
void alertCuBLASConfigNotDeterministic() const;
|
||||
|
||||
void setFloat32MatmulPrecision(const std::string& s);
|
||||
void setFloat32Precision(
|
||||
const std::string& backend,
|
||||
@ -429,7 +417,6 @@ class TORCH_API Context {
|
||||
}
|
||||
|
||||
private:
|
||||
static bool checkCuBLASConfigDeterministic();
|
||||
std::array<c10::once_flag, at::COMPILE_TIME_MAX_DEVICE_TYPES> init_;
|
||||
bool enabled_cudnn = true;
|
||||
bool deterministic_cudnn = false;
|
||||
|
||||
@ -436,7 +436,6 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
|
||||
static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublaslt: not implemented");
|
||||
}
|
||||
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -570,8 +569,6 @@ inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_D
|
||||
|
||||
template <>
|
||||
void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -583,8 +580,6 @@ void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
|
||||
|
||||
template <>
|
||||
void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -596,8 +591,6 @@ void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
|
||||
|
||||
template <>
|
||||
void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -611,8 +604,6 @@ void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::co
|
||||
|
||||
template <>
|
||||
void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -626,8 +617,6 @@ void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::com
|
||||
|
||||
template <typename C_Dtype>
|
||||
inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -697,8 +686,6 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
|
||||
|
||||
template <typename C_Dtype>
|
||||
inline void bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
BGEMM_CHECK_ARGVALUES(at::BFloat16);
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
@ -1027,8 +1014,6 @@ inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dty
|
||||
|
||||
template <>
|
||||
void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -1040,8 +1025,6 @@ void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
|
||||
|
||||
template <>
|
||||
void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -1053,8 +1036,6 @@ void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
|
||||
|
||||
template <>
|
||||
void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -1068,8 +1049,6 @@ void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::comp
|
||||
|
||||
template <>
|
||||
void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -1083,8 +1062,6 @@ void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::compl
|
||||
|
||||
template <typename C_Dtype>
|
||||
inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -1191,7 +1168,6 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
|
||||
|
||||
template <typename C_Dtype>
|
||||
inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t opa = _cublasOpFromChar(transa);
|
||||
cublasOperation_t opb = _cublasOpFromChar(transb);
|
||||
@ -1947,11 +1923,11 @@ void scaled_gemm(
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
|
||||
cublasLtMatmulDescAttributes_t matmulDescA = CUBLASLT_MATMUL_DESC_A_SCALE_POINTER;
|
||||
cublasLtMatmulDescAttributes_t matmulDescB = CUBLASLT_MATMUL_DESC_B_SCALE_POINTER;
|
||||
#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
|
||||
// hipblaslt supported row-wise before cublas, and did so their own way (via
|
||||
// the SCALE_POINTERSs), but then migrated to match how cublas does it (via
|
||||
// the SCALE_MODEs). Here we check for this early custom mode.
|
||||
bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise);
|
||||
#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
|
||||
if (use_rowwise) {
|
||||
matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
|
||||
matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
|
||||
@ -1966,8 +1942,12 @@ void scaled_gemm(
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
// rowwise isn't supported using cublaslt or older hipblaslt
|
||||
#elif (CUDA_VERSION < 12080) && !defined(USE_ROCM)
|
||||
// hipblaslt supported row-wise before cublas, and did so their own way (via
|
||||
// the SCALE_POINTERSs), but then migrated to match how cublas does it (via
|
||||
// the SCALE_MODEs). Here we check for this early custom mode.
|
||||
bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise);
|
||||
// rowwise isn't supported using older cublaslt or older hipblaslt
|
||||
TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt");
|
||||
#endif // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
|
||||
computeDesc.setAttribute(matmulDescA, mat1_scale_ptr);
|
||||
@ -2411,8 +2391,6 @@ void trsmBatched<c10::complex<double>>(
|
||||
|
||||
template <>
|
||||
void gemv<c10::complex<double>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<double>)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t op = _cublasOpFromChar(trans);
|
||||
_cublasAdjustLdLevel2(m, n, &lda);
|
||||
@ -2428,8 +2406,6 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
|
||||
// gemv is bw bound, and does not benefit from TF32. But the precision
|
||||
// loss still happens on TF32. So we disable it here.
|
||||
NoTF32Guard disable_tf32;
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t op = _cublasOpFromChar(trans);
|
||||
_cublasAdjustLdLevel2(m, n, &lda);
|
||||
@ -2442,8 +2418,6 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
|
||||
|
||||
template <>
|
||||
void gemv<double>(CUDABLAS_GEMV_ARGTYPES(double)) {
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t op = _cublasOpFromChar(trans);
|
||||
_cublasAdjustLdLevel2(m, n, &lda);
|
||||
@ -2457,8 +2431,6 @@ void gemv<float>(CUDABLAS_GEMV_ARGTYPES(float)) {
|
||||
// gemv is bw bound, and does not benefit from TF32. But the precision
|
||||
// loss still happens on TF32. So we disable it here.
|
||||
NoTF32Guard disable_tf32;
|
||||
// See Note [Writing Nondeterministic Operations]
|
||||
globalContext().alertCuBLASConfigNotDeterministic();
|
||||
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
cublasOperation_t op = _cublasOpFromChar(trans);
|
||||
_cublasAdjustLdLevel2(m, n, &lda);
|
||||
|
||||
@ -185,17 +185,6 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
|
||||
// right: "lro, summed, ro" permuted with rpermutation and the three flattened
|
||||
// then the permuted output is a view of bmm(left, right)
|
||||
// finally, opermutation reverts the permutation to the original order of dimensions
|
||||
// By default the output is "lro, lo, 1-for-summed-dims, ro" with original shape dimensions.
|
||||
// However, if all dimensions from the right operand appear before those from the left
|
||||
// operand in the final output, we can swap the operands so that bmm directly produces
|
||||
// the result in the correct memory order.
|
||||
|
||||
bool swap_lo_ro = !lo.empty() && !ro.empty() && ro.back() < lo.front();
|
||||
if (swap_lo_ro) {
|
||||
std::swap(left, right);
|
||||
std::swap(lo, ro);
|
||||
std::swap(lo_size, ro_size);
|
||||
}
|
||||
auto out_num_dim = lro.size() + lo.size() + sum_dims_.size() + ro.size();
|
||||
std::vector<SymInt> out_size;
|
||||
out_size.reserve(out_num_dim);
|
||||
|
||||
@ -156,7 +156,7 @@ void cpu_padding(
|
||||
int64_t offset_h = ndim >= 2 ? p.offsets[ndim - 2] : 0;
|
||||
int64_t offset_w = p.offsets[ndim - 1];
|
||||
|
||||
// do vectorized copy when output is overlapped with input on W,
|
||||
// do vectorized copy whe output is overlapped with input on W,
|
||||
// only applies to positive padding
|
||||
auto loop = [=](scalar_t* out, const scalar_t* in, bool positive_padding) {
|
||||
if (positive_padding) {
|
||||
|
||||
@ -318,7 +318,7 @@ batch_norm_cpu_collect_stats_channels_last_impl(
|
||||
//
|
||||
// The optimal THRESHOLD to tile was found empirically.
|
||||
// When C > THRESHOLD, C is large enough that the benefit from tiling and vectorization outweigh the synchronization overhead.
|
||||
// When C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization.
|
||||
// Wehn C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization.
|
||||
//
|
||||
// When num_threads == 1, always use Method 2 as there is no synchronization overhead.
|
||||
//
|
||||
|
||||
@ -1665,7 +1665,7 @@ const std::optional<at::Tensor>& bias,
|
||||
const std::optional<at::Tensor>& scale_result,
|
||||
std::optional<c10::ScalarType> out_dtype,
|
||||
bool use_fast_accum) {
|
||||
bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/false);
|
||||
bool allowed_device = _scaled_mm_allowed_device();
|
||||
TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0, or ROCm MI300+");
|
||||
|
||||
TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed");
|
||||
|
||||
@ -20,7 +20,7 @@
|
||||
|
||||
// SegmentReduce compilation with CUDA-12.9 causes NVCC crash on Windows
|
||||
// See https://github.com/pytorch/pytorch/issues/156181
|
||||
#if !(defined(_WIN32) && CUDART_VERSION == 12090)
|
||||
#if !defined(_WIN32) || CUDART_VERSION < 12090
|
||||
|
||||
namespace at::native {
|
||||
|
||||
@ -606,4 +606,4 @@ REGISTER_DISPATCH(
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -464,7 +464,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
}
|
||||
#endif
|
||||
int32_t trailingSize;
|
||||
TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam;
|
||||
if (isInOutAligned) {
|
||||
// in this case we can and should flatten the tensors after the cat dim
|
||||
// we want to view the tensors as if consisting of `alignment`-sized elements
|
||||
@ -476,15 +475,16 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
// then we need to divide last out size by elems_per_vec,
|
||||
// and divide all strides except last by elems_per_vec (last stride is 1 always)
|
||||
// for input, we will fix up the sizes and strides in the kernel directly
|
||||
kernelOutputParam = outputParam;
|
||||
nDims = dimension + 1;
|
||||
constexpr auto elems_per_vec = alignment / sizeof(scalar_t);
|
||||
auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1];
|
||||
kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec;
|
||||
auto out_size = dimension == 0 ? out.numel() : outputParam.tensorStride[dimension-1];
|
||||
outputParam.tensorSize[dimension] = out_size / elems_per_vec;
|
||||
trailingSize = outputParam.tensorStride[dimension];
|
||||
kernelOutputParam.tensorStride[dimension] = 1;
|
||||
for (int i = 0; i < dimension; ++i) {
|
||||
kernelOutputParam.tensorStride[i] /= elems_per_vec;
|
||||
outputParam.tensorStride[dimension] = 1;
|
||||
for (int i = 0; i < nDims; ++i) {
|
||||
if (i!=dimension) {
|
||||
outputParam.tensorStride[i] /= elems_per_vec;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -505,7 +505,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \
|
||||
CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\
|
||||
catGrid, applyBlock, 0, stream.stream()>>>(\
|
||||
(char*)data, catMetaData, kernelOutputParam, dimension, trailingSize);\
|
||||
(char*)data, catMetaData, outputParam, dimension, trailingSize);\
|
||||
} else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
|
||||
CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\
|
||||
catGrid, applyBlock, 0, stream.stream()>>>(\
|
||||
|
||||
@ -1,7 +1,5 @@
|
||||
#include <ATen/core/op_registration/op_registration.h>
|
||||
#include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
|
||||
#include <ATen/native/mkldnn/xpu/qconv.h>
|
||||
|
||||
#include <c10/core/MemoryFormat.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <torch/library.h>
|
||||
@ -9,7 +7,7 @@
|
||||
using namespace at::native::onednn;
|
||||
namespace at::native::xpu {
|
||||
|
||||
inline c10::ScalarType QConvoneDNNXPU::qconv_decide_out_dtype(
|
||||
static inline c10::ScalarType qconv_decide_out_dtype(
|
||||
const at::Tensor& act,
|
||||
const std::optional<c10::ScalarType> output_dtype) {
|
||||
bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
|
||||
@ -21,7 +19,7 @@ inline c10::ScalarType QConvoneDNNXPU::qconv_decide_out_dtype(
|
||||
return dst_dtype;
|
||||
}
|
||||
|
||||
at::Tensor QConvoneDNNXPU::qconv_prepack_xpu(
|
||||
static at::Tensor qconv_prepack_xpu(
|
||||
at::Tensor weight,
|
||||
at::Tensor weight_scales,
|
||||
double input_scale,
|
||||
@ -35,265 +33,222 @@ at::Tensor QConvoneDNNXPU::qconv_prepack_xpu(
|
||||
return weight;
|
||||
}
|
||||
|
||||
at::Tensor QConvoneDNNXPU::run_pointwise(
|
||||
at::Tensor act,
|
||||
double act_scale,
|
||||
int64_t act_zero_point,
|
||||
at::Tensor weight,
|
||||
at::Tensor weight_scales,
|
||||
at::Tensor weight_zero_points,
|
||||
std::optional<at::Tensor> bias,
|
||||
torch::List<int64_t> stride,
|
||||
torch::List<int64_t> padding,
|
||||
torch::List<int64_t> dilation,
|
||||
int64_t groups,
|
||||
double inv_output_scale,
|
||||
int64_t output_zero_point,
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
std::string_view attr,
|
||||
torch::List<std::optional<at::Scalar>> scalars,
|
||||
std::optional<std::string_view> algorithm) {
|
||||
if (act.dim() == 3 || act.dim() == 5) {
|
||||
TORCH_CHECK(
|
||||
attr == "none",
|
||||
"quantized pointwise conv",
|
||||
act.dim() - 2,
|
||||
"d doesn't support unary_post_op fusion. Got unary_post_op:",
|
||||
attr,
|
||||
".");
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
attr == "none" || attr == "relu" || attr == "hardtanh" ||
|
||||
attr == "hardswish" || attr == "swish",
|
||||
"We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:",
|
||||
attr,
|
||||
".");
|
||||
class QConvoneDNNXPU final {
|
||||
public:
|
||||
static at::Tensor run_pointwise(
|
||||
at::Tensor act,
|
||||
double act_scale,
|
||||
int64_t act_zero_point,
|
||||
at::Tensor weight,
|
||||
at::Tensor weight_scales,
|
||||
at::Tensor weight_zero_points,
|
||||
std::optional<at::Tensor> bias,
|
||||
torch::List<int64_t> stride,
|
||||
torch::List<int64_t> padding,
|
||||
torch::List<int64_t> dilation,
|
||||
int64_t groups,
|
||||
double inv_output_scale,
|
||||
int64_t output_zero_point,
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
std::string_view attr,
|
||||
torch::List<std::optional<at::Scalar>> scalars,
|
||||
std::optional<std::string_view> algorithm) {
|
||||
if (act.dim() == 3 || act.dim() == 5) {
|
||||
TORCH_CHECK(
|
||||
attr == "none",
|
||||
"quantized pointwise conv",
|
||||
act.dim() - 2,
|
||||
"d doesn't support unary_post_op fusion. Got unary_post_op:",
|
||||
attr,
|
||||
".");
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
attr == "none" || attr == "relu" || attr == "hardtanh" ||
|
||||
attr == "hardswish" || attr == "swish",
|
||||
"We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:",
|
||||
attr,
|
||||
".");
|
||||
}
|
||||
|
||||
bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
|
||||
auto mfmt = is_channels_last_suggested
|
||||
? get_cl_tag_by_ndim(act.ndimension())
|
||||
: at::MemoryFormat::Contiguous;
|
||||
Tensor input_ = act.contiguous(mfmt);
|
||||
Tensor weight_ = weight.contiguous(mfmt);
|
||||
|
||||
auto dst_tz = conv_dst_size(
|
||||
input_.ndimension(),
|
||||
input_.sizes(),
|
||||
weight_.sizes(),
|
||||
padding.vec(),
|
||||
padding.vec(),
|
||||
stride.vec(),
|
||||
dilation.vec());
|
||||
|
||||
auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
|
||||
Tensor output =
|
||||
at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
|
||||
|
||||
return quantized_convolution(
|
||||
act,
|
||||
act_scale,
|
||||
act_zero_point,
|
||||
weight,
|
||||
weight_scales,
|
||||
weight_zero_points,
|
||||
bias,
|
||||
stride,
|
||||
padding,
|
||||
dilation,
|
||||
/*transposed*/ false,
|
||||
groups,
|
||||
output,
|
||||
inv_output_scale,
|
||||
output_zero_point,
|
||||
/*accum*/ std::nullopt,
|
||||
/*accum_scale*/ 0.0,
|
||||
/*accum_zero_point*/ 0,
|
||||
/*output_dtype*/ output_dtype,
|
||||
/*binary_attr*/ std::nullopt,
|
||||
/*binary_alpha*/ std::nullopt,
|
||||
/*unary_attr*/ attr,
|
||||
/*unary_scalars*/ scalars,
|
||||
/*unary_algorithm*/ algorithm);
|
||||
}
|
||||
|
||||
bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
|
||||
auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(act.ndimension())
|
||||
: at::MemoryFormat::Contiguous;
|
||||
Tensor input_ = act.contiguous(mfmt);
|
||||
Tensor weight_ = weight.contiguous(mfmt);
|
||||
|
||||
auto dst_tz = conv_dst_size(
|
||||
input_.ndimension(),
|
||||
input_.sizes(),
|
||||
weight_.sizes(),
|
||||
padding.vec(),
|
||||
padding.vec(),
|
||||
stride.vec(),
|
||||
dilation.vec());
|
||||
|
||||
auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
|
||||
Tensor output =
|
||||
at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
|
||||
|
||||
return quantized_convolution(
|
||||
act,
|
||||
act_scale,
|
||||
act_zero_point,
|
||||
weight,
|
||||
weight_scales,
|
||||
weight_zero_points,
|
||||
bias,
|
||||
stride,
|
||||
padding,
|
||||
dilation,
|
||||
/*transposed*/ false,
|
||||
groups,
|
||||
output,
|
||||
inv_output_scale,
|
||||
output_zero_point,
|
||||
/*accum*/ std::nullopt,
|
||||
/*accum_scale*/ 0.0,
|
||||
/*accum_zero_point*/ 0,
|
||||
/*output_dtype*/ output_dtype,
|
||||
/*binary_attr*/ std::nullopt,
|
||||
/*binary_alpha*/ std::nullopt,
|
||||
/*unary_attr*/ attr,
|
||||
/*unary_scalars*/ scalars,
|
||||
/*unary_algorithm*/ algorithm);
|
||||
}
|
||||
|
||||
at::Tensor QConvoneDNNXPU::run_pointwise_tensor(
|
||||
at::Tensor act,
|
||||
at::Tensor act_scale,
|
||||
at::Tensor act_zero_point,
|
||||
at::Tensor weight,
|
||||
at::Tensor weight_scales,
|
||||
at::Tensor weight_zero_points,
|
||||
std::optional<at::Tensor> bias,
|
||||
torch::List<int64_t> stride,
|
||||
torch::List<int64_t> padding,
|
||||
torch::List<int64_t> dilation,
|
||||
int64_t groups,
|
||||
double output_scale,
|
||||
int64_t output_zero_point,
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
std::string_view attr,
|
||||
torch::List<std::optional<at::Scalar>> scalars,
|
||||
std::optional<std::string_view> algorithm) {
|
||||
return run_pointwise(
|
||||
act,
|
||||
act_scale.item().toDouble(),
|
||||
act_zero_point.item().toLong(),
|
||||
weight,
|
||||
weight_scales,
|
||||
weight_zero_points,
|
||||
bias,
|
||||
stride,
|
||||
padding,
|
||||
dilation,
|
||||
groups,
|
||||
output_scale,
|
||||
output_zero_point,
|
||||
output_dtype,
|
||||
/*unary_attr*/ attr,
|
||||
/*unary_scalars*/ scalars,
|
||||
/*unary_algorithm*/ algorithm);
|
||||
}
|
||||
|
||||
at::Tensor QConvoneDNNXPU::run_pointwise_binary(
|
||||
at::Tensor act,
|
||||
double act_scale,
|
||||
int64_t act_zero_point,
|
||||
at::Tensor weight,
|
||||
at::Tensor weight_scales,
|
||||
at::Tensor weight_zero_points,
|
||||
at::Tensor accum,
|
||||
std::optional<at::Tensor> bias,
|
||||
torch::List<int64_t> stride,
|
||||
torch::List<int64_t> padding,
|
||||
torch::List<int64_t> dilation,
|
||||
int64_t groups,
|
||||
double output_scale,
|
||||
int64_t output_zero_point,
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
double accum_scale,
|
||||
int64_t accum_zero_point,
|
||||
std::string_view binary_attr,
|
||||
std::optional<at::Scalar> alpha,
|
||||
std::optional<std::string_view> unary_attr,
|
||||
torch::List<std::optional<at::Scalar>> unary_scalars,
|
||||
std::optional<std::string_view> unary_algorithm) {
|
||||
TORCH_CHECK(
|
||||
act.dim() == 4 && binary_attr == "sum" &&
|
||||
(!unary_attr.has_value() ||
|
||||
(unary_attr.has_value() &&
|
||||
(unary_attr.value() == "none" || unary_attr.value() == "relu"))),
|
||||
"post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ",
|
||||
binary_attr,
|
||||
" unary_post_op: ",
|
||||
unary_attr.has_value() ? unary_attr.value() : "none",
|
||||
".")
|
||||
|
||||
bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
|
||||
auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(act.ndimension())
|
||||
: at::MemoryFormat::Contiguous;
|
||||
Tensor input_ = act.contiguous(mfmt);
|
||||
Tensor weight_ = weight.contiguous(mfmt);
|
||||
|
||||
auto dst_tz = conv_dst_size(
|
||||
input_.ndimension(),
|
||||
input_.sizes(),
|
||||
weight_.sizes(),
|
||||
padding.vec(),
|
||||
padding.vec(),
|
||||
stride.vec(),
|
||||
dilation.vec());
|
||||
|
||||
auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
|
||||
bool has_accum_postop_sum = binary_attr == "sum";
|
||||
Tensor output = has_accum_postop_sum
|
||||
? accum
|
||||
: at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
|
||||
|
||||
output = quantized_convolution(
|
||||
act,
|
||||
act_scale,
|
||||
act_zero_point,
|
||||
weight,
|
||||
weight_scales,
|
||||
weight_zero_points,
|
||||
bias,
|
||||
stride,
|
||||
padding,
|
||||
dilation,
|
||||
/*transposed*/ false,
|
||||
groups,
|
||||
output,
|
||||
output_scale,
|
||||
output_zero_point,
|
||||
/*accum*/ accum,
|
||||
/*accum_scale*/ accum_scale,
|
||||
/*accum_zero_point*/ accum_zero_point,
|
||||
/*output_dtype*/ output_dtype,
|
||||
/*binary_attr*/ binary_attr,
|
||||
/*binary_alpha*/ alpha,
|
||||
/*unary_attr*/ unary_attr,
|
||||
/*unary_scalars*/ unary_scalars,
|
||||
/*unary_algorithm*/ unary_algorithm);
|
||||
|
||||
if (!has_accum_postop_sum) {
|
||||
return output;
|
||||
} else {
|
||||
return accum;
|
||||
static at::Tensor run_pointwise_tensor(
|
||||
at::Tensor act,
|
||||
at::Tensor act_scale,
|
||||
at::Tensor act_zero_point,
|
||||
at::Tensor weight,
|
||||
at::Tensor weight_scales,
|
||||
at::Tensor weight_zero_points,
|
||||
std::optional<at::Tensor> bias,
|
||||
torch::List<int64_t> stride,
|
||||
torch::List<int64_t> padding,
|
||||
torch::List<int64_t> dilation,
|
||||
int64_t groups,
|
||||
double output_scale,
|
||||
int64_t output_zero_point,
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
std::string_view attr,
|
||||
torch::List<std::optional<at::Scalar>> scalars,
|
||||
std::optional<std::string_view> algorithm) {
|
||||
return run_pointwise(
|
||||
act,
|
||||
act_scale.item().toDouble(),
|
||||
act_zero_point.item().toLong(),
|
||||
weight,
|
||||
weight_scales,
|
||||
weight_zero_points,
|
||||
bias,
|
||||
stride,
|
||||
padding,
|
||||
dilation,
|
||||
groups,
|
||||
output_scale,
|
||||
output_zero_point,
|
||||
output_dtype,
|
||||
/*unary_attr*/ attr,
|
||||
/*unary_scalars*/ scalars,
|
||||
/*unary_algorithm*/ algorithm);
|
||||
}
|
||||
}
|
||||
|
||||
at::Tensor QConvoneDNNXPU::run_pointwise_binary_tensor(
|
||||
at::Tensor act, // contains quantized values but not QTensor
|
||||
at::Tensor act_scale,
|
||||
at::Tensor act_zero_point,
|
||||
at::Tensor weight, // contains quantized values but not QTensor
|
||||
at::Tensor weight_scales,
|
||||
at::Tensor weight_zero_points,
|
||||
at::Tensor accum, // contains quantized values but not QTensor
|
||||
std::optional<at::Tensor> bias,
|
||||
torch::List<int64_t> stride,
|
||||
torch::List<int64_t> padding,
|
||||
torch::List<int64_t> dilation,
|
||||
int64_t groups,
|
||||
double output_scale,
|
||||
int64_t output_zero_point,
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
double accum_scale,
|
||||
int64_t accum_zero_point,
|
||||
std::string_view binary_attr,
|
||||
std::optional<at::Scalar> alpha,
|
||||
std::optional<std::string_view> unary_attr,
|
||||
torch::List<std::optional<at::Scalar>> unary_scalars,
|
||||
std::optional<std::string_view> unary_algorithm) {
|
||||
return run_pointwise_binary(
|
||||
act,
|
||||
act_scale.item().toDouble(),
|
||||
act_zero_point.item().toLong(),
|
||||
weight,
|
||||
weight_scales,
|
||||
weight_zero_points,
|
||||
accum,
|
||||
bias,
|
||||
stride,
|
||||
padding,
|
||||
dilation,
|
||||
groups,
|
||||
output_scale,
|
||||
output_zero_point,
|
||||
output_dtype,
|
||||
accum_scale,
|
||||
accum_zero_point,
|
||||
binary_attr,
|
||||
alpha,
|
||||
unary_attr,
|
||||
unary_scalars,
|
||||
unary_algorithm);
|
||||
}
|
||||
static at::Tensor run_pointwise_binary(
|
||||
at::Tensor act,
|
||||
double act_scale,
|
||||
int64_t act_zero_point,
|
||||
at::Tensor weight,
|
||||
at::Tensor weight_scales,
|
||||
at::Tensor weight_zero_points,
|
||||
at::Tensor accum,
|
||||
std::optional<at::Tensor> bias,
|
||||
torch::List<int64_t> stride,
|
||||
torch::List<int64_t> padding,
|
||||
torch::List<int64_t> dilation,
|
||||
int64_t groups,
|
||||
double output_scale,
|
||||
int64_t output_zero_point,
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
double accum_scale,
|
||||
int64_t accum_zero_point,
|
||||
std::string_view binary_attr,
|
||||
std::optional<at::Scalar> alpha,
|
||||
std::optional<std::string_view> unary_attr,
|
||||
torch::List<std::optional<at::Scalar>> unary_scalars,
|
||||
std::optional<std::string_view> unary_algorithm) {
|
||||
TORCH_CHECK(
|
||||
act.dim() == 4 && binary_attr == "sum" &&
|
||||
(!unary_attr.has_value() ||
|
||||
(unary_attr.has_value() &&
|
||||
(unary_attr.value() == "none" || unary_attr.value() == "relu"))),
|
||||
"post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ",
|
||||
binary_attr,
|
||||
" unary_post_op: ",
|
||||
unary_attr.has_value() ? unary_attr.value() : "none",
|
||||
".")
|
||||
|
||||
bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
|
||||
auto mfmt = is_channels_last_suggested
|
||||
? get_cl_tag_by_ndim(act.ndimension())
|
||||
: at::MemoryFormat::Contiguous;
|
||||
Tensor input_ = act.contiguous(mfmt);
|
||||
Tensor weight_ = weight.contiguous(mfmt);
|
||||
|
||||
auto dst_tz = conv_dst_size(
|
||||
input_.ndimension(),
|
||||
input_.sizes(),
|
||||
weight_.sizes(),
|
||||
padding.vec(),
|
||||
padding.vec(),
|
||||
stride.vec(),
|
||||
dilation.vec());
|
||||
|
||||
auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
|
||||
bool has_accum_postop_sum = binary_attr == "sum";
|
||||
Tensor output = has_accum_postop_sum
|
||||
? accum
|
||||
: at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
|
||||
|
||||
output = quantized_convolution(
|
||||
act,
|
||||
act_scale,
|
||||
act_zero_point,
|
||||
weight,
|
||||
weight_scales,
|
||||
weight_zero_points,
|
||||
bias,
|
||||
stride,
|
||||
padding,
|
||||
dilation,
|
||||
/*transposed*/ false,
|
||||
groups,
|
||||
output,
|
||||
output_scale,
|
||||
output_zero_point,
|
||||
/*accum*/ accum,
|
||||
/*accum_scale*/ accum_scale,
|
||||
/*accum_zero_point*/ accum_zero_point,
|
||||
/*output_dtype*/ output_dtype,
|
||||
/*binary_attr*/ binary_attr,
|
||||
/*binary_alpha*/ alpha,
|
||||
/*unary_attr*/ unary_attr,
|
||||
/*unary_scalars*/ unary_scalars,
|
||||
/*unary_algorithm*/ unary_algorithm);
|
||||
|
||||
if (!has_accum_postop_sum) {
|
||||
return output;
|
||||
} else {
|
||||
return accum;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
TORCH_LIBRARY_IMPL(onednn, XPU, m) {
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("onednn::qconv_prepack"),
|
||||
TORCH_FN(QConvoneDNNXPU::qconv_prepack_xpu));
|
||||
TORCH_FN(xpu::qconv_prepack_xpu));
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("onednn::qconv1d_pointwise"),
|
||||
QConvoneDNNXPU::run_pointwise);
|
||||
@ -312,9 +267,6 @@ TORCH_LIBRARY_IMPL(onednn, XPU, m) {
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("onednn::qconv_pointwise.tensor"),
|
||||
QConvoneDNNXPU::run_pointwise_tensor);
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("onednn::qconv2d_pointwise.binary_tensor"),
|
||||
QConvoneDNNXPU::run_pointwise_binary_tensor);
|
||||
}
|
||||
|
||||
} // namespace at::native::xpu
|
||||
|
||||
@ -1,111 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/Config.h>
|
||||
#include <ATen/Tensor.h>
|
||||
|
||||
namespace at::native::xpu {
|
||||
class QConvoneDNNXPU final {
|
||||
public:
|
||||
C10_API static at::Tensor run_pointwise(
|
||||
at::Tensor act,
|
||||
double act_scale,
|
||||
int64_t act_zero_point,
|
||||
at::Tensor weight,
|
||||
at::Tensor weight_scales,
|
||||
at::Tensor weight_zero_points,
|
||||
std::optional<at::Tensor> bias,
|
||||
torch::List<int64_t> stride,
|
||||
torch::List<int64_t> padding,
|
||||
torch::List<int64_t> dilation,
|
||||
int64_t groups,
|
||||
double inv_output_scale,
|
||||
int64_t output_zero_point,
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
std::string_view attr,
|
||||
torch::List<std::optional<at::Scalar>> scalars,
|
||||
std::optional<std::string_view> algorithm);
|
||||
|
||||
C10_API static at::Tensor run_pointwise_tensor(
|
||||
at::Tensor act,
|
||||
at::Tensor act_scale,
|
||||
at::Tensor act_zero_point,
|
||||
at::Tensor weight,
|
||||
at::Tensor weight_scales,
|
||||
at::Tensor weight_zero_points,
|
||||
std::optional<at::Tensor> bias,
|
||||
torch::List<int64_t> stride,
|
||||
torch::List<int64_t> padding,
|
||||
torch::List<int64_t> dilation,
|
||||
int64_t groups,
|
||||
double output_scale,
|
||||
int64_t output_zero_point,
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
std::string_view attr,
|
||||
torch::List<std::optional<at::Scalar>> scalars,
|
||||
std::optional<std::string_view> algorithm);
|
||||
|
||||
C10_API static at::Tensor run_pointwise_binary(
|
||||
at::Tensor act,
|
||||
double act_scale,
|
||||
int64_t act_zero_point,
|
||||
at::Tensor weight,
|
||||
at::Tensor weight_scales,
|
||||
at::Tensor weight_zero_points,
|
||||
at::Tensor accum,
|
||||
std::optional<at::Tensor> bias,
|
||||
torch::List<int64_t> stride,
|
||||
torch::List<int64_t> padding,
|
||||
torch::List<int64_t> dilation,
|
||||
int64_t groups,
|
||||
double output_scale,
|
||||
int64_t output_zero_point,
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
double accum_scale,
|
||||
int64_t accum_zero_point,
|
||||
std::string_view binary_attr,
|
||||
std::optional<at::Scalar> alpha,
|
||||
std::optional<std::string_view> unary_attr,
|
||||
torch::List<std::optional<at::Scalar>> unary_scalars,
|
||||
std::optional<std::string_view> unary_algorithm);
|
||||
|
||||
C10_API static at::Tensor run_pointwise_binary_tensor(
|
||||
at::Tensor act,
|
||||
at::Tensor act_scale,
|
||||
at::Tensor act_zero_point,
|
||||
at::Tensor weight,
|
||||
at::Tensor weight_scales,
|
||||
at::Tensor weight_zero_points,
|
||||
at::Tensor accum,
|
||||
std::optional<at::Tensor> bias,
|
||||
torch::List<int64_t> stride,
|
||||
torch::List<int64_t> padding,
|
||||
torch::List<int64_t> dilation,
|
||||
int64_t groups,
|
||||
double output_scale,
|
||||
int64_t output_zero_point,
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
double accum_scale,
|
||||
int64_t accum_zero_point,
|
||||
std::string_view binary_attr,
|
||||
std::optional<at::Scalar> alpha,
|
||||
std::optional<std::string_view> unary_attr,
|
||||
torch::List<std::optional<at::Scalar>> unary_scalars,
|
||||
std::optional<std::string_view> unary_algorithm);
|
||||
|
||||
static inline c10::ScalarType qconv_decide_out_dtype(
|
||||
const at::Tensor& act,
|
||||
const std::optional<c10::ScalarType> output_dtype);
|
||||
|
||||
static at::Tensor qconv_prepack_xpu(
|
||||
at::Tensor weight,
|
||||
at::Tensor weight_scales,
|
||||
double input_scale,
|
||||
int64_t input_zero_point,
|
||||
torch::List<int64_t> stride,
|
||||
torch::List<int64_t> padding,
|
||||
torch::List<int64_t> dilation,
|
||||
int64_t groups,
|
||||
std::optional<torch::List<int64_t>> input_shape);
|
||||
};
|
||||
|
||||
} // namespace at::native::xpu
|
||||
@ -1,14 +1,13 @@
|
||||
#include <torch/library.h>
|
||||
|
||||
#include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
|
||||
#include <ATen/native/mkldnn/xpu/qlinear.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
|
||||
using namespace at::native::onednn;
|
||||
|
||||
namespace at::native::xpu {
|
||||
|
||||
inline c10::ScalarType QLinearOnednnXPU::qlinear_decide_out_dtype(
|
||||
static inline c10::ScalarType qlinear_decide_out_dtype(
|
||||
const at::Tensor& act,
|
||||
const std::optional<c10::ScalarType> output_dtype) {
|
||||
bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
|
||||
@ -20,7 +19,7 @@ inline c10::ScalarType QLinearOnednnXPU::qlinear_decide_out_dtype(
|
||||
return dst_dtype;
|
||||
}
|
||||
|
||||
Tensor QLinearOnednnXPU::q_linear_pointwise(
|
||||
static Tensor q_linear_pointwise(
|
||||
Tensor act,
|
||||
double act_scale,
|
||||
int64_t act_zero_point,
|
||||
@ -79,7 +78,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise(
|
||||
return qout;
|
||||
}
|
||||
|
||||
Tensor QLinearOnednnXPU::q_linear_pointwise_tensor(
|
||||
static Tensor q_linear_pointwise_tensor(
|
||||
Tensor act,
|
||||
Tensor act_scale,
|
||||
Tensor act_zero_point,
|
||||
@ -138,7 +137,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise_tensor(
|
||||
return qout;
|
||||
}
|
||||
|
||||
Tensor QLinearOnednnXPU::q_linear_pointwise_binary(
|
||||
static Tensor q_linear_pointwise_binary(
|
||||
Tensor act,
|
||||
double act_scale,
|
||||
int64_t act_zero_point,
|
||||
@ -209,7 +208,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise_binary(
|
||||
return dim == 3 ? qout.reshape({act.size(0), -1, N}) : qout;
|
||||
}
|
||||
|
||||
Tensor QLinearOnednnXPU::q_linear_pointwise_binary_tensor(
|
||||
static Tensor q_linear_pointwise_binary_tensor(
|
||||
Tensor act,
|
||||
Tensor act_scale,
|
||||
Tensor act_zero_point,
|
||||
@ -249,7 +248,7 @@ Tensor QLinearOnednnXPU::q_linear_pointwise_binary_tensor(
|
||||
unary_post_op_algorithm);
|
||||
}
|
||||
|
||||
Tensor QLinearOnednnXPU::q_linear_prepack_onednn(
|
||||
static at::Tensor q_linear_prepack_onednn(
|
||||
at::Tensor weight,
|
||||
std::optional<torch::List<int64_t>> input_shape) {
|
||||
at::Tensor weight_transposed = weight.transpose(0, 1);
|
||||
@ -259,19 +258,19 @@ Tensor QLinearOnednnXPU::q_linear_prepack_onednn(
|
||||
TORCH_LIBRARY_IMPL(onednn, XPU, m) {
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise"),
|
||||
TORCH_FN(QLinearOnednnXPU::q_linear_pointwise));
|
||||
TORCH_FN(q_linear_pointwise));
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.tensor"),
|
||||
TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_tensor));
|
||||
TORCH_FN(q_linear_pointwise_tensor));
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("onednn::qlinear_prepack"),
|
||||
TORCH_FN(QLinearOnednnXPU::q_linear_prepack_onednn));
|
||||
TORCH_FN(q_linear_prepack_onednn));
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary"),
|
||||
TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_binary));
|
||||
TORCH_FN(q_linear_pointwise_binary));
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary_tensor"),
|
||||
TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_binary_tensor));
|
||||
TORCH_FN(q_linear_pointwise_binary_tensor));
|
||||
}
|
||||
|
||||
} // namespace at::native::xpu
|
||||
|
||||
@ -1,91 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/Config.h>
|
||||
#include <ATen/Tensor.h>
|
||||
#include <ATen/core/List.h>
|
||||
|
||||
namespace at::native::xpu {
|
||||
|
||||
class QLinearOnednnXPU final {
|
||||
public:
|
||||
C10_API static Tensor q_linear_pointwise(
|
||||
Tensor act,
|
||||
double act_scale,
|
||||
int64_t act_zero_point,
|
||||
Tensor weight,
|
||||
Tensor weight_scales,
|
||||
Tensor weight_zero_points,
|
||||
std::optional<Tensor> bias,
|
||||
double output_scale,
|
||||
int64_t output_zero_point,
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
std::string_view post_op_name,
|
||||
torch::List<std::optional<at::Scalar>> post_op_args,
|
||||
std::string_view post_op_algorithm);
|
||||
|
||||
C10_API static Tensor q_linear_pointwise_tensor(
|
||||
Tensor act,
|
||||
Tensor act_scale,
|
||||
Tensor act_zero_point,
|
||||
Tensor weight,
|
||||
Tensor weight_scales,
|
||||
Tensor weight_zero_points,
|
||||
std::optional<Tensor> bias,
|
||||
double output_scale,
|
||||
int64_t output_zero_point,
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
std::string_view post_op_name,
|
||||
torch::List<std::optional<at::Scalar>> post_op_args,
|
||||
std::string_view post_op_algorithm);
|
||||
|
||||
C10_API static Tensor q_linear_pointwise_binary(
|
||||
Tensor act,
|
||||
double act_scale,
|
||||
int64_t act_zero_point,
|
||||
Tensor weight,
|
||||
Tensor weight_scales,
|
||||
Tensor weight_zero_points,
|
||||
std::optional<at::Tensor> other,
|
||||
std::optional<Tensor> bias,
|
||||
double output_scale,
|
||||
int64_t output_zero_point,
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
double other_scale,
|
||||
int64_t other_zero_point,
|
||||
std::string_view binary_post_op,
|
||||
double binary_alpha,
|
||||
std::string_view unary_post_op,
|
||||
torch::List<std::optional<at::Scalar>> unary_post_op_args,
|
||||
std::string_view unary_post_op_algorithm);
|
||||
|
||||
C10_API static Tensor q_linear_pointwise_binary_tensor(
|
||||
Tensor act,
|
||||
Tensor act_scale,
|
||||
Tensor act_zero_point,
|
||||
Tensor weight,
|
||||
Tensor weight_scales,
|
||||
Tensor weight_zero_points,
|
||||
std::optional<at::Tensor> other,
|
||||
std::optional<Tensor> bias,
|
||||
double output_scale,
|
||||
int64_t output_zero_point,
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
double other_scale,
|
||||
int64_t other_zero_point,
|
||||
std::string_view binary_post_op,
|
||||
double binary_alpha,
|
||||
std::string_view unary_post_op,
|
||||
torch::List<std::optional<at::Scalar>> unary_post_op_args,
|
||||
std::string_view unary_post_op_algorithm);
|
||||
|
||||
C10_API static Tensor q_linear_prepack_onednn(
|
||||
at::Tensor weight,
|
||||
std::optional<torch::List<int64_t>> input_shape);
|
||||
|
||||
static inline c10::ScalarType qlinear_decide_out_dtype(
|
||||
const at::Tensor& act,
|
||||
const std::optional<c10::ScalarType> output_dtype);
|
||||
|
||||
}; // class QLinearOnednnXPU
|
||||
|
||||
} // namespace at::native::xpu
|
||||
@ -2,7 +2,6 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/MemoryOverlap.h>
|
||||
#include <ATen/WrapDimUtils.h>
|
||||
#include <ATen/native/SortingUtils.h>
|
||||
#include <ATen/native/TensorShape.h>
|
||||
#include <ATen/native/TypeProperties.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
@ -12,85 +11,10 @@
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/kthvalue_native.h>
|
||||
#include <ATen/ops/sort.h>
|
||||
#include <ATen/ops/sort_native.h>
|
||||
#endif
|
||||
namespace at::native {
|
||||
namespace {
|
||||
|
||||
void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& values, Tensor& indices) {
|
||||
using namespace mps;
|
||||
if (self.dim() == 0 && self.numel() == 1) {
|
||||
values.copy_(self);
|
||||
indices.zero_();
|
||||
return;
|
||||
}
|
||||
// Handle empty tensors
|
||||
if (self.numel() == 0) {
|
||||
values.copy_(self);
|
||||
indices.copy_(values.toType(at::ScalarType::Long));
|
||||
return;
|
||||
}
|
||||
// issue #154890, raising error to prevent crash within MPSGraph until
|
||||
// workaround is implemented.
|
||||
TORCH_CHECK(self.dim() - dim <= 4, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890");
|
||||
|
||||
auto stream = getCurrentMPSStream();
|
||||
struct CachedGraph : public MPSCachedGraph {
|
||||
CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
|
||||
MPSGraphTensor *selfTensor = nil, *valuesTensor = nil, *indicesTensor = nil;
|
||||
};
|
||||
|
||||
// MPSGraph kthvalue is always sorted.
|
||||
@autoreleasepool {
|
||||
// Input as placeholders
|
||||
MPSShape* input_shape = getMPSShape(self);
|
||||
NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
|
||||
std::string key = std::string("kthvalue:") + [ns_shape_key UTF8String] + ":" + getMPSTypeString(self) + ":k" +
|
||||
std::to_string(k) + ":dim" + std::to_string(dim);
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), input_shape);
|
||||
|
||||
MPSGraphTensor* castInputTensor = newCachedGraph->selfTensor;
|
||||
MPSDataType dataType = getMPSDataType(self);
|
||||
// #issue 104398441 sortWithTensor and argsortWithTensor
|
||||
if (dataType != MPSDataTypeInt32 && dataType != MPSDataTypeFloat32 && dataType != MPSDataTypeFloat16) {
|
||||
dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32;
|
||||
castInputTensor = [mpsGraph castTensor:newCachedGraph->selfTensor toType:dataType name:@"castInputTensor"];
|
||||
}
|
||||
MPSGraphTensor* sortedTensor = [mpsGraph sortWithTensor:castInputTensor
|
||||
axis:(NSUInteger)dim
|
||||
descending:false
|
||||
name:nil];
|
||||
sortedTensor = [mpsGraph sliceTensor:sortedTensor
|
||||
dimension:(NSUInteger)dim
|
||||
start:((NSUInteger)k - 1)
|
||||
length:1
|
||||
name:nil];
|
||||
MPSGraphTensor* argSortedTensor = [mpsGraph argSortWithTensor:castInputTensor
|
||||
axis:(NSInteger)dim
|
||||
descending:false
|
||||
name:@"kthvalue_out"];
|
||||
argSortedTensor = [mpsGraph sliceTensor:argSortedTensor
|
||||
dimension:dim
|
||||
start:((NSUInteger)k - 1)
|
||||
length:1
|
||||
name:nil];
|
||||
newCachedGraph->valuesTensor = sortedTensor;
|
||||
newCachedGraph->indicesTensor = argSortedTensor;
|
||||
});
|
||||
Placeholder inputPlaceholder = Placeholder(cachedGraph->selfTensor, self);
|
||||
// Outputs as placeholders
|
||||
Placeholder valuesPlaceholder = Placeholder(cachedGraph->valuesTensor, values);
|
||||
Placeholder indicesPlaceholder = Placeholder(cachedGraph->indicesTensor, indices);
|
||||
// Create dictionary of inputs and outputs
|
||||
auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
|
||||
auto results = dictionaryFromPlaceholders(valuesPlaceholder, indicesPlaceholder);
|
||||
runMPSGraph(stream, cachedGraph->graph(), feeds, results);
|
||||
}
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
// sort
|
||||
TORCH_IMPL_FUNC(sort_stable_out_mps)
|
||||
@ -157,31 +81,4 @@ TORCH_IMPL_FUNC(sort_stable_out_mps)
|
||||
runMPSGraph(stream, cachedGraph->graph(), feeds, results);
|
||||
}
|
||||
}
|
||||
|
||||
std::tuple<Tensor&, Tensor&> kthvalue_out_mps(const Tensor& self,
|
||||
int64_t k,
|
||||
int64_t dim_,
|
||||
bool keepdim,
|
||||
Tensor& values,
|
||||
Tensor& indices) {
|
||||
// See note [Writing Nondeterministic Operations]
|
||||
// If there are duplicate elements of the kth value, the procedure for choosing which
|
||||
// of the duplicates to use for the indices output is nondeterministic.
|
||||
at::globalContext().alertNotDeterministic("kthvalue MPS");
|
||||
|
||||
int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
|
||||
int64_t slicesize = self.dim() == 0 ? 1 : self.size(dim);
|
||||
TORCH_CHECK(k >= 1 && k <= slicesize, "kthvalue(): selected number k out of range for dimension ", dim);
|
||||
at::assert_no_overlap(self, values);
|
||||
_reduction_with_indices_allocate_or_resize_output(values, indices, self, dim, keepdim);
|
||||
|
||||
kthvalue_out_mps_impl(self, k, dim, values, indices);
|
||||
|
||||
if (!keepdim) {
|
||||
values.squeeze_(dim);
|
||||
indices.squeeze_(dim);
|
||||
}
|
||||
|
||||
return std::forward_as_tuple(values, indices);
|
||||
}
|
||||
} // namespace at::native
|
||||
|
||||
@ -335,9 +335,6 @@ static void isin_Tensor_Tensor_out_mps(const Tensor& elements,
|
||||
}
|
||||
|
||||
static void is_posneginf_helper(TensorIteratorBase& iter, bool is_neg) {
|
||||
if (iter.numel() == 0) {
|
||||
return;
|
||||
}
|
||||
const auto& self = iter.input(0);
|
||||
auto& out = iter.output(0);
|
||||
@autoreleasepool {
|
||||
|
||||
@ -417,7 +417,6 @@ TORCH_IMPL_FUNC(sgn_out_mps)(const Tensor& self, const Tensor& output) {
|
||||
|
||||
Tensor& conj_physical_out_mps(const Tensor& self, Tensor& result) {
|
||||
TORCH_CHECK(self.is_complex());
|
||||
TORCH_CHECK(self.dtype() != at::kComplexDouble);
|
||||
mps::unary_op(self, result, "conj", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
|
||||
return [mpsGraph conjugateWithTensor:inputTensor name:nil];
|
||||
});
|
||||
|
||||
@ -340,8 +340,8 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: abs
|
||||
SparseCPU, SparseCUDA, SparseMPS: abs_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr
|
||||
SparseCPU, SparseCUDA: abs_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs
|
||||
tags: [core, pointwise]
|
||||
|
||||
@ -350,16 +350,16 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: abs_
|
||||
SparseCPU, SparseCUDA, SparseMPS: abs_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_
|
||||
SparseCPU, SparseCUDA: abs_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs_
|
||||
|
||||
- func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
CPU, CUDA, MPS, MTIA: abs_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: abs_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_out
|
||||
SparseCPU, SparseCUDA: abs_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
# Note [Adding an alias]
|
||||
@ -428,7 +428,7 @@
|
||||
variants: function, method
|
||||
structured_delegate: sgn.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: sgn_sparse
|
||||
SparseCPU, SparseCUDA: sgn_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn
|
||||
tags: pointwise
|
||||
@ -437,7 +437,7 @@
|
||||
variants: method
|
||||
structured_delegate: sgn.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_
|
||||
SparseCPU, SparseCUDA: sgn_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn_
|
||||
tags: pointwise
|
||||
@ -448,7 +448,7 @@
|
||||
dispatch:
|
||||
CPU, CUDA: sgn_out
|
||||
MPS: sgn_out_mps
|
||||
SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_out
|
||||
SparseCPU, SparseCUDA: sgn_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
@ -476,7 +476,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: _conj_physical
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr
|
||||
autogen: _conj_physical.out
|
||||
|
||||
- func: conj_physical(Tensor self) -> Tensor
|
||||
@ -487,8 +487,8 @@
|
||||
dispatch:
|
||||
CPU, CUDA: conj_physical_out
|
||||
MPS: conj_physical_out_mps
|
||||
SparseCPU, SparseCUDA, SparseMPS: conj_physical_out_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr_out
|
||||
SparseCPU, SparseCUDA: conj_physical_out_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
- func: conj_physical_(Tensor(a!) self) -> Tensor(a!)
|
||||
@ -554,7 +554,7 @@
|
||||
structured_delegate: add.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse
|
||||
SparseCPU, SparseCUDA, SparseMeta: add_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr
|
||||
MkldnnCPU: mkldnn_add
|
||||
ZeroTensor: add_zerotensor
|
||||
@ -566,7 +566,7 @@
|
||||
variants: method
|
||||
structured_delegate: add.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse_
|
||||
SparseCPU, SparseCUDA, SparseMeta: add_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_
|
||||
MkldnnCPU: mkldnn_add_
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add__Tensor
|
||||
@ -582,7 +582,6 @@
|
||||
dispatch:
|
||||
SparseCPU, SparseMeta: add_out_sparse_cpu
|
||||
SparseCUDA: add_out_sparse_cuda
|
||||
SparseMPS: add_out_sparse_mps
|
||||
SparseCsrCPU, SparseCsrMeta: add_out_sparse_compressed_cpu
|
||||
SparseCsrCUDA: add_out_sparse_compressed_cuda
|
||||
MkldnnCPU: mkldnn_add_out
|
||||
@ -875,7 +874,7 @@
|
||||
variants: function, method
|
||||
structured_delegate: asinh.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: asinh_sparse
|
||||
SparseCPU, SparseCUDA: asinh_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr
|
||||
tags: [core, pointwise]
|
||||
|
||||
@ -883,7 +882,7 @@
|
||||
variants: function, method
|
||||
structured_delegate: asinh.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_
|
||||
SparseCPU, SparseCUDA: asinh_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_
|
||||
tags: pointwise
|
||||
|
||||
@ -893,7 +892,7 @@
|
||||
dispatch:
|
||||
CPU, CUDA: asinh_out
|
||||
MPS: asinh_out_mps
|
||||
SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_out
|
||||
SparseCPU, SparseCUDA: asinh_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
@ -910,7 +909,7 @@
|
||||
structured_delegate: atanh.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: atanh_sparse
|
||||
SparseCPU, SparseCUDA: atanh_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr
|
||||
tags: [core, pointwise]
|
||||
|
||||
@ -918,7 +917,7 @@
|
||||
structured_delegate: atanh.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_
|
||||
SparseCPU, SparseCUDA: atanh_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_
|
||||
tags: pointwise
|
||||
|
||||
@ -928,7 +927,7 @@
|
||||
dispatch:
|
||||
CPU, CUDA: atanh_out
|
||||
MPS: atanh_out_mps
|
||||
SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_out
|
||||
SparseCPU, SparseCUDA: atanh_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_out
|
||||
tags: pointwise
|
||||
# arctanh, alias for atanh
|
||||
@ -965,7 +964,7 @@
|
||||
variants: function, method
|
||||
structured_delegate: asin.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: asin_sparse
|
||||
SparseCPU, SparseCUDA: asin_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr
|
||||
tags: [core, pointwise]
|
||||
|
||||
@ -974,7 +973,7 @@
|
||||
variants: function, method
|
||||
structured_delegate: asin.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: asin_sparse_
|
||||
SparseCPU, SparseCUDA: asin_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_
|
||||
tags: pointwise
|
||||
|
||||
@ -984,7 +983,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: asin_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: asin_sparse_out
|
||||
SparseCPU, SparseCUDA: asin_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
@ -1002,7 +1001,7 @@
|
||||
structured_delegate: atan.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: atan_sparse
|
||||
SparseCPU, SparseCUDA: atan_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr
|
||||
tags: [core, pointwise]
|
||||
|
||||
@ -1011,7 +1010,7 @@
|
||||
structured_delegate: atan.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: atan_sparse_
|
||||
SparseCPU, SparseCUDA: atan_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_
|
||||
tags: pointwise
|
||||
|
||||
@ -1021,7 +1020,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: atan_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: atan_sparse_out
|
||||
SparseCPU, SparseCUDA: atan_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
@ -1460,7 +1459,7 @@
|
||||
structured_delegate: ceil.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: ceil_sparse
|
||||
SparseCPU, SparseCUDA: ceil_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr
|
||||
tags: [core, pointwise]
|
||||
|
||||
@ -1469,7 +1468,7 @@
|
||||
structured_delegate: ceil.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_
|
||||
SparseCPU, SparseCUDA: ceil_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_
|
||||
tags: pointwise
|
||||
|
||||
@ -1479,7 +1478,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: ceil_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_out
|
||||
SparseCPU, SparseCUDA: ceil_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
@ -2407,7 +2406,7 @@
|
||||
MPS: empty_mps
|
||||
Meta: empty_meta_symint
|
||||
MkldnnCPU: empty_mkldnn
|
||||
SparseCPU, SparseCUDA, SparseMPS: empty_sparse
|
||||
SparseCPU, SparseCUDA: empty_sparse
|
||||
SparseMeta: empty_sparse_symint
|
||||
SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
|
||||
SparseCsrMeta: empty_sparse_compressed_symint
|
||||
@ -2535,7 +2534,7 @@
|
||||
structured_delegate: erf.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: erf_sparse
|
||||
SparseCPU, SparseCUDA: erf_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr
|
||||
tags: [core, pointwise]
|
||||
|
||||
@ -2544,7 +2543,7 @@
|
||||
structured_delegate: erf.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: erf_sparse_
|
||||
SparseCPU, SparseCUDA: erf_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_
|
||||
tags: pointwise
|
||||
|
||||
@ -2554,7 +2553,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS, MTIA: erf_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: erf_sparse_out
|
||||
SparseCPU, SparseCUDA: erf_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
@ -2620,7 +2619,7 @@
|
||||
structured_delegate: expm1.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: expm1_sparse
|
||||
SparseCPU, SparseCUDA: expm1_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr
|
||||
tags: [core, pointwise]
|
||||
|
||||
@ -2629,7 +2628,7 @@
|
||||
structured_delegate: expm1.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_
|
||||
SparseCPU, SparseCUDA: expm1_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_
|
||||
tags: pointwise
|
||||
|
||||
@ -2639,7 +2638,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: expm1_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_out
|
||||
SparseCPU, SparseCUDA: expm1_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
@ -2738,7 +2737,7 @@
|
||||
structured_delegate: floor.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: floor_sparse
|
||||
SparseCPU, SparseCUDA: floor_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr
|
||||
tags: [core, pointwise]
|
||||
|
||||
@ -2747,7 +2746,7 @@
|
||||
structured_delegate: floor.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: floor_sparse_
|
||||
SparseCPU, SparseCUDA: floor_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_
|
||||
tags: pointwise
|
||||
|
||||
@ -2757,7 +2756,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: floor_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: floor_sparse_out
|
||||
SparseCPU, SparseCUDA: floor_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
@ -2765,7 +2764,7 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CPU, CUDA, MPS, MTIA: floor_divide
|
||||
CPU, CUDA, MPS: floor_divide
|
||||
SparseCPU, SparseCUDA: floor_divide_sparse
|
||||
|
||||
- func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
|
||||
@ -2799,7 +2798,7 @@
|
||||
structured_delegate: frac.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: frac_sparse
|
||||
SparseCPU, SparseCUDA: frac_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr
|
||||
tags: pointwise
|
||||
|
||||
@ -2808,7 +2807,7 @@
|
||||
structured_delegate: frac.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: frac_sparse_
|
||||
SparseCPU, SparseCUDA: frac_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_
|
||||
tags: pointwise
|
||||
|
||||
@ -2819,7 +2818,7 @@
|
||||
dispatch:
|
||||
CPU, CUDA: frac_out
|
||||
MPS: frac_out_mps
|
||||
SparseCPU, SparseCUDA, SparseMPS: frac_sparse_out
|
||||
SparseCPU, SparseCUDA: frac_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
@ -3209,7 +3208,7 @@
|
||||
dispatch:
|
||||
CPU, CUDA, MPS, MTIA: isnan
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isnan
|
||||
SparseCPU, SparseCUDA, SparseMPS: isnan_sparse
|
||||
SparseCPU, SparseCUDA: isnan_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isnan_sparse_csr
|
||||
autogen: isnan.out
|
||||
tags: [core, pointwise]
|
||||
@ -3290,7 +3289,6 @@
|
||||
dispatch:
|
||||
CPU: kthvalue_out_cpu
|
||||
CUDA: kthvalue_out_cuda
|
||||
MPS: kthvalue_out_mps
|
||||
|
||||
- func: kthvalue.dimname(Tensor self, SymInt k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
|
||||
variants: function, method
|
||||
@ -3338,21 +3336,21 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: nan_to_num
|
||||
SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse
|
||||
SparseCPU, SparseCUDA: nan_to_num_sparse
|
||||
tags: pointwise
|
||||
|
||||
- func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!)
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: nan_to_num_
|
||||
SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_
|
||||
SparseCPU, SparseCUDA: nan_to_num_sparse_
|
||||
tags: pointwise
|
||||
|
||||
- func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
|
||||
dispatch:
|
||||
CPU, CUDA, MTIA: nan_to_num_out
|
||||
MPS: nan_to_num_out_mps
|
||||
SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_out
|
||||
SparseCPU, SparseCUDA: nan_to_num_sparse_out
|
||||
tags: pointwise
|
||||
|
||||
- func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
|
||||
@ -3555,7 +3553,7 @@
|
||||
structured_delegate: log1p.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: log1p_sparse
|
||||
SparseCPU, SparseCUDA: log1p_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr
|
||||
tags: [core, pointwise]
|
||||
|
||||
@ -3564,7 +3562,7 @@
|
||||
structured_delegate: log1p.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_
|
||||
SparseCPU, SparseCUDA: log1p_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_
|
||||
tags: pointwise
|
||||
|
||||
@ -3574,7 +3572,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: log1p_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_out
|
||||
SparseCPU, SparseCUDA: log1p_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
@ -4666,7 +4664,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: rad2deg
|
||||
SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse
|
||||
SparseCPU, SparseCUDA: rad2deg_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr
|
||||
tags: pointwise
|
||||
|
||||
@ -4674,14 +4672,14 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: rad2deg_
|
||||
SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_
|
||||
SparseCPU, SparseCUDA: rad2deg_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_
|
||||
tags: pointwise
|
||||
|
||||
- func: rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: rad2deg_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_out
|
||||
SparseCPU, SparseCUDA: rad2deg_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
@ -4689,7 +4687,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: deg2rad
|
||||
SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse
|
||||
SparseCPU, SparseCUDA: deg2rad_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr
|
||||
tags: pointwise
|
||||
|
||||
@ -4697,14 +4695,14 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: deg2rad_
|
||||
SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_
|
||||
SparseCPU, SparseCUDA: deg2rad_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_
|
||||
tags: pointwise
|
||||
|
||||
- func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: deg2rad_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_out
|
||||
SparseCPU, SparseCUDA: deg2rad_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
@ -4930,7 +4928,7 @@
|
||||
structured_delegate: neg.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: neg_sparse
|
||||
SparseCPU, SparseCUDA: neg_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg
|
||||
tags: [core, pointwise]
|
||||
@ -4940,7 +4938,7 @@
|
||||
structured_delegate: neg.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: neg_sparse_
|
||||
SparseCPU, SparseCUDA: neg_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg_
|
||||
tags: pointwise
|
||||
@ -4951,7 +4949,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS, MTIA: neg_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: neg_out_sparse
|
||||
SparseCPU, SparseCUDA: neg_out_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_out
|
||||
tags: pointwise
|
||||
# Alias for neg
|
||||
@ -5035,7 +5033,7 @@
|
||||
structured_delegate: round.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: round_sparse
|
||||
SparseCPU, SparseCUDA: round_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr
|
||||
tags: [core, pointwise]
|
||||
|
||||
@ -5044,7 +5042,7 @@
|
||||
structured_delegate: round.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: round_sparse_
|
||||
SparseCPU, SparseCUDA: round_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_
|
||||
tags: pointwise
|
||||
|
||||
@ -5054,7 +5052,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: round_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: round_sparse_out
|
||||
SparseCPU, SparseCUDA: round_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
@ -5097,7 +5095,7 @@
|
||||
QuantizedCPU: relu_quantized_cpu
|
||||
QuantizedCUDA: relu_quantized_cuda
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu
|
||||
SparseCPU, SparseCUDA, SparseMPS: relu_sparse
|
||||
SparseCPU, SparseCUDA: relu_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr
|
||||
tags: [core, pointwise]
|
||||
|
||||
@ -5112,7 +5110,7 @@
|
||||
QuantizedCPU: relu_quantized_cpu_
|
||||
QuantizedCUDA: relu_quantized_cuda_
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu_
|
||||
SparseCPU, SparseCUDA, SparseMPS: relu_sparse_
|
||||
SparseCPU, SparseCUDA: relu_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr_
|
||||
autogen: relu.out
|
||||
tags: pointwise
|
||||
@ -5399,7 +5397,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr
|
||||
SparseCPU, SparseCUDA, SparseMPS: sin_sparse
|
||||
SparseCPU, SparseCUDA: sin_sparse
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sin
|
||||
tags: [core, pointwise]
|
||||
|
||||
@ -5409,7 +5407,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_
|
||||
SparseCPU, SparseCUDA, SparseMPS: sin_sparse_
|
||||
SparseCPU, SparseCUDA: sin_sparse_
|
||||
tags: pointwise
|
||||
|
||||
- func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
|
||||
@ -5419,7 +5417,7 @@
|
||||
dispatch:
|
||||
CPU, CUDA, MPS, MTIA: sin_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: sin_sparse_out
|
||||
SparseCPU, SparseCUDA: sin_sparse_out
|
||||
tags: pointwise
|
||||
|
||||
- func: sinc(Tensor self) -> Tensor
|
||||
@ -5444,7 +5442,7 @@
|
||||
structured_delegate: sinh.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: sinh_sparse
|
||||
SparseCPU, SparseCUDA: sinh_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr
|
||||
tags: [core, pointwise]
|
||||
|
||||
@ -5453,7 +5451,7 @@
|
||||
structured_delegate: sinh.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_
|
||||
SparseCPU, SparseCUDA: sinh_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_
|
||||
tags: pointwise
|
||||
|
||||
@ -5463,7 +5461,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: sinh_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_out
|
||||
SparseCPU, SparseCUDA: sinh_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_out
|
||||
|
||||
# Returns a copy of this `Variable` that is detached from its autograd graph.
|
||||
@ -5906,7 +5904,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sqrt
|
||||
SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse
|
||||
SparseCPU, SparseCUDA: sqrt_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr
|
||||
tags: [core, pointwise]
|
||||
|
||||
@ -5915,7 +5913,7 @@
|
||||
structured_delegate: sqrt.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_
|
||||
SparseCPU, SparseCUDA: sqrt_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_
|
||||
tags: pointwise
|
||||
|
||||
@ -5925,7 +5923,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS, MTIA: sqrt_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_out
|
||||
SparseCPU, SparseCUDA: sqrt_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
@ -6063,7 +6061,7 @@
|
||||
structured_delegate: tan.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: tan_sparse
|
||||
SparseCPU, SparseCUDA: tan_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr
|
||||
tags: [core, pointwise]
|
||||
|
||||
@ -6072,7 +6070,7 @@
|
||||
structured_delegate: tan.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: tan_sparse_
|
||||
SparseCPU, SparseCUDA: tan_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_
|
||||
tags: pointwise
|
||||
|
||||
@ -6082,7 +6080,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: tan_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: tan_sparse_out
|
||||
SparseCPU, SparseCUDA: tan_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
@ -6093,7 +6091,7 @@
|
||||
dispatch:
|
||||
QuantizedCPU: tanh_quantized_cpu
|
||||
MkldnnCPU: mkldnn_tanh
|
||||
SparseCPU, SparseCUDA, SparseMPS: tanh_sparse
|
||||
SparseCPU, SparseCUDA: tanh_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh
|
||||
tags: [core, pointwise]
|
||||
@ -6104,7 +6102,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
MkldnnCPU: mkldnn_tanh_
|
||||
SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_
|
||||
SparseCPU, SparseCUDA: tanh_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh_
|
||||
tags: pointwise
|
||||
@ -6115,7 +6113,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS, MTIA: tanh_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_out
|
||||
SparseCPU, SparseCUDA: tanh_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
@ -6387,8 +6385,8 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: trunc_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr
|
||||
SparseCPU, SparseCUDA: trunc_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr
|
||||
tags: [core, pointwise]
|
||||
|
||||
- func: trunc_(Tensor(a!) self) -> Tensor(a!)
|
||||
@ -6396,8 +6394,8 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_
|
||||
SparseCPU, SparseCUDA: trunc_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_
|
||||
tags: pointwise
|
||||
|
||||
- func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
|
||||
@ -6406,8 +6404,8 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: trunc_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_out
|
||||
SparseCPU, SparseCUDA: trunc_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_out
|
||||
tags: pointwise
|
||||
# Alias for trunc
|
||||
|
||||
@ -7369,8 +7367,8 @@
|
||||
- func: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor
|
||||
variants: method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: sparse_to_dense
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: sparse_compressed_to_dense
|
||||
SparseCPU, SparseCUDA: sparse_to_dense
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_compressed_to_dense
|
||||
MkldnnCPU: mkldnn_to_dense
|
||||
autogen: _to_dense.out
|
||||
|
||||
@ -7396,8 +7394,8 @@
|
||||
- func: dense_dim(Tensor self) -> int
|
||||
variants: method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: dense_dim_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: dense_dim_sparse_csr
|
||||
SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr
|
||||
CompositeExplicitAutograd: dense_dim_default
|
||||
device_check: NoCheck
|
||||
device_guard: False
|
||||
@ -7530,7 +7528,7 @@
|
||||
device_check: NoCheck # Allows copy into different device
|
||||
variants: function
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: copy_sparse_
|
||||
SparseCPU, SparseCUDA, SparseMeta: copy_sparse_
|
||||
autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out
|
||||
|
||||
# By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors
|
||||
@ -9721,7 +9719,7 @@
|
||||
structured_delegate: sign.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: sign_sparse
|
||||
SparseCPU, SparseCUDA: sign_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr
|
||||
tags: [core, pointwise]
|
||||
|
||||
@ -9730,7 +9728,7 @@
|
||||
structured_delegate: sign.out
|
||||
variants: method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: sign_sparse_
|
||||
SparseCPU, SparseCUDA: sign_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_
|
||||
tags: pointwise
|
||||
|
||||
@ -9741,7 +9739,7 @@
|
||||
dispatch:
|
||||
CPU, CUDA: sign_out
|
||||
MPS: sign_out_mps
|
||||
SparseCPU, SparseCUDA, SparseMPS: sign_sparse_out
|
||||
SparseCPU, SparseCUDA: sign_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
@ -9749,7 +9747,7 @@
|
||||
variants: function, method
|
||||
structured_delegate: signbit.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: signbit_sparse
|
||||
SparseCPU, SparseCUDA: signbit_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr
|
||||
tags: pointwise
|
||||
|
||||
@ -9760,7 +9758,7 @@
|
||||
CPU: signbit_out
|
||||
CUDA: signbit_out
|
||||
MPS: signbit_out_mps
|
||||
SparseCPU, SparseCUDA, SparseMPS: signbit_sparse_out
|
||||
SparseCPU, SparseCUDA: signbit_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
@ -13264,7 +13262,7 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: isinf
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isinf
|
||||
SparseCPU, SparseCUDA, SparseMPS: isinf_sparse
|
||||
SparseCPU, SparseCUDA: isinf_sparse
|
||||
SparseMeta: isinf_sparse_meta
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isinf_sparse_csr
|
||||
autogen: isinf.out
|
||||
@ -13280,7 +13278,7 @@
|
||||
structured_delegate: isposinf.out
|
||||
dispatch:
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isposinf
|
||||
SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse
|
||||
SparseCPU, SparseCUDA: isposinf_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr
|
||||
tags: pointwise
|
||||
|
||||
@ -13289,7 +13287,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: isposinf_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse_out
|
||||
SparseCPU, SparseCUDA: isposinf_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
@ -13298,7 +13296,7 @@
|
||||
structured_delegate: isneginf.out
|
||||
dispatch:
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isneginf
|
||||
SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse
|
||||
SparseCPU, SparseCUDA: isneginf_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr
|
||||
tags: pointwise
|
||||
|
||||
@ -13307,7 +13305,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: isneginf_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse_out
|
||||
SparseCPU, SparseCUDA: isneginf_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
|
||||
@ -1,73 +0,0 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/SparseTensorUtils.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/native/sparse/SparseStubs.h>
|
||||
#include <ATen/native/sparse/FlattenIndicesCommon.h>
|
||||
#include <ATen/ExpandUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/_coalesce_native.h>
|
||||
#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
|
||||
#include <ATen/ops/empty_native.h>
|
||||
#include <ATen/ops/zeros_native.h>
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
namespace {
|
||||
|
||||
using namespace mps;
|
||||
using namespace at::sparse;
|
||||
|
||||
#ifndef PYTORCH_JIT_COMPILE_SHADERS
|
||||
static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
|
||||
#else
|
||||
#include <ATen/native/mps/FlattenIndices_metallib.h>
|
||||
#endif
|
||||
|
||||
Tensor flatten_indices_mps(const Tensor& indices, IntArrayRef size) {
|
||||
TORCH_CHECK(indices.dim() == 2, "flatten_indices: indices must be 2D");
|
||||
TORCH_CHECK(static_cast<size_t>(indices.size(0)) == size.size(),
|
||||
"flatten_indices: indices.size(0) must equal size.size()");
|
||||
|
||||
const int64_t sparse_dim = indices.size(0);
|
||||
const int64_t nnz = indices.size(1);
|
||||
|
||||
if (nnz == 0) {
|
||||
return at::empty({0}, indices.options().dtype(kLong));
|
||||
}
|
||||
|
||||
// Row-major multipliers for flattening: mul[d] = prod_{j>d}(size[j])
|
||||
std::vector<int64_t> row_muls(sparse_dim);
|
||||
row_muls[sparse_dim - 1] = 1;
|
||||
for (int64_t i = sparse_dim - 2; i >= 0; --i) {
|
||||
row_muls[i] = row_muls[i + 1] * size[i + 1];
|
||||
}
|
||||
|
||||
auto flat_indices = at::empty({nnz}, indices.options().dtype(kLong));
|
||||
|
||||
auto stream = getCurrentMPSStream();
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
auto pipeline = lib.getPipelineStateForFunc("flatten_indices_kernel");
|
||||
auto encoder = stream->commandEncoder();
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
mtl_setArgs(encoder,
|
||||
indices,
|
||||
row_muls,
|
||||
flat_indices,
|
||||
static_cast<uint>(sparse_dim),
|
||||
indices.strides()
|
||||
);
|
||||
|
||||
mtl_dispatch1DJob(encoder, pipeline, nnz);
|
||||
}
|
||||
});
|
||||
return flat_indices;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
REGISTER_MPS_DISPATCH(flatten_indices_stub, &flatten_indices_mps)
|
||||
} // namespace at::native
|
||||
@ -20,9 +20,46 @@ using namespace at::sparse;
|
||||
#ifndef PYTORCH_JIT_COMPILE_SHADERS
|
||||
static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
|
||||
#else
|
||||
#include <ATen/native/mps/Coalesce_metallib.h>
|
||||
#include <ATen/native/mps/Sparse_metallib.h>
|
||||
#endif
|
||||
|
||||
|
||||
static Tensor flatten_indices(const Tensor& indices, IntArrayRef size) {
|
||||
|
||||
TORCH_CHECK(indices.dim() == 2, "flatten_indices: indices must be 2D");
|
||||
TORCH_CHECK(static_cast<size_t>(indices.size(0)) == size.size(),
|
||||
"flatten_indices: indices.size(0) must equal size.size()");
|
||||
|
||||
int64_t sparse_dim = indices.size(0);
|
||||
int64_t nnz = indices.size(1);
|
||||
|
||||
if (nnz == 0) {
|
||||
return at::empty({0}, indices.options().dtype(kLong));
|
||||
}
|
||||
|
||||
std::vector<int64_t> strides(sparse_dim);
|
||||
strides[sparse_dim - 1] = 1;
|
||||
for (int64_t i = sparse_dim - 2; i >= 0; i--) {
|
||||
strides[i] = strides[i + 1] * size[i + 1];
|
||||
}
|
||||
|
||||
Tensor flat_indices = at::empty({nnz}, indices.options().dtype(kLong));
|
||||
|
||||
auto stream = getCurrentMPSStream();
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
auto pipeline = lib.getPipelineStateForFunc("flatten_indices_kernel");
|
||||
auto encoder = stream->commandEncoder();
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
|
||||
mtl_setArgs(encoder, indices, strides, flat_indices, sparse_dim, nnz);
|
||||
mtl_dispatch1DJob(encoder, pipeline, nnz);
|
||||
}
|
||||
});
|
||||
|
||||
return flat_indices;
|
||||
}
|
||||
|
||||
static Tensor compute_output_positions(const Tensor& is_unique) {
|
||||
|
||||
int64_t nnz = is_unique.size(0);
|
||||
|
||||
@ -1,169 +0,0 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/SparseTensorUtils.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/_coalesce_native.h>
|
||||
#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
|
||||
#include <ATen/ops/cat.h>
|
||||
#include <ATen/ops/add_native.h>
|
||||
#include <ATen/ops/empty_native.h>
|
||||
#include <ATen/ops/zeros_native.h>
|
||||
#include <ATen/ops/result_type.h>
|
||||
#include <ATen/ops/copy_sparse_to_sparse.h>
|
||||
#include <ATen/ops/mul.h>
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
|
||||
using namespace at::sparse;
|
||||
|
||||
Tensor& add_out_dense_sparse_mps(Tensor& out, const Tensor& dense, const SparseTensor& sparse, const Scalar& alpha);
|
||||
|
||||
Tensor& add_out_dense_sparse_mps(
|
||||
Tensor& out,
|
||||
const Tensor& dense,
|
||||
const SparseTensor& sparse,
|
||||
const Scalar& alpha) {
|
||||
TORCH_CHECK(dense.is_mps(), "add: expected 'self' to be an MPS tensor, got ", dense.device());
|
||||
TORCH_CHECK(sparse.is_mps(), "add: expected 'other' to be an MPS tensor, got ", sparse.device());
|
||||
TORCH_CHECK(out.is_mps(), "add: expected 'out' to be an MPS tensor, got ", out.device());
|
||||
TORCH_CHECK(dense.sizes().equals(sparse.sizes()),
|
||||
"add: expected 'self' and 'other' to have same size, but self has size ",
|
||||
dense.sizes(), " while other has size ", sparse.sizes(),
|
||||
" (FYI: dense-sparse addition does not currently support broadcasting)");
|
||||
|
||||
const int64_t nnz = sparse._nnz();
|
||||
if (nnz == 0) {
|
||||
out.resize_as_(dense);
|
||||
out.copy_(dense);
|
||||
return out;
|
||||
}
|
||||
|
||||
auto commonDtype = at::result_type(dense, sparse);
|
||||
TORCH_CHECK(canCast(commonDtype, out.scalar_type()),
|
||||
"Can't convert result type ", commonDtype, " to output ", out.scalar_type());
|
||||
|
||||
Tensor r;
|
||||
const bool need_separate_buffer = out.is_same(dense) || (out.scalar_type() != commonDtype);
|
||||
if (need_separate_buffer) {
|
||||
r = at::empty(dense.sizes(), out.options().dtype(commonDtype));
|
||||
} else {
|
||||
r = out;
|
||||
r.resize_as_(dense);
|
||||
}
|
||||
|
||||
Tensor dense_buffer = dense.to(commonDtype);
|
||||
if (!r.is_same(dense_buffer)) {
|
||||
r.copy_(dense_buffer);
|
||||
}
|
||||
|
||||
Tensor indices = sparse._indices();
|
||||
Tensor values = sparse._values().to(commonDtype);
|
||||
if (values.numel() == 0) {
|
||||
if (!out.is_same(r)) {
|
||||
out.resize_as_(dense);
|
||||
out.copy_(r);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
const int64_t nDim = r.dim();
|
||||
const int64_t nDimI = sparse.sparse_dim();
|
||||
TORCH_CHECK(nDimI >= 0 && nDimI <= nDim,
|
||||
"Invalid sparse_dim=", nDimI, " for dense tensor of dim ", nDim);
|
||||
|
||||
Tensor indices1D = at::sparse::flatten_indices(indices, sparse.sizes()).contiguous();
|
||||
|
||||
int64_t view_rows = 1;
|
||||
int64_t view_cols = 1;
|
||||
for (int64_t i = 0; i < nDimI; i++) {
|
||||
view_rows *= r.size(i);
|
||||
}
|
||||
for (int64_t i = nDimI; i < nDim; i++) {
|
||||
view_cols *= r.size(i);
|
||||
}
|
||||
|
||||
if (view_cols == 1) {
|
||||
Tensor r_flat = r.reshape({view_rows});
|
||||
Tensor values_1d = values.reshape({nnz});
|
||||
r_flat.index_add_(0, indices1D, values_1d, alpha);
|
||||
} else {
|
||||
Tensor r_view = r.view({view_rows, view_cols});
|
||||
Tensor values_2d = values.reshape({nnz, view_cols});
|
||||
r_view.index_add_(0, indices1D, values_2d, alpha);
|
||||
}
|
||||
|
||||
if (!out.is_same(r)) {
|
||||
out.resize_as_(dense);
|
||||
out.copy_(r);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
SparseTensor& add_out_sparse_mps(const SparseTensor& self,
|
||||
const SparseTensor& other,
|
||||
const Scalar& alpha,
|
||||
SparseTensor& out) {
|
||||
TORCH_CHECK(other.is_sparse(), "add(sparse, dense) is not supported. Use add(dense, sparse) instead.");
|
||||
TORCH_CHECK(self.is_mps(), "add: expected 'self' to be MPS, but got ", self.device());
|
||||
TORCH_CHECK(other.is_mps(), "add: expected 'other' to be MPS, but got ", other.device());
|
||||
TORCH_CHECK(out.is_mps(), "add: expected 'out' to be MPS, but got ", out.device());
|
||||
if (!self.is_sparse()) {
|
||||
return add_out_dense_sparse_mps(out, self, other, alpha);
|
||||
}
|
||||
auto commonDtype = at::result_type(self, other);
|
||||
TORCH_CHECK(canCast(commonDtype, out.scalar_type()),
|
||||
"Can't convert result type ", commonDtype, " to output ", out.scalar_type());
|
||||
|
||||
TORCH_CHECK(self.sizes().equals(other.sizes()),
|
||||
"add: expected 'self' and 'other' to have same size, but ", self.sizes(), " != ", other.sizes());
|
||||
|
||||
TORCH_CHECK(is_same_density(self, other),
|
||||
"add: expected 'self' and 'other' to have same density, but 'self' has ",
|
||||
self.sparse_dim(), " sparse dimensions while 'other' has ", other.sparse_dim(), " sparse dimensions");
|
||||
|
||||
if (other._nnz() == 0) {
|
||||
out.resize_as_(self);
|
||||
Tensor vals = self._values();
|
||||
if (vals.scalar_type() != out.scalar_type()) {
|
||||
vals = vals.to(out.scalar_type());
|
||||
}
|
||||
alias_into_sparse(out, self._indices(), vals);
|
||||
out._coalesced_(self.is_coalesced());
|
||||
return out;
|
||||
}
|
||||
|
||||
Tensor t_indices_ = self._indices();
|
||||
Tensor s_indices_ = other._indices();
|
||||
|
||||
Tensor t_values_ = self._values().to(commonDtype);
|
||||
Tensor s_values_ = other._values().to(commonDtype);
|
||||
if (!alpha.isIntegral(false) || alpha.to<double>() != 1.0) {
|
||||
s_values_ = at::mul(s_values_, alpha);
|
||||
}
|
||||
|
||||
Tensor r_indices_ = at::cat({t_indices_, s_indices_}, 1);
|
||||
Tensor r_values_ = at::cat({t_values_, s_values_ }, 0);
|
||||
|
||||
SparseTensor tmp = empty({0}, out.options().dtype(commonDtype));
|
||||
tmp.resize_as_(other);
|
||||
alias_into_sparse(tmp, r_indices_, r_values_);
|
||||
tmp = _coalesce_sparse_mps(tmp);
|
||||
|
||||
out.resize_as_(other);
|
||||
Tensor out_vals = tmp._values();
|
||||
if (out.scalar_type() != commonDtype) {
|
||||
out_vals = out_vals.to(out.scalar_type());
|
||||
}
|
||||
alias_into_sparse(out, tmp._indices(), out_vals);
|
||||
out._coalesced_(tmp.is_coalesced());
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
@ -1,19 +0,0 @@
|
||||
#include <metal_stdlib>
|
||||
using namespace metal;
|
||||
|
||||
|
||||
kernel void flatten_indices_kernel(
|
||||
device const long* indices [[ buffer(0) ]],
|
||||
device const long* row_muls [[ buffer(1) ]],
|
||||
device long* flat_indices [[ buffer(2) ]],
|
||||
constant uint& sparse_dim [[ buffer(3) ]],
|
||||
constant long2& idx_strides [[ buffer(4) ]],
|
||||
uint gid [[ thread_position_in_grid ]]) {
|
||||
long flat = 0;
|
||||
for (uint d = 0; d < sparse_dim; ++d) {
|
||||
long off = (long)d * idx_strides.x + (long)gid * idx_strides.y;
|
||||
long v = indices[off];
|
||||
flat += v * row_muls[d];
|
||||
}
|
||||
flat_indices[gid] = flat;
|
||||
}
|
||||
@ -2,6 +2,19 @@
|
||||
#include <metal_stdlib>
|
||||
using namespace metal;
|
||||
|
||||
kernel void flatten_indices_kernel(
|
||||
device const int64_t* indices [[buffer(0)]],
|
||||
device const int64_t* strides [[buffer(1)]],
|
||||
device int64_t* flat_indices [[buffer(2)]],
|
||||
constant uint& sparse_dim [[buffer(3)]],
|
||||
constant uint& nnz [[buffer(4)]],
|
||||
uint gid [[thread_position_in_grid]]) {
|
||||
int64_t flat_idx = 0;
|
||||
for (uint d = 0; d < sparse_dim; d++) {
|
||||
flat_idx += indices[d * nnz + gid] * strides[d];
|
||||
}
|
||||
flat_indices[gid] = flat_idx;
|
||||
}
|
||||
|
||||
kernel void compute_output_positions_kernel(
|
||||
device const bool* is_unique [[buffer(0)]],
|
||||
@ -10,13 +10,8 @@ using namespace at::native::memory;
|
||||
|
||||
constexpr int buffer_size = 1024;
|
||||
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION < 13000
|
||||
__managed__ double4 buffer1[buffer_size];
|
||||
__managed__ double4 buffer2[buffer_size];
|
||||
#else
|
||||
__managed__ double4_16a buffer1[buffer_size];
|
||||
__managed__ double4_16a buffer2[buffer_size];
|
||||
#endif
|
||||
|
||||
void reset_buffers() {
|
||||
for (int i = 0; i < buffer_size; i++) {
|
||||
|
||||
@ -106,7 +106,7 @@ dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
doctr_det_predictor,pass,3
|
||||
doctr_det_predictor,pass,4
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -106,7 +106,7 @@ dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
doctr_det_predictor,pass,3
|
||||
doctr_det_predictor,pass,4
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -106,7 +106,7 @@ dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
doctr_det_predictor,pass,3
|
||||
doctr_det_predictor,pass,4
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -106,7 +106,7 @@ dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
doctr_det_predictor,pass,3
|
||||
doctr_det_predictor,pass,4
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -106,7 +106,7 @@ dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
doctr_det_predictor,pass,3
|
||||
doctr_det_predictor,pass,4
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -1427,13 +1427,23 @@ class AOTInductorModelCache:
|
||||
inductor_configs = {}
|
||||
if mode == "max-autotune":
|
||||
inductor_configs["max_autotune"] = True
|
||||
ep = torch.export.export(
|
||||
model_clone,
|
||||
example_args,
|
||||
example_kwargs,
|
||||
dynamic_shapes=dynamic_shapes,
|
||||
strict=False,
|
||||
)
|
||||
# We can't support this in non-strict
|
||||
if hasattr(model_clone, "name") and model.name == "levit_128":
|
||||
ep = torch.export.export(
|
||||
model_clone,
|
||||
example_args,
|
||||
example_kwargs,
|
||||
dynamic_shapes=dynamic_shapes,
|
||||
strict=True,
|
||||
)
|
||||
else:
|
||||
ep = torch.export.export(
|
||||
model_clone,
|
||||
example_args,
|
||||
example_kwargs,
|
||||
dynamic_shapes=dynamic_shapes,
|
||||
strict=True,
|
||||
)
|
||||
with torch.no_grad():
|
||||
package_path = torch._inductor.aoti_compile_and_package(
|
||||
ep, inductor_configs=inductor_configs
|
||||
@ -2317,6 +2327,7 @@ class BenchmarkRunner:
|
||||
# no need for n iterations
|
||||
# the logic should be the same to self.model_iter_fn (forward_pass)
|
||||
with self.autocast(**self.autocast_arg):
|
||||
model_copy.name = name
|
||||
optimized_model_iter_fn = optimize_ctx(
|
||||
model_copy, example_inputs
|
||||
)
|
||||
|
||||
@ -759,13 +759,6 @@ libtorch_cuda_distributed_extra_sources = [
|
||||
"torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
|
||||
]
|
||||
|
||||
libtorch_nvshmem_sources = [
|
||||
"torch/csrc/distributed/c10d/cuda/utils.cpp",
|
||||
"torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
|
||||
"torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu",
|
||||
"torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu",
|
||||
]
|
||||
|
||||
libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources
|
||||
|
||||
libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_sources + [
|
||||
|
||||
@ -61,14 +61,11 @@ void* get_symbol(const char* name, int version) {
|
||||
}
|
||||
#endif
|
||||
|
||||
// As of CUDA 13, this API is deprecated.
|
||||
#if defined(CUDA_VERSION) && (CUDA_VERSION < 13000)
|
||||
// This fallback to the old API to try getting the symbol again.
|
||||
if (auto st = cudaGetDriverEntryPoint(name, &out, cudaEnableDefault, &qres);
|
||||
st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
|
||||
return out;
|
||||
}
|
||||
#endif
|
||||
|
||||
// If the symbol cannot be resolved, report and return nullptr;
|
||||
// the caller is responsible for checking the pointer.
|
||||
|
||||
@ -540,9 +540,11 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
|
||||
${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
|
||||
)
|
||||
|
||||
append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
|
||||
if(USE_DISTRIBUTED)
|
||||
append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@ -566,30 +568,32 @@ if(USE_CUDA)
|
||||
list(APPEND Caffe2_GPU_SRCS
|
||||
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
|
||||
endif()
|
||||
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
|
||||
set_source_files_properties(
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
|
||||
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
|
||||
)
|
||||
endif()
|
||||
if(USE_DISTRIBUTED)
|
||||
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
|
||||
set_source_files_properties(
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
|
||||
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
|
||||
)
|
||||
endif()
|
||||
|
||||
set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
|
||||
# Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
|
||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
|
||||
endif()
|
||||
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
|
||||
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
|
||||
set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
|
||||
# Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
|
||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
|
||||
endif()
|
||||
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
|
||||
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
|
||||
endif()
|
||||
endif()
|
||||
set_source_files_properties(
|
||||
${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
|
||||
@ -622,9 +626,11 @@ if(USE_ROCM)
|
||||
list(APPEND Caffe2_HIP_SRCS
|
||||
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
|
||||
endif()
|
||||
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
|
||||
if(USE_DISTRIBUTED)
|
||||
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
|
||||
endif()
|
||||
endif()
|
||||
# caffe2_nvrtc's stubs to driver APIs are useful for HIP.
|
||||
# See NOTE [ ATen NVRTC Stub and HIP ]
|
||||
@ -1057,7 +1063,7 @@ elseif(USE_CUDA)
|
||||
UNFUSE_FMA # Addressing issue #121558
|
||||
)
|
||||
target_sources(torch_cuda PRIVATE $<TARGET_OBJECTS:flash_attention>)
|
||||
target_include_directories(torch_cuda SYSTEM PUBLIC
|
||||
target_include_directories(torch_cuda PUBLIC
|
||||
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc>
|
||||
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/flash-attention/include>
|
||||
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/cutlass/include>
|
||||
@ -1345,10 +1351,12 @@ if(BUILD_TEST)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
|
||||
add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
|
||||
if(NOT WIN32)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
|
||||
if(USE_DISTRIBUTED)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
|
||||
if(NOT WIN32)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
|
||||
endif()
|
||||
endif()
|
||||
if(NOT NO_API)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api)
|
||||
@ -1453,40 +1461,46 @@ if(BUILD_LITE_INTERPRETER)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(USE_GLOO AND USE_C10D_GLOO)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
|
||||
endif()
|
||||
if(USE_UCC AND USE_C10D_UCC)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
|
||||
if(USE_CUDA)
|
||||
target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
|
||||
|
||||
# Pass USE_DISTRIBUTED to torch_cpu, as some codes in jit/pickler.cpp and
|
||||
# jit/unpickler.cpp need to be compiled only when USE_DISTRIBUTED is set
|
||||
if(USE_DISTRIBUTED)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_DISTRIBUTED)
|
||||
if(USE_GLOO AND USE_C10D_GLOO)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
|
||||
endif()
|
||||
endif()
|
||||
if(USE_NCCL AND USE_C10D_NCCL)
|
||||
if(USE_ROCM)
|
||||
target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
|
||||
else()
|
||||
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
|
||||
if(USE_UCC AND USE_C10D_UCC)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
|
||||
if(USE_CUDA)
|
||||
target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if(USE_MPI AND USE_C10D_MPI)
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
set_source_files_properties(
|
||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
|
||||
PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
|
||||
if(USE_NCCL AND USE_C10D_NCCL)
|
||||
if(USE_ROCM)
|
||||
target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
|
||||
else()
|
||||
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
|
||||
endif()
|
||||
endif()
|
||||
if(USE_MPI AND USE_C10D_MPI)
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
set_source_files_properties(
|
||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
|
||||
PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
|
||||
endif()
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
|
||||
endif()
|
||||
# Pass USE_RPC in order to reduce use of
|
||||
# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
|
||||
# need to be removed when RPC is supported
|
||||
if(NOT WIN32)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_RPC)
|
||||
endif()
|
||||
# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
|
||||
# can only be compiled with USE_TENSORPIPE is set.
|
||||
if(USE_TENSORPIPE)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
|
||||
endif()
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
|
||||
endif()
|
||||
# Pass USE_RPC in order to reduce use of
|
||||
# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
|
||||
# need to be removed when RPC is supported
|
||||
if(NOT WIN32)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_RPC)
|
||||
endif()
|
||||
# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
|
||||
# can only be compiled with USE_TENSORPIPE is set.
|
||||
if(USE_TENSORPIPE)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
|
||||
endif()
|
||||
|
||||
if(NOT INTERN_BUILD_MOBILE)
|
||||
|
||||
@ -1126,7 +1126,7 @@ if(USE_CUDA AND CUDA_VERSION VERSION_LESS 13.0)
|
||||
include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
|
||||
endif()
|
||||
|
||||
if(USE_TENSORPIPE)
|
||||
if(USE_DISTRIBUTED AND USE_TENSORPIPE)
|
||||
if(MSVC)
|
||||
message(WARNING "Tensorpipe cannot be used on Windows.")
|
||||
else()
|
||||
|
||||
@ -191,11 +191,13 @@ function(caffe2_print_configuration_summary)
|
||||
message(STATUS " USE_PYTORCH_QNNPACK : ${USE_PYTORCH_QNNPACK}")
|
||||
message(STATUS " USE_XNNPACK : ${USE_XNNPACK}")
|
||||
message(STATUS " USE_DISTRIBUTED : ${USE_DISTRIBUTED}")
|
||||
message(STATUS " USE_MPI : ${USE_MPI}")
|
||||
message(STATUS " USE_GLOO : ${USE_GLOO}")
|
||||
message(STATUS " USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
|
||||
message(STATUS " USE_GLOO_IBVERBS : ${USE_GLOO_IBVERBS}")
|
||||
message(STATUS " USE_TENSORPIPE : ${USE_TENSORPIPE}")
|
||||
if(${USE_DISTRIBUTED})
|
||||
message(STATUS " USE_MPI : ${USE_MPI}")
|
||||
message(STATUS " USE_GLOO : ${USE_GLOO}")
|
||||
message(STATUS " USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
|
||||
message(STATUS " USE_GLOO_IBVERBS : ${USE_GLOO_IBVERBS}")
|
||||
message(STATUS " USE_TENSORPIPE : ${USE_TENSORPIPE}")
|
||||
endif()
|
||||
if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
|
||||
message(STATUS " SELECTED_OP_LIST : ${SELECTED_OP_LIST}")
|
||||
endif()
|
||||
|
||||
@ -3331,6 +3331,13 @@ def coverage_post_process(app, exception):
|
||||
if not isinstance(app.builder, CoverageBuilder):
|
||||
return
|
||||
|
||||
if not torch.distributed.is_available():
|
||||
raise RuntimeError(
|
||||
"The coverage tool cannot run with a version "
|
||||
"of PyTorch that was built with USE_DISTRIBUTED=0 "
|
||||
"as this module's API changes."
|
||||
)
|
||||
|
||||
# These are all the modules that have "automodule" in an rst file
|
||||
# These modules are the ones for which coverage is checked
|
||||
# Here, we make sure that no module is missing from that list
|
||||
|
||||
@ -24,12 +24,17 @@ For Intel Client GPU
|
||||
+-------------------------------------+----------------------------------------------------------------------------------------------------+
|
||||
| Supported OS | Validated Hardware |
|
||||
+=====================================+====================================================================================================+
|
||||
|| Windows 11 & Ubuntu 24.04/25.04 || Intel® Arc A-Series Graphics (CodeName: Alchemist) |
|
||||
|| Windows 11 & Ubuntu 24.10 || Intel® Arc A-Series Graphics (CodeName: Alchemist) |
|
||||
|| || Intel® Arc B-Series Graphics (CodeName: Battlemage) |
|
||||
|| || Intel® Core™ Ultra Processors with Intel® Arc™ Graphics (CodeName: Meteor Lake-H) |
|
||||
|| || Intel® Core™ Ultra Desktop Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Lunar Lake) |
|
||||
|| || Intel® Core™ Ultra Mobile Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Arrow Lake-H)|
|
||||
+-------------------------------------+----------------------------------------------------------------------------------------------------+
|
||||
|| Ubuntu 24.04 & WSL2 (Ubuntu 24.04) || Intel® Arc A-Series Graphics (CodeName: Alchemist) |
|
||||
|| || Intel® Core™ Ultra Processors with Intel® Arc™ Graphics (CodeName: Meteor Lake-H) |
|
||||
|| || Intel® Core™ Ultra Desktop Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Lunar Lake) |
|
||||
|| || Intel® Core™ Ultra Mobile Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Arrow Lake-H)|
|
||||
+-------------------------------------+----------------------------------------------------------------------------------------------------+
|
||||
|
||||
Intel GPUs support (Prototype) is ready from PyTorch* 2.5 for Intel® Client GPUs and Intel® Data Center GPU Max Series on both Linux and Windows, which brings Intel GPUs and the SYCL* software stack into the official PyTorch stack with consistent user experience to embrace more AI application scenarios.
|
||||
|
||||
|
||||
@ -125,10 +125,6 @@ deterministic implementation will be used::
|
||||
[[ 0.1509, 1.8027],
|
||||
[ 0.0333, -1.1444]]], device='cuda:0')
|
||||
|
||||
Furthermore, if you are using CUDA tensors, and your CUDA version is 10.2 or greater, you
|
||||
should set the environment variable `CUBLAS_WORKSPACE_CONFIG` according to CUDA documentation:
|
||||
`<https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility>`_
|
||||
|
||||
CUDA convolution determinism
|
||||
----------------------------
|
||||
While disabling CUDA convolution benchmarking (discussed above) ensures that
|
||||
|
||||
@ -32,7 +32,7 @@ torch.cuda.memory._dump_snapshot("my_snapshot.pickle")
|
||||
|
||||
## Using the visualizer
|
||||
|
||||
Open <https://pytorch.org/memory_viz> and drag/drop the pickled snapshot file into the visualizer.
|
||||
Open [pytorch.org/memory_viz](https://pytorch.org/memory_viz) and drag/drop the pickled snapshot file into the visualizer.
|
||||
The visualizer is a javascript application that runs locally on your computer. It does not upload any snapshot data.
|
||||
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
if(NOT WIN32)
|
||||
if(USE_DISTRIBUTED AND NOT WIN32)
|
||||
set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd")
|
||||
set(DIST_AUTOGRAD_TEST_SOURCES
|
||||
${TORCH_ROOT}/test/cpp/common/main.cpp
|
||||
|
||||
@ -15,8 +15,6 @@ set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
|
||||
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
|
||||
set(CMAKE_CXX_VISIBILITY_PRESET hidden)
|
||||
|
||||
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
||||
|
||||
if(APPLE)
|
||||
set(CMAKE_INSTALL_RPATH "@loader_path/lib;@loader_path")
|
||||
elseif(UNIX)
|
||||
@ -36,7 +34,6 @@ else()
|
||||
message(FATAL_ERROR "Cannot find Python directory")
|
||||
endif()
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
include(${PROJECT_SOURCE_DIR}/cmake/TorchPythonTargets.cmake)
|
||||
|
||||
add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/openreg)
|
||||
|
||||
@ -164,13 +164,13 @@ at::Tensor view(const at::Tensor& self, c10::SymIntArrayRef size) {
|
||||
|
||||
// LITERALINCLUDE START: FALLBACK IMPL
|
||||
void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
|
||||
static const std::unordered_set<c10::OperatorName> cpu_fallback_blocklist = {
|
||||
static const std::unordered_set<c10::OperatorName> cpu_fallback_blacklist = {
|
||||
c10::OperatorName("aten::abs", ""),
|
||||
c10::OperatorName("aten::abs", "out"),
|
||||
};
|
||||
|
||||
const auto& op_name = op.schema().operator_name();
|
||||
if (cpu_fallback_blocklist.count(op_name)) {
|
||||
if (cpu_fallback_blacklist.count(op_name)) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Operator '",
|
||||
|
||||
@ -1,146 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <include/openreg.h>
|
||||
|
||||
#include "OpenRegException.h"
|
||||
#include "OpenRegStream.h"
|
||||
|
||||
namespace c10::openreg {
|
||||
|
||||
struct OpenRegEvent {
|
||||
OpenRegEvent(bool enable_timing) noexcept : enable_timing_{enable_timing} {}
|
||||
|
||||
~OpenRegEvent() {
|
||||
if (is_created_) {
|
||||
OPENREG_CHECK(orEventDestroy(event_));
|
||||
}
|
||||
}
|
||||
|
||||
OpenRegEvent(const OpenRegEvent&) = delete;
|
||||
OpenRegEvent& operator=(const OpenRegEvent&) = delete;
|
||||
|
||||
OpenRegEvent(OpenRegEvent&& other) noexcept {
|
||||
moveHelper(std::move(other));
|
||||
}
|
||||
OpenRegEvent& operator=(OpenRegEvent&& other) noexcept {
|
||||
if (this != &other) {
|
||||
moveHelper(std::move(other));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
operator orEvent_t() const {
|
||||
return event();
|
||||
}
|
||||
|
||||
std::optional<at::Device> device() const {
|
||||
if (is_created_) {
|
||||
return at::Device(at::kPrivateUse1, device_index_);
|
||||
} else {
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
bool isCreated() const {
|
||||
return is_created_;
|
||||
}
|
||||
|
||||
DeviceIndex device_index() const {
|
||||
return device_index_;
|
||||
}
|
||||
|
||||
orEvent_t event() const {
|
||||
return event_;
|
||||
}
|
||||
|
||||
bool query() const {
|
||||
if (!is_created_) {
|
||||
return true;
|
||||
}
|
||||
|
||||
orError_t err = orEventQuery(event_);
|
||||
if (err == orSuccess) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void record() {
|
||||
record(getCurrentOpenRegStream());
|
||||
}
|
||||
|
||||
void recordOnce(const OpenRegStream& stream) {
|
||||
if (!was_recorded_)
|
||||
record(stream);
|
||||
}
|
||||
|
||||
void record(const OpenRegStream& stream) {
|
||||
if (!is_created_) {
|
||||
createEvent(stream.device_index());
|
||||
}
|
||||
|
||||
TORCH_CHECK(
|
||||
device_index_ == stream.device_index(),
|
||||
"Event device ",
|
||||
device_index_,
|
||||
" does not match recording stream's device ",
|
||||
stream.device_index(),
|
||||
".");
|
||||
|
||||
OPENREG_CHECK(orEventRecord(event_, stream));
|
||||
was_recorded_ = true;
|
||||
}
|
||||
|
||||
void block(const OpenRegStream& stream) {
|
||||
if (is_created_) {
|
||||
OPENREG_CHECK(orStreamWaitEvent(stream, event_, 0));
|
||||
}
|
||||
}
|
||||
|
||||
float elapsed_time(const OpenRegEvent& other) const {
|
||||
TORCH_CHECK_VALUE(
|
||||
!(enable_timing_ & orEventDisableTiming) &&
|
||||
!(other.enable_timing_ & orEventDisableTiming),
|
||||
"Both events must be created with argument 'enable_timing=True'.");
|
||||
TORCH_CHECK_VALUE(
|
||||
is_created_ && other.isCreated(),
|
||||
"Both events must be recorded before calculating elapsed time.");
|
||||
TORCH_CHECK(
|
||||
query() && other.query(),
|
||||
"Both events must be completed before calculating elapsed time.");
|
||||
|
||||
float time_ms = 0;
|
||||
OPENREG_CHECK(orEventElapsedTime(&time_ms, event_, other.event_));
|
||||
return time_ms;
|
||||
}
|
||||
|
||||
void synchronize() const {
|
||||
if (is_created_) {
|
||||
OPENREG_CHECK(orEventSynchronize(event_));
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
unsigned int enable_timing_{orEventDisableTiming};
|
||||
bool is_created_{false};
|
||||
bool was_recorded_{false};
|
||||
DeviceIndex device_index_{-1};
|
||||
orEvent_t event_{};
|
||||
|
||||
void createEvent(DeviceIndex device_index) {
|
||||
device_index_ = device_index;
|
||||
OPENREG_CHECK(orEventCreateWithFlags(&event_, enable_timing_));
|
||||
is_created_ = true;
|
||||
}
|
||||
|
||||
void moveHelper(OpenRegEvent&& other) {
|
||||
std::swap(enable_timing_, other.enable_timing_);
|
||||
std::swap(is_created_, other.is_created_);
|
||||
std::swap(was_recorded_, other.was_recorded_);
|
||||
std::swap(device_index_, other.device_index_);
|
||||
std::swap(event_, other.event_);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace c10::openreg
|
||||
@ -1,9 +0,0 @@
|
||||
#include "OpenRegException.h"
|
||||
|
||||
void orCheckFail(
|
||||
const char* func,
|
||||
const char* file,
|
||||
uint32_t line,
|
||||
const char* msg) {
|
||||
throw ::c10::Error({func, file, line}, msg);
|
||||
}
|
||||
@ -1,20 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <include/openreg.h>
|
||||
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
void orCheckFail(
|
||||
const char* func,
|
||||
const char* file,
|
||||
uint32_t line,
|
||||
const char* msg = "");
|
||||
|
||||
#define OPENREG_CHECK(EXPR, ...) \
|
||||
do { \
|
||||
const orError_t __err = EXPR; \
|
||||
if (__err != orSuccess) { \
|
||||
orCheckFail( \
|
||||
__func__, __FILE__, static_cast<uint32_t>(__LINE__), ##__VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
@ -1,6 +1,5 @@
|
||||
#include <include/openreg.h>
|
||||
|
||||
#include "OpenRegException.h"
|
||||
#include "OpenRegFunctions.h"
|
||||
|
||||
namespace c10::openreg {
|
||||
|
||||
@ -1,10 +1,14 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef _WIN32
|
||||
#define OPENREG_EXPORT __declspec(dllexport)
|
||||
#else
|
||||
#define OPENREG_EXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
|
||||
#include <include/Macros.h>
|
||||
|
||||
#include <limits>
|
||||
|
||||
namespace c10::openreg {
|
||||
|
||||
@ -1,253 +0,0 @@
|
||||
#include "OpenRegStream.h"
|
||||
|
||||
#include <c10/util/CallOnce.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <deque>
|
||||
|
||||
namespace c10::openreg {
|
||||
|
||||
namespace {
|
||||
|
||||
// Global stream state and constants
|
||||
static c10::once_flag init_flag;
|
||||
|
||||
static DeviceIndex num_devices = -1;
|
||||
static constexpr int kStreamsPerPoolBits = 5;
|
||||
static constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
|
||||
static constexpr int kStreamTypeBits = 2;
|
||||
|
||||
/*
|
||||
* The stream pools are lazily initialized when the first queue is requested
|
||||
* for a device. The device flags track the initialization of each device. When
|
||||
* a queue is requested, the next queue in the pool to be returned in a
|
||||
* round-robin fashion, see Note [Stream Management].
|
||||
*/
|
||||
static std::deque<c10::once_flag> device_flags;
|
||||
static std::vector<std::array<
|
||||
std::array<orStream_t, kStreamsPerPool>,
|
||||
c10::openreg::max_compile_time_stream_priorities>>
|
||||
streams;
|
||||
static std::deque<
|
||||
std::array<std::atomic<uint32_t>, max_compile_time_stream_priorities>>
|
||||
priority_counters;
|
||||
|
||||
static thread_local std::unique_ptr<StreamId[]> current_streams = nullptr;
|
||||
|
||||
/*
|
||||
* Note [StreamId assignment]
|
||||
* ~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
* How do we assign stream IDs?
|
||||
*
|
||||
* -- 56 bits -- -- 5 bits -- -- 2 bits -- -- 1 bit --
|
||||
* zeros StreamIdIndex StreamIdType Ext/native stream
|
||||
* ignored for ext ignored for ext
|
||||
*
|
||||
* Where StreamIdType:
|
||||
* 00 = default stream
|
||||
* 01 = normal stream
|
||||
* 11 = external stream
|
||||
*
|
||||
* For external stream, StreamID is a orStream_t pointer. This means that last
|
||||
* bit will always be 0. So when constructing StreamId for a native stream we
|
||||
* set last bit to 1 to distinguish between native and external streams.
|
||||
*
|
||||
* StreamId is 64-bit, so we can just rely on regular promotion rules.
|
||||
* We rely on StreamIdIndex and StreamIdType being non-negative;
|
||||
*/
|
||||
using StreamIdIndex = uint8_t;
|
||||
enum class StreamIdType : uint8_t {
|
||||
DEFAULT = 0x0,
|
||||
NORMAL = 0x1,
|
||||
EXT = 0x3,
|
||||
};
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
|
||||
switch (s) {
|
||||
case StreamIdType::DEFAULT:
|
||||
return stream << "DEFAULT";
|
||||
case StreamIdType::NORMAL:
|
||||
return stream << "NORMAL";
|
||||
case StreamIdType::EXT:
|
||||
return stream << "EXT";
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return stream << static_cast<int16_t>(s);
|
||||
}
|
||||
|
||||
static inline StreamIdType streamIdType(StreamId s) {
|
||||
// Externally allocated streams have their id being the orStream_ptr
|
||||
// so the last bit will be 0
|
||||
if (!(s & 1)) {
|
||||
return StreamIdType(StreamIdType::EXT);
|
||||
}
|
||||
|
||||
int mask_for_type = (1 << kStreamTypeBits) - 1;
|
||||
auto st = static_cast<StreamIdType>((s >> 1) & mask_for_type);
|
||||
TORCH_CHECK(
|
||||
st == StreamIdType::DEFAULT || st == StreamIdType::NORMAL,
|
||||
"invalid StreamId: ",
|
||||
s);
|
||||
return st;
|
||||
}
|
||||
|
||||
static inline size_t streamIdIndex(StreamId s) {
|
||||
return static_cast<size_t>(
|
||||
(s >> (kStreamTypeBits + 1)) & ((1 << kStreamsPerPoolBits) - 1));
|
||||
}
|
||||
|
||||
StreamId makeStreamId(StreamIdType st, size_t si) {
|
||||
if (st == StreamIdType::EXT) {
|
||||
return static_cast<StreamId>(0);
|
||||
}
|
||||
|
||||
return (static_cast<StreamId>(si) << (kStreamTypeBits + 1)) |
|
||||
(static_cast<StreamId>(st) << 1) | 1;
|
||||
}
|
||||
|
||||
static void initGlobalStreamState() {
|
||||
num_devices = device_count();
|
||||
device_flags.resize(num_devices);
|
||||
streams.resize(num_devices);
|
||||
priority_counters.resize(num_devices);
|
||||
}
|
||||
|
||||
static void initSingleDeviceStream(
|
||||
int priority,
|
||||
DeviceIndex device_index,
|
||||
int i) {
|
||||
auto& stream = streams[device_index][priority][i];
|
||||
|
||||
OPENREG_CHECK(orStreamCreateWithPriority(&stream, 0, priority));
|
||||
priority_counters[device_index][priority] = 0;
|
||||
}
|
||||
|
||||
// Creates stream pools for the specified device. It should be call only once.
|
||||
static void initDeviceStreamState(DeviceIndex device_index) {
|
||||
for (const auto i : c10::irange(kStreamsPerPool)) {
|
||||
for (const auto p : c10::irange(max_compile_time_stream_priorities)) {
|
||||
initSingleDeviceStream(p, device_index, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void initOpenRegStreamsOnce() {
|
||||
c10::call_once(init_flag, initGlobalStreamState);
|
||||
|
||||
if (current_streams) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Inits current streams (thread local) to the last queue in the "normal
|
||||
// priority" queue pool. Note: the queue pool have not been initialized yet.
|
||||
// It will be initialized in initDeviceStreamState for the specified device.
|
||||
current_streams = std::make_unique<StreamId[]>(num_devices);
|
||||
for (const auto i : c10::irange(num_devices)) {
|
||||
current_streams[i] = makeStreamId(StreamIdType::DEFAULT, 0);
|
||||
}
|
||||
}
|
||||
|
||||
static uint32_t get_idx(std::atomic<uint32_t>& counter) {
|
||||
auto raw_idx = counter++;
|
||||
return raw_idx % kStreamsPerPool;
|
||||
}
|
||||
|
||||
OpenRegStream OpenRegStreamForId(DeviceIndex device_index, StreamId stream_id) {
|
||||
return OpenRegStream(
|
||||
OpenRegStream::UNCHECKED,
|
||||
Stream(
|
||||
Stream::UNSAFE,
|
||||
c10::Device(DeviceType::PrivateUse1, device_index),
|
||||
stream_id));
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
// See Note [StreamId assignment]
|
||||
orStream_t OpenRegStream::stream() const {
|
||||
c10::DeviceIndex device_index = stream_.device_index();
|
||||
StreamId stream_id = stream_.id();
|
||||
StreamIdType st = streamIdType(stream_id);
|
||||
size_t si = streamIdIndex(stream_id);
|
||||
switch (st) {
|
||||
// The index 0 stream is default as well.
|
||||
case StreamIdType::DEFAULT:
|
||||
case StreamIdType::NORMAL:
|
||||
return streams[device_index][static_cast<uint8_t>(st)][si];
|
||||
case StreamIdType::EXT:
|
||||
return reinterpret_cast<orStream_t>(stream_id);
|
||||
default:
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Unrecognized stream ",
|
||||
stream_,
|
||||
" (I didn't recognize the stream type, ",
|
||||
st,
|
||||
").",
|
||||
" Did you manufacture the StreamId yourself? Don't do that;");
|
||||
}
|
||||
}
|
||||
|
||||
// Returns a stream from the requested pool
|
||||
// Note: when called the first time on a device, this will create the
|
||||
// stream pools for that device.
|
||||
OpenRegStream getStreamFromPool(const int priority, DeviceIndex device_index) {
|
||||
initOpenRegStreamsOnce();
|
||||
if (device_index == -1) {
|
||||
device_index = current_device();
|
||||
}
|
||||
c10::call_once(
|
||||
device_flags[device_index], initDeviceStreamState, device_index);
|
||||
auto pri_idx =
|
||||
std::clamp(priority, 0, max_compile_time_stream_priorities - 1);
|
||||
const auto idx = get_idx(priority_counters[device_index][pri_idx]);
|
||||
auto id_type = static_cast<StreamIdType>(pri_idx);
|
||||
return OpenRegStreamForId(device_index, makeStreamId(id_type, idx));
|
||||
}
|
||||
|
||||
OpenRegStream getStreamFromPool(const bool isHighPriority, DeviceIndex device) {
|
||||
initOpenRegStreamsOnce();
|
||||
int priority = 0;
|
||||
return getStreamFromPool(priority, device);
|
||||
}
|
||||
|
||||
OpenRegStream getStreamFromExternal(
|
||||
orStream_t ext_stream,
|
||||
DeviceIndex device_index) {
|
||||
return OpenRegStreamForId(
|
||||
device_index, reinterpret_cast<int64_t>(ext_stream));
|
||||
}
|
||||
|
||||
OpenRegStream getDefaultOpenRegStream(DeviceIndex device_index) {
|
||||
initOpenRegStreamsOnce();
|
||||
if (device_index == -1) {
|
||||
device_index = current_device();
|
||||
}
|
||||
return OpenRegStreamForId(
|
||||
device_index, makeStreamId(StreamIdType::DEFAULT, 0));
|
||||
}
|
||||
|
||||
OpenRegStream getCurrentOpenRegStream(DeviceIndex device_index) {
|
||||
initOpenRegStreamsOnce();
|
||||
if (device_index == -1) {
|
||||
device_index = current_device();
|
||||
}
|
||||
return OpenRegStreamForId(device_index, current_streams[device_index]);
|
||||
}
|
||||
|
||||
void setCurrentOpenRegStream(OpenRegStream stream) {
|
||||
initOpenRegStreamsOnce();
|
||||
current_streams[stream.device_index()] = stream.id();
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& stream, const OpenRegStream& s) {
|
||||
return stream << s.unwrap();
|
||||
}
|
||||
|
||||
} // namespace c10::openreg
|
||||
@ -1,162 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <include/openreg.h>
|
||||
|
||||
#include "OpenRegException.h"
|
||||
#include "OpenRegFunctions.h"
|
||||
|
||||
#include <c10/core/DeviceGuard.h>
|
||||
#include <c10/core/Stream.h>
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
namespace c10::openreg {
|
||||
|
||||
static constexpr int max_compile_time_stream_priorities = 1;
|
||||
|
||||
class OpenRegStream {
|
||||
public:
|
||||
enum Unchecked { UNCHECKED };
|
||||
|
||||
explicit OpenRegStream(Stream stream) : stream_(stream) {
|
||||
TORCH_CHECK(stream_.device_type() == DeviceType::PrivateUse1);
|
||||
}
|
||||
|
||||
explicit OpenRegStream(Unchecked, Stream stream) : stream_(stream) {}
|
||||
|
||||
bool operator==(const OpenRegStream& other) const noexcept {
|
||||
return unwrap() == other.unwrap();
|
||||
}
|
||||
|
||||
bool operator!=(const OpenRegStream& other) const noexcept {
|
||||
return unwrap() != other.unwrap();
|
||||
}
|
||||
|
||||
operator orStream_t() const {
|
||||
return stream();
|
||||
}
|
||||
|
||||
operator Stream() const {
|
||||
return unwrap();
|
||||
}
|
||||
|
||||
DeviceType device_type() const {
|
||||
return DeviceType::PrivateUse1;
|
||||
}
|
||||
|
||||
DeviceIndex device_index() const {
|
||||
return stream_.device_index();
|
||||
}
|
||||
|
||||
Device device() const {
|
||||
return Device(DeviceType::PrivateUse1, device_index());
|
||||
}
|
||||
|
||||
StreamId id() const {
|
||||
return stream_.id();
|
||||
}
|
||||
|
||||
bool query() const {
|
||||
DeviceGuard guard{stream_.device()};
|
||||
|
||||
if (orStreamQuery(stream()) == orSuccess) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void synchronize() const {
|
||||
DeviceGuard guard{stream_.device()};
|
||||
OPENREG_CHECK(orStreamSynchronize(stream()));
|
||||
}
|
||||
|
||||
int priority() const {
|
||||
DeviceGuard guard{stream_.device()};
|
||||
int priority = 0;
|
||||
OPENREG_CHECK(orStreamGetPriority(stream(), &priority));
|
||||
return priority;
|
||||
}
|
||||
|
||||
orStream_t stream() const;
|
||||
|
||||
Stream unwrap() const {
|
||||
return stream_;
|
||||
}
|
||||
|
||||
struct c10::StreamData3 pack3() const {
|
||||
return stream_.pack3();
|
||||
}
|
||||
|
||||
static OpenRegStream unpack3(
|
||||
StreamId stream_id,
|
||||
DeviceIndex device_index,
|
||||
DeviceType device_type) {
|
||||
return OpenRegStream(Stream::unpack3(stream_id, device_index, device_type));
|
||||
}
|
||||
|
||||
private:
|
||||
Stream stream_;
|
||||
};
|
||||
|
||||
/*
|
||||
* Get a stream from the pool in a round-robin fashion.
|
||||
*
|
||||
* You can request a stream from the highest priority pool by setting
|
||||
* isHighPriority to true for a specific device.
|
||||
*/
|
||||
OPENREG_EXPORT OpenRegStream
|
||||
getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
|
||||
|
||||
/*
|
||||
* Get a stream from the pool in a round-robin fashion.
|
||||
*
|
||||
* You can request a stream by setting a priority value for a specific device.
|
||||
* The priority number lower, the priority higher.
|
||||
*/
|
||||
OPENREG_EXPORT OpenRegStream
|
||||
getStreamFromPool(const int priority, DeviceIndex device = -1);
|
||||
|
||||
/*
|
||||
* Get a OpenRegStream from a externally allocated one.
|
||||
*
|
||||
* This is mainly for interoperability with different libraries where we
|
||||
* want to operate on a non-torch allocated stream for data exchange or similar
|
||||
* purposes
|
||||
*/
|
||||
OPENREG_EXPORT OpenRegStream
|
||||
getStreamFromExternal(orStream_t ext_stream, DeviceIndex device_index);
|
||||
|
||||
/*
|
||||
* Get the default OpenReg stream, for the passed OpenReg device, or for the
|
||||
* current device if no device index is passed.
|
||||
*/
|
||||
OPENREG_EXPORT OpenRegStream
|
||||
getDefaultOpenRegStream(DeviceIndex device_index = -1);
|
||||
|
||||
/*
|
||||
* Get the current OpenReg stream, for the passed OpenReg device, or for the
|
||||
* current device if no device index is passed.
|
||||
*/
|
||||
OPENREG_EXPORT OpenRegStream
|
||||
getCurrentOpenRegStream(DeviceIndex device_index = -1);
|
||||
|
||||
/*
|
||||
* Set the current stream on the device of the passed in stream to be the passed
|
||||
* in stream.
|
||||
*/
|
||||
OPENREG_EXPORT void setCurrentOpenRegStream(OpenRegStream stream);
|
||||
|
||||
OPENREG_EXPORT std::ostream& operator<<(
|
||||
std::ostream& stream,
|
||||
const OpenRegStream& s);
|
||||
|
||||
} // namespace c10::openreg
|
||||
|
||||
namespace std {
|
||||
template <>
|
||||
struct hash<c10::openreg::OpenRegStream> {
|
||||
size_t operator()(c10::openreg::OpenRegStream s) const noexcept {
|
||||
return std::hash<c10::Stream>{}(s.unwrap());
|
||||
}
|
||||
};
|
||||
} // namespace std
|
||||
@ -1,7 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef _WIN32
|
||||
#define OPENREG_EXPORT __declspec(dllexport)
|
||||
#else
|
||||
#define OPENREG_EXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
@ -53,7 +53,7 @@ def build_deps():
|
||||
".",
|
||||
"--target",
|
||||
"install",
|
||||
"--config", # For multi-config generators
|
||||
"--config",
|
||||
"Release",
|
||||
"--",
|
||||
]
|
||||
|
||||
@ -1,14 +1,7 @@
|
||||
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
|
||||
|
||||
project(TORCH_OPENREG CXX C)
|
||||
|
||||
option(USE_TEST "Build and run unit tests" ON)
|
||||
|
||||
set(LIBRARY_NAME openreg)
|
||||
set(LIBRARY_TEST ortests)
|
||||
|
||||
file(GLOB_RECURSE SOURCE_FILES
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/*.cpp"
|
||||
)
|
||||
|
||||
add_library(${LIBRARY_NAME} SHARED ${SOURCE_FILES})
|
||||
@ -20,26 +13,3 @@ install(TARGETS ${LIBRARY_NAME}
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
)
|
||||
|
||||
if(USE_TEST)
|
||||
enable_testing()
|
||||
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/cmake/GTestTargets.cmake)
|
||||
|
||||
file(GLOB_RECURSE TEST_FILES
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/tests/*.cpp"
|
||||
)
|
||||
|
||||
add_executable(${LIBRARY_TEST} ${TEST_FILES})
|
||||
target_link_libraries(${LIBRARY_TEST}
|
||||
PRIVATE
|
||||
${LIBRARY_NAME}
|
||||
gtest
|
||||
gtest_main
|
||||
)
|
||||
|
||||
add_test(NAME alltests COMMAND ${LIBRARY_TEST})
|
||||
add_custom_command(TARGET ${LIBRARY_TEST}
|
||||
POST_BUILD
|
||||
COMMAND ${CMAKE_CTEST_COMMAND} -C Release --output-on-failure --verbose)
|
||||
endif()
|
||||
|
||||
@ -4,9 +4,9 @@
|
||||
|
||||
OpenReg is a C++ backend library that simulates the behavior of a CUDA-like device on a CPU. Its core objective is **not to accelerate computation or improve performance**, but rather to **simulate modern CUDA programming, enabling developers to prototype and test in an environment without actual GPU hardware**. The current design principles are as follows:
|
||||
|
||||
* **API Consistency**: Provide an interface consistent with the CUDA Runtime API, allowing upper-level applications (like PyTorch's `PrivateUse1` backend) to switch and test seamlessly.
|
||||
* **API Consistency**: Provide an interface consistent with the CUDA Runtime API, allowing upper-level applications (like PyTorch's PrivateUse1 backend) to switch and test seamlessly.
|
||||
* **Functional Consistency**: Provide behavior consistent with the CUDA Runtime, such as memory isolation, device context management, etc.
|
||||
* **Completeness**: Aim to support `PrivateUse1` device integration and safeguard the third-party device integration mechanism, without striving to cover all capabilities of the CUDA Runtime.
|
||||
* **Completeness**: Aim to support PrivateUse1 device integration and safeguard the third-party device integration mechanism, without striving to cover all capabilities of the CUDA Runtime.
|
||||
|
||||
## Directory Structure
|
||||
|
||||
@ -14,34 +14,19 @@ The project's code is organized with a clear structure and separation of respons
|
||||
|
||||
```text
|
||||
openreg/
|
||||
├── README.md # Comprehensive introduction of OpenReg.
|
||||
├── CMakeLists.txt # Top-level CMake build script, used to compile and generate libopenreg.so
|
||||
├── cmake/
|
||||
│ └── GTestTargets.cmake # Utils of fetching GoogleTest.
|
||||
├── CMakeLists.txt # Top-level CMake build script, used to compile and generate libopenreg.so
|
||||
├── include/
|
||||
│ ├── openreg.h # Public API header file, external users only need to include this file
|
||||
│ └── openreg.inl # Public API header file, as an extension of openreg.h, cannot be included separately.
|
||||
├── example/
|
||||
│ └── example.cpp # Example for OpenReg.
|
||||
├── tests/
|
||||
│ ├── event_tests.cpp # Testcases about OpenReg Event.
|
||||
│ ├── stream_tests.cpp # Testcases about OpenReg Stream.
|
||||
│ ├── device_tests.cpp # Testcases about OpenReg Device.
|
||||
│ └── memory_tests.cpp # Testcases about OpenReg Memory.
|
||||
│ └── openreg.h # Public API header file, external users only need to include this file
|
||||
└── csrc/
|
||||
├── device.cpp # Implementation of device management APIs
|
||||
├── memory.cpp # Implementation of memory management APIs
|
||||
└── stream.cpp # Implementation of stream and event APIs.
|
||||
├── device.cpp # Implementation of device management-related APIs
|
||||
└── memory.cpp # Implementation of APIs for memory management, copying, and protection
|
||||
```
|
||||
|
||||
* `CMakeLists.txt`: Responsible for compiling and linking all source files under the `csrc/` directory to generate the final `libopenreg.so` shared library.
|
||||
* `include`: Defines all externally exposed APIs, data structures, and enums.
|
||||
* `openreg.h`: Defines all externally exposed C-style APIs.
|
||||
* `openreg.inl`: Defines all externally exposed C++ APIs.
|
||||
* `include/openreg.h`: Defines all externally exposed C-style APIs, data structures, and enums. It is the "public face" of this library.
|
||||
* `csrc/`: Contains the C++ implementation source code for all core functionalities.
|
||||
* `device.cpp`: Implements the core functions of device management: device discovery and context management.
|
||||
* `memory.cpp`: Implements the core functions of memory management: allocation, free, copy and memory protection.
|
||||
* `stream.cpp`: Implements the core functions of stream and event: creation, destroy, record, synchronization and so on.
|
||||
* `device.cpp`: Implements device discovery (`orGetDeviceCount`) and thread context management (`orSetDevice`/`orGetDevice`).
|
||||
* `memory.cpp`: Implements the core functions of memory allocation (`orMalloc`/`orMallocHost`), deallocation, copying, and memory protection (`orMemoryProtect`, `orMemoryUnprotect`).
|
||||
* `CMakeLists.txt`: Responsible for compiling and linking all source files under the `csrc/` directory to generate the final `libopenreg.so` shared library.
|
||||
|
||||
## Implemented APIs
|
||||
|
||||
@ -49,49 +34,25 @@ OpenReg currently provides a set of APIs covering basic memory and device manage
|
||||
|
||||
### Device Management APIs
|
||||
|
||||
| OpenReg | CUDA | Feature Description |
|
||||
| :------------------------------- | :--------------------------------- | :--------------------------------- |
|
||||
| `orGetDeviceCount` | `cudaGetDeviceCount` | Get the number of available GPUs |
|
||||
| `orSetDevice` | `cudaSetDevice` | Set the active GPU |
|
||||
| `orGetDevice` | `cudaGetDevice` | Get the current GPU |
|
||||
| `orDeviceSynchronize` | `cudaDeviceSynchronize` | Wait for all GPU tasks to finish |
|
||||
| `orDeviceGetStreamPriorityRange` | `cudaDeviceGetStreamPriorityRange` | Get the range of stream priorities |
|
||||
| OpenReg | CUDA | Feature Description |
|
||||
| :------------------- | :------------------- | :------------------------------------------------ |
|
||||
| `orGetDeviceCount` | `cudaGetDeviceCount` | Get the number of devices |
|
||||
| `orSetDevice` | `cudaSetDevice` | Set the current device for the current thread |
|
||||
| `orGetDevice` | `cudaGetDevice` | Get the current device for the current thread |
|
||||
|
||||
### Memory Management APIs
|
||||
|
||||
| OpenReg | CUDA | Feature Description |
|
||||
| :----------------------- | :------------------------- | :---------------------------------------- |
|
||||
| `orMalloc` | `cudaMalloc` | Allocate device memory |
|
||||
| `orFree` | `cudaFree` | Free device memory |
|
||||
| `orMallocHost` | `cudaMallocHost` | Allocate page-locked (Pinned) host memory |
|
||||
| `orFreeHost` | `cudaFreeHost` | Free page-locked host memory |
|
||||
| `orMemcpy` | `cudaMemcpy` | Synchronous memory copy |
|
||||
| `orMemcpyAsyn` | `cudaMemcpyAsyn` | Asynchronous memory copy |
|
||||
| `orPointerGetAttributes` | `cudaPointerGetAttributes` | Get pointer attributes |
|
||||
|
||||
### Stream APIs
|
||||
|
||||
| OpenReg | CUDA | Feature Description |
|
||||
| :--------------------------- | :----------------------------- | :------------------------------------- |
|
||||
| `orStreamCreate` | `cudaStreamCreate` | Create a default-priority stream |
|
||||
| `orStreamCreateWithPriority` | `cudaStreamCreateWithPriority` | Create a stream with a given priority |
|
||||
| `orStreamDestroy` | `cudaStreamDestroy` | Destroy a stream |
|
||||
| `orStreamQuery` | `cudaStreamQuery` | Check if a stream has completed |
|
||||
| `orStreamSynchronize` | `cudaStreamSynchronize` | Wait for a stream to complete |
|
||||
| `orStreamWaitEvent` | `cudaStreamWaitEvent` | Make a stream wait for an event |
|
||||
| `orStreamGetPriority` | `cudaStreamGetPriority` | Get a stream’s priority |
|
||||
|
||||
### Event APIs
|
||||
|
||||
| OpenReg | CUDA | Feature Description |
|
||||
| :----------------------- | :------------------------- | :---------------------------------- |
|
||||
| `orEventCreate` | `cudaEventCreate` | Create an event with default flag |
|
||||
| `orEventCreateWithFlags` | `cudaEventCreateWithFlags` | Create an event with specific flag |
|
||||
| `orEventDestroy` | `cudaEventDestroy` | Destroy an event |
|
||||
| `orEventRecord` | `cudaEventRecord` | Record an event in a stream |
|
||||
| `orEventSynchronize` | `cudaEventSynchronize` | Wait for an event to complete |
|
||||
| `orEventQuery` | `cudaEventQuery` | Check if an event has completed |
|
||||
| `orEventElapsedTime` | `cudaEventElapsedTime` | Get time elapsed between two events |
|
||||
| OpenReg | CUDA | Feature Description |
|
||||
| :----------------------- | :--------------------------- | :----------------------------------------- |
|
||||
| `orMalloc` | `cudaMalloc` | Allocate device memory |
|
||||
| `orFree` | `cudaFree` | Free device memory |
|
||||
| `orMallocHost` | `cudaMallocHost` | Allocate page-locked (Pinned) host memory |
|
||||
| `orFreeHost` | `cudaFreeHost` | Free page-locked host memory |
|
||||
| `orMemcpy` | `cudaMemcpy` | Synchronous memory copy |
|
||||
| `orMemcpyAsync` | `cudaMemcpyAsync` | Asynchronous memory copy |
|
||||
| `orPointerGetAttributes` | `cudaPointerGetAttributes` | Get pointer attributes |
|
||||
| `orMemoryUnprotect` | - | (Internal use) Unprotect memory |
|
||||
| `orMemoryProtect` | - | (Internal use) Restore memory protection |
|
||||
|
||||
## Implementation Principles
|
||||
|
||||
@ -110,42 +71,67 @@ Simulating device memory, host memory, and memory copies:
|
||||
2. **Deallocation**: Memory is freed using `munmap`.
|
||||
3. **Authorization**: When a legitimate memory access is required, an RAII guard restores the memory permissions to `PROT_READ | PROT_WRITE`. The permissions are automatically reverted to `PROT_NONE` when the scope is exited.
|
||||
|
||||
### Stream&Event Principles
|
||||
|
||||
Simulating creation, release and synchronization for event and steam:
|
||||
|
||||
1. **Event**: Each event is encapsulated as a task function and placed into a stream, which acts as a thread. Upon completion of the task, a flag within the event is modified to simulate the event's status.
|
||||
2. **Stream**: When each stream is requested, a new thread is created, which sequentially processes each task in the task queue within the stream structure. Tasks can be wrappers around kernel functions or events.
|
||||
3. **Synchronization**: Synchronization between streams and events is achieved using multithreading, condition variables, and mutexes.
|
||||
|
||||
## Usage Example
|
||||
|
||||
Please refer to [example](example/example.cpp) for example.
|
||||
The following is a simple code snippet demonstrating how to use the core features of the OpenReg library.
|
||||
|
||||
The command to compile example.cpp is as follow:
|
||||
```cpp
|
||||
#include "openreg.h"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cstdio>
|
||||
|
||||
```Shell
|
||||
mkdir build
|
||||
#define OR_CHECK(call) do { \
|
||||
orError_t err = call; \
|
||||
if (err != orSuccess) { \
|
||||
fprintf(stderr, "OR Error code %d in %s at line %d\n", err, __FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
pushd build
|
||||
cmake ..
|
||||
make -j 32
|
||||
popd
|
||||
int main() {
|
||||
int device_count = 0;
|
||||
OR_CHECK(orGetDeviceCount(&device_count));
|
||||
std::cout << "Found " << device_count << " simulated devices." << std::endl;
|
||||
|
||||
g++ -o out example/example.cpp -L ./build -lopenreg
|
||||
LD_LIBRARY_PATH=./build ./out
|
||||
```
|
||||
int current_device = -1;
|
||||
OR_CHECK(orSetDevice(1));
|
||||
OR_CHECK(orGetDevice(¤t_device));
|
||||
std::cout << "Set current device to " << current_device << "." << std::endl;
|
||||
|
||||
The output is as follow:
|
||||
const int n = 1024;
|
||||
const size_t size = n * sizeof(int);
|
||||
int *h_a, *d_a;
|
||||
OR_CHECK(orMallocHost((void**)&h_a, size));
|
||||
OR_CHECK(orMalloc((void**)&d_a, size));
|
||||
|
||||
```Shell
|
||||
Current environment have 2 devices
|
||||
Current is 0 device
|
||||
All tasks have been submitted.
|
||||
Kernel execution time: 0.238168 ms
|
||||
Verification PASSED!
|
||||
orPointerAttributes attr;
|
||||
OR_CHECK(orPointerGetAttributes(&attr, d_a));
|
||||
std::cout << "Pointer " << (void*)d_a << " is of type " << attr.type
|
||||
<< " on device " << attr.device << std::endl;
|
||||
|
||||
for (int i = 0; i < n; ++i) {
|
||||
h_a[i] = i;
|
||||
}
|
||||
OR_CHECK(orMemcpy(d_a, h_a, size, orMemcpyHostToDevice));
|
||||
std::cout << "Data copied from Host to Device." << std::endl;
|
||||
|
||||
// std::cout << "Trying to access device memory directly from CPU..." << std::endl;
|
||||
// int val = d_a[0]; // CRASH!
|
||||
|
||||
// Clean up resources
|
||||
OR_CHECK(orFree(d_a));
|
||||
OR_CHECK(orFreeHost(h_a));
|
||||
std::cout << "Resources freed." << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
The most basic functions of the OpenReg backend are currently supported, and will be dynamically optimized and expanded based on the needs of PyTorch integration.
|
||||
To better support PrivateUse1 device integration, the following capabilities are planned for the future:
|
||||
|
||||
* **Stream Support**: Provide the ability to simulate CUDA Streams.
|
||||
* **Event Support**: Provide the ability to simulate CUDA Events.
|
||||
* **Cross-Platform Support**: Add support for Windows and macOS (low priority).
|
||||
|
||||
@ -1,12 +0,0 @@
|
||||
set(GTest_REL_PATH "../../../../../../../third_party/googletest")
|
||||
get_filename_component(GTest_DIR "${CMAKE_CURRENT_LIST_DIR}/${GTest_REL_PATH}" ABSOLUTE)
|
||||
|
||||
if(EXISTS "${GTest_DIR}/CMakeLists.txt")
|
||||
message(STATUS "Found GTest: ${GTest_DIR}")
|
||||
|
||||
set(BUILD_GMOCK OFF CACHE BOOL "Disable GMock build")
|
||||
set(INSTALL_GTEST OFF CACHE BOOL "Disable GTest install")
|
||||
add_subdirectory(${GTest_DIR} "${CMAKE_BINARY_DIR}/gtest")
|
||||
else()
|
||||
message(FATAL_ERROR "GTest Not Found")
|
||||
endif()
|
||||
@ -1,12 +1,10 @@
|
||||
#include <include/openreg.h>
|
||||
|
||||
namespace {
|
||||
|
||||
// Total device numbers
|
||||
constexpr int DEVICE_COUNT = 2;
|
||||
// Current device index
|
||||
thread_local int gCurrentDevice = 0;
|
||||
|
||||
} // namespace
|
||||
|
||||
orError_t orGetDeviceCount(int* count) {
|
||||
|
||||
@ -1,20 +1,10 @@
|
||||
#include "memory.h"
|
||||
|
||||
#include <include/openreg.h>
|
||||
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
|
||||
namespace {
|
||||
|
||||
struct Block {
|
||||
orMemoryType type = orMemoryType::orMemoryTypeUnmanaged;
|
||||
int device = -1;
|
||||
void* pointer = nullptr;
|
||||
size_t size = 0;
|
||||
int refcount{0};
|
||||
};
|
||||
|
||||
class MemoryManager {
|
||||
public:
|
||||
static MemoryManager& getInstance() {
|
||||
@ -48,7 +38,7 @@ class MemoryManager {
|
||||
}
|
||||
}
|
||||
|
||||
m_registry[mem] = {type, current_device, mem, aligned_size, 0};
|
||||
m_registry[mem] = {type, current_device, mem, aligned_size};
|
||||
*ptr = mem;
|
||||
return orSuccess;
|
||||
}
|
||||
@ -61,15 +51,14 @@ class MemoryManager {
|
||||
auto it = m_registry.find(ptr);
|
||||
if (it == m_registry.end())
|
||||
return orErrorUnknown;
|
||||
|
||||
const auto& info = it->second;
|
||||
|
||||
if (info.type == orMemoryType::orMemoryTypeDevice) {
|
||||
openreg::mprotect(info.pointer, info.size, F_PROT_READ | F_PROT_WRITE);
|
||||
openreg::munmap(info.pointer, info.size);
|
||||
} else {
|
||||
openreg::free(info.pointer);
|
||||
}
|
||||
|
||||
m_registry.erase(it);
|
||||
return orSuccess;
|
||||
}
|
||||
@ -81,39 +70,36 @@ class MemoryManager {
|
||||
orMemcpyKind kind) {
|
||||
if (!dst || !src || count == 0)
|
||||
return orErrorUnknown;
|
||||
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
Block* dst_info = getBlockInfoNoLock(dst);
|
||||
Block* src_info = getBlockInfoNoLock(src);
|
||||
|
||||
orPointerAttributes dst_info = getPointerInfo(dst);
|
||||
orPointerAttributes src_info = getPointerInfo(src);
|
||||
switch (kind) {
|
||||
case orMemcpyHostToDevice:
|
||||
if ((!dst_info || dst_info->type != orMemoryType::orMemoryTypeDevice) ||
|
||||
(src_info && src_info->type == orMemoryType::orMemoryTypeDevice))
|
||||
if (dst_info.type != orMemoryType::orMemoryTypeDevice ||
|
||||
src_info.type == orMemoryType::orMemoryTypeDevice)
|
||||
return orErrorUnknown;
|
||||
break;
|
||||
case orMemcpyDeviceToHost:
|
||||
if ((dst_info && dst_info->type == orMemoryType::orMemoryTypeDevice) ||
|
||||
(!src_info || src_info->type != orMemoryType::orMemoryTypeDevice))
|
||||
if (dst_info.type == orMemoryType::orMemoryTypeDevice ||
|
||||
src_info.type != orMemoryType::orMemoryTypeDevice)
|
||||
return orErrorUnknown;
|
||||
break;
|
||||
case orMemcpyDeviceToDevice:
|
||||
if ((!dst_info || dst_info->type != orMemoryType::orMemoryTypeDevice) ||
|
||||
(!src_info || src_info->type != orMemoryType::orMemoryTypeDevice))
|
||||
if (dst_info.type != orMemoryType::orMemoryTypeDevice ||
|
||||
src_info.type != orMemoryType::orMemoryTypeDevice)
|
||||
return orErrorUnknown;
|
||||
break;
|
||||
case orMemcpyHostToHost:
|
||||
if ((dst_info && dst_info->type == orMemoryType::orMemoryTypeDevice) ||
|
||||
(src_info && src_info->type == orMemoryType::orMemoryTypeDevice))
|
||||
if (dst_info.type == orMemoryType::orMemoryTypeDevice ||
|
||||
src_info.type == orMemoryType::orMemoryTypeDevice)
|
||||
return orErrorUnknown;
|
||||
break;
|
||||
}
|
||||
|
||||
unprotectNoLock(dst_info);
|
||||
unprotectNoLock(src_info);
|
||||
::memcpy(dst, src, count);
|
||||
protectNoLock(dst_info);
|
||||
protectNoLock(src_info);
|
||||
{
|
||||
ScopedMemoryProtector dst_protector(dst_info);
|
||||
ScopedMemoryProtector src_protector(src_info);
|
||||
::memcpy(dst, src, count);
|
||||
}
|
||||
|
||||
return orSuccess;
|
||||
}
|
||||
@ -125,16 +111,17 @@ class MemoryManager {
|
||||
return orErrorUnknown;
|
||||
|
||||
std ::lock_guard<std::mutex> lock(m_mutex);
|
||||
Block* info = getBlockInfoNoLock(ptr);
|
||||
orPointerAttributes info = getPointerInfo(ptr);
|
||||
|
||||
if (!info) {
|
||||
attributes->type = orMemoryType::orMemoryTypeUnmanaged;
|
||||
attributes->type = info.type;
|
||||
if (info.type == orMemoryType::orMemoryTypeUnmanaged) {
|
||||
attributes->device = -1;
|
||||
attributes->pointer = const_cast<void*>(ptr);
|
||||
attributes->size = 0;
|
||||
} else {
|
||||
attributes->type = info->type;
|
||||
attributes->device = info->device;
|
||||
attributes->pointer = info->pointer;
|
||||
attributes->device = info.device;
|
||||
attributes->pointer = info.pointer;
|
||||
attributes->size = info.size;
|
||||
}
|
||||
|
||||
return orSuccess;
|
||||
@ -142,61 +129,71 @@ class MemoryManager {
|
||||
|
||||
orError_t unprotect(void* ptr) {
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
return unprotectNoLock(getBlockInfoNoLock(ptr));
|
||||
orPointerAttributes info = getPointerInfo(ptr);
|
||||
if (info.type != orMemoryType::orMemoryTypeDevice) {
|
||||
return orErrorUnknown;
|
||||
}
|
||||
if (openreg::mprotect(
|
||||
info.pointer, info.size, F_PROT_READ | F_PROT_WRITE) != 0) {
|
||||
return orErrorUnknown;
|
||||
}
|
||||
return orSuccess;
|
||||
}
|
||||
|
||||
orError_t protect(void* ptr) {
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
return protectNoLock(getBlockInfoNoLock(ptr));
|
||||
orPointerAttributes info = getPointerInfo(ptr);
|
||||
if (info.type != orMemoryType::orMemoryTypeDevice) {
|
||||
return orErrorUnknown;
|
||||
}
|
||||
if (openreg::mprotect(info.pointer, info.size, F_PROT_NONE) != 0) {
|
||||
return orErrorUnknown;
|
||||
}
|
||||
return orSuccess;
|
||||
}
|
||||
|
||||
private:
|
||||
class ScopedMemoryProtector {
|
||||
public:
|
||||
ScopedMemoryProtector(const orPointerAttributes& info)
|
||||
: m_info(info), m_protected(false) {
|
||||
if (m_info.type == orMemoryType::orMemoryTypeDevice) {
|
||||
if (openreg::mprotect(
|
||||
m_info.pointer, m_info.size, F_PROT_READ | F_PROT_WRITE) == 0) {
|
||||
m_protected = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
~ScopedMemoryProtector() {
|
||||
if (m_protected) {
|
||||
openreg::mprotect(m_info.pointer, m_info.size, F_PROT_NONE);
|
||||
}
|
||||
}
|
||||
ScopedMemoryProtector(const ScopedMemoryProtector&) = delete;
|
||||
ScopedMemoryProtector& operator=(const ScopedMemoryProtector&) = delete;
|
||||
|
||||
private:
|
||||
orPointerAttributes m_info;
|
||||
bool m_protected;
|
||||
};
|
||||
|
||||
MemoryManager() = default;
|
||||
|
||||
orError_t unprotectNoLock(Block* info) {
|
||||
if (info && info->type == orMemoryType::orMemoryTypeDevice) {
|
||||
if (info->refcount == 0) {
|
||||
if (openreg::mprotect(
|
||||
info->pointer, info->size, F_PROT_READ | F_PROT_WRITE) != 0) {
|
||||
return orErrorUnknown;
|
||||
}
|
||||
}
|
||||
|
||||
info->refcount++;
|
||||
}
|
||||
|
||||
return orSuccess;
|
||||
}
|
||||
|
||||
orError_t protectNoLock(Block* info) {
|
||||
if (info && info->type == orMemoryType::orMemoryTypeDevice) {
|
||||
if (info->refcount == 1) {
|
||||
if (openreg::mprotect(info->pointer, info->size, F_PROT_NONE) != 0) {
|
||||
return orErrorUnknown;
|
||||
}
|
||||
}
|
||||
|
||||
info->refcount--;
|
||||
}
|
||||
|
||||
return orSuccess;
|
||||
}
|
||||
|
||||
Block* getBlockInfoNoLock(const void* ptr) {
|
||||
orPointerAttributes getPointerInfo(const void* ptr) {
|
||||
auto it = m_registry.upper_bound(const_cast<void*>(ptr));
|
||||
if (it != m_registry.begin()) {
|
||||
--it;
|
||||
const char* p_char = static_cast<const char*>(ptr);
|
||||
const char* base_char = static_cast<const char*>(it->first);
|
||||
if (p_char >= base_char && p_char < (base_char + it->second.size)) {
|
||||
return &it->second;
|
||||
return it->second;
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
return {};
|
||||
}
|
||||
|
||||
std::map<void*, Block> m_registry;
|
||||
std::map<void*, orPointerAttributes> m_registry;
|
||||
std::mutex m_mutex;
|
||||
};
|
||||
|
||||
@ -228,22 +225,6 @@ orError_t orMemcpy(
|
||||
return MemoryManager::getInstance().memcpy(dst, src, count, kind);
|
||||
}
|
||||
|
||||
orError_t orMemcpyAsync(
|
||||
void* dst,
|
||||
const void* src,
|
||||
size_t count,
|
||||
orMemcpyKind kind,
|
||||
orStream_t stream) {
|
||||
if (!stream) {
|
||||
return orErrorUnknown;
|
||||
}
|
||||
|
||||
auto& mm = MemoryManager::getInstance();
|
||||
|
||||
return orLaunchKernel(
|
||||
stream, &MemoryManager::memcpy, &mm, dst, src, count, kind);
|
||||
}
|
||||
|
||||
orError_t orPointerGetAttributes(
|
||||
orPointerAttributes* attributes,
|
||||
const void* ptr) {
|
||||
|
||||
@ -4,6 +4,8 @@
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
|
||||
#include <include/openreg.h>
|
||||
|
||||
#if defined(_WIN32)
|
||||
#include <windows.h>
|
||||
#else
|
||||
|
||||
@ -1,313 +0,0 @@
|
||||
#include <include/openreg.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <condition_variable>
|
||||
#include <mutex>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include <thread>
|
||||
|
||||
static std::mutex g_mutex;
|
||||
static std::once_flag g_flag;
|
||||
static std::vector<std::set<orStream_t>> g_streams_per_device;
|
||||
|
||||
static void initialize_registries() {
|
||||
int device_count = 0;
|
||||
orGetDeviceCount(&device_count);
|
||||
g_streams_per_device.resize(device_count);
|
||||
}
|
||||
|
||||
struct orEventImpl {
|
||||
std::mutex mtx;
|
||||
std::condition_variable cv;
|
||||
std::atomic<bool> completed{true};
|
||||
int device_index = -1;
|
||||
bool timing_enabled{false};
|
||||
std::chrono::high_resolution_clock::time_point completion_time;
|
||||
};
|
||||
|
||||
struct orEvent {
|
||||
std::shared_ptr<orEventImpl> impl;
|
||||
};
|
||||
|
||||
struct orStream {
|
||||
std::queue<std::function<void()>> tasks;
|
||||
std::mutex mtx;
|
||||
std::condition_variable cv;
|
||||
std::thread worker;
|
||||
std::atomic<bool> stop_flag{false};
|
||||
int device_index = -1;
|
||||
|
||||
orStream() {
|
||||
worker = std::thread([this] {
|
||||
while (true) {
|
||||
std::function<void()> task;
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(this->mtx);
|
||||
this->cv.wait(lock, [this] {
|
||||
return this->stop_flag.load() || !this->tasks.empty();
|
||||
});
|
||||
if (this->stop_flag.load() && this->tasks.empty()) {
|
||||
return;
|
||||
}
|
||||
task = std::move(this->tasks.front());
|
||||
this->tasks.pop();
|
||||
}
|
||||
task();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
~orStream() {
|
||||
stop_flag.store(true);
|
||||
cv.notify_one();
|
||||
worker.join();
|
||||
}
|
||||
};
|
||||
|
||||
orError_t openreg::addTaskToStream(
|
||||
orStream_t stream,
|
||||
std::function<void()> task) {
|
||||
if (!stream)
|
||||
return orErrorUnknown;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(stream->mtx);
|
||||
stream->tasks.push(std::move(task));
|
||||
}
|
||||
|
||||
stream->cv.notify_one();
|
||||
return orSuccess;
|
||||
}
|
||||
|
||||
orError_t orEventCreateWithFlags(orEvent_t* event, unsigned int flags) {
|
||||
if (!event)
|
||||
return orErrorUnknown;
|
||||
|
||||
auto impl = std::make_shared<orEventImpl>();
|
||||
orGetDevice(&(impl->device_index));
|
||||
if (flags & orEventEnableTiming) {
|
||||
impl->timing_enabled = true;
|
||||
}
|
||||
|
||||
*event = new orEvent{std::move(impl)};
|
||||
return orSuccess;
|
||||
}
|
||||
|
||||
orError_t orEventCreate(orEvent_t* event) {
|
||||
return orEventCreateWithFlags(event, orEventDisableTiming);
|
||||
}
|
||||
|
||||
orError_t orEventDestroy(orEvent_t event) {
|
||||
if (!event)
|
||||
return orErrorUnknown;
|
||||
|
||||
delete event;
|
||||
return orSuccess;
|
||||
}
|
||||
|
||||
orError_t orEventRecord(orEvent_t event, orStream_t stream) {
|
||||
if (!event || !stream)
|
||||
return orErrorUnknown;
|
||||
|
||||
auto event_impl = event->impl;
|
||||
event_impl->completed.store(false);
|
||||
auto record_task = [event_impl]() {
|
||||
if (event_impl->timing_enabled) {
|
||||
event_impl->completion_time = std::chrono::high_resolution_clock::now();
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(event_impl->mtx);
|
||||
event_impl->completed.store(true);
|
||||
}
|
||||
|
||||
event_impl->cv.notify_all();
|
||||
};
|
||||
|
||||
return openreg::addTaskToStream(stream, record_task);
|
||||
}
|
||||
|
||||
orError_t orEventSynchronize(orEvent_t event) {
|
||||
if (!event)
|
||||
return orErrorUnknown;
|
||||
|
||||
auto event_impl = event->impl;
|
||||
std::unique_lock<std::mutex> lock(event_impl->mtx);
|
||||
event_impl->cv.wait(lock, [&] { return event_impl->completed.load(); });
|
||||
|
||||
return orSuccess;
|
||||
}
|
||||
|
||||
orError_t orEventQuery(orEvent_t event) {
|
||||
if (!event)
|
||||
return orErrorUnknown;
|
||||
|
||||
return event->impl->completed.load() ? orSuccess : orErrorNotReady;
|
||||
}
|
||||
|
||||
orError_t orEventElapsedTime(float* ms, orEvent_t start, orEvent_t end) {
|
||||
if (!ms || !start || !end)
|
||||
return orErrorUnknown;
|
||||
|
||||
auto start_impl = start->impl;
|
||||
auto end_impl = end->impl;
|
||||
|
||||
if (start_impl->device_index != end_impl->device_index) {
|
||||
return orErrorUnknown;
|
||||
}
|
||||
|
||||
if (!start_impl->timing_enabled || !end_impl->timing_enabled) {
|
||||
return orErrorUnknown;
|
||||
}
|
||||
|
||||
if (!start_impl->completed.load() || !end_impl->completed.load()) {
|
||||
return orErrorUnknown;
|
||||
}
|
||||
|
||||
auto duration = end_impl->completion_time - start_impl->completion_time;
|
||||
*ms = std::chrono::duration_cast<std::chrono::duration<float, std::milli>>(
|
||||
duration)
|
||||
.count();
|
||||
|
||||
return orSuccess;
|
||||
}
|
||||
|
||||
orError_t orStreamCreateWithPriority(
|
||||
orStream_t* stream,
|
||||
[[maybe_unused]] unsigned int flag,
|
||||
int priority) {
|
||||
if (!stream) {
|
||||
return orErrorUnknown;
|
||||
}
|
||||
|
||||
int min_p, max_p;
|
||||
orDeviceGetStreamPriorityRange(&min_p, &max_p);
|
||||
if (priority < min_p || priority > max_p) {
|
||||
return orErrorUnknown;
|
||||
}
|
||||
|
||||
int current_device = 0;
|
||||
orGetDevice(¤t_device);
|
||||
|
||||
orStream_t new_stream = nullptr;
|
||||
new_stream = new orStream();
|
||||
new_stream->device_index = current_device;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
std::call_once(g_flag, initialize_registries);
|
||||
g_streams_per_device[current_device].insert(new_stream);
|
||||
}
|
||||
|
||||
*stream = new_stream;
|
||||
|
||||
return orSuccess;
|
||||
}
|
||||
|
||||
orError_t orStreamCreate(orStream_t* stream) {
|
||||
int min_p, max_p;
|
||||
orDeviceGetStreamPriorityRange(&min_p, &max_p);
|
||||
|
||||
return orStreamCreateWithPriority(stream, 0, max_p);
|
||||
}
|
||||
|
||||
orError_t orStreamGetPriority(
|
||||
[[maybe_unused]] orStream_t stream,
|
||||
int* priority) {
|
||||
// Since OpenReg has only one priority level, the following code
|
||||
// returns 0 directly for convenience.
|
||||
*priority = 0;
|
||||
|
||||
return orSuccess;
|
||||
}
|
||||
|
||||
orError_t orStreamDestroy(orStream_t stream) {
|
||||
if (!stream)
|
||||
return orErrorUnknown;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
|
||||
int device_idx = stream->device_index;
|
||||
if (device_idx >= 0 && device_idx < g_streams_per_device.size()) {
|
||||
g_streams_per_device[device_idx].erase(stream);
|
||||
}
|
||||
}
|
||||
|
||||
delete stream;
|
||||
return orSuccess;
|
||||
}
|
||||
|
||||
orError_t orStreamQuery(orStream_t stream) {
|
||||
if (!stream) {
|
||||
return orErrorUnknown;
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lock(stream->mtx);
|
||||
return stream->tasks.empty() ? orSuccess : orErrorNotReady;
|
||||
}
|
||||
|
||||
orError_t orStreamSynchronize(orStream_t stream) {
|
||||
if (!stream)
|
||||
return orErrorUnknown;
|
||||
|
||||
orEvent_t event;
|
||||
orEventCreate(&event);
|
||||
orEventRecord(event, stream);
|
||||
|
||||
orError_t status = orEventSynchronize(event);
|
||||
orEventDestroy(event);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
orError_t orStreamWaitEvent(orStream_t stream, orEvent_t event, unsigned int) {
|
||||
if (!stream || !event)
|
||||
return orErrorUnknown;
|
||||
|
||||
auto event_impl = event->impl;
|
||||
auto wait_task = [event_impl]() {
|
||||
std::unique_lock<std::mutex> lock(event_impl->mtx);
|
||||
event_impl->cv.wait(lock, [&] { return event_impl->completed.load(); });
|
||||
};
|
||||
|
||||
return openreg::addTaskToStream(stream, wait_task);
|
||||
}
|
||||
|
||||
orError_t orDeviceGetStreamPriorityRange(
|
||||
int* leastPriority,
|
||||
int* greatestPriority) {
|
||||
if (!leastPriority || !greatestPriority) {
|
||||
return orErrorUnknown;
|
||||
}
|
||||
|
||||
// OpenReg have only one priority now.
|
||||
*leastPriority = 0;
|
||||
*greatestPriority = 0;
|
||||
return orSuccess;
|
||||
}
|
||||
|
||||
orError_t orDeviceSynchronize(void) {
|
||||
int current_device = 0;
|
||||
orGetDevice(¤t_device);
|
||||
|
||||
std::vector<orStream_t> streams;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_mutex);
|
||||
std::call_once(g_flag, initialize_registries);
|
||||
|
||||
auto& streams_on_device = g_streams_per_device[current_device];
|
||||
streams.assign(streams_on_device.begin(), streams_on_device.end());
|
||||
}
|
||||
|
||||
for (orStream_t stream : streams) {
|
||||
orError_t status = orStreamSynchronize(stream);
|
||||
if (status != orSuccess) {
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
return orSuccess;
|
||||
}
|
||||
@ -1,112 +0,0 @@
|
||||
#include "include/openreg.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
struct MemoryGuard {
|
||||
MemoryGuard(void* ptr) : ptr_(ptr) {
|
||||
orMemoryUnprotect(ptr_);
|
||||
}
|
||||
~MemoryGuard() {
|
||||
orMemoryProtect(ptr_);
|
||||
}
|
||||
|
||||
private:
|
||||
void* ptr_{};
|
||||
};
|
||||
|
||||
void add_kernel(float* out, float* a, float* b, int num) {
|
||||
for (int i = 0; i < num; ++i) {
|
||||
out[i] = a[i] + b[i];
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
int device_count = 0;
|
||||
orGetDeviceCount(&device_count);
|
||||
|
||||
std::cout << "Current environment have " << device_count << " devices"
|
||||
<< std::endl;
|
||||
|
||||
orSetDevice(0);
|
||||
int current_device = -1;
|
||||
orGetDevice(¤t_device);
|
||||
|
||||
std::cout << "Current is " << current_device << " device" << std::endl;
|
||||
|
||||
constexpr int num = 50000;
|
||||
constexpr size_t size = num * sizeof(float);
|
||||
|
||||
std::vector<float> host_a(num), host_b(num), host_out(num, 0.0f);
|
||||
std::iota(host_a.begin(), host_a.end(), 0.0f);
|
||||
for (int i = 0; i < num; ++i) {
|
||||
host_b[i] = 2.0f;
|
||||
}
|
||||
|
||||
float *dev_a, *dev_b, *dev_out;
|
||||
orMalloc((void**)&dev_a, size);
|
||||
orMalloc((void**)&dev_b, size);
|
||||
orMalloc((void**)&dev_out, size);
|
||||
|
||||
// There will be subsequent memory access operations, so memory protection
|
||||
// needs to be released
|
||||
MemoryGuard a{dev_a};
|
||||
MemoryGuard b{dev_b};
|
||||
MemoryGuard c{dev_out};
|
||||
|
||||
orStream_t stream1, stream2;
|
||||
orEvent_t start_event, stop_event;
|
||||
|
||||
orStreamCreate(&stream1);
|
||||
orStreamCreate(&stream2);
|
||||
orEventCreateWithFlags(&start_event, orEventEnableTiming);
|
||||
orEventCreateWithFlags(&stop_event, orEventEnableTiming);
|
||||
|
||||
// Copy input from host to device
|
||||
orMemcpyAsync(dev_a, host_a.data(), size, orMemcpyHostToDevice, stream1);
|
||||
orMemcpyAsync(dev_b, host_b.data(), size, orMemcpyHostToDevice, stream1);
|
||||
|
||||
// Submit compute kernel and two events those are used for calculating time.
|
||||
orEventRecord(start_event, stream1);
|
||||
orLaunchKernel(stream1, add_kernel, dev_out, dev_a, dev_b, num);
|
||||
orEventRecord(stop_event, stream1);
|
||||
|
||||
// Synchronization between streams.
|
||||
orStreamWaitEvent(stream2, stop_event, 0);
|
||||
orMemcpyAsync(host_out.data(), dev_out, size, orMemcpyDeviceToHost, stream2);
|
||||
orStreamSynchronize(stream2);
|
||||
|
||||
std::cout << "All tasks have been submitted." << std::endl;
|
||||
|
||||
float elapsed_ms = 0.0f;
|
||||
orEventElapsedTime(&elapsed_ms, start_event, stop_event);
|
||||
std::cout << "Kernel execution time: " << elapsed_ms << " ms" << std::endl;
|
||||
|
||||
bool success = true;
|
||||
for (int i = 0; i < num; ++i) {
|
||||
if (std::abs(host_out[i] - (host_a[i] + host_b[i])) > 1e-5) {
|
||||
std::cout << "Verification FAILED at index " << i << "! Expected "
|
||||
<< (host_a[i] + host_b[i]) << ", got " << host_out[i]
|
||||
<< std::endl;
|
||||
success = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (success) {
|
||||
std::cout << "Verification PASSED!" << std::endl;
|
||||
}
|
||||
|
||||
orFree(dev_a);
|
||||
orFree(dev_b);
|
||||
orFree(dev_out);
|
||||
|
||||
orStreamDestroy(stream1);
|
||||
orStreamDestroy(stream2);
|
||||
|
||||
orEventDestroy(start_event);
|
||||
orEventDestroy(stop_event);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -3,20 +3,16 @@
|
||||
#include <cstddef>
|
||||
|
||||
#ifdef _WIN32
|
||||
#define OPENREG_EXPORT __declspec(dllexport)
|
||||
#define OPENREG_EXPORT __declspec(dllexport)
|
||||
#else
|
||||
#define OPENREG_EXPORT __attribute__((visibility("default")))
|
||||
#define OPENREG_EXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef enum orError_t {
|
||||
orSuccess = 0,
|
||||
orErrorUnknown = 1,
|
||||
orErrorNotReady = 2
|
||||
} orError_t;
|
||||
typedef enum orError_t { orSuccess = 0, orErrorUnknown = 1 } orError_t;
|
||||
|
||||
typedef enum orMemcpyKind {
|
||||
orMemcpyHostToHost = 0,
|
||||
@ -35,75 +31,25 @@ struct orPointerAttributes {
|
||||
orMemoryType type = orMemoryType::orMemoryTypeUnmanaged;
|
||||
int device;
|
||||
void* pointer;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
typedef enum orEventFlags {
|
||||
orEventDisableTiming = 0x0,
|
||||
orEventEnableTiming = 0x1,
|
||||
} orEventFlags;
|
||||
|
||||
struct orStream;
|
||||
struct orEvent;
|
||||
typedef struct orStream* orStream_t;
|
||||
typedef struct orEvent* orEvent_t;
|
||||
|
||||
// Memory
|
||||
OPENREG_EXPORT orError_t orMalloc(void** devPtr, size_t size);
|
||||
OPENREG_EXPORT orError_t orFree(void* devPtr);
|
||||
OPENREG_EXPORT orError_t orMallocHost(void** hostPtr, size_t size);
|
||||
OPENREG_EXPORT orError_t orFreeHost(void* hostPtr);
|
||||
OPENREG_EXPORT orError_t
|
||||
orMemcpy(void* dst, const void* src, size_t count, orMemcpyKind kind);
|
||||
OPENREG_EXPORT orError_t orMemcpyAsync(
|
||||
void* dst,
|
||||
const void* src,
|
||||
size_t count,
|
||||
orMemcpyKind kind,
|
||||
orStream_t stream);
|
||||
OPENREG_EXPORT orError_t
|
||||
orPointerGetAttributes(orPointerAttributes* attributes, const void* ptr);
|
||||
OPENREG_EXPORT orError_t orMemcpy(void* dst, const void* src, size_t count, orMemcpyKind kind);
|
||||
OPENREG_EXPORT orError_t orMemoryUnprotect(void* devPtr);
|
||||
OPENREG_EXPORT orError_t orMemoryProtect(void* devPtr);
|
||||
|
||||
// Device
|
||||
OPENREG_EXPORT orError_t orGetDeviceCount(int* count);
|
||||
OPENREG_EXPORT orError_t orSetDevice(int device);
|
||||
OPENREG_EXPORT orError_t orGetDevice(int* device);
|
||||
OPENREG_EXPORT orError_t
|
||||
orDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority);
|
||||
OPENREG_EXPORT orError_t orDeviceSynchronize(void);
|
||||
|
||||
// Stream
|
||||
OPENREG_EXPORT orError_t orStreamCreateWithPriority(
|
||||
orStream_t* stream,
|
||||
unsigned int flags,
|
||||
int priority);
|
||||
OPENREG_EXPORT orError_t orStreamCreate(orStream_t* stream);
|
||||
OPENREG_EXPORT orError_t orStreamGetPriority(orStream_t stream, int* priority);
|
||||
OPENREG_EXPORT orError_t orStreamDestroy(orStream_t stream);
|
||||
OPENREG_EXPORT orError_t orStreamQuery(orStream_t stream);
|
||||
OPENREG_EXPORT orError_t orStreamSynchronize(orStream_t stream);
|
||||
OPENREG_EXPORT orError_t
|
||||
orStreamWaitEvent(orStream_t stream, orEvent_t event, unsigned int flags);
|
||||
|
||||
// Event
|
||||
OPENREG_EXPORT orError_t
|
||||
orEventCreateWithFlags(orEvent_t* event, unsigned int flags);
|
||||
OPENREG_EXPORT orError_t orEventCreate(orEvent_t* event);
|
||||
OPENREG_EXPORT orError_t orEventDestroy(orEvent_t event);
|
||||
OPENREG_EXPORT orError_t orEventRecord(orEvent_t event, orStream_t stream);
|
||||
OPENREG_EXPORT orError_t orEventSynchronize(orEvent_t event);
|
||||
OPENREG_EXPORT orError_t orEventQuery(orEvent_t event);
|
||||
OPENREG_EXPORT orError_t
|
||||
orEventElapsedTime(float* ms, orEvent_t start, orEvent_t end);
|
||||
OPENREG_EXPORT orError_t orPointerGetAttributes(
|
||||
orPointerAttributes* attributes,
|
||||
const void* ptr);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
#define OPENREG_H
|
||||
#include "openreg.inl"
|
||||
|
||||
#endif
|
||||
|
||||
@ -1,42 +0,0 @@
|
||||
#ifndef OPENREG_H
|
||||
#error "Don`t include openreg.inl directly, include openreg.h instead."
|
||||
#endif
|
||||
|
||||
#include <functional>
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
|
||||
namespace openreg {
|
||||
OPENREG_EXPORT orError_t
|
||||
addTaskToStream(orStream* stream, std::function<void()> task);
|
||||
}
|
||||
|
||||
template <typename Func, typename... Args>
|
||||
OPENREG_EXPORT inline orError_t orLaunchKernel(
|
||||
orStream* stream,
|
||||
Func&& kernel_func,
|
||||
Args&&... args) {
|
||||
if (!stream) {
|
||||
return orErrorUnknown;
|
||||
}
|
||||
|
||||
/*
|
||||
* Some tests in PyTorch still use C++11, so we use conditional macro to
|
||||
* select different approaches for different C++ version.
|
||||
*
|
||||
* Std::apply is only supported in C++17, so for C++11/14, std::bind is
|
||||
* a more appropriate approach, but the former has better performance.
|
||||
*/
|
||||
#if __cplusplus >= 201703L
|
||||
auto task = [func = std::forward<Func>(kernel_func),
|
||||
args_tuple =
|
||||
std::make_tuple(std::forward<Args>(args)...)]() mutable {
|
||||
std::apply(func, std::move(args_tuple));
|
||||
};
|
||||
#else
|
||||
auto task =
|
||||
std::bind(std::forward<Func>(kernel_func), std::forward<Args>(args)...);
|
||||
#endif
|
||||
|
||||
return openreg::addTaskToStream(stream, std::move(task));
|
||||
}
|
||||
@ -1,41 +0,0 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <include/openreg.h>
|
||||
|
||||
namespace {
|
||||
|
||||
class DeviceTest : public ::testing::Test {
|
||||
protected:
|
||||
void SetUp() override {
|
||||
orSetDevice(0);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(DeviceTest, GetDeviceCountValid) {
|
||||
int count = -1;
|
||||
EXPECT_EQ(orGetDeviceCount(&count), orSuccess);
|
||||
EXPECT_EQ(count, 2);
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, GetDeviceValid) {
|
||||
int device = -1;
|
||||
EXPECT_EQ(orGetDevice(&device), orSuccess);
|
||||
EXPECT_EQ(device, 0);
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, SetDeviceValid) {
|
||||
EXPECT_EQ(orSetDevice(1), orSuccess);
|
||||
|
||||
int device = -1;
|
||||
EXPECT_EQ(orGetDevice(&device), orSuccess);
|
||||
EXPECT_EQ(device, 1);
|
||||
|
||||
EXPECT_EQ(orSetDevice(0), orSuccess);
|
||||
EXPECT_EQ(orGetDevice(&device), orSuccess);
|
||||
EXPECT_EQ(device, 0);
|
||||
}
|
||||
|
||||
TEST_F(DeviceTest, SetDeviceInvalidNegative) {
|
||||
EXPECT_EQ(orSetDevice(-1), orErrorUnknown);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user