Compare commits

..

2 Commits

Author SHA1 Message Date
2f26446210 tc 2025-10-02 15:32:13 -07:00
f36f3bc69f tc 2025-10-02 15:27:59 -07:00
571 changed files with 2868 additions and 9752 deletions

View File

@ -37,9 +37,9 @@ case ${DOCKER_TAG_PREFIX} in
rocm*)
BASE_TARGET=rocm
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
# add gfx950, gfx115x conditionally starting in ROCm 7.0
# add gfx950 conditionally starting in ROCm 7.0
if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
fi
EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
;;

View File

@ -19,8 +19,8 @@ pip_install \
transformers==4.36.2
pip_install coloredlogs packaging
pip_install onnxruntime==1.23.0
pip_install onnxscript==0.5.3
pip_install onnxruntime==1.22.1
pip_install onnxscript==0.4.0
# Cache the transformers model to be used later by ONNX tests. We need to run the transformers
# package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/

View File

@ -120,8 +120,9 @@ ninja==1.11.1.4
numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
#Description: Just-In-Time Compiler for Numerical Functions
#Pinned versions: 0.55.2, 0.60.0
#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
#test that import: test_numba_integration.py
#For numba issue see https://github.com/pytorch/pytorch/issues/51511
#Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073
#numpy
@ -241,9 +242,10 @@ pygments==2.15.0
#Pinned versions: 14.1.0
#test that import:
scikit-image==0.22.0
scikit-image==0.19.3 ; python_version < "3.10"
scikit-image==0.22.0 ; python_version >= "3.10"
#Description: image processing routines
#Pinned versions: 0.22.0
#Pinned versions:
#test that import: test_nn.py
#scikit-learn
@ -339,7 +341,7 @@ onnx==1.18.0
#Pinned versions:
#test that import:
onnxscript==0.5.3
onnxscript==0.4.0
#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
#Pinned versions:
#test that import:

View File

@ -5,7 +5,7 @@ DESIRED_ROCM ?= 7.0
DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
PACKAGE_NAME = magma-rocm
# inherit this from underlying docker image, do not pass this env var to docker
#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201
#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
@ -18,6 +18,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
.PHONY: all
all: magma-rocm70
all: magma-rocm64
all: magma-rocm63
.PHONY:
clean:
@ -33,3 +34,8 @@ magma-rocm70:
magma-rocm64: DESIRED_ROCM := 6.4
magma-rocm64:
$(DOCKER_RUN)
.PHONY: magma-rocm63
magma-rocm63: DESIRED_ROCM := 6.3
magma-rocm63:
$(DOCKER_RUN)

View File

@ -34,14 +34,12 @@ fi
# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
if [ -n "$NUMBA_CUDA_DIR" ]; then
NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
pushd "$NUMBA_CUDA_DIR"
patch -p4 <"$NUMBA_PATCH"
popd
fi
NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
if [ -n "$NUMBA_CUDA_DIR" ]; then
NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
pushd "$NUMBA_CUDA_DIR"
patch -p4 <"$NUMBA_PATCH"
popd
fi
echo "Environment variables:"

View File

@ -38,7 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
fi
# TODO: Move this to .ci/docker/requirements-ci.txt
python -m pip install "psutil==5.9.1" nvidia-ml-py "pytest-shard==0.1.2"
python -m pip install "psutil==5.9.1" "pynvml==11.4.1" "pytest-shard==0.1.2"
run_tests() {
# Run nvidia-smi if available

View File

@ -66,7 +66,6 @@ readability-simplify-subscript-expr,
readability-string-compare,
-readability-redundant-access-specifiers,
-readability-redundant-control-flow,
-readability-redundant-inline-specifier,
'
HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
WarningsAsErrors: '*'

View File

@ -1 +1 @@
0ad9951c416d33c5da4f7a504fb162cbe62386f5
78a47f87ce259a48f0391fa9ae15add05ea7432b

View File

@ -1 +1 @@
2a9138a26ee257fef05310ad3fecf7c55fe80d73
0fc62aa26a30ed7ca419d285f285cb5ba02c4394

View File

@ -202,7 +202,7 @@ ARG max_jobs=16
ENV MAX_JOBS=${max_jobs}
ARG nvcc_threads=4
ENV NVCC_THREADS=$nvcc_threads
ARG torch_cuda_arch_list='8.0 8.6 8.9 9.0'
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
ARG USE_SCCACHE
@ -297,28 +297,16 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
echo "[INFO] Showing torch_build_versions.txt content:" && \
cat torch_build_versions.txt
# Install build and runtime dependencies, this is needed for flashinfer install
COPY requirements/build.txt requirements/build.txt
COPY use_existing_torch.py use_existing_torch.py
RUN python3 use_existing_torch.py
RUN cat requirements/build.txt
# Install uv for faster pip installs if not existed
RUN --mount=type=cache,target=/root/.cache/uv \
if ! python3 -m uv --version > /dev/null 2>&1; then \
python3 -m pip install uv==0.8.4; \
fi
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/build.txt
# Default mount file as placeholder, this just avoid the mount error
ARG TORCH_WHEELS_PATH="./requirements"
# Install torch, torchaudio and torchvision
@ -344,11 +332,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# Install xformers wheel from previous stage
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system /wheels/xformers/*.whl --verbose
# Build flashinfer from source.
ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0'
# install package for build flashinfer
# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
RUN pip install build==1.3.0
RUN pip freeze | grep -E 'setuptools|packaging|build'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}

View File

@ -1,14 +1,9 @@
import glob
import os
requires_files = glob.glob("requirements/*.txt")
requires_files += ["pyproject.toml"]
for file in requires_files:
if not os.path.exists(file):
print(f"!!! skipping missing {file}")
continue
print(f">>> cleaning {file}")
with open(file) as f:
lines = f.readlines()

View File

@ -40,15 +40,6 @@ jobs:
# Use gh CLI to get changed files in the PR with explicit repo
CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
# See https://github.com/pytorch/pytorch/pull/134215#issuecomment-2332128790
PYI_FILES_TO_ADD=""
for file in ${CHANGED_FILES}; do
if [[ "${file}" == *".pyi.in" ]]; then
PYI_FILES_TO_ADD="${PYI_FILES_TO_ADD} ${file//.in/}"
fi
done
CHANGED_FILES="${CHANGED_FILES}${PYI_FILES_TO_ADD}"
if [ -z "$CHANGED_FILES" ]; then
echo "No changed files found, setting to '*'"
CHANGED_FILES="*"

View File

@ -106,16 +106,6 @@ jobs:
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
]}
secrets: inherit

View File

@ -73,28 +73,3 @@ jobs:
test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
secrets: inherit
# ROCM MI300 runner
opmicrobenchmark-build-rocm:
if: github.repository_owner == 'pytorch'
name: opmicrobenchmark-build-rocm
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-rocm-py3_10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
test-matrix: |
{ include: [
{ config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
]}
secrets: inherit
opmicrobenchmark-test-rocm:
name: opmicrobenchmark-test-rocm
uses: ./.github/workflows/_rocm-test.yml
needs: opmicrobenchmark-build-rocm
with:
timeout-minutes: 500
build-environment: linux-jammy-rocm-py3_10
docker-image: ${{ needs.opmicrobenchmark-build-rocm.outputs.docker-image }}
test-matrix: ${{ needs.opmicrobenchmark-build-rocm.outputs.test-matrix }}
secrets: inherit

View File

@ -213,9 +213,9 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
test-matrix: |
{ include: [
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
]}
secrets: inherit

View File

@ -127,6 +127,8 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
# More memory is needed to build with asan
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-clang18-asan
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan

View File

@ -140,6 +140,8 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
# More memory is needed to build with asan
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-clang18-asan
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan

View File

@ -160,10 +160,9 @@ jobs:
runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
{ config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
{ config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
{ config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
{ config: "default", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
{ config: "default", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
{ config: "default", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
]}
secrets: inherit
@ -190,6 +189,41 @@ jobs:
runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
secrets: inherit
linux-jammy-rocm-py3_10-build:
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-rocm-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.4" },
]}
secrets: inherit
linux-jammy-rocm-py3_10-test:
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
secrets: inherit
inductor-build:
name: inductor-build
uses: ./.github/workflows/_linux-build.yml

View File

@ -42,7 +42,7 @@ jobs:
build-external-packages: "vllm"
build-environment: linux-jammy-cuda12.8-py3.12-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm
cuda-arch-list: '8.0 8.9 9.0'
cuda-arch-list: '8.0;8.9;9.0'
runner: linux.24xlarge.memory
test-matrix: |
{ include: [

View File

@ -28,7 +28,6 @@ exclude_patterns = [
'torch/lib/**',
'venv/**',
'**/*.pyi',
"tools/experimental/dynamic_shapes/torchfuzz/**",
'tools/test/test_selective_build.py',
]
command = [
@ -1573,7 +1572,6 @@ exclude_patterns = [
'torch/_inductor/fx_passes/serialized_patterns/**',
'torch/_inductor/autoheuristic/artifacts/**',
'test/dynamo/cpython/**',
'test/test_torchfuzz_repros.py',
'scripts/**',
'third_party/**',
'fb/**',

View File

@ -50,10 +50,11 @@ RUN git submodule update --init --recursive
FROM conda as conda-installs
ARG PYTHON_VERSION=3.11
ARG CUDA_PATH=cu121
ARG CUDA_CHANNEL=nvidia
ARG INSTALL_CHANNEL=whl/nightly
# Automatically set by buildx
# pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574
RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda=25.7.0
RUN /opt/conda/bin/conda update -y -n base -c defaults conda
RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION}
ARG TARGETPLATFORM

View File

@ -40,6 +40,41 @@ namespace {
->conv
->rnn
*/
const std::map<std::string, std::vector<std::string>> _fp32_precisions = {
{"generic", {{"ieee", "tf32", "bf16", "none"}}},
{"mkldnn", {{"ieee", "tf32", "bf16", "none"}}},
{"cuda", {{"ieee", "tf32", "none"}}}};
// Check whether the backend and op are legal
void check_fp32_prec_backend_and_op(
const std::string& backend,
const std::string& op) {
static std::vector<std::string> backends = {"generic", "mkldnn", "cuda"};
static std::vector<std::string> operators = {"conv", "matmul", "rnn", "all"};
TORCH_CHECK(
std::find(backends.begin(), backends.end(), backend) != backends.end(),
"Invalid backend: ",
backend);
TORCH_CHECK(
std::find(operators.begin(), operators.end(), op) != operators.end(),
"Invalid operator: ",
op);
if (backend == "generic") {
TORCH_CHECK(op == "all", "Invalid operation for generic backend: ", op);
}
}
// Return whether the precision is supported by backends
bool validate_fp32_prec(
const std::string& backend,
const std::string& precision) {
auto iterp = _fp32_precisions.find(backend);
TORCH_CHECK(iterp != _fp32_precisions.end());
auto precisions = iterp->second;
bool valid = std::find(precisions.begin(), precisions.end(), precision) !=
precisions.end();
return valid;
}
C10_ALWAYS_INLINE void warn_deprecated_fp32_precision_api(){
TORCH_WARN_ONCE(
@ -51,54 +86,6 @@ namespace {
}
} // namespace
Float32Backend str2backend(const std::string& name) {
if (name == "generic")
return Float32Backend::GENERIC;
else if (name == "cuda")
return Float32Backend::CUDA;
else if (name == "mkldnn")
return Float32Backend::MKLDNN;
TORCH_CHECK(false, "Unknown backend: ", name);
}
Float32Op str2op(const std::string& name) {
if (name == "all")
return Float32Op::ALL;
else if (name == "conv")
return Float32Op::CONV;
else if (name == "rnn")
return Float32Op::RNN;
else if (name == "matmul")
return Float32Op::MATMUL;
TORCH_CHECK(false, "Unknown op: ", name);
}
Float32Precision str2precision(const std::string& name) {
if (name == "none")
return Float32Precision::NONE;
else if (name == "ieee")
return Float32Precision::IEEE;
else if (name == "tf32")
return Float32Precision::TF32;
else if (name == "bf16")
return Float32Precision::BF16;
TORCH_CHECK(false, "Unknown precision: ", name);
}
std::string precision2str(Float32Precision prec) {
switch (prec) {
case Float32Precision::NONE:
return "none";
case Float32Precision::IEEE:
return "ieee";
case Float32Precision::TF32:
return "tf32";
case Float32Precision::BF16:
return "bf16";
}
TORCH_CHECK(false, "Invalid enum Float32Precision(", static_cast<int>(prec), ")");
}
Context::Context() = default;
// TODO: This could be bad juju if someone calls globalContext() in the
@ -192,10 +179,10 @@ void Context::setUserEnabledNNPACK(bool e) {
enabled_nnpack = e;
}
bool Context::allowTF32CuDNN(std::optional<Float32Op> op) const {
if (!op.has_value()) {
bool allow_tf32_rnn = float32Precision(Float32Backend::CUDA, Float32Op::RNN) == Float32Precision::TF32;
bool allow_tf32_conv = float32Precision(Float32Backend::CUDA, Float32Op::CONV) == Float32Precision::TF32;
bool Context::allowTF32CuDNN(const std::string& op) const {
if (op.empty()){
bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32";
bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32";
TORCH_CHECK(
allow_tf32_rnn == allow_tf32_conv && allow_tf32_rnn == allow_tf32_cudnn,
"PyTorch is checking whether allow_tf32 is enabled for cuDNN without a specific operator name,",
@ -204,15 +191,15 @@ bool Context::allowTF32CuDNN(std::optional<Float32Op> op) const {
"We suggest only using the new API to set the TF32 flag(s). See also: ",
"https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices");
} else {
return float32Precision(Float32Backend::CUDA, op.value()) == Float32Precision::TF32;
return float32Precision("cuda", op) == "tf32";
}
warn_deprecated_fp32_precision_api();
return allow_tf32_cudnn;
}
void Context::setAllowTF32CuDNN(bool b) {
setFloat32Precision(Float32Backend::CUDA, Float32Op::RNN, b ? Float32Precision::TF32 : Float32Precision::NONE);
setFloat32Precision(Float32Backend::CUDA, Float32Op::CONV, b ? Float32Precision::TF32 : Float32Precision::NONE);
setFloat32Precision("cuda", "rnn", b ? "tf32" : "none");
setFloat32Precision("cuda", "conv", b ? "tf32" : "none");
allow_tf32_cudnn = b;
warn_deprecated_fp32_precision_api();
}
@ -292,6 +279,42 @@ bool Context::userEnabledOverrideableSDP() const {
return enabled_overrideable;
}
static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
bool Context::checkCuBLASConfigDeterministic() {
// If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
// is set to deterministic setting
if (hasCUDART()) {
const auto workspace_config = c10::utils::get_env(cublas_config_var_name);
return (workspace_config == cublas_deterministic_configs[0] || workspace_config == cublas_deterministic_configs[1]);
}
return true;
}
void Context::alertCuBLASConfigNotDeterministic() const {
static const bool cublas_config_deterministic = checkCuBLASConfigDeterministic();
if (C10_LIKELY(!deterministicAlgorithms() || cublas_config_deterministic)) {
return;
}
auto msg = c10::str(
"Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or ",
"`at::Context::setDeterministicAlgorithms(true)`, but this operation is not deterministic because ",
"it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this ",
"case, you must set an environment variable before running your PyTorch application: ",
cublas_config_var_name, "=", cublas_deterministic_configs[0], " or ",
cublas_config_var_name, "=", cublas_deterministic_configs[1], ". For more information, go to ",
"https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility"
);
if (deterministicAlgorithmsWarnOnly()) {
TORCH_WARN(msg);
} else {
TORCH_CHECK(false, msg);
}
}
bool Context::benchmarkCuDNN() const {
return benchmark_cudnn;
}
@ -318,7 +341,7 @@ void Context::setImmediateMiopen(bool b) {
bool Context::allowTF32CuBLAS() const {
bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
bool allow_tf32_new = float32Precision(Float32Backend::CUDA, Float32Op::MATMUL) == Float32Precision::TF32;
bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32";
TORCH_CHECK(
legacy_allow_tf32 == allow_tf32_new,
"PyTorch is checking whether allow_tf32_new is enabled for cuBlas matmul,",
@ -331,17 +354,17 @@ bool Context::allowTF32CuBLAS() const {
void Context::setAllowTF32CuBLAS(bool b) {
float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, b ? Float32Precision::TF32 : Float32Precision::IEEE);
setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee");
}
Float32MatmulPrecision Context::float32MatmulPrecision() const {
bool invalid = float32Precision(Float32Backend::CUDA, Float32Op::MATMUL) == Float32Precision::TF32 &&
bool invalid = float32Precision("cuda", "matmul") == "tf32" &&
float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST;
invalid = invalid ||
(float32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL) == Float32Precision::BF16 &&
(float32Precision("mkldnn", "matmul") == "bf16" &&
float32_matmul_precision != at::Float32MatmulPrecision::MEDIUM);
invalid = invalid ||
(float32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL) == Float32Precision::TF32 &&
(float32Precision("mkldnn", "matmul") == "tf32" &&
float32_matmul_precision != at::Float32MatmulPrecision::HIGH);
TORCH_CHECK(
!invalid,
@ -353,26 +376,15 @@ Float32MatmulPrecision Context::float32MatmulPrecision() const {
return float32_matmul_precision;
}
Float32Precision Context::float32Precision(Float32Backend backend, Float32Op op) const {
std::pair<Float32Backend, Float32Op> key{backend, op};
auto it = fp32_precision.find(key);
TORCH_CHECK(it != fp32_precision.end(), "Invalid (backend, op) pair: (", backend, ", ", op, ")");
Float32Precision precision = it->second;
if (precision == Float32Precision::NONE) {
key.second = Float32Op::ALL;
precision = fp32_precision.find(key)->second;
}
if (precision == Float32Precision::NONE) {
key.first = Float32Backend::GENERIC;
precision = fp32_precision.find(key)->second;
}
// "cuda" does not support "bf16"
if (backend == Float32Backend::CUDA && precision == Float32Precision::BF16) {
return Float32Precision::NONE;
}
return precision;
std::string Context::float32Precision(const std::string& backend, const std::string& op) const {
check_fp32_prec_backend_and_op(backend, op);
auto precision = fp32_precision.find(backend)->second.find(op)->second;
if (precision == "none")
precision = fp32_precision.find(backend)->second.find("all")->second;
if (precision == "none")
precision = fp32_precision.find("generic")->second.find("all")->second;
bool valid_prec = validate_fp32_prec(backend, precision);
return valid_prec ? precision : "none";
}
void Context::setFloat32MatmulPrecision(const std::string &s) {
@ -381,18 +393,18 @@ void Context::setFloat32MatmulPrecision(const std::string &s) {
// TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention
if (s_ == "highest") {
float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST;
setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::IEEE);
setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::IEEE);
setFloat32Precision("cuda", "matmul", "ieee");
setFloat32Precision("mkldnn", "matmul", "ieee");
return true;
} else if (s_ == "high") {
float32_matmul_precision = at::Float32MatmulPrecision::HIGH;
setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::TF32);
setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::TF32);
setFloat32Precision("cuda", "matmul", "tf32");
setFloat32Precision("mkldnn", "matmul", "tf32");
return true;
} else if (s_ == "medium") {
float32_matmul_precision = at::Float32MatmulPrecision::MEDIUM;
setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::TF32);
setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::BF16);
setFloat32Precision("cuda", "matmul", "tf32");
setFloat32Precision("mkldnn", "matmul", "bf16");
return true;
}
return false;
@ -406,16 +418,25 @@ void Context::setFloat32MatmulPrecision(const std::string &s) {
"setFloat32MatmulPrecision call has no effect.");
}
void Context::setFloat32Precision(Float32Backend backend, Float32Op op, Float32Precision p) {
auto it = fp32_precision.find(std::make_pair(backend, op));
TORCH_CHECK(
it != fp32_precision.end(),
"Invalid (backend, op) pair: (", backend, ", ", op, ")");
TORCH_CHECK(
!(backend == Float32Backend::CUDA && p == Float32Precision::BF16),
"backend 'cuda' does not support precision 'bf16'");
it->second = p;
void Context::setFloat32Precision(const std::string& backend, const std::string& op, const std::string& p) {
check_fp32_prec_backend_and_op(backend, op);
if (validate_fp32_prec(backend, p)) {
fp32_precision[backend][op] = p;
} else {
std::string msg;
auto iterp = _fp32_precisions.find(backend);
TORCH_CHECK(iterp != _fp32_precisions.end());
for (const auto& p : iterp->second) {
msg += p;
msg += " ";
}
TORCH_WARN(
"you have set wrong precision for backend:",
backend,
" setFloat32Precision call has no effect.",
"Please choose precision from: ",
msg);
}
}
at::LinalgBackend Context::linalgPreferredBackend() const {
@ -483,8 +504,8 @@ at::BlasBackend Context::blasPreferredBackend() {
#if ROCM_VERSION >= 60300
"gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
#endif
#if ROCM_VERSION >= 70000
"gfx950", "gfx1150", "gfx1151"
#if ROCM_VERSION >= 60500
"gfx950"
#endif
};
for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {

View File

@ -25,27 +25,17 @@
#include <c10/util/CallOnce.h>
#include <c10/util/Exception.h>
#include <c10/util/env.h>
#include <c10/util/hash.h>
#include <c10/util/irange.h>
#include <cstdint>
#include <map>
#include <mutex>
#include <unordered_map>
namespace at {
class Tensor;
enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM };
enum class TORCH_API Float32Backend { GENERIC, CUDA, MKLDNN };
enum class TORCH_API Float32Op { ALL, CONV, RNN, MATMUL };
enum class TORCH_API Float32Precision { NONE, IEEE, TF32, BF16 };
TORCH_API Float32Backend str2backend(const std::string& name);
TORCH_API Float32Op str2op(const std::string& name);
TORCH_API Float32Precision str2precision(const std::string& name);
TORCH_API std::string precision2str(Float32Precision prec);
class TORCH_API Context {
public:
@ -320,7 +310,13 @@ class TORCH_API Context {
//
// * Throw an error when `Context::deterministicAlgorithms()` is true. Most
// of the time, this should be accomplished by calling
// `at::globalContext().alertNotDeterminstic().
// `at::globalContext().alertNotDeterminstic()`. However, if the
// nondeterministic behavior is caused by the CuBLAS workspace
// configuration in CUDA >= 10.2,
// `at::globalContext().alertCuBLASConfigNotDeterministic()` should be
// called instead (in this case, a comment explaining why the operation is
// nondeterministic is not necessary). See below for details on these
// methods.
//
// * Have an entry in the list of nondeterministic PyTorch operations in the
// docstring of `use_deterministic_algorithms()` in torch/__init__.py
@ -344,19 +340,27 @@ class TORCH_API Context {
// Throws an error if `Context::deterministicAlgorithms()` is true
static void alertNotDeterministic(std::string_view const& caller);
// Throws an error if `Context::deterministicAlgorithms()` is true, CUDA
// >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or
// ":4096:8". For more details:
// https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
void alertCuBLASConfigNotDeterministic() const;
void setFloat32MatmulPrecision(const std::string& s);
void setFloat32Precision(
Float32Backend backend,
Float32Op op,
Float32Precision p);
bool allowTF32CuDNN(std::optional<Float32Op> op = std::nullopt) const;
const std::string& backend,
const std::string& op,
const std::string& s);
bool allowTF32CuDNN(const std::string& op = std::string()) const;
void setAllowTF32CuDNN(bool);
bool allowTF32OneDNN() const;
void setAllowTF32OneDNN(bool);
bool allowTF32CuBLAS() const;
void setAllowTF32CuBLAS(bool);
Float32MatmulPrecision float32MatmulPrecision() const;
Float32Precision float32Precision(Float32Backend backend, Float32Op op) const;
std::string float32Precision(
const std::string& backend,
const std::string& op) const;
bool allowFP16ReductionCuBLAS() const;
void setAllowFP16ReductionCuBLAS(bool);
bool allowBF16ReductionCuBLAS() const;
@ -425,6 +429,7 @@ class TORCH_API Context {
}
private:
static bool checkCuBLASConfigDeterministic();
std::array<c10::once_flag, at::COMPILE_TIME_MAX_DEVICE_TYPES> init_;
bool enabled_cudnn = true;
bool deterministic_cudnn = false;
@ -483,20 +488,21 @@ class TORCH_API Context {
bool enable_sparse_tensor_invariant_checks = false;
bool allow_fp16_reduction_cpu = false;
using Key = std::pair<Float32Backend, Float32Op>;
std::unordered_map<Key, Float32Precision, c10::hash<Key>> fp32_precision = {
{{Float32Backend::GENERIC, Float32Op::ALL}, Float32Precision::NONE},
{{Float32Backend::MKLDNN, Float32Op::ALL}, Float32Precision::NONE},
{{Float32Backend::MKLDNN, Float32Op::CONV}, Float32Precision::NONE},
{{Float32Backend::MKLDNN, Float32Op::RNN}, Float32Precision::NONE},
{{Float32Backend::MKLDNN, Float32Op::MATMUL}, Float32Precision::NONE},
{{Float32Backend::CUDA, Float32Op::ALL}, Float32Precision::NONE},
{{Float32Backend::CUDA, Float32Op::CONV}, Float32Precision::TF32},
{{Float32Backend::CUDA, Float32Op::RNN}, Float32Precision::TF32},
{{Float32Backend::CUDA, Float32Op::MATMUL},
float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST
? Float32Precision::NONE
: Float32Precision::TF32},
std::map<std::string, std::map<std::string, std::string>> fp32_precision = {
{"generic", {{"all", "none"}}},
{"mkldnn",
{{"matmul", "none"},
{"conv", "none"},
{"rnn", "none"},
{"all", "none"}}},
{"cuda",
{{"matmul",
float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST
? "none"
: "tf32"},
{"conv", "tf32"},
{"rnn", "tf32"},
{"all", "none"}}},
};
Allocator* prev_allocator_ptr_{nullptr};
@ -678,4 +684,5 @@ struct TORCH_API ROCmBackwardPassGuard {
~ROCmBackwardPassGuard();
static bool is_backward_pass();
};
} // namespace at

View File

@ -179,7 +179,7 @@ void propagate_names_except(const Tensor& result, const Tensor& src, IntArrayRef
return;
}
const auto src_names = src.names();
const auto result_dim = result.dim();
const auto result_dim = static_cast<int64_t>(result.dim());
const auto src_dim = static_cast<int64_t>(src_names.size());
const auto excluded_dim = static_cast<int64_t>(excluded_idxs.size());
TORCH_INTERNAL_ASSERT(src_dim - excluded_dim == result_dim);

View File

@ -214,7 +214,7 @@ inline Tensor applySlice(
"step must be greater than zero");
// See NOTE [nested tensor size for indexing]
if (self_sizes.has_value() && !self_sizes.value().empty()) {
if (self_sizes.has_value() && self_sizes.value().size() > 0) {
// Skip this optimization if we are tracing, as the trace may be polymorphic
// over the shape of the `self` tensor, and we still want to record
// the slice.

View File

@ -273,11 +273,11 @@ void checkLayout(CheckedFrom c, at::ArrayRef<Tensor> tensors, at::Layout layout)
}
void * maybe_data_ptr(const Tensor& tensor) {
return tensor.defined() ? tensor.data_ptr() : nullptr;
return tensor.defined() ? (void *)tensor.data_ptr() : nullptr;
}
void * maybe_data_ptr(const TensorArg& tensor) {
return tensor->defined() ? tensor->data_ptr() : nullptr;
return tensor->defined() ? (void *)tensor->data_ptr() : nullptr;
}
void check_dim_size(

View File

@ -50,46 +50,6 @@ namespace {
constexpr size_t MAX_SIZE_INDEX = 64;
}
// A large reserved pinned memory segment that is created in advance which is used
// to allocate small pinned memory requests to avoid calling into expensive APIs.
// We never free this memory and move up the pointer as we allocate new blocks
// and when blocks are freed, they are cached in the free lists.
struct PinnedReserveSegment {
PinnedReserveSegment(void *start, size_t size) : start_(start), size_(size),
current_ptr_(start_), initialized_(true) {}
PinnedReserveSegment() : start_(nullptr), size_(0), current_ptr_(nullptr), initialized_(false) {}
bool initialized() {
return initialized_;
}
void* allocate(size_t bytes) {
std::lock_guard<std::mutex> guard(mutex_);
// Round up the requested size to 4KB boundary for all including the small ones.
size_t rounded_bytes = (bytes + 4096 - 1) & ~(4096 - 1);
if (((uint8_t*)current_ptr_ + rounded_bytes) > ((uint8_t*)start_ + size_)) {
return nullptr;
}
void* ptr = current_ptr_;
current_ptr_ = (uint8_t*)current_ptr_ + rounded_bytes;
return ptr;
}
bool owns(void* ptr) {
return ptr >= start_ && ptr < (uint8_t*)start_ + size_;
}
std::mutex mutex_;
void* start_;
size_t size_;
void* current_ptr_;
bool initialized_;
};
// Struct containing memory allocator summary statistics for host.
struct TORCH_API HostStats {
// COUNT: total allocations (active)
@ -243,21 +203,7 @@ struct CachingHostAllocatorImpl {
// background.
if (!pinned_use_background_threads()) {
process_events();
}
// Round up the allocation to the nearest power of two to improve reuse.
// These power of two sizes are also used to index into the free list.
size_t roundSize = c10::llvm::PowerOf2Ceil(size);
// First, try to allocate from the free list
auto* block = get_free_block(roundSize);
if (block) {
return {block->ptr_, reinterpret_cast<void*>(block)};
}
// Check in the recently freed blocks with pending events to see if we
// can reuse them. Call get_free_block again after processing events
if (pinned_use_background_threads()) {
} else {
// Launch the background thread and process events in a loop.
static bool background_thread_flag [[maybe_unused]] = [this] {
getBackgroundThreadPool()->run([&]() {
@ -270,6 +216,16 @@ struct CachingHostAllocatorImpl {
}();
}
// Round up the allocation to the nearest power of two to improve reuse.
// These power of two sizes are also used to index into the free list.
size_t roundSize = c10::llvm::PowerOf2Ceil(size);
// First, try to allocate from the free list
auto* block = get_free_block(roundSize);
if (block) {
return {block->ptr_, reinterpret_cast<void*>(block)};
}
// Slow path: if we can't allocate from the cached free list, we need
// to create a new block.
void* ptr = nullptr;

View File

@ -117,7 +117,7 @@ C10_HOST_DEVICE inline T cauchy(T val, T median, T sigma) {
template <>
C10_HOST_DEVICE inline double cauchy(double val, double median, double sigma) {
// https://en.wikipedia.org/wiki/Cauchy_distribution#Cumulative_distribution_function
return median + sigma * at::tan(c10::pi<double> * (val - 0.5));
return median + sigma * at::tan(c10::pi<double> * (val - static_cast<double>(0.5)));
}
/**

View File

@ -2,7 +2,7 @@
namespace c10 {
inline BoxedKernel::BoxedKernel() : boxed_kernel_func_(nullptr) {}
inline BoxedKernel::BoxedKernel() : functor_(), boxed_kernel_func_(nullptr) {}
inline BoxedKernel::BoxedKernel(
std::unique_ptr<OperatorKernel> functor,

View File

@ -20,7 +20,9 @@ make_unique_base(Args&&... args) {
} // namespace detail
inline KernelFunction::KernelFunction()
: unboxed_kernel_func_(nullptr), sym_unboxed_kernel_func_(nullptr) {}
: boxed_kernel_func_(),
unboxed_kernel_func_(nullptr),
sym_unboxed_kernel_func_(nullptr) {}
inline KernelFunction::~KernelFunction() {
if (tokens_) {

View File

@ -76,7 +76,13 @@ void _print_dispatch_trace(const std::string& label, const std::string& op_name,
OpRegistrationListener::~OpRegistrationListener()= default;
Dispatcher::Dispatcher(): backendFallbackKernels_(), listeners_(std::make_unique<detail::RegistrationListenerList>()), guard_(std::make_shared<Guard>())
Dispatcher::Dispatcher()
: operators_()
, operatorLookupTable_()
, backendFallbackKernels_()
, listeners_(std::make_unique<detail::RegistrationListenerList>())
, cond_var_()
, guard_(std::make_shared<Guard>())
{}
Dispatcher::~Dispatcher() {

View File

@ -96,7 +96,7 @@ class TORCH_API Dispatcher final {
friend class TypedOperatorHandle;
struct Guard final {
Guard() : alive(true) {}
Guard() : alive(true), mutex() {}
std::atomic<bool> alive;
std::mutex mutex;
};

View File

@ -62,7 +62,17 @@ static const auto& getDispatchTableIndexToKey() {
}
OperatorEntry::OperatorEntry(OperatorName&& operator_name)
: name_(std::move(operator_name)), dispatchTable_(), dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized()), is_observed_(ObservedOperators::isObserved(name_))
: name_(std::move(operator_name))
, schema_()
#ifndef C10_MOBILE
, tags_()
#endif
, dispatchTable_()
, dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized())
, kernels_()
, cpp_signature_()
, sym_cpp_signature_()
, is_observed_(ObservedOperators::isObserved(name_))
{
// Pick up any backend fallbacks that were registered prior to this
// OperatorEntry being created.

View File

@ -114,7 +114,7 @@ constexpr bool allowlist_contains(std::string_view allowlist, std::string_view i
}
next++;
} else {
if (allowlist.substr(cur) == item) {
if (allowlist.substr(cur).compare(item) == 0) {
return true;
}
break;

View File

@ -73,7 +73,7 @@ c10::FunctionSchema RegisterOperators::inferSchemaFromKernels_(
std::optional<FunctionSchema> inferred_schema = std::nullopt;
for (const auto& kernel : options.kernels) {
if (nullptr != kernel.inferred_function_schema) {
if (nullptr != kernel.inferred_function_schema.get()) {
if (!inferred_schema.has_value()) {
inferred_schema = *kernel.inferred_function_schema;
break;

View File

@ -411,6 +411,7 @@ public:
Options()
: schemaOrName_(std::nullopt)
, kernels()
, aliasAnalysisKind_(std::nullopt)
{}
@ -419,6 +420,7 @@ public:
struct KernelRegistrationConfig final {
KernelRegistrationConfig()
: dispatch_key(std::nullopt)
, func()
, cpp_signature(std::nullopt)
, inferred_function_schema(nullptr)
{}

View File

@ -905,7 +905,7 @@ class Vectorized8 : public Vectorizedi {
// Because loadu(const void* ptr, T count) requires zero initialization for
// upper 128 bits. However, by using _mm256_castsi128_si256, the upper 128
// bits of the result are undefined.
// TODO<leslie> We can use _mm256_zextsi128_si256 in the future,
// TODO<leslie> We can use _mm256_zextsi128_si256 in the furture,
// since gcc 9.3 doesn't support it now.
__m128i input_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr));
return _mm256_castsi128_si256(input_128);
@ -1844,7 +1844,7 @@ Vectorized<int16_t> inline shift_256_16(
c0 = _mm256_srav_epi32(a0, b0);
c0 = _mm256_shuffle_epi8(c0, ctl_1_0);
// Perform shifting the same way for input array elements with
// Peform shifting the same way for input array elements with
// idx%2==1.
__m256i a1 = _mm256_and_si256(a, keep_1);
__m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
@ -2180,7 +2180,7 @@ Vectorized<T> inline shift_256_8(
c0 = _mm256_srlv_epi32(a0, b0);
c0 = _mm256_shuffle_epi8(c0, ctl_3_0);
// Perform shifting the same way for input array elements with
// Peform shifting the same way for input array elements with
// idx%4==1.
__m256i a1 = _mm256_shuffle_epi8(a, ctl_1_3);
__m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
@ -2193,7 +2193,7 @@ Vectorized<T> inline shift_256_8(
c1 = _mm256_srlv_epi32(a1, b1);
c1 = _mm256_shuffle_epi8(c1, ctl_3_1);
// Perform shifting the same way for input array elements with
// Peform shifting the same way for input array elements with
// idx%4==2.
__m256i a2 = _mm256_shuffle_epi8(a, ctl_2_3);
__m256i b2 = _mm256_shuffle_epi8(b, ctl_2_0);
@ -2206,7 +2206,7 @@ Vectorized<T> inline shift_256_8(
c2 = _mm256_srlv_epi32(a2, b2);
c2 = _mm256_shuffle_epi8(c2, ctl_3_2);
// Perform shifting the same way for input array elements with
// Peform shifting the same way for input array elements with
// idx%4==3.
__m256i a3 = _mm256_and_si256(a, keep_3);
__m256i b3 = _mm256_shuffle_epi8(b, ctl_3_0);

View File

@ -1088,7 +1088,7 @@ class Vectorized8 : public Vectorizedi {
// Because loadu(const void* ptr, T count) requires zero initialization for
// upper 384 bits. However, by using _mm512_castsi128_si512, the upper 384
// bits of the result are undefined.
// TODO<leslie> We can use _mm512_zextsi128_si512 in the future,
// TODO<leslie> We can use _mm512_zextsi128_si512 in the furture,
// since gcc 9.3 doesn't support it now.
__m128i input_128 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
return _mm512_castsi128_si512(input_128);
@ -2022,7 +2022,7 @@ Vectorized<T> inline shift_512_8(
c0 = _mm512_srlv_epi16(a0, b0);
c0 = _mm512_shuffle_epi8(c0, ctl_1_0);
// Perform shifting the same way for input array elements with
// Peform shifting the same way for input array elements with
// idx%2==1.
__m512i a1 = _mm512_and_si512(a, keep_1);
__m512i b1 = _mm512_shuffle_epi8(b, ctl_1_0);

View File

@ -323,7 +323,7 @@ class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor<
descriptor_.reset(raw_descriptor);
}
template <typename T>
void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) {
inline void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) {
// NOLINTNEXTLINE(bugprone-sizeof-expression)
TORCH_CUDABLAS_CHECK(::cublasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(value)));
}
@ -345,7 +345,7 @@ class CuBlasLtMatrixLayout : public CuBlasLtDescriptor<
descriptor_.reset(raw_descriptor);
}
template <typename T>
void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) {
inline void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) {
TORCH_CUDABLAS_CHECK(::cublasLtMatrixLayoutSetAttribute(descriptor(), attr, &value, sizeof(T)));
}
};
@ -360,7 +360,7 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
descriptor_.reset(raw_descriptor);
}
template <typename T>
void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) {
inline void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) {
TORCH_CUDABLAS_CHECK(::cublasLtMatmulPreferenceSetAttribute(descriptor(), attr, &value, sizeof(T)));
}
};
@ -395,7 +395,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
computeType = CUBLAS_COMPUTE_64F;
scaleType = CUDA_R_64F;
} else if constexpr (std::is_same_v<Dtype, float>) {
if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
}
} else if constexpr (std::is_same_v<Dtype, c10::complex<double>>) {
@ -440,6 +440,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublaslt: not implemented");
}
globalContext().alertCuBLASConfigNotDeterministic();
cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -573,6 +574,8 @@ inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_D
template <>
void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -584,6 +587,8 @@ void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
template <>
void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -595,6 +600,8 @@ void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
template <>
void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -608,6 +615,8 @@ void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::co
template <>
void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -621,6 +630,8 @@ void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::com
template <typename C_Dtype>
inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -692,6 +703,8 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
template <typename C_Dtype>
inline void bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
BGEMM_CHECK_ARGVALUES(at::BFloat16);
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
@ -1015,6 +1028,8 @@ inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dty
template <>
void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1026,6 +1041,8 @@ void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
template <>
void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1037,6 +1054,8 @@ void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
template <>
void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1050,6 +1069,8 @@ void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::comp
template <>
void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1063,6 +1084,8 @@ void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::compl
template <typename C_Dtype>
inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1171,6 +1194,7 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
template <typename C_Dtype>
inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t opa = _cublasOpFromChar(transa);
cublasOperation_t opb = _cublasOpFromChar(transb);
@ -1270,7 +1294,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
}
#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
if (at::detail::getCUDAHooks().isGPUArch({"gfx11", "gfx12"})) { //no CK GEMM version
if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100
gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
} else{
at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
@ -1559,7 +1583,7 @@ bool gemm_and_bias(
computeType = CUBLAS_COMPUTE_64F;
scaleType = CUDA_R_64F;
} else if constexpr (std::is_same_v<Dtype, float>) {
if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
}
} else if constexpr (std::is_same_v<Dtype, at::Half>) {
@ -2384,6 +2408,8 @@ void trsmBatched<c10::complex<double>>(
template <>
void gemv<c10::complex<double>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<double>)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t op = _cublasOpFromChar(trans);
_cublasAdjustLdLevel2(m, n, &lda);
@ -2399,6 +2425,8 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
// gemv is bw bound, and does not benefit from TF32. But the precision
// loss still happens on TF32. So we disable it here.
NoTF32Guard disable_tf32;
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t op = _cublasOpFromChar(trans);
_cublasAdjustLdLevel2(m, n, &lda);
@ -2411,6 +2439,8 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
template <>
void gemv<double>(CUDABLAS_GEMV_ARGTYPES(double)) {
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t op = _cublasOpFromChar(trans);
_cublasAdjustLdLevel2(m, n, &lda);
@ -2424,6 +2454,8 @@ void gemv<float>(CUDABLAS_GEMV_ARGTYPES(float)) {
// gemv is bw bound, and does not benefit from TF32. But the precision
// loss still happens on TF32. So we disable it here.
NoTF32Guard disable_tf32;
// See Note [Writing Nondeterministic Operations]
globalContext().alertCuBLASConfigNotDeterministic();
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
cublasOperation_t op = _cublasOpFromChar(trans);
_cublasAdjustLdLevel2(m, n, &lda);

View File

@ -109,7 +109,7 @@ void CUDAGeneratorState::increase(uint64_t increment) {
offset_intragraph_ % 4 == 0, "RNG offset must be a multiple of 4.");
// Ensures the increment does not cause overflow.
TORCH_INTERNAL_ASSERT(
offset_intragraph_ <= std::numeric_limits<uint64_t>::max() - increment,
offset_intragraph_ <= std::numeric_limits<uint32_t>::max() - increment,
"Increment causes overflow in the offset value.");
offset_intragraph_ += increment;
} else {
@ -461,7 +461,7 @@ void CUDAGeneratorImpl::unregister_graph(cuda::CUDAGraph* graph) {
*/
PhiloxCudaState CUDAGeneratorImpl::philox_cuda_state(uint64_t increment) {
if (at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None) {
uint64_t offset = state_->offset_intragraph_;
uint32_t offset = state_->offset_intragraph_;
state_->increase(increment);
return PhiloxCudaState(
state_->seed_extragraph_.data_ptr<int64_t>(),

View File

@ -96,16 +96,16 @@ struct CUDAGraph;
struct CUDAGeneratorState : public c10::intrusive_ptr_target {
uint64_t seed_;
uint64_t philox_offset_per_thread_;
uint64_t offset_intragraph_;
uint32_t offset_intragraph_;
bool capturing_{};
std::unordered_set<cuda::CUDAGraph*> registered_graphs_;
at::TensorBase seed_extragraph_;
at::TensorBase offset_extragraph_;
at::TensorBase seed_extragraph_{};
at::TensorBase offset_extragraph_{};
CUDAGeneratorState(
uint64_t seed = default_rng_seed_val,
uint64_t philox_offset_per_thread = 0,
uint64_t offset_intragraph = 0)
uint32_t offset_intragraph = 0)
: seed_(seed),
philox_offset_per_thread_(philox_offset_per_thread),
offset_intragraph_(offset_intragraph) {}
@ -167,7 +167,7 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
CUDAGeneratorImpl* clone_impl() const override;
c10::intrusive_ptr<CUDAGeneratorState> state_;
std::atomic_flag no_reset_rnn_state_;
std::atomic_flag no_reset_rnn_state_{};
};
namespace cuda::detail {

View File

@ -56,7 +56,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
// the ID assigned by cuda during graph capture,
// used to identify when a stream is participating in capture
CaptureId_t capture_id_ = 0;
CaptureId_t capture_id_ = -1;
// uuid used to request a particular private mempool from CUDACachingAllocator.
// By default, this will be set to {id_, 0}.

View File

@ -6,15 +6,43 @@
#define HIPSPARSE_VERSION ((hipsparseVersionMajor*100000) + (hipsparseVersionMinor*100) + hipsparseVersionPatch)
#endif
// cuSparse Generic API added in CUDA 10.1
// Windows support added in CUDA 11.0
#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && ((CUSPARSE_VERSION >= 10300) || (CUSPARSE_VERSION >= 11000 && defined(_WIN32)))
#define AT_USE_CUSPARSE_GENERIC_API() 1
#else
#define AT_USE_CUSPARSE_GENERIC_API() 0
#endif
// cuSparse Generic API descriptor pointers were changed to const in CUDA 12.0
#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \
(CUSPARSE_VERSION < 12000)
#define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 1
#else
#define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 0
#endif
#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \
(CUSPARSE_VERSION >= 12000)
#define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 1
#else
#define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 0
#endif
#if defined(USE_ROCM)
// hipSparse const API added in v2.4.0
#if HIPSPARSE_VERSION >= 200400
#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 1
#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0
#define AT_USE_HIPSPARSE_GENERIC_API() 1
#else
#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0
#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 1
#define AT_USE_HIPSPARSE_GENERIC_API() 1
#endif
#else // USE_ROCM
#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0
#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0
#define AT_USE_HIPSPARSE_GENERIC_API() 0
#endif // USE_ROCM

View File

@ -12,6 +12,8 @@ cusparseStatus_t destroyConstDnMat(const cusparseDnMatDescr* dnMatDescr) {
return cusparseDestroyDnMat(const_cast<cusparseDnMatDescr*>(dnMatDescr));
}
#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
namespace {
// If a specific GPU model does not provide native support for a given data
@ -208,4 +210,6 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input, int6
descriptor_.reset(raw_descriptor);
}
#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
} // namespace at::cuda::sparse

View File

@ -35,6 +35,7 @@ class CuSparseDescriptor {
std::unique_ptr<T, CuSparseDescriptorDeleter<T, destructor>> descriptor_;
};
#if AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
template <typename T, cusparseStatus_t (*destructor)(const T*)>
struct ConstCuSparseDescriptorDeleter {
void operator()(T* x) {
@ -57,6 +58,7 @@ class ConstCuSparseDescriptor {
protected:
std::unique_ptr<T, ConstCuSparseDescriptorDeleter<T, destructor>> descriptor_;
};
#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS || AT_USE_HIPSPARSE_CONST_DESCRIPTORS
#if defined(USE_ROCM)
using cusparseMatDescr = std::remove_pointer_t<hipsparseMatDescr_t>;
@ -121,8 +123,39 @@ class TORCH_CUDA_CPP_API CuSparseBsrsm2Info
#endif // AT_USE_HIPSPARSE_TRIANGULAR_SOLVE
#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type);
#if AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS()
class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
: public CuSparseDescriptor<cusparseDnMatDescr, &cusparseDestroyDnMat> {
public:
explicit CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1);
};
class TORCH_CUDA_CPP_API CuSparseConstDnMatDescriptor
: public CuSparseDescriptor<const cusparseDnMatDescr, &destroyConstDnMat> {
public:
explicit CuSparseConstDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1);
cusparseDnMatDescr* unsafe_mutable_descriptor() const {
return const_cast<cusparseDnMatDescr*>(descriptor());
}
cusparseDnMatDescr* unsafe_mutable_descriptor() {
return const_cast<cusparseDnMatDescr*>(descriptor());
}
};
class TORCH_CUDA_CPP_API CuSparseDnVecDescriptor
: public CuSparseDescriptor<cusparseDnVecDescr, &cusparseDestroyDnVec> {
public:
explicit CuSparseDnVecDescriptor(const Tensor& input);
};
class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor
: public CuSparseDescriptor<cusparseSpMatDescr, &cusparseDestroySpMat> {};
#elif AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
: public ConstCuSparseDescriptor<
cusparseDnMatDescr,
@ -161,6 +194,7 @@ cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type);
: public ConstCuSparseDescriptor<
cusparseSpMatDescr,
&cusparseDestroySpMat> {};
#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
class TORCH_CUDA_CPP_API CuSparseSpMatCsrDescriptor
: public CuSparseSpMatDescriptor {
@ -249,4 +283,6 @@ class TORCH_CUDA_CPP_API CuSparseSpGEMMDescriptor
}
};
#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
} // namespace at::cuda::sparse

View File

@ -9,6 +9,7 @@
#include <cuda_runtime_api.h>
#include <future>
#include <unordered_map>
namespace at::cuda {
namespace {
@ -71,20 +72,9 @@ using Block = HostBlock<CUDAStream>;
struct CUDACachingHostAllocatorImpl
: public CachingHostAllocatorImpl<CUDAStream, EventPool::Event> {
private:
ska::flat_hash_map<void*, bool> use_host_register;
std::unordered_map<void*, bool> use_host_register;
void allocate_host_memory(size_t size, void** ptr) override {
// try allocating from reserve segment first before calling into expensive APIs
if (get_reserve_segment().initialized()) {
*ptr = get_reserve_segment().allocate(size);
if (*ptr != nullptr) {
return;
}
}
allocate_host_memory_slowpath(size, ptr);
}
void allocate_host_memory_slowpath(size_t size, void** ptr) {
// Pinned memory pointers allocated by any device can be directly used by
// any other device, regardless of the current device at the time of
// allocation, since we assume unified addressing. So we grab any existing
@ -123,18 +113,6 @@ struct CUDACachingHostAllocatorImpl
}
void free_block(Block* block) override {
// We never free blocks from the reserve segment
if (get_reserve_segment().initialized()) {
// Check if the block is from the reserve segment
if (get_reserve_segment().owns(block->ptr_)) {
return;
}
}
free_block_slowpath(block);
}
void free_block_slowpath(Block* block) {
auto start = std::chrono::steady_clock::now();
// Users may change the allocator config at will. torch unit tests do this.
// However, allocations using cudaHostRegister should use corresonding
@ -194,20 +172,6 @@ struct CUDACachingHostAllocatorImpl
return event_pool->get(idx);
}
PinnedReserveSegment& get_reserve_segment() {
static auto reserve_segment = [&]() {
if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() > 0) {
void *ptr;
size_t sz = c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() * 1024 * 1024;
allocate_host_memory_slowpath(sz, &ptr);
return PinnedReserveSegment(ptr, sz);
} else {
return PinnedReserveSegment();
}
} ();
return reserve_segment;
}
TaskThreadPool* getThreadPool() {
static TaskThreadPool* pool = new TaskThreadPool(
static_cast<int>(c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
@ -222,15 +186,15 @@ struct CUDACachingHostAllocatorImpl
size_t numThreads,
size_t pageSize) {
uintptr_t start = (uintptr_t)ptr + (size * i / numThreads);
uintptr_t end = start + (size / numThreads);
uintptr_t end = (uintptr_t)start + (size / numThreads);
if (i == (numThreads - 1)) {
end = (uintptr_t)ptr + size;
}
// pre-fault/map the pages by setting the first byte of the page
uintptr_t alignedStart =
((start + pageSize - 1) & ~(pageSize - 1));
for (uintptr_t p = alignedStart; p < (end); p += pageSize) {
(((uintptr_t)start + pageSize - 1) & ~(pageSize - 1));
for (uintptr_t p = alignedStart; p < ((uintptr_t)end); p += pageSize) {
// NOLINTNEXTLINE(performance-no-int-to-ptr)
memset((void*)p, 0, 1);
}

View File

@ -310,7 +310,7 @@ cublasHandle_t getCurrentCUDABlasHandle() {
// FP32 data type calculations based on the value of the allow_tf32 flag.
// To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH.
if (!NoTF32Guard::should_disable_tf32() &&
at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));
} else {
TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));

View File

@ -122,7 +122,7 @@ struct DeviceThreadHandlePool : public std::enable_shared_from_this<DeviceThread
// Called by the destructor. Releases this thread's handles back into the pool.
void release() {
if(!my_handles.empty()) {
if(my_handles.size() > 0) {
auto parent = weak_parent.lock();
if (!parent) {
// If this thread exits after atexit handlers have completed, the

View File

@ -19,7 +19,7 @@ struct PhiloxCudaState {
// Called if graph capture is underway
PhiloxCudaState(int64_t* seed,
int64_t* offset_extragraph,
uint64_t offset_intragraph) {
uint32_t offset_intragraph) {
seed_.ptr = seed;
offset_.ptr = offset_extragraph;
offset_intragraph_ = offset_intragraph;
@ -36,7 +36,7 @@ struct PhiloxCudaState {
Payload seed_{};
Payload offset_{};
uint64_t offset_intragraph_ = 0;
uint32_t offset_intragraph_ = 0;
bool captured_ = false;
};

View File

@ -162,7 +162,7 @@ inline std::string ComputeTypeFor() {
// ROCBLAS and hipBLASLt.
template <>
inline std::string ComputeTypeFor<float>() {
if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) != at::Float32Precision::TF32) {
if (at::globalContext().float32Precision("cuda", "matmul") != "tf32") {
return "f32_r";
} else {
return "xf32_r";

View File

@ -506,7 +506,7 @@ class HipblasltGemmOp : public Callable<ParamsT> {
}
hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
}
HipBlasLtMatmulDescriptor matmul(computeType, HIP_R_32F);

View File

@ -141,7 +141,7 @@ class RocblasGemmOp : public Callable<GemmParams<T>> {
TuningStatus Call(const GemmParams<T>* params) override {
auto input_output_type = RocBlasDataTypeFor<T>();
if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r)
if (at::globalContext().float32Precision("cuda", "matmul") == "tf32" && input_output_type == rocblas_datatype_f32_r)
return FAIL; // no support for TF32 in rocBLAS
auto compute_type = RocBlasComputeTypeFor<T>();
auto h_a = DoCastForHalfOrBfloat16(params->alpha);
@ -209,7 +209,7 @@ class RocblasGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>>
TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
auto input_output_type = RocBlasDataTypeFor<T>();
if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r)
if (at::globalContext().float32Precision("cuda", "matmul") == "tf32" && input_output_type == rocblas_datatype_f32_r)
return FAIL; // no support for TF32 in rocBLAS
auto compute_type = RocBlasComputeTypeFor<T>();
auto h_a = DoCastForHalfOrBfloat16(params->alpha);

View File

@ -404,6 +404,8 @@ TuningContext::TuningContext() :
max_warmup_iterations_{0},
icache_flush_{true},
rotating_buffer_size_{-1},
filename_{},
untuned_file_{},
results_count_from_input_file_{0},
is_shutting_down_{false}
{

View File

@ -141,7 +141,7 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
size[i] = (int) t.size(i);
}
for (const auto i : c10::irange(dim, pad)) {
size[i] = 1;
size[i] = (int) 1;
}
dim = std::max(dim, pad);
cudnnTensorFormat_t filter_format{};

View File

@ -176,7 +176,7 @@ struct LinalgCheckMatrixUnaryRuleHelper;
template <char const *op_name, typename F, F Func, typename A, typename... T>
struct LinalgCheckMatrixUnaryRuleHelper<op_name, F, Func, typelist<A, T...>> {
static Tensor check_and_reshape_input(const Tensor& tensor, std::optional<int64_t> batch_dim) {
static inline Tensor check_and_reshape_input(const Tensor& tensor, std::optional<int64_t> batch_dim) {
TORCH_CHECK(rankWithoutBatchDim(tensor, batch_dim) >= 2, op_name, ": The input tensor A must have at least 2 dimensions.");
return moveBatchDimToFront(tensor, batch_dim);
}
@ -222,7 +222,7 @@ struct LinalgCheckMatrixBinaryRuleHelper;
template <char const *op_name, typename F, F Func, typename A, typename B, typename... T>
struct LinalgCheckMatrixBinaryRuleHelper<op_name, F, Func, typelist<A, B, T...>> {
static std::tuple<Tensor, Tensor> check_inputs_and_reshape_inputs(
static inline std::tuple<Tensor, Tensor> check_inputs_and_reshape_inputs(
const Tensor& first, std::optional<int64_t> first_bdim,
const Tensor& second, std::optional<int64_t> second_bdim) {
TORCH_CHECK(rankWithoutBatchDim(first, first_bdim) >= 2,

View File

@ -58,7 +58,7 @@ scalar_t dot_impl(int64_t n, const scalar_t *x, int64_t incx, const scalar_t *y,
template<typename scalar_t>
scalar_t vdot_impl(int64_t n, const scalar_t *x, int64_t incx, const scalar_t *y, int64_t incy);
static constexpr bool lda_cond(int64_t m, int64_t n, int64_t lda) {
static constexpr inline bool lda_cond(int64_t m, int64_t n, int64_t lda) {
return n == 1 || lda >= std::max<int64_t>(1L, m);
}

View File

@ -991,7 +991,7 @@ std::size_t UnsafeUkernelKeyHasher<PackKey>::operator()(const PackKey& key) cons
template <typename key_t, typename value_t>
struct KernelCache {
using kstore_t = std::unordered_map<key_t, std::shared_ptr<value_t>, UnsafeUkernelKeyHasher<key_t>>;
static std::shared_ptr<value_t>&& fetch_or_create(
static inline std::shared_ptr<value_t>&& fetch_or_create(
const key_t& key,
const std::function<std::shared_ptr<value_t>()>& callback) {
auto&& search = get_store().find(key);
@ -1003,7 +1003,7 @@ struct KernelCache {
}
}
static kstore_t& get_store() {
static inline kstore_t& get_store() {
static thread_local kstore_t cache_kernels;
return cache_kernels;
}
@ -1067,7 +1067,7 @@ struct GemmHelper {
struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
// Fetch/create GemmHelper object and execute brgemm with batch size = 1
template <typename scalar_t_a, typename scalar_t_b, typename scalar_t_c>
static void call(
static inline void call(
int64_t M,
int64_t N,
int64_t K,
@ -1118,12 +1118,12 @@ struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
.execute(A, B, (*value).A_B_offsets, C, (*value).scratchpad.data());
}
static std::shared_ptr<GemmHelper>& get_current() {
static inline std::shared_ptr<GemmHelper>& get_current() {
static thread_local std::shared_ptr<GemmHelper> current;
return current;
}
static bool device_check(ScalarType dtype) {
static inline bool device_check(ScalarType dtype) {
if (!at::globalContext().userEnabledMkldnn()) {
return false;
}
@ -1153,7 +1153,7 @@ using pack_t = dnnl::ukernel::brgemm_pack_B;
using pack_t = dnnl::ukernel::transform;
#endif
struct Pack : public KernelCache <PackKey, pack_t> {
static void call(
static inline void call(
int64_t K,
int64_t N,
int64_t ld_in,
@ -1182,7 +1182,7 @@ struct Pack : public KernelCache <PackKey, pack_t> {
}
}
static bool could_pack(ScalarType dtype) {
static inline bool could_pack(ScalarType dtype) {
if (!at::globalContext().userEnabledMkldnn()) {
return false;
}

View File

@ -702,7 +702,7 @@ static void check_shape_forward(const at::Tensor& input,
// If kernel size is incorrect
std::ostringstream input_ss;
std::ostringstream kernel_ss;
std::string separator;
std::string separator = "";
for (int i = 0, len = input_shape.size(); i < len; ++i) {
input_ss << separator << input_shape[i];
@ -1019,7 +1019,7 @@ static Tensor convolution_same(
if (symmetric_padding) {
// All backends handle symmetric padding natively
SymDimVector output_padding(dim);
SymDimVector output_padding(static_cast<size_t>(dim));
return at::convolution_symint(input, weight, bias, stride, padding_l, dilation,
false, output_padding, groups);
}
@ -1039,7 +1039,7 @@ static Tensor convolution_same(
}
}
auto padded_input = at::constant_pad_nd_symint(input, pad_nd, 0);
SymDimVector output_padding(dim);
SymDimVector output_padding(static_cast<size_t>(dim));
return at::convolution_symint(padded_input, weight, bias, stride, padding_l,
dilation, false, output_padding, groups);
}
@ -1174,7 +1174,7 @@ at::Tensor convolution(
bool deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
return at::_convolution(input, weight, bias, stride, padding, dilation,
transposed, output_padding, groups,
ctx.benchmarkCuDNN(), deterministic, ctx.userEnabledCuDNN(), ctx.allowTF32CuDNN(at::Float32Op::CONV));
ctx.benchmarkCuDNN(), deterministic, ctx.userEnabledCuDNN(), ctx.allowTF32CuDNN("conv"));
}
at::Tensor convolution_overrideable(
@ -1319,7 +1319,7 @@ ConvBackend select_conv_backend(
params.benchmark = ctx.benchmarkCuDNN();
params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
params.cudnn_enabled = ctx.userEnabledCuDNN();
params.allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);
params.allow_tf32 = ctx.allowTF32CuDNN("conv");
auto input = input_r;
auto weight = weight_r;
@ -1699,7 +1699,7 @@ at::Tensor _convolution(
c10::MaybeOwned<Tensor> bias_r_maybe_owned = at::borrow_from_optional_tensor(bias_r_opt);
const Tensor& bias_r = *bias_r_maybe_owned;
return at::_convolution(input_r, weight_r, bias_r, stride_, padding_, dilation_, transposed_, output_padding_, groups_, benchmark, deterministic, cudnn_enabled, at::globalContext().allowTF32CuDNN(at::Float32Op::CONV));
return at::_convolution(input_r, weight_r, bias_r, stride_, padding_, dilation_, transposed_, output_padding_, groups_, benchmark, deterministic, cudnn_enabled, at::globalContext().allowTF32CuDNN("conv"));
}
std::tuple<Tensor, Tensor, Tensor> convolution_backward_overrideable(
@ -1997,7 +1997,7 @@ std::tuple<Tensor, Tensor, Tensor> convolution_backward(
params.benchmark = ctx.benchmarkCuDNN();
params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
params.cudnn_enabled = ctx.userEnabledCuDNN();
params.allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);
params.allow_tf32 = ctx.allowTF32CuDNN("conv");
// Validate inputs.
check_shape_backward(input, weight.sizes(), params);

View File

@ -1,5 +1,6 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/native/Copy.h>
#include <ATen/native/Copy.h>
#include <ATen/core/Tensor.h>
#include <ATen/Dispatch.h>

View File

@ -70,7 +70,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value)
new_shape.emplace_back(input_sizes[i]);
}
for (const auto i : c10::irange(l_pad)) {
for (const auto i : c10::irange((size_t)l_pad)) {
auto pad_idx = pad.size() - ((i + 1) * 2);
auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
TORCH_CHECK(new_dim >= 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",

View File

@ -47,7 +47,7 @@ int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar
int64_t sgn = (xstep > 0) - (xstep < 0);
size_d = std::ceil((xend - xstart + xstep - sgn) / xstep);
} else {
size_d = std::ceil((end.to<double>() - start.to<double>())
size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
/ step.to<double>());
}

View File

@ -107,6 +107,11 @@ void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes) {
storage->set_nbytes(size_bytes);
}
// Call the sparse implementation in SparseTensor.cpp directly.
// A dynamic dispatch here is NOT necessary, so I didn't put
// this function in native_functions.yaml
const Tensor& resize_as_sparse_(const Tensor& self, const Tensor& src);
// TODO(VitalyFedyunin): Move it to HTML docs.
//
// Strides of the output tensor of `resize_as_` operator is defined by input

View File

@ -145,6 +145,12 @@
#include <utility>
#include <vector>
namespace at::native {
AdvancedIndex make_info(Tensor self, IOptTensorListRef orig);
} // namespace at::native
namespace at::meta {
TORCH_META_FUNC(gather)

View File

@ -73,6 +73,7 @@
#include <ATen/ops/where_native.h>
#include <ATen/ops/zeros_like.h>
#include <iostream>
#include <utility>
#endif

View File

@ -124,7 +124,7 @@ struct IsUnique {};
template <typename scalar_t>
struct IsUnique<scalar_t, false> {
bool operator() (scalar_t* data_ptr, int64_t i) {
inline bool operator() (scalar_t* data_ptr, int64_t i) {
if (i == 0) { return true; }
return c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1]);
}
@ -132,7 +132,7 @@ struct IsUnique<scalar_t, false> {
template <typename scalar_t>
struct IsUnique<scalar_t, true> {
bool operator() (scalar_t* data_ptr, int64_t i) {
inline bool operator() (scalar_t* data_ptr, int64_t i) {
if (i == 0) { return true; }
return (c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1]))
&& !(_isnan(data_ptr[i]) && _isnan(data_ptr[i - 1]));

View File

@ -4,6 +4,7 @@
#include <ATen/OpMathType.h>
#include <ATen/TensorUtils.h>
#include <ATen/OpMathType.h>
#include <ATen/core/Tensor.h>
#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>

View File

@ -17,7 +17,7 @@
namespace ao::sparse {
int register_linear_params();
#ifdef USE_FBGEMM

View File

@ -20,7 +20,7 @@
namespace ao::sparse {
int register_linear_params();
#ifdef USE_FBGEMM
namespace {

View File

@ -16,7 +16,7 @@
#endif
namespace ao::sparse {
int register_linear_params();
#ifdef USE_FBGEMM

View File

@ -22,7 +22,7 @@ static inline void cpu_atomic_add_float(float* dst, float fvalue)
old_value.floatV = *dst;
new_value.floatV = old_value.floatV + fvalue;
unsigned* old_intV = &old_value.intV;
unsigned* old_intV = (unsigned*)(&old_value.intV);
while (!std::atomic_compare_exchange_strong(dst_intV, old_intV, new_value.intV)) {
#ifdef __aarch64__
__asm__ __volatile__("yield;" : : : "memory");

View File

@ -118,7 +118,7 @@ gemm_notrans_(
scale_(m, n, beta, c, ldc);
// c += alpha * (a @ b)
const uint64_t unsigned_m = m;
const uint64_t unsigned_m = static_cast<int64_t>(m);
const uint64_t i_m = unsigned_m / 4;
for (const uint64_t l : c10::irange(k)) {
for (const uint64_t j : c10::irange(n)) {

View File

@ -8,6 +8,7 @@
#include <c10/util/irange.h>
#include <ATen/OpMathType.h>
#include <ATen/native/cpu/utils.h>
#include <ATen/OpMathType.h>
namespace at::native {
inline namespace CPU_CAPABILITY {

View File

@ -17,6 +17,7 @@
#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <c10/util/irange.h>
#include <ATen/OpMathType.h>
// [Note AVX-SSE transitions] In general we avoid calls into cmath for code
// compiled with AVX/AVX2 This is because of SSE-AVX transitions and a bug in

View File

@ -240,7 +240,7 @@ static void unfolded2d_copy(
int64_t output_height,
int64_t output_width) {
at::parallel_for(
0, n_input_plane * kH * kW, 0, [&](int64_t start, int64_t end) {
0, (int64_t)n_input_plane * kH * kW, 0, [&](int64_t start, int64_t end) {
for (const auto k : c10::irange(start, end)) {
int64_t nip = k / (kH * kW);
int64_t rest = k % (kH * kW);
@ -316,7 +316,7 @@ static void unfolded2d_copy(
for (int64_t x = 0; x < output_width; x++)
memcpy(
dst + (size_t)y * output_width + x,
src + (size_t)iy * input_width + ix + x * dW,
src + (size_t)iy * input_width + ix + (int64_t)x * dW,
sizeof(scalar_t) * (1));
}
}

View File

@ -906,7 +906,7 @@ static void ref_dyn_quant_matmul_4bit_channelwise_kernel(
// Round to nearest integer
const int32_t nudged_zero_point0 = lrintf(zero_point0);
int8_t* dst_ptr = lhs_qa8dx + m_idx * dst_stride;
int8_t* dst_ptr = (int8_t*)lhs_qa8dx + m_idx * dst_stride;
// LHS offset at the beginning of the row
*((float*)(dst_ptr)) = recip_scale0;
@ -1048,7 +1048,7 @@ static void ref_dyn_quant_matmul_4bit_groupwise_kernel(
zero_point0 = (std::min)(zero_point0, qmax);
const int32_t nudged_zero_point0 = lrintf(zero_point0);
int8_t* dst_ptr = lhs_qa8dx + row_idx * dst_stride;
int8_t* dst_ptr = (int8_t*)lhs_qa8dx + row_idx * dst_stride;
*((float*)(dst_ptr)) = recip_scale0;
dst_ptr += sizeof(float);

View File

@ -285,8 +285,8 @@ static bool isSupportedHipLtROCmArch(int index) {
#if ROCM_VERSION >= 60300
"gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
#endif
#if ROCM_VERSION >= 70000
"gfx950", "gfx1150", "gfx1151"
#if ROCM_VERSION >= 60500
"gfx950"
#endif
};
return at::detail::getCUDAHooks().isGPUArch(archs, index);
@ -1919,7 +1919,7 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca
TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
addmm_out_cuda_impl(out, out, self, mat2, 0, 1);
addmm_out_cuda_impl(const_cast<Tensor&>(out), out, self, mat2, 0, 1);
return out;
}

View File

@ -102,7 +102,13 @@ __host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(const c10::comple
}
void launch_logcumsumexp_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim) {
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16,
// Compile time for CUDA-11.4 is 3x slower than with CUDA-11.6+, specifically for complex numbers
#if defined(FBCODE_CAFFE2) || defined(OVRSOURCE)
#define _LCME_DISPATCH AT_DISPATCH_FLOATING_TYPES_AND2
#else
#define _LCME_DISPATCH AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2
#endif
_LCME_DISPATCH(ScalarType::Half, ScalarType::BFloat16,
self.scalar_type(), "logcumsumexp_cuda",
[&]() {
using opmath_t = at::opmath_type<scalar_t>;

View File

@ -127,7 +127,8 @@ void apply_ldl_solve_cusolver(
const Tensor& pivots,
const Tensor& B,
bool upper) {
#if !(defined(CUDART_VERSION) && defined(CUSOLVER_VERSION))
#if !(defined(CUDART_VERSION) && defined(CUSOLVER_VERSION) && \
CUSOLVER_VERSION >= 11102)
TORCH_CHECK(
false,
"Calling torch.linalg.ldl_solve on a CUDA tensor requires compiling ",

View File

@ -169,10 +169,7 @@ std::string repro_from_args(const ConvolutionParams& params) {
ss << "If that doesn't trigger the error, please include your original repro script when reporting this issue.\n\n";
ss << "import torch\n";
ss << "torch.backends.cuda.matmul.allow_tf32 = "
<< pybool(
at::globalContext().float32Precision(
at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
at::Float32Precision::TF32)
<< pybool(at::globalContext().float32Precision("cuda", "matmul") == "tf32")
<< "\n";
ss << "torch.backends.cudnn.benchmark = "
<< pybool(at::globalContext().benchmarkCuDNN()) << "\n";
@ -729,7 +726,7 @@ Tensor cudnn_convolution_relu(
auto& ctx = at::globalContext();
bool benchmark = ctx.benchmarkCuDNN();
bool allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);
bool allow_tf32 = ctx.allowTF32CuDNN("conv");
auto _bias = bias_t.has_value()
? bias_t.value()
: at::zeros(
@ -787,7 +784,7 @@ Tensor cudnn_convolution_add_relu(
}
auto& ctx = at::globalContext();
bool allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);
bool allow_tf32 = ctx.allowTF32CuDNN("conv");
bool benchmark = ctx.benchmarkCuDNN();
auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
auto _bias = bias_t.has_value()

View File

@ -76,6 +76,7 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss_tensor(
#else // AT_CUDNN_ENABLED
#include <ATen/cudnn/Descriptors.h>
#include <ATen/cudnn/Types.h>
#include <ATen/cudnn/Utils.h>
@ -283,9 +284,9 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss_tensor(
checkBackend(c, {*targets}, Backend::CUDA);
const auto batch_size = log_probs->size(1);
int64_t input_lengths_size =
!input_lengths_.sizes().empty() ? input_lengths_.size(0) : 1;
input_lengths_.sizes().size() ? input_lengths_.size(0) : 1;
int64_t target_lengths_size =
!target_lengths_.sizes().empty() ? target_lengths_.size(0) : 1;
target_lengths_.sizes().size() ? target_lengths_.size(0) : 1;
TORCH_CHECK(
input_lengths_size == batch_size,
"input_lengths needs to have size to match batch_size");

View File

@ -142,6 +142,8 @@ void run_cudnn_SDP_bprop_nestedtensor(
namespace at {
namespace native {
#include <cudnn_frontend.h>
namespace fe = cudnn_frontend;
constexpr uint8_t MAX_MHA_DIM = 4;
@ -1377,7 +1379,7 @@ void run_cudnn_SDP_fprop(
cudnnHandle_t handle = getCudnnHandle();
// NB: The key initialization will round up sequence length, stride data etc.
// if use_ragged_in_dense is enabled (to allow multiple sequence lengths to
// if use_ragged_in_dense is enabled (to allow multiple sequence lenghths to
// reuse the same cached value/graph)
auto key = MHACacheKeyWrapper(
b,

View File

@ -245,7 +245,7 @@ descriptor(cudnnHandle_t handle, DropoutDescriptor&& dropout_desc) const {
datatype,
input_datatype,
algo,
at::globalContext().allowTF32CuDNN(at::Float32Op::RNN));
at::globalContext().allowTF32CuDNN("rnn"));
#else
rnn_desc.set(
handle,
@ -261,7 +261,7 @@ descriptor(cudnnHandle_t handle, DropoutDescriptor&& dropout_desc) const {
datatype,
input_datatype,
algo,
at::globalContext().allowTF32CuDNN(at::Float32Op::RNN));
at::globalContext().allowTF32CuDNN("rnn"));
#endif
return rnn_desc;
}

View File

@ -772,21 +772,13 @@ void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
template <>
void gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
static const std::vector<std::string> wmma_archs = {
"gfx1100", "gfx1101", "gfx1102", "gfx1200", "gfx1201",
#if ROCM_VERSION >= 70000
"gfx1150", "gfx1151"
#endif
};
if (at::detail::getCUDAHooks().isGPUArch(wmma_archs)) {
auto dprops = at::cuda::getCurrentDeviceProperties();
std::string_view arch(dprops->gcnArchName);
if (arch == "gfx1100") {
dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGS(at::BFloat16));
}
else if (at::detail::getCUDAHooks().isGPUArch({"gfx9"})) {
} else{
dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGS(at::BFloat16));
}
else {
TORCH_CHECK(false, "gemm_internal_ck<at::BFloat16> unsupported gfx arch");
}
}
} // namespace at::native

View File

@ -599,21 +599,11 @@ void dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
template <>
void gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
static const std::vector<std::string> wmma_archs = {
"gfx1100", "gfx1101", "gfx1102", "gfx1200", "gfx1201",
#if ROCM_VERSION >= 70000
"gfx1150", "gfx1151"
#endif
};
if (at::detail::getCUDAHooks().isGPUArch(wmma_archs)) {
if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) {
dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGS(at::Half));
}
else if (at::detail::getCUDAHooks().isGPUArch({"gfx9"})) {
} else{
dispatch_half_gemm(CUDABLAS_GEMM_ARGS(at::Half));
}
else {
TORCH_CHECK(false, "gemm_internal_ck<at::Half> unsupported gfx arch");
}
}
} // namespace at::native

View File

@ -38,6 +38,7 @@ REGISTER_NO_CPU_DISPATCH(mkldnn_convolution_transpose_backward_stub)
#include <ATen/native/mkldnn/MKLDNNCommon.h>
#include <ATen/native/mkldnn/Utils.h>
#include <ATen/native/ConvUtils.h>
#include <c10/util/irange.h>
namespace at::native {
@ -104,7 +105,7 @@ static void check_shape_forward(const Tensor& input,
// If kernel size is incorrect
std::ostringstream input_ss;
std::ostringstream kernel_ss;
std::string separator;
std::string separator = "";
for (int i = 0, len = input_shape.size(); i < len; ++i) {
input_ss << separator << input_shape[i];
@ -155,12 +156,12 @@ static void check_shape_forward(const Tensor& input,
//
static bool mkldnn_conv_enabled_fpmath_mode_bf16(){
return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::CONV) == at::Float32Precision::BF16 &&
return at::globalContext().float32Precision("mkldnn", "conv") == "bf16" &&
mkldnn_bf16_device_check();
}
static bool mkldnn_conv_enabled_fpmath_mode_tf32(){
return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::CONV) == at::Float32Precision::TF32 &&
return at::globalContext().float32Precision("mkldnn", "conv") == "tf32" &&
cpuinfo_has_x86_amx_fp16();
}

View File

@ -69,12 +69,12 @@ mkldnn_scaled_mm(const Tensor& mat1, const Tensor& mat2,
namespace at::native {
static bool use_mkldnn_bf32_linear() {
return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::BF16 &&
return at::globalContext().float32Precision("mkldnn", "matmul") == "bf16" &&
mkldnn_bf16_device_check();
}
static bool use_mkldnn_tf32_linear() {
return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::TF32 &&
return at::globalContext().float32Precision("mkldnn", "matmul") == "tf32" &&
cpuinfo_has_x86_amx_fp16();
}

View File

@ -111,11 +111,11 @@ static bool use_mkldnn_fp16_matmul() {
}
static bool use_mkldnn_bf32_matmul() {
return use_mkldnn_bf16_matmul() && at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::BF16;
return use_mkldnn_bf16_matmul() && at::globalContext().float32Precision("mkldnn", "matmul") == "bf16";
}
static bool use_mkldnn_tf32_matmul() {
return cpuinfo_has_x86_amx_fp16() && at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::TF32;
return cpuinfo_has_x86_amx_fp16() && at::globalContext().float32Precision("mkldnn", "matmul") == "tf32";
}
// returns an ideep::tensor

View File

@ -14,7 +14,6 @@ struct EmbeddingBagParams {
::c10::metal::array<idx_type_t, 2> output_strides;
::c10::metal::array<idx_type_t, 2> max_indices_strides;
bool use_per_sample_weights;
idx_type_t per_sample_weights_stride;
idx_type_t num_indices;
@ -24,24 +23,3 @@ struct EmbeddingBagParams {
EmbeddingBagMode mode;
int64_t padding_idx;
};
template <typename idx_type_t = uint32_t>
struct EmbeddingBagBackwardParams {
::c10::metal::array<idx_type_t, 2> weight_grad_strides;
::c10::metal::array<idx_type_t, 2> output_grad_strides;
::c10::metal::array<idx_type_t, 2> max_indices_strides;
bool use_per_sample_weights;
idx_type_t per_sample_weights_stride;
idx_type_t feature_size;
EmbeddingBagMode mode;
int64_t padding_idx;
};
template <typename idx_type_t = uint32_t>
struct EmbeddingBagPerSampleWeightsBackwardParams {
::c10::metal::array<idx_type_t, 2> output_grad_strides;
::c10::metal::array<idx_type_t, 2> weight_strides;
idx_type_t per_sample_weights_grad_stride;
idx_type_t feature_size;
int64_t padding_idx;
};

View File

@ -1,5 +1,4 @@
#include <ATen/native/mps/kernels/EmbeddingBag.h>
#include <c10/metal/atomic.h>
#include <c10/metal/utils.h>
#include <metal_array>
#include <metal_stdlib>
@ -45,7 +44,6 @@ template <EmbeddingBagMode M, typename T>
struct MaybeApplyPerSampleWeight {
inline opmath_t<T> operator()(
opmath_t<T> weight_val,
bool /*use_per_sample_weights*/,
uint32_t /*per_sample_weights_index*/,
constant T* /*per_sample_weights*/,
uint32_t /*per_sample_weights_stride*/) {
@ -57,11 +55,10 @@ template <typename T>
struct MaybeApplyPerSampleWeight<EmbeddingBagMode::SUM, T> {
inline opmath_t<T> operator()(
opmath_t<T> weight_val,
bool use_per_sample_weights,
uint32_t per_sample_weights_index,
constant T* per_sample_weights,
uint32_t per_sample_weights_stride) {
if (use_per_sample_weights) {
if (per_sample_weights_stride) {
T per_sample_weight = per_sample_weights
[per_sample_weights_stride * per_sample_weights_index];
return static_cast<opmath_t<T>>(per_sample_weight) * weight_val;
@ -157,7 +154,6 @@ void embedding_bag_impl(
auto num_bags = params.num_bags;
auto feature_size = params.feature_size;
auto padding_idx = params.padding_idx;
auto use_per_sample_weights = params.use_per_sample_weights;
auto per_sample_weights_stride = params.per_sample_weights_stride;
constant auto& output_strides = params.output_strides;
constant auto& weight_strides = params.weight_strides;
@ -187,11 +183,7 @@ void embedding_bag_impl(
feature_idx * weight_strides[1]]);
weight_val = MaybeApplyPerSampleWeight<M, T>()(
weight_val,
use_per_sample_weights,
indices_idx,
per_sample_weights,
per_sample_weights_stride);
weight_val, indices_idx, per_sample_weights, per_sample_weights_stride);
auto new_out_val = ReductionOp<M, T>()(weight_val, out_val, bag_size_ == 0);
@ -247,208 +239,19 @@ kernel void embedding_bag(
}
}
template <EmbeddingBagMode M, typename T>
struct MaybeDivBagSize {
inline opmath_t<T> operator()(opmath_t<T> val, opmath_t<T> bag_size) {
return val;
}
};
template <typename T>
struct MaybeDivBagSize<EmbeddingBagMode::MEAN, T> {
inline opmath_t<T> operator()(opmath_t<T> val, opmath_t<T> bag_size) {
return val / bag_size;
}
};
template <EmbeddingBagMode M, typename T, typename I>
void embedding_bag_backward_sum_mean_impl(
constant T* output_grad,
constant I* indices,
constant I* offset2bag,
constant I* bag_size,
constant T* per_sample_weights,
device AtomicType_t<T>* weight_grad,
constant EmbeddingBagBackwardParams<uint32_t>& params,
uint tid) {
auto feature_size = params.feature_size;
auto indices_idx = tid / feature_size;
auto bag_idx = static_cast<uint32_t>(offset2bag[indices_idx]);
auto bag_size_val = bag_size[bag_idx];
auto weight_idx = indices[indices_idx];
auto padding_idx = params.padding_idx;
if (bag_size_val && weight_idx != padding_idx) {
auto feature_idx = tid % feature_size;
constant auto& weight_grad_strides = params.weight_grad_strides;
constant auto& output_grad_strides = params.output_grad_strides;
auto use_per_sample_weights = params.use_per_sample_weights;
auto per_sample_weights_stride = params.per_sample_weights_stride;
auto output_grad_val =
static_cast<opmath_t<T>>(output_grad
[bag_idx * output_grad_strides[0] +
feature_idx * output_grad_strides[1]]);
opmath_t<T> weight_grad_val = MaybeDivBagSize<M, T>()(
MaybeApplyPerSampleWeight<M, T>()(
output_grad_val,
use_per_sample_weights,
indices_idx,
per_sample_weights,
per_sample_weights_stride),
static_cast<opmath_t<T>>(bag_size_val));
AtomicType<T>::atomic_add(
weight_grad,
static_cast<int32_t>(weight_idx) * weight_grad_strides[0] +
feature_idx * weight_grad_strides[1],
static_cast<T>(weight_grad_val));
}
}
template <typename T, typename I>
void embedding_bag_backward_max_impl(
constant T* output_grad,
constant I* bag_size,
constant I* max_indices,
device AtomicType_t<T>* weight_grad,
constant EmbeddingBagBackwardParams<uint32_t>& params,
uint tid) {
auto feature_size = params.feature_size;
auto bag_idx = tid / feature_size;
auto bag_size_val = bag_size[bag_idx];
if (bag_size_val) {
auto feature_idx = tid % feature_size;
constant auto& weight_grad_strides = params.weight_grad_strides;
constant auto& output_grad_strides = params.output_grad_strides;
constant auto& max_indices_strides = params.max_indices_strides;
auto output_grad_val = output_grad
[bag_idx * output_grad_strides[0] +
feature_idx * output_grad_strides[1]];
auto max_index =
static_cast<uint32_t>(max_indices
[bag_idx * max_indices_strides[0] +
feature_idx * max_indices_strides[1]]);
AtomicType<T>::atomic_add(
weight_grad,
max_index * weight_grad_strides[0] +
feature_idx * weight_grad_strides[1],
output_grad_val);
}
}
#define DISPATCH_BACKWARD_SUM_MEAN_IMPL(MODE) \
return embedding_bag_backward_sum_mean_impl<MODE>( \
output_grad, \
indices, \
offset2bag, \
bag_size, \
per_sample_weights, \
weight_grad, \
params, \
tid)
template <typename T, typename I>
kernel void embedding_bag_backward(
constant T* output_grad [[buffer(0)]],
constant I* indices [[buffer(1)]],
constant I* offset2bag [[buffer(2)]],
constant I* bag_size [[buffer(3)]],
constant I* max_indices [[buffer(4)]],
constant T* per_sample_weights [[buffer(5)]],
device AtomicType_t<T>* weight_grad [[buffer(6)]],
constant EmbeddingBagBackwardParams<uint32_t>& params [[buffer(7)]],
uint tid [[thread_position_in_grid]]) {
switch (params.mode) {
case EmbeddingBagMode::SUM:
DISPATCH_BACKWARD_SUM_MEAN_IMPL(EmbeddingBagMode::SUM);
case EmbeddingBagMode::MEAN:
DISPATCH_BACKWARD_SUM_MEAN_IMPL(EmbeddingBagMode::MEAN);
case EmbeddingBagMode::MAX:
return embedding_bag_backward_max_impl(
output_grad, bag_size, max_indices, weight_grad, params, tid);
}
}
template <typename T, typename I>
kernel void embedding_bag_per_sample_weights_backward(
constant T* output_grad [[buffer(0)]],
constant T* weight [[buffer(1)]],
constant I* indices [[buffer(2)]],
constant I* offset2bag [[buffer(3)]],
device AtomicType_t<T>* per_sample_weights_grad [[buffer(4)]],
constant EmbeddingBagPerSampleWeightsBackwardParams<uint32_t>& params
[[buffer(5)]],
uint tid [[thread_position_in_grid]]) {
auto feature_size = params.feature_size;
auto padding_idx = params.padding_idx;
auto indices_idx = tid / feature_size;
auto weight_idx = indices[indices_idx];
if (weight_idx != padding_idx) {
auto feature_idx = tid % feature_size;
auto bag_idx = static_cast<uint32_t>(offset2bag[indices_idx]);
constant auto& output_grad_strides = params.output_grad_strides;
constant auto& weight_strides = params.weight_strides;
auto per_sample_weights_grad_stride = params.per_sample_weights_grad_stride;
auto weight_val = weight
[static_cast<uint32_t>(weight_idx) * weight_strides[0] +
feature_idx * weight_strides[1]];
auto output_grad_val = output_grad
[bag_idx * output_grad_strides[0] +
feature_idx * output_grad_strides[1]];
auto per_sample_weights_grad_val = static_cast<opmath_t<T>>(weight_val) *
static_cast<opmath_t<T>>(output_grad_val);
AtomicType<T>::atomic_add(
per_sample_weights_grad,
indices_idx * per_sample_weights_grad_stride,
static_cast<T>(per_sample_weights_grad_val));
}
}
#define REGISTER_EMBEDDING_BAG_OP(T, I) \
template [[host_name("embedding_bag_" #T "_" #I)]] \
kernel void embedding_bag<T, I>( \
constant T * weight [[buffer(0)]], \
constant I * indices [[buffer(1)]], \
constant I * offsets [[buffer(2)]], \
constant T * per_sample_weights [[buffer(3)]], \
device T * output [[buffer(4)]], \
device I * offset2bag [[buffer(5)]], \
device I * bag_size [[buffer(6)]], \
device I * max_indices [[buffer(7)]], \
constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]], \
uint tid [[thread_position_in_grid]]); \
\
template [[host_name("embedding_bag_backward_" #T "_" #I)]] \
kernel void embedding_bag_backward<T, I>( \
constant T * output_grad [[buffer(0)]], \
constant I * indices [[buffer(1)]], \
constant I * offset2bag [[buffer(2)]], \
constant I * bag_size [[buffer(3)]], \
constant I * max_indices [[buffer(4)]], \
constant T * per_sample_weights [[buffer(5)]], \
device AtomicType_t<T> * weight_grad [[buffer(6)]], \
constant EmbeddingBagBackwardParams<uint32_t> & params [[buffer(7)]], \
uint tid [[thread_position_in_grid]]); \
\
template \
[[host_name("embedding_bag_per_sample_weights_backward_" #T "_" #I)]] \
kernel void embedding_bag_per_sample_weights_backward<T, I>( \
constant T * output_grad [[buffer(0)]], \
constant T * weight [[buffer(1)]], \
constant I * indices [[buffer(2)]], \
constant I * offset2bag [[buffer(3)]], \
device AtomicType_t<T> * per_sample_weights_grad [[buffer(4)]], \
constant EmbeddingBagPerSampleWeightsBackwardParams<uint32_t> & \
params [[buffer(5)]], \
uint tid [[thread_position_in_grid]]);
#define REGISTER_EMBEDDING_BAG_OP(T, I) \
template [[host_name("embedding_bag_" #T "_" #I)]] \
kernel void embedding_bag<T, I>( \
constant T * weight [[buffer(0)]], \
constant I * indices [[buffer(1)]], \
constant I * offsets [[buffer(2)]], \
constant T * per_sample_weights [[buffer(3)]], \
device T * output [[buffer(4)]], \
device I * offset2bag [[buffer(5)]], \
device I * bag_size [[buffer(6)]], \
device I * max_indices [[buffer(7)]], \
constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]], \
uint tid [[thread_position_in_grid]]);
REGISTER_EMBEDDING_BAG_OP(float, int);
REGISTER_EMBEDDING_BAG_OP(float, long);

View File

@ -13,10 +13,8 @@
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/_embedding_bag_dense_backward_native.h>
#include <ATen/ops/_embedding_bag_forward_only_native.h>
#include <ATen/ops/_embedding_bag_native.h>
#include <ATen/ops/_embedding_bag_per_sample_weights_backward_native.h>
#include <ATen/ops/empty.h>
#endif
@ -97,7 +95,6 @@ static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
}
bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
params.use_per_sample_weights = use_per_sample_weights;
params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
params.num_indices = num_indices;
@ -180,117 +177,4 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_forward_only_mps(
padding_idx);
}
Tensor _embedding_bag_dense_backward_mps(const Tensor& output_grad,
const Tensor& indices,
const Tensor& offset2bag,
const Tensor& bag_size,
const Tensor& max_indices,
int64_t num_weights,
bool scale_grad_by_freq,
int64_t mode,
const std::optional<Tensor>& per_sample_weights_opt,
int64_t padding_idx) {
// indices and offset2bag are assumed having correct dtypes and
// contiguous here due to the checks in _embedding_bag_backward in
// EmbeddingBag.cpp.
// Also see NOTE [ embedding_bag Native Functions ] in native_functions.yaml
// for more details.
int64_t feature_size = output_grad.size(1);
auto weight_grad = at::zeros({num_weights, feature_size}, output_grad.options());
EmbeddingBagBackwardParams<uint32_t> params;
for (const auto dim : c10::irange(2)) {
params.output_grad_strides[dim] = output_grad.stride(dim);
params.weight_grad_strides[dim] = weight_grad.stride(dim);
if (mode == EmbeddingBagMode::MAX) {
params.max_indices_strides[dim] = safe_downcast<uint32_t, int64_t>(max_indices.stride(dim));
}
}
bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
params.use_per_sample_weights = use_per_sample_weights;
params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
params.feature_size = output_grad.size(1);
params.mode = static_cast<EmbeddingBagMode>(mode);
params.padding_idx = padding_idx;
auto num_indices = offset2bag.numel();
auto num_threads = (params.mode == EmbeddingBagMode::MAX) ? output_grad.numel() : num_indices * params.feature_size;
MPSStream* stream = getCurrentMPSStream();
mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_backward_{}_{}",
mps::scalarToMetalTypeString(output_grad),
mps::scalarToMetalTypeString(indices)));
getMPSProfiler().beginProfileKernel(
pipeline_state, "embedding_bag", {output_grad, indices, offset2bag, bag_size});
[computeEncoder setComputePipelineState:pipeline_state];
mps::mtl_setArgs(computeEncoder,
output_grad,
indices,
offset2bag,
bag_size,
max_indices,
use_per_sample_weights ? per_sample_weights_opt : std::nullopt,
weight_grad,
params);
mps::mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
getMPSProfiler().endProfileKernel(pipeline_state);
}
});
return std::move(weight_grad);
}
Tensor _embedding_bag_per_sample_weights_backward_mps(const Tensor& output_grad,
const Tensor& weight,
const Tensor& indices,
const Tensor& offsets,
const Tensor& offset2bag,
int64_t mode,
int64_t padding_idx) {
TORCH_INTERNAL_ASSERT(static_cast<EmbeddingBagMode>(mode) == EmbeddingBagMode::SUM);
int64_t num_indices = indices.size(0);
int64_t feature_size = output_grad.size(1);
auto per_sample_weights_grad = at::zeros({num_indices}, output_grad.options());
EmbeddingBagPerSampleWeightsBackwardParams params;
for (const auto dim : c10::irange(2)) {
params.output_grad_strides[dim] = output_grad.stride(dim);
params.weight_strides[dim] = weight.stride(dim);
}
params.per_sample_weights_grad_stride = per_sample_weights_grad.stride(0);
params.feature_size = feature_size;
params.padding_idx = padding_idx;
auto num_threads = num_indices * feature_size;
MPSStream* stream = getCurrentMPSStream();
mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_per_sample_weights_backward_{}_{}",
mps::scalarToMetalTypeString(output_grad),
mps::scalarToMetalTypeString(indices)));
getMPSProfiler().beginProfileKernel(
pipeline_state, "embedding_bag_per_sample_weights_backward", {output_grad, weight, indices, offset2bag});
[computeEncoder setComputePipelineState:pipeline_state];
mps::mtl_setArgs(computeEncoder, output_grad, weight, indices, offset2bag, per_sample_weights_grad, params);
mps::mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
getMPSProfiler().endProfileKernel(pipeline_state);
}
});
return std::move(per_sample_weights_grad);
}
} // namespace at::native

View File

@ -2379,7 +2379,7 @@
- func: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
dispatch:
CPU, CUDA, MPS: _embedding_bag_backward_symint
CPU, CUDA: _embedding_bag_backward_symint
- func: _embedding_bag_sparse_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
dispatch:
@ -2389,14 +2389,12 @@
dispatch:
CPU: _embedding_bag_dense_backward_cpu
CUDA: _embedding_bag_dense_backward_cuda
MPS: _embedding_bag_dense_backward_mps
autogen: _embedding_bag_dense_backward.out
- func: _embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1) -> Tensor
dispatch:
CPU: _embedding_bag_per_sample_weights_backward_cpu
CUDA: _embedding_bag_per_sample_weights_backward_cuda
MPS: _embedding_bag_per_sample_weights_backward_mps
autogen: _embedding_bag_per_sample_weights_backward.out
- func: empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor

View File

@ -316,7 +316,7 @@ Tensor NestedTensor_to_padded_tensor_generic(
TORCH_CHECK(
(int64_t)output_size_.size() == ret_val.dim(),
"Length of output_size does not match NestedTensor dims. Broadcasting is not supported.");
for (int64_t i = 0; i < ret_val.dim(); i++) {
for (int64_t i = 0; i < (int64_t)ret_val.dim(); i++) {
TORCH_CHECK(
output_size_[i] >= ret_val.size(i),
"Value in output_size is less than NestedTensor padded size. Truncation is not supported.");

View File

@ -146,12 +146,12 @@ inline TensorQuantizationParams ChooseQuantizationParams(
// The arithmetic error on the zero point computed from either pair
// will be roughly machine_epsilon * (sum of absolute values of terms)
// so we want to use the variant that adds the smaller terms.
double zero_point_from_min = qmin - min / scale;
double zero_point_from_max = qmax - max / scale;
double zero_point_from_min = qmin - min / static_cast<double>(scale);
double zero_point_from_max = qmax - max / static_cast<double>(scale);
double zero_point_from_min_error =
std::abs(qmin) - std::abs(min / scale);
std::abs(qmin) - std::abs(min / static_cast<double>(scale));
double zero_point_from_max_error =
std::abs(qmax) - std::abs(max / scale);
std::abs(qmax) - std::abs(max / static_cast<double>(scale));
double initial_zero_point =
zero_point_from_min_error < zero_point_from_max_error
? zero_point_from_min

View File

@ -560,7 +560,7 @@ float hsum_sq(const int32_t* A, int len) {
alignas(64) float temp[8];
_mm256_store_ps(temp, sum_ps);
for (const auto k : c10::irange(8)) {
row_sum += temp[k];
row_sum += static_cast<float>(temp[k]);
}
#elif defined(CPU_CAPABILITY_AVX512)
__m512 sum_ps = _mm512_setzero_ps();
@ -574,7 +574,7 @@ float hsum_sq(const int32_t* A, int len) {
alignas(64) float temp[16];
_mm512_store_ps(temp, sum_ps);
for (const auto k : c10::irange(16)) {
row_sum += temp[k];
row_sum += static_cast<float>(temp[k]);
}
#endif // CPU_CAPABILITY_AVX2 or CPU_CAPABILITY_AVX512
@ -1282,7 +1282,7 @@ template <bool ReLUFused = false>
void qadd_scalar_kernel(Tensor& out, const Tensor& self, const Scalar& other) {
int64_t zero_point = out.q_zero_point();
float scale = static_cast<float>(out.q_scale());
float inv_scale = 1.0f / scale;
float inv_scale = static_cast<float>(1.0f / scale);
int64_t self_zero_point = self.q_zero_point();
float self_scale = static_cast<float>(self.q_scale());
@ -2915,7 +2915,7 @@ void fake_quantize_learnable_channel_grad_kernel_cpu(
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
*dx_output = (*dy_input) * (xqi >= quant_min && xqi <= quant_max);
// Calculate gradients for scale and zero point.
float xfqi = ((std::max(std::min(xqi, quant_max), quant_min) - (*zero_point_input)) * (*scale_input));
float xfqi = static_cast<float>((std::max(std::min(xqi, quant_max), quant_min) - (*zero_point_input)) * (*scale_input));
if (xqi < quant_min || xqi > quant_max) {
*dzero_point_output = (*dy_input) * (-1) * (*scale_input) * grad_factor;
*dscale_output = ((xqi < quant_min) ? ((*dy_input) * dscale_small) : ((*dy_input) * dscale_big)) * grad_factor;
@ -4415,7 +4415,7 @@ void _qmul_tensor_cpu_impl(
uint8_t y_data = *(y_ptr + idx);
int32_t x_val = static_cast<int32_t>(x_data) - x_zero_point;
int32_t y_val = static_cast<int32_t>(y_data) - y_zero_point;
int32_t out_val = x_val * y_val;
int32_t out_val = static_cast<int32_t>(x_val * y_val);
float out_val_f = (float)out_val * multiplier;
if constexpr (std::is_same<T, float>::value) {
*(out_ptr + idx) = out_val_f;

View File

@ -1198,7 +1198,7 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
kSpatialDim == 2 ? ideep::format_tag::nhwc : ideep::format_tag::ndhwc);
ideep::tensor src(src_desc, act_contig.data_ptr());
// weights & bias
ideep::tensor& weights = *(weight_);
ideep::tensor& weights = *(weight_.get());
bool with_bias = bias_.has_value();
const auto& kernel_size = weights.get_dims();
// dst

View File

@ -812,7 +812,7 @@ at::Tensor PackedLinearWeightsOnednn::apply_impl(
auto is_input_qint8 = input.scalar_type() == c10::ScalarType::QInt8;
auto input_contig = input.expect_contiguous();
auto& w = *weight_;
auto& w = *(weight_.get());
auto K = input.size(dim - 1), M = input.numel() / K, N = w.get_dim(1);
auto input_dims = {M, K};
auto input_data_type = is_input_qint8 ? dnnl::memory::data_type::s8 : dnnl::memory::data_type::u8;

View File

@ -545,7 +545,7 @@ at::Tensor PackedLinearWeightsOnednn::apply_dynamic_impl(
/*reduce_range=*/reduce_range);
const std::vector<int32_t>& src_zero_point = std::vector<int32_t>(1, q_params.zero_point);
// weights, dst
auto w = *weight_;
auto w = *(weight_.get());
auto dst_dims = {x.get_dim(0), w.get_dim(1)};
const ideep::scale_t& src_scales = ideep::scale_t(1, 1.0/q_params.scale);
const ideep::scale_t& weights_scales = w.get_scale();

View File

@ -12,6 +12,7 @@
#include <ATen/quantized/Quantizer.h>
#include <c10/core/QScheme.h>
#include <c10/util/irange.h>
#include <torch/library.h>
#include <utility>

View File

@ -10,6 +10,7 @@
#include <ATen/quantized/Quantizer.h>
#include <c10/core/QScheme.h>
#include <c10/util/irange.h>
#include <torch/library.h>
int register_linear_params();

View File

@ -65,7 +65,7 @@ Tensor& addmv_out_sparse_compressed(
return result.zero_();
} else {
return at::mul_out(
result,
const_cast<Tensor&>(result),
self,
at::native::scalar_tensor(
beta,

Some files were not shown because too many files have changed in this diff Show More