Files
pytorch/torch/testing/_internal/common_mps.py
Nikita Shulga 9ca080db87 [MPS] Extend atomic operations to all int types (#158179)
That fixes `index_put(..., accumulate=True)` for all dtypes

int64 operation is not really atomic, but eventually consistent from the `index_put_accumulate` kernel point of view: i.e. by the end of the operation results in the global memory are indeed accumulation of the operands at given indices
Pull Request resolved: https://github.com/pytorch/pytorch/pull/158179
Approved by: https://github.com/dcci, https://github.com/Skylion007
ghstack dependencies: #158064, #158178
2025-07-14 04:25:05 +00:00

991 lines
36 KiB
Python

import unittest
from collections.abc import Sequence
from typing import Optional
import torch
from .common_utils import MACOS_VERSION
from .opinfo.core import DecorateInfo, OpInfo
if torch.backends.mps.is_available():
def mps_ops_modifier(
ops: Sequence[OpInfo],
device_type: Optional[str] = None,
xfail_exclusion: Optional[list[str]] = None,
) -> Sequence[OpInfo]:
if xfail_exclusion is None:
xfail_exclusion = []
# Supported complex OPS
SUPPORTED_COMPLEX_OPS = {
"__radd__",
"__rmul__",
"__rsub__",
"__getitem__",
"_unsafe_masked_index",
"abs",
"add",
"alias_copy",
"argwhere",
"atleast_1d",
"atleast_2d",
"atleast_3d",
"as_strided",
"as_strided_copy",
"as_strided_scatter",
"asin",
"acos",
"atan",
"broadcast_tensors",
"broadcast_to",
"chalf",
"cfloat",
"chunk",
"clone",
"conj",
"conj_physical",
"contiguous",
"cos",
"cosh",
"diag",
"diag_embed",
"diagflat",
"diagonal",
"diagonal_copy",
"diagonal_scatter",
"divno_rounding_mode",
"dsplit",
"empty",
"empty_permuted",
"empty_strided",
"exp",
"expm1",
"exp2",
"expand",
"expand_as",
"expand_copy",
"flatten",
"fill",
"full",
"full_like",
"H",
"hsplit",
"imag",
"index_copy",
"index_select",
"isfinite",
"isinf",
"isreal",
"item",
"kron",
"linalg.diagonal",
"linalg.svd",
"log10",
"log1p",
"log2",
"log",
"mH",
"mT",
"masked_fill",
"masked_scatter",
"masked_select",
"meshgridlist_of_tensors",
"meshgridvariadic_tensors",
"movedim",
"mul",
"narrow",
"narrow_copy",
"neg",
"new_full",
"new_ones",
"new_zeros",
"nn.functional.conv1d",
"nn.functional.conv2d",
"nn.functional.conv_transpose1d",
"nn.functional.conv_transpose2d",
"nn.functional.conv_transpose3d",
"nn.functional.feature_alpha_dropoutwithout_train",
"nn.functional.padcircular",
"nn.functional.softsign",
"nn.functional.tanhshrink",
"nn.functional.unfold",
"nonzero",
"ones",
"ones_like",
"outer",
"permute",
"permute_copy",
"positive",
"randn",
"ravel",
"real",
"repeat_interleave",
"reshape_as",
"reshape",
"resolve_conj",
"resolve_neg",
"rsqrt",
"rsub",
"scalar_tensor",
"select",
"sgn",
"sigmoid",
"sin",
"sinc",
"sinh",
"slice",
"special.spherical_bessel_j0",
"special.entr",
"special.xlog1py",
"special.zeta",
"split",
"split_with_sizes",
"split_with_sizes_copy",
"splitlist_args",
"sqrt",
"squeeze",
"squeeze_copy",
"squeezemultiple",
"sub",
"svd",
"t",
"t_copy",
"tanh",
"tan",
"tensor_split",
"transpose",
"transpose_copy",
"tril",
"triu",
"true_divide",
"T",
"unbind",
"unbind_copy",
"unflatten",
"unfold",
"unfold_copy",
"unsafe_chunk",
"unsafe_split",
"unsqueeze",
"unsqueeze_copy",
"view_as",
"view_as_real",
"view",
"view_copy",
"vsplit",
"zero_",
"zeros",
"zeros_like",
}
AFTER_MACOS_14_0_SUPPORTED_COMPLEX_OPS = {
"__rdiv__",
"__rmatmul__",
"_chunk_cat",
"acosh",
"all",
"allclose",
"angle",
"any",
"addcdiv",
"addcmul",
"addmmdecomposed",
"addmv",
"atanh",
"bfloat16",
"bmm",
"bool",
"cartesian_prod",
"cat",
"char",
"column_stack",
"combinations",
"corrcoef",
"constant_pad_nd",
"cov",
"count_nonzero",
"diff",
"div",
"dot",
"dstack",
"einsum",
"eq",
"equal",
"eye",
"fft.fft",
"fft.fft2",
"fft.fftn",
"fft.fftshift",
"fft.ifft",
"fft.ifft2",
"fft.ifftn",
"fft.ifftshift",
"fft.irfftn",
"fft.irfft2",
"fft.irfft",
"fft.hfftn",
"fft.hfft2",
"fft.hfft",
"flip",
"fliplr",
"flipud",
"float",
"gradient",
"half",
"hstack",
"inner",
"int",
"isclose",
"isnan",
"ldexp",
"lerp",
"linalg.multi_dot",
"linalg.pinv",
"linspace",
"linspacetensor_overload",
"logical_and",
"logical_not",
"logical_or",
"logical_xor",
"logsumexp",
"long",
"masked.mean",
"masked.prod",
"masked.std",
"masked.sum",
"masked.var",
"masked.logsumexp",
"matmul",
"mean",
"mm",
"mv",
"ne",
"nn.functional.padconstant",
"nn.functional.padreflect",
"nn.functional.padreplicate",
"nn.functional.pixel_shuffle",
"nn.functional.pixel_unshuffle",
"nn.functional.rms_norm",
"pinverse",
"prod",
"reciprocal",
"roll",
"rot90",
"short",
"sinh",
"sqrt",
"square",
"stack",
"stft",
"sum",
"sum_to_size",
"tensordot",
"trace",
"trapz",
"trapezoid",
"vstack",
"where",
"byte",
}
# Those ops worked on MacOS12, but broken on MacOS13, see https://github.com/pytorch/pytorch/issues/85758
MACOS_BEFORE_13_3_XFAILLIST = {
# Failures due to precision issues (due to fast-math). These has been fixed in MacOS 13.3+
"cdist": [torch.float32],
# CPU Error: cpu not giving nan for x/0.0
"atan2": [
torch.bool,
torch.int16,
torch.int32,
torch.int64,
torch.uint8,
torch.int8,
],
# test blow pass on macOS 12 as it falls back to cpu
# Argsort case using duplicate indices (undefined behaviour):
# - CPU output: tensor([2546, 6917, 3181, ..., 7128, 5133, 30], device='cpu')
# - MPS output: tensor([2546, 6917, 3181, ..., 7128, 30, 5133], device='mps:0')
# Elements from index 30 and 5133 are both equal.
# Since CPU is not using argsort with stable=True, these cases result in undefined behaviour.
"argsort": [torch.float16, torch.int8, torch.uint8, torch.bool],
# Same issue as `argsort` with duplicate indices. This test checks both the sorted values and the indices.
# The values of the sorted tensor match the CPU,
# but in case of the returned indices this results in undefined behaviour.
"sort": [torch.int8, torch.uint8, torch.bool, torch.float16],
# Unsupported dtypes
"cumsum": [torch.int64],
"cumprod": [torch.int64],
"cumulative_trapezoid": [torch.int64],
"masked.cumsum": [torch.int64],
"masked.cumprod": [torch.int64],
"linalg.vander": [torch.int64],
# Fail with `Expected 1.0 but got nan.` for empty tensors
# Caused by sample input at index 23: SampleInput(
# input=Tensor[size=(), device="mps:0", dtype=torch.float32],
# args=(0),
# kwargs={'mask': 'Tensor[size=(), device="mps:0", dtype=torch.bool]'},
# broadcasts_input=False, name='')
"masked.softmin": [torch.float32, torch.float16],
"masked.softmax": [torch.float32, torch.float16],
"masked.log_softmax": [torch.float32, torch.float16],
}
MACOS_AFTER_13_1_XFAILLIST = {
# before macOS 13.2 it falls back to cpu and pass the forward pass
"grid_sampler_2d": [
torch.float32,
torch.float16,
torch.bfloat16,
], # Unsupported Border padding mode
}
MACOS_13_3_XFAILLIST = {
# Failure due to precision issue for fp16
# on both cpu and mps there are test cases that might produce inf result
# 'nn.functional.pairwise_distance': [torch.float16],
# test blow pass on macOS 12 as it falls back to cpu
# Argsort case using duplicate indices (undefined behaviour):
# - CPU output: tensor([2546, 6917, 3181, ..., 7128, 5133, 30], device='cpu')
# - MPS output: tensor([2546, 6917, 3181, ..., 7128, 30, 5133], device='mps:0')
# Elements from index 30 and 5133 are both equal.
# Since CPU is not using argsort with stable=True, these cases result in undefined behaviour.
"argsort": [
torch.float16,
torch.int8,
torch.uint8,
torch.bool,
torch.bfloat16,
],
# Same issue as `argsort` with duplicate indices. This test checks both the sorted values and the indices.
# The values of the sorted tensor match the CPU,
# but in case of the returned indices this results in undefined behaviour.
"sort": [
torch.int8,
torch.uint8,
torch.bool,
torch.float16,
torch.bfloat16,
],
}
MACOS_BEFORE_14_4_XFAILLIST = {
# These ops work fine in 14.4 but fail in 14.2 or 13.x
"fft.hfft2": [torch.complex64],
}
# Those ops are not expected to work
UNIMPLEMENTED_XFAILLIST = {
# Failures due to lack of op implementation on MPS backend
"logspace": None,
"logspacetensor_overload": None,
"linalg.eig": None,
"linalg.eigvals": None,
"put": None,
"cauchy_": None,
"cauchy": None,
"cholesky_inverse": None,
"cholesky_solve": None,
"frexp": None,
"gcd": None,
"geqrf": None,
"nn.functional.grid_sample": None, # Unsupported Border padding mode
"heaviside": None,
"igamma": None,
"igammac": None,
"index_reduceprod": None,
"index_reducemean": None,
"index_reduceamax": None,
"index_reduceamin": None,
"kthvalue": None,
"lcm": None,
"linalg.cond": None,
"linalg.eigh": None,
"linalg.eigvalsh": None,
"linalg.householder_product": None,
"linalg.ldl_factor": None,
"linalg.ldl_factor_ex": None,
"linalg.ldl_solve": None,
"linalg.lstsq": None,
"linalg.lstsqgrad_oriented": None,
"linalg.lu": None,
"linalg.lu_solve": None,
"linalg.matrix_norm": [torch.float32],
"linalg.norm": [torch.float32],
"linalg.normsubgradients_at_zero": [torch.float32],
"linalg.qr": None,
"linalg.svdvals": None,
"linalg.vecdot": None,
"lu_solve": None,
"masked.median": None,
"matrix_exp": None,
"mode": None,
"native_dropout_backward": None,
"normnuc": None,
"nn.functional.fractional_max_pool2d": None,
"nn.functional.fractional_max_pool3d": None,
"nn.functional.adaptive_avg_pool3d": None,
"nn.functional.adaptive_max_pool3d": None,
"nn.functional.interpolatearea": None,
"nn.functional.interpolatebicubic": [torch.uint8],
"nn.functional.max_unpool1dgrad": None,
"nn.functional.max_unpool2dgrad": None,
"nn.functional.max_unpool3dgrad": None,
"nn.functional.avg_pool3d": None,
"nn.functional.ctc_loss": None,
"nn.functional.embedding_bag": None,
"nn.functional.max_unpool1d": None,
"nn.functional.max_unpool2d": None,
"nn.functional.max_unpool3d": None,
"nn.functional.multi_margin_loss": None,
"nn.functional.multilabel_margin_loss": None,
"nn.functional.pdist": None,
"nn.functional.rrelu": None,
"nn.functional.norm": None,
"ormqr": None,
"pca_lowrank": None,
"qr": None,
"scatter_reduceamax": [torch.int32, torch.int64]
if MACOS_VERSION < 15.0
else [torch.int64],
"scatter_reduceamin": [torch.int32, torch.int64]
if MACOS_VERSION < 15.0
else [torch.int64],
"segment_reduce": None,
"_segment.reduce": None,
"segment.reduce": None,
"segment_reduce_offsets": None,
"_segment_reduce_offsets": None,
"_segment_reduce_lengths": None,
"_segment_reducelengths": None,
"_segment_reduceoffsets": None,
"sparse.mm": None,
"sparse.sampled_addmm": None,
"sparse.mmreduce": None,
"special.airy_ai": None,
"special.erfcx": None,
"special.laguerre_polynomial_l": None,
"special.legendre_polynomial_p": None,
"special.log_ndtr": None,
"special.ndtri": None,
"svd_lowrank": None,
"symeig": None,
"take": None,
"to": None,
"to_sparse": None,
"unique": None,
"vdot": None,
"segment_reduce_": None,
"_upsample_bilinear2d_aa": [torch.uint8], # uint8 is for CPU only
"_upsample_bicubic2d_aa": [torch.uint8], # uint8 is for CPU only
"geometric": None,
"geometric_": None,
"log_normal_": None,
"log_normal": None,
"cdouble": None,
"double": None,
"nn.functional.softminwith_dtype": None,
"log_softmaxwith_dtype": None,
"softmaxwith_dtype": None,
"float_power": None,
"linalg.matrix_rankhermitian": None,
"linalg.pinvhermitian": None,
"nonzero_static": None,
# MPS: input sizes must be divisible by output sizes
"nn.functional.adaptive_avg_pool1d": None,
"nn.functional.adaptive_avg_pool2d": None,
# Convolution for integral types is not supported on MPS
"nn.functional.conv1d": [torch.int64],
"nn.functional.conv2d": [torch.int64],
"nn.functional.conv3d": [torch.int64],
"nn.functional.conv_transpose1d": [torch.int64],
"nn.functional.conv_transpose2d": [torch.int64, torch.bfloat16],
"nn.functional.conv_transpose3d": [
torch.int64,
torch.bfloat16,
torch.float16,
],
# Unsupported dtypes
"dot": [torch.int64] if MACOS_VERSION < 14.0 else [],
"histc": [torch.float16, torch.bfloat16],
"index_add": [torch.int64],
# GEMM on MPS is not supported for integral types
"nn.functional.linear": [
torch.int16,
torch.int32,
torch.int64,
torch.uint8,
torch.int8,
],
"addmmdecomposed": [
torch.int16,
torch.int32,
torch.int64,
torch.uint8,
torch.int8,
],
"addbmm": [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
"addmm": [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
"baddbmm": [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
"mat": [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
"matmul": [torch.int64] if MACOS_VERSION < 14.0 else [],
"__rmatmul__": [torch.int64] if MACOS_VERSION < 14.0 else [],
# returned output on CPU is float64
"bincount": [
torch.int16,
torch.int32,
torch.int64,
torch.uint8,
torch.int8,
],
# round not working properly for float16 and bfloat16
"round": [torch.float16, torch.bfloat16],
"rounddecimals_0": [torch.bfloat16],
}
if MACOS_VERSION < 14.0:
# FFT and BFloat16 support was added in MacOS 14
UNIMPLEMENTED_XFAILLIST.update(
{
"bfloat16": None,
"fft.fft": None,
"fft.fft2": None,
"fft.fftn": None,
"fft.hfft": None,
"fft.hfft2": None,
"fft.hfftn": None,
"fft.ifft": None,
"fft.ifft2": None,
"fft.ifftn": None,
"fft.ihfft": None,
"fft.ihfft2": None,
"fft.ihfftn": None,
"fft.irfft": None,
"fft.irfft2": None,
"fft.irfftn": None,
"fft.rfft": None,
"fft.rfft2": None,
"fft.rfftn": None,
"stft": None,
# Error in TestConsistencyCPU.test_output_match_isin_cpu fails for integers,
# not reproducible in later OS. Added assert to op if used in < 14.0
"isin": [
torch.int64,
torch.int32,
torch.int16,
torch.uint8,
torch.int8,
],
"nn.functional.max_pool2d": [torch.uint8],
}
)
if MACOS_VERSION < 15.0:
UNIMPLEMENTED_XFAILLIST.update(
{
"quantile": None,
"nanquantile": None,
}
)
UNDEFINED_XFAILLIST = {
# Top 60 operators
# topk fails with duplicate indices
"topk": [
torch.int16,
torch.int32,
torch.int64,
torch.uint8,
torch.int8,
],
# Failures due to random output that they generate using
# Philox engine causing mismatch with CPU results
"multinomial": [
torch.float16,
torch.float32,
torch.bfloat16,
], # random results
"uniform": [torch.float16, torch.float32, torch.bfloat16],
"rand_like": [torch.float16, torch.float32, torch.bfloat16],
"randint": None,
"randint_like": None,
"randn": None,
"randn_like": None,
"bernoulli": [torch.float16, torch.float32, torch.bfloat16],
"exponential": [torch.float16, torch.float32, torch.bfloat16],
"nn.functional.feature_alpha_dropoutwith_train": [
torch.float16,
torch.float32,
torch.bfloat16,
],
"normal": [torch.float16, torch.float32, torch.bfloat16],
"normalin_place": [torch.float16, torch.float32, torch.bfloat16],
"normalnumber_mean": [torch.float16, torch.float32, torch.bfloat16],
"nn.functional.alpha_dropout": [
torch.float16,
torch.float32,
torch.bfloat16,
],
"nn.functional.dropout": [torch.float16, torch.float32, torch.bfloat16],
"nn.functional.dropout2d": [torch.float16, torch.float32, torch.bfloat16],
"nn.functional.dropout3d": [torch.float16, torch.float32, torch.bfloat16],
# See https://github.com/pytorch/pytorch/issues/111479
"nn.functional.multi_head_attention_forward": [
torch.float32,
torch.float16,
torch.bfloat16,
],
# zero to negative integer powers are undefined
"__rpow__": [torch.int8, torch.int16, torch.int32, torch.int64],
"resize_": [torch.float16, torch.float32, torch.bfloat16],
"resize_as_": [torch.float16, torch.float32, torch.bfloat16],
# CPU Errors:
"addr": [
torch.bool,
torch.int16,
torch.int32,
torch.int64,
torch.uint8,
torch.int8,
], # "addmv_impl_cpu" not implemented for 'Half'
"as_stridedpartial_views": None, # cpu result off, showing random values
# random results
# mps vs cpu:
# Mismatched elements: 40 / 96 (41.7%)
# Greatest absolute difference: 17.892311096191406 at index (1, 0, 2) (up to 1e-05 allowed)
# Greatest relative difference: inf at index (1, 0, 0) (up to 1.3e-06 allowed)
# cuda(2.0.0.dev20230301+cu117) vs cpu:
# Mismatched elements: 56 / 96 (58.3%)
# Greatest absolute difference: 17.892311096191406 at index (1, 0, 2) (up to 1e-05 allowed)
# Greatest relative difference: inf at index (1, 0, 0) (up to 1.3e-06 allowed)
"nn.functional.scaled_dot_product_attention": [
torch.float32,
torch.float16,
torch.bfloat16,
],
}
ON_MPS_XFAILLIST = {
# Failures due to lack of implementation of downstream functions on MPS backend
# TODO: remove these once downstream function 'aten::_linalg_svd.U' have been implemented
"linalg.matrix_rank": None,
# Exception: Caused by `torch.arange(-8.001, -4.0, dtype=torch.uint8, device="mps")`
"arange": [torch.uint8],
}
EMPTY_OPS_SKIPLIST = {
# Fill tensors with uninitialized data, causing mismatch with CPU.
# They occasionally match, thus skipping them.
# See https://github.com/pytorch/pytorch/issues/100175
"new_empty": None,
"new_empty_strided": None,
"empty_strided": None,
# CPU: empty is returning all 0's and there is a mismatch with MPS
# allocation (MacOS 13). According to
# https://pytorch.org/docs/2.0/generated/torch.empty.html
"empty": None,
"empty_like": None,
"empty_permuted": None,
}
SKIPLIST = {
# Unsupported
# This doesn't work on M1, but is partially working on M2 with the exception of torch.float16
"nn.functional.conv3d": None,
}
def addDecorator(op: OpInfo, d: DecorateInfo) -> None:
if device_type is not None:
d.device_type = device_type
op.decorators = op.decorators + (d,)
for op in ops:
key = op.name + op.variant_test_name
if key in EMPTY_OPS_SKIPLIST:
addDecorator(
op,
DecorateInfo(
unittest.skip("Skipping empty ops."),
dtypes=EMPTY_OPS_SKIPLIST[key],
),
)
if key in SKIPLIST:
addDecorator(
op, DecorateInfo(unittest.skip("Skipped!"), dtypes=SKIPLIST[key])
)
for xfaillist in [
UNIMPLEMENTED_XFAILLIST,
UNDEFINED_XFAILLIST,
ON_MPS_XFAILLIST,
]:
if key in xfaillist and key not in xfail_exclusion:
addDecorator(
op,
DecorateInfo(unittest.expectedFailure, dtypes=xfaillist[key]),
)
if (
key in MACOS_BEFORE_14_4_XFAILLIST
and key not in xfail_exclusion
and (MACOS_VERSION < 14.4)
):
addDecorator(
op,
DecorateInfo(
unittest.expectedFailure,
dtypes=MACOS_BEFORE_14_4_XFAILLIST[key],
),
)
if (
key in MACOS_BEFORE_13_3_XFAILLIST
and key not in xfail_exclusion
and (torch.backends.mps.is_macos13_or_newer() and MACOS_VERSION < 13.3)
):
addDecorator(
op,
DecorateInfo(
unittest.expectedFailure,
dtypes=MACOS_BEFORE_13_3_XFAILLIST[key],
),
)
if (
key in MACOS_AFTER_13_1_XFAILLIST
and key not in xfail_exclusion
and torch.backends.mps.is_macos13_or_newer(2)
):
addDecorator(
op,
DecorateInfo(
unittest.expectedFailure, dtypes=MACOS_AFTER_13_1_XFAILLIST[key]
),
)
if (
key in MACOS_13_3_XFAILLIST
and key not in xfail_exclusion
and (MACOS_VERSION >= 13.3)
):
addDecorator(
op,
DecorateInfo(
unittest.expectedFailure, dtypes=MACOS_13_3_XFAILLIST[key]
),
)
# If ops is not supported for complex types, expect it to fail
if key not in SUPPORTED_COMPLEX_OPS and (
key not in AFTER_MACOS_14_0_SUPPORTED_COMPLEX_OPS
or MACOS_VERSION < 14.0
):
addDecorator(
op,
DecorateInfo(
unittest.expectedFailure,
dtypes=[torch.complex32, torch.complex64],
),
)
return ops
def mps_ops_grad_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
XFAILLIST_GRAD = {
# Unimplemented ops
"_segment_reduce": [torch.float16, torch.float32],
"_chunk_cat": [torch.float16, torch.float32],
"_upsample_bilinear2d_aa": None, # `_upsample_bilinear2d_aa_backward_out` not implemented for MPS
"_upsample_bicubic2d_aa": None, # `_upsample_bilinear2d_aa_backward_out` not implemented for MPS
"sparse.mmreduce": [torch.float32], # csr not supported
"unique_consecutive": [torch.float16, torch.float32],
"scalar_tensor": [torch.float16, torch.float32],
"cdist": [torch.float32],
"masked.scatter": [torch.float16, torch.float32],
"index_fill": [torch.float16, torch.float32], # missing `aten::_unique`.
"linalg.solve": [torch.float16, torch.float32], # missing `aten::lu_solve`.
"linalg.solve_ex": [
torch.float16,
torch.float32,
], # missing `aten::lu_solve`.
"linalg.tensorsolve": [
torch.float16,
torch.float32,
], # missing `aten::lu_solve`.
"linalg.det": [torch.float16, torch.float32], # missing aten::lu_solve.out
"linalg.slogdet": [
torch.float16,
torch.float32,
], # missing aten::lu_solve.out
"logdet": [torch.float16, torch.float32], # missing aten::lu_solve.out
"aminmax": [torch.float32, torch.float16],
"special.i1": [torch.float16], # "i1_backward" not implemented for 'Half'
"special.i1e": [torch.float16], # "i1e_backward" not implemented for 'Half'
# Correctness issues
"atanh": [torch.float32],
# Random output
"exponential": [torch.float16, torch.float32],
# CPU errors
# derivative for zeta is not implemented
"special.zeta": None,
# derivative for aten::nextafter is not implemented on CPU
"nextafter": None,
# derivative for aten::floor_divide is not implemented on CPU
"floor_divide": [torch.float16, torch.float32],
# derivative for aten::narrow_copy is not implemented on CPU
"narrow_copy": [torch.float16, torch.float32],
# derivative for aten::_histogramdd_from_bin_cts is not implemented on CPU
"histogramdd": [torch.float16, torch.float32],
# derivative for aten::histogram is not implemented
"histogram": [torch.float16, torch.float32],
# 'bool' object is not iterable
"allclose": [torch.float16, torch.float32],
"equal": [torch.float16, torch.float32],
# 'float' object is not iterable
"item": [torch.float16, torch.float32],
# "smooth_l1_backward_cpu_out" not implemented for 'Half'
"nn.functional.smooth_l1_loss": [torch.float16],
# cpu error: grad requires non-empty inputs
"randn": [torch.float16, torch.float32],
"signal.windows.bartlett": [torch.float32],
"signal.windows.blackman": [torch.float32],
"signal.windows.cosine": [torch.float32],
"signal.windows.exponential": [torch.float32],
"signal.windows.gaussian": [torch.float32],
"signal.windows.general_cosine": [torch.float32],
"signal.windows.general_hamming": [torch.float32],
"signal.windows.hamming": [torch.float32],
"signal.windows.hann": [torch.float32],
"signal.windows.kaiser": [torch.float32],
"signal.windows.nuttall": [torch.float32],
"eye": [torch.float16, torch.float32],
# round not working properly for float16
"round": [torch.float16],
# topk fails with duplicate indices
"topk": [torch.float16],
}
MACOS_BEFORE_13_3_XFAILLIST_GRAD = {
# Failures due to precision issues (may be fast-math). These has been fixed in MacOS 14
"masked.softmin": [torch.float32, torch.float16],
"masked.softmax": [torch.float32, torch.float16],
"masked.log_softmax": [torch.float32, torch.float16],
"atanh": [torch.float16],
"triangular_solve": [torch.float32],
# Unsupported Border padding mode, forward pass success as fallback to cpu
"grid_sampler_2d": [torch.float32, torch.float16, torch.bfloat16],
# Same issue as `argsort` and `sort` with duplicate elements (undefined behaviour).
# Forward pass is passing since `msort` doesn't return the indices, just the values, which match the CPU.
# On the backward pass for `sort` both are used (values and indices), thus resulting in a issmatch between CPU and MPS.
# Running `msort` with stable `sort` passes.
"msort": [torch.float16],
}
SKIPLIST_GRAD = {
"nn.functional.pairwise_distance": [torch.float16],
# failed assertion `destination datatype must be fp32'
"nn.functional.conv1d": [torch.float16],
"nn.functional.conv2d": [torch.float16],
"nn.functional.conv3d": [torch.float16],
"nn.functional.conv_transpose1d": [torch.float16],
"nn.functional.conv_transpose2d": [torch.float16],
"nn.functional.conv_transpose3d": [torch.float16],
}
MACOS_13_3_XFAILLIST_GRAD = {
# Same issue as `argsort` and `sort` with duplicate elements (undefined behaviour).
# Forward pass is passing since `msort` doesn't return the indices, just the values, which match the CPU.
# On the backward pass for `sort` both are used (values and indices), thus resulting in a issmatch between CPU and MPS.
# Running `msort` with stable `sort` passes.
"msort": [torch.float16],
}
ON_MPS_XFAILLIST = {
# Failures due to lack of implementation of downstream functions on MPS backend
# TODO: remove these once downstream function 'aten::_linalg_svd.U' have been implemented
"linalg.matrix_rank": None,
# Exception: Caused by sample input at index 3 on MPS
"nn.functional.conv3d": [torch.float32],
}
def addDecorator(op: OpInfo, d: DecorateInfo) -> None:
op.decorators = op.decorators + (d,)
for op in ops:
key = op.name + op.variant_test_name
if key in XFAILLIST_GRAD:
addDecorator(
op,
DecorateInfo(unittest.expectedFailure, dtypes=XFAILLIST_GRAD[key]),
)
if key in SKIPLIST_GRAD:
addDecorator(op, DecorateInfo(unittest.skip, dtypes=SKIPLIST_GRAD[key]))
if key in ON_MPS_XFAILLIST:
addDecorator(
op,
DecorateInfo(
unittest.expectedFailure, dtypes=ON_MPS_XFAILLIST[key]
),
)
if key in MACOS_BEFORE_13_3_XFAILLIST_GRAD and (
torch.backends.mps.is_macos13_or_newer() and MACOS_VERSION < 13.3
):
addDecorator(
op,
DecorateInfo(
unittest.expectedFailure,
dtypes=MACOS_BEFORE_13_3_XFAILLIST_GRAD[key],
),
)
if key in MACOS_13_3_XFAILLIST_GRAD and (MACOS_VERSION >= 13.3):
addDecorator(
op,
DecorateInfo(
unittest.expectedFailure, dtypes=MACOS_13_3_XFAILLIST_GRAD[key]
),
)
return ops
def mps_ops_error_inputs_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
# Error input samples do not take a dtype argument.
XFAILLIST = {
# Exceptions are not raised
"__rmod__",
"__rsub__",
"__rpow__",
"bernoulli",
"clamp_max",
"clamp_min",
"masked_scatter",
# unsupported float64 dtype
"cat",
"complex",
"multinomial",
"nn.functional.conv1d",
"nn.functional.conv2d",
"nn.functional.conv3d",
"gather",
"scatter",
"scatter_add",
# MPS does not support tensor dimensions > 16
"amax",
"amin",
"aminmax",
# memory overlapping checks
"index_select",
}
def addDecorator(op: OpInfo, d: DecorateInfo) -> None:
op.decorators = op.decorators + (d,)
for op in ops:
key = op.name + op.variant_test_name
if key in XFAILLIST:
addDecorator(op, DecorateInfo(unittest.expectedFailure))
return ops