add the torch.float8_e8m0fnu dtype to PyTorch (#147466)

Summary: Continuing the work from https://github.com/pytorch/pytorch/pull/146427 Adds the `torch.float8_e8m0fnu` dtype to PyTorch, as detailed in https://github.com/pytorch/pytorch/issues/146414 . Please see the issue for a detailed definition of the format. Example of basic functionality: ```python import torch # round trip x0 = torch.randn(4, 4, dtype=torch.float32) x1 = x0.to(torch.float8_e8m0fnu) # RNE rounding x2 = x1.to(torch.float32) # 2 ** exponent # creation with empty x0 = torch.empty(4, 4, dtype=torch.float8_e8m0fnu) # printing print(x0) ``` Done in this PR: * numerical correctness * op coverage (except for `torch._scaled_mm`): create tensor, cast to/from float32 * printing a tensor works For future PRs: * performance optimizations for casting * torch._scaled_mm * PT2 * various cleanups (detailed in comments with issue numbers) Test Plan: ``` pytest test/quantization/core/experimental/test_float8.py -s ``` Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/147466 Approved by: https://github.com/drisspg
2025-10-20 21:14:14 +08:00 · 2025-02-20 13:55:42 +00:00
parent 574371d828
commit 382fbcc1e4
25 changed files with 535 additions and 44 deletions
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@ -63,10 +63,12 @@ DLDataType getDLDataType(const Tensor& t) {
    case ScalarType::BFloat16:
      dtype.code = DLDataTypeCode::kDLBfloat;
      break;
    // TODO(#146647): use macro here instead of spelling out each shell dtype
    case ScalarType::Float8_e5m2:
    case ScalarType::Float8_e5m2fnuz:
    case ScalarType::Float8_e4m3fn:
    case ScalarType::Float8_e4m3fnuz:
    case ScalarType::Float8_e8m0fnu:
      TORCH_CHECK(false, "float8 types are not supported by dlpack");
      break;
    case ScalarType::QInt8:
--- a/aten/src/ATen/Dispatch_v2.h
+++ b/aten/src/ATen/Dispatch_v2.h
@ -87,7 +87,7 @@
 #define AT_FLOAT8_TYPES                                          \
  c10::kFloat8_e5m2, c10::kFloat8_e5m2fnuz, c10::kFloat8_e4m3fn, \
-      c10::kFloat8_e4m3fnuz
+      c10::kFloat8_e4m3fnuz, c10::kFloat8_e8m0fnu
 #define AT_INTEGRAL_TYPES \
  c10::kByte, c10::kChar, c10::kInt, c10::kLong, c10::kShort
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@ -59,8 +59,8 @@ bool copy_transpose_valid(const Tensor& self, const Tensor& src) {
 #if !defined(C10_MOBILE)
 #define _AT_DISPATCH_CP_TYPES(TYPE, NAME, ...)                              \
        AT_DISPATCH_V2(                             \
-            TYPE, NAME, AT_WRAP(__VA_ARGS__), kComplexHalf, kHalf, kBool, kBFloat16, kFloat8_e5m2,            \
+            TYPE, NAME, AT_WRAP(__VA_ARGS__), kComplexHalf, kHalf, kBool, kBFloat16,            \
-            kFloat8_e4m3fn, kFloat8_e5m2fnuz, kFloat8_e4m3fnuz, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
+            AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
 #else
 #define _AT_DISPATCH_CP_TYPES(TYPE, NAME, ...)     \
        AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(    \
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@ -460,7 +460,8 @@ Tensor isinf(const Tensor& self) {
 Tensor isfinite(const Tensor& self) {
  // Note: Integral tensor values are always finite
-  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/true)) {
+  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/true) ||
      self.scalar_type() == kFloat8_e8m0fnu) {
    return at::ones_like(self, at::kBool, at::MemoryFormat::Preserve);
  }
--- a/aten/src/ATen/native/cpu/CopyKernel.cpp
+++ b/aten/src/ATen/native/cpu/CopyKernel.cpp
@ -204,12 +204,12 @@ static void reduced_float_copy_kernel(TensorIteratorBase &iter, bool requires_ne
 #define _AT_DISPATCH_ALL_TYPES(TYPE, NAME, ...)                                       \
        AT_DISPATCH_V2(TYPE, NAME, AT_WRAP(__VA_ARGS__),                                       \
            kComplexHalf, kHalf, kBool,              \
-            kBFloat16, kFloat8_e5m2, kFloat8_e4m3fn, \
+            kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), \
-            kFloat8_e5m2fnuz, kFloat8_e4m3fnuz, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
+            AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
 #define _AT_DISPATCH_ALL_TYPES_NO_CF(TYPE, NAME, ...)              \
        AT_DISPATCH_V2(TYPE, NAME, AT_WRAP(__VA_ARGS__),                    \
-            kBool, kHalf, kBFloat16, kFloat8_e5m2, kFloat8_e4m3fn, \
+            kBool, kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), \
-            kFloat8_e5m2fnuz, kFloat8_e4m3fnuz, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
+            AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
 #else
 #define _AT_DISPATCH_ALL_TYPES(TYPE, NAME, ...)                                               \
        AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(                                               \
--- a/aten/src/ATen/native/cpu/FillKernel.cpp
+++ b/aten/src/ATen/native/cpu/FillKernel.cpp
@ -51,6 +51,9 @@ void fill_kernel(TensorIterator& iter, const Scalar& value_scalar) {
    fill_non_native_type<at::Float8_e4m3fnuz>(iter, value_scalar);
  } else if (iter.dtype() == ScalarType::Float8_e5m2fnuz) {
    fill_non_native_type<at::Float8_e5m2fnuz>(iter, value_scalar);
  } else if (iter.dtype() == ScalarType::Float8_e8m0fnu) {
    // TODO(#146647): use macro here instead of spelling out each float8 dtype
    fill_non_native_type<at::Float8_e8m0fnu>(iter, value_scalar);
  } else {
    AT_DISPATCH_V2(
      iter.dtype(), "fill_cpu", AT_WRAP([&]() {
--- a/aten/src/ATen/native/cpu/IndexKernel.cpp
+++ b/aten/src/ATen/native/cpu/IndexKernel.cpp
@ -184,7 +184,13 @@ void index_put_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef
      }
    }),
    AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-    AT_EXPAND(AT_FLOAT8_TYPES),
+    // AT_EXPAND(AT_FLOAT8_TYPES),
    // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
    // should not be supported here, then reenable AT_FLOAT8_DTYPES
    kFloat8_e4m3fn,
    kFloat8_e5m2,
    kFloat8_e4m3fnuz,
    kFloat8_e5m2fnuz,
    kComplexHalf,
    kHalf,
    kBool,
--- a/aten/src/ATen/native/cuda/Copy.cu
+++ b/aten/src/ATen/native/cuda/Copy.cu
@ -144,6 +144,28 @@ void float8_copy_kernel_cuda(TensorIteratorBase &iter) {
         gpu_kernel(iter, [] GPU_LAMBDA(Float8_e5m2fnuz x) { return x; });
         break;
    }
  } else if (dtype == kFloat8_e8m0fnu) {
    // TODO(#146647): clean this up, too much copy-pasta
    switch (other_dtype) {
      case kFloat:
         gpu_kernel_nocast(iter, [] GPU_LAMBDA(float value) {
             return Float8_e8m0fnu(value);
         });
         break;
      case kHalf:
         gpu_kernel_nocast(iter, [] GPU_LAMBDA(Half value) {
             return Float8_e8m0fnu(value);
         });
         break;
      case kBFloat16:
         gpu_kernel_nocast(iter, [] GPU_LAMBDA(BFloat16 value) {
             return Float8_e8m0fnu(value);
         });
         break;
      default:
         gpu_kernel(iter, [] GPU_LAMBDA(Float8_e8m0fnu x) { return x; });
         break;
    }
  } else {
    TORCH_CHECK(false, "This supposed ot be called only for Float8 types");
  }
@ -157,7 +179,7 @@ void direct_copy_kernel_cuda(TensorIteratorBase &iter) {
    AT_DISPATCH_QINT_TYPES(dtype, "copy_", [&] {
      gpu_kernel(iter, [] GPU_LAMBDA(scalar_t x) { return x; });
    });
-  } else if (dtype == kFloat8_e5m2 || dtype == kFloat8_e4m3fn || dtype == kFloat8_e5m2fnuz || dtype == kFloat8_e4m3fnuz) {
+  } else if (isFloat8Type(dtype)) {
     float8_copy_kernel_cuda(iter);
  } else if (iter.dtype(1) == kFloat && (dtype == kBFloat16 || dtype == kHalf)) {
     if (dtype == kBFloat16) {
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@ -582,7 +582,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
            C10_CUDA_KERNEL_LAUNCH_CHECK();
          }),
          AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-          AT_EXPAND(AT_FLOAT8_TYPES),
+          // AT_EXPAND(AT_FLOAT8_TYPES),
          // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
          // should not be supported here, then reenable AT_FLOAT8_DTYPES
          kFloat8_e4m3fn,
          kFloat8_e5m2,
          kFloat8_e4m3fnuz,
          kFloat8_e5m2fnuz,
          kComplexHalf,
          kHalf,
          kBool,
@ -606,7 +612,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
              C10_CUDA_KERNEL_LAUNCH_CHECK();
            }),
            AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-            AT_EXPAND(AT_FLOAT8_TYPES),
+            // AT_EXPAND(AT_FLOAT8_TYPES),
            // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
            // should not be supported here, then reenable AT_FLOAT8_DTYPES
            kFloat8_e4m3fn,
            kFloat8_e5m2,
            kFloat8_e4m3fnuz,
            kFloat8_e5m2fnuz,
            kComplexHalf,
            kHalf,
            kBool,
@ -630,7 +642,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
                C10_CUDA_KERNEL_LAUNCH_CHECK();
              }),
              AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-              AT_EXPAND(AT_FLOAT8_TYPES),
+              // AT_EXPAND(AT_FLOAT8_TYPES),
              // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
              // should not be supported here, then reenable AT_FLOAT8_DTYPES
              kFloat8_e4m3fn,
              kFloat8_e5m2,
              kFloat8_e4m3fnuz,
              kFloat8_e5m2fnuz,
              kComplexHalf,
              kHalf,
              kBool,
@ -652,7 +670,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
                C10_CUDA_KERNEL_LAUNCH_CHECK();
              }),
              AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-              AT_EXPAND(AT_FLOAT8_TYPES),
+              // AT_EXPAND(AT_FLOAT8_TYPES),
              // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
              // should not be supported here, then reenable AT_FLOAT8_DTYPES
              kFloat8_e4m3fn,
              kFloat8_e5m2,
              kFloat8_e4m3fnuz,
              kFloat8_e5m2fnuz,
              kComplexHalf,
              kHalf,
              kBool,
@ -677,7 +701,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
              C10_CUDA_KERNEL_LAUNCH_CHECK();
            }),
            AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-            AT_EXPAND(AT_FLOAT8_TYPES),
+            // AT_EXPAND(AT_FLOAT8_TYPES),
            // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
            // should not be supported here, then reenable AT_FLOAT8_DTYPES
            kFloat8_e4m3fn,
            kFloat8_e5m2,
            kFloat8_e4m3fnuz,
            kFloat8_e5m2fnuz,
            kComplexHalf,
            kHalf,
            kBool,
--- a/aten/src/ATen/native/cuda/jit_utils.h
+++ b/aten/src/ATen/native/cuda/jit_utils.h
@ -228,6 +228,10 @@ template <> inline std::string typeName<at::Float8_e5m2fnuz>() {
 template <> inline std::string typeName<at::Float8_e4m3fnuz>() {
    return "at::Float8_e4m3fnuz";
 }
 template <> inline std::string typeName<at::Float8_e8m0fnu>() {
    // TODO(#146647): Can the code here be made generic for any scalartype?
    return "at::Float8_e8m0fnu";
 }
 #define TYPE_NAME_CASE(ctype, scalartype)                    \
  case ScalarType::scalartype:  return typeName<ctype>();
--- a/c10/core/Scalar.h
+++ b/c10/core/Scalar.h
@ -49,16 +49,9 @@ class C10_API Scalar {
 #define DEFINE_IMPLICIT_CTOR(type, name) \
  Scalar(type vv) : Scalar(vv, true) {}
-  AT_FORALL_SCALAR_TYPES_AND7(
+  AT_FORALL_SCALAR_TYPES_AND3(Half, BFloat16, ComplexHalf, DEFINE_IMPLICIT_CTOR)
      Half,
      BFloat16,
      Float8_e5m2,
      Float8_e4m3fn,
      Float8_e5m2fnuz,
      Float8_e4m3fnuz,
      ComplexHalf,
      DEFINE_IMPLICIT_CTOR)
  AT_FORALL_COMPLEX_TYPES(DEFINE_IMPLICIT_CTOR)
  AT_FORALL_FLOAT8_TYPES(DEFINE_IMPLICIT_CTOR)
  // Helper constructors to allow Scalar creation from long and long long types
  // As std::is_same_v<long, long long> is false(except Android), one needs to
--- a/c10/core/ScalarType.cpp
+++ b/c10/core/ScalarType.cpp
@ -222,6 +222,9 @@ std::pair<std::string, std::string> getDtypeNames(c10::ScalarType scalarType) {
      return std::make_pair("float8_e5m2fnuz", "");
    case c10::ScalarType::Float8_e4m3fnuz:
      return std::make_pair("float8_e4m3fnuz", "");
    case c10::ScalarType::Float8_e8m0fnu:
      // TODO(#146647): macroify all of this
      return std::make_pair("float8_e8m0fnu", "");
    default:
      throw std::runtime_error("Unimplemented scalar type");
  }
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@ -7,6 +7,7 @@
 #include <c10/util/Float8_e4m3fnuz.h>
 #include <c10/util/Float8_e5m2.h>
 #include <c10/util/Float8_e5m2fnuz.h>
 #include <c10/util/Float8_e8m0fnu.h>
 #include <c10/util/Half.h>
 #include <c10/util/bits.h>
 #include <c10/util/complex.h>
@ -102,7 +103,8 @@ struct dummy_int1_7_t {};
  _(c10::dummy_int1_7_t<4>, Int4) /* 40 */               \
  _(c10::dummy_int1_7_t<5>, Int5) /* 41 */               \
  _(c10::dummy_int1_7_t<6>, Int6) /* 42 */               \
-  _(c10::dummy_int1_7_t<7>, Int7) /* 43 */
+  _(c10::dummy_int1_7_t<7>, Int7) /* 43 */               \
  _(c10::Float8_e8m0fnu, Float8_e8m0fnu) /* 44 */
 // If you want to support ComplexHalf for real, add ComplexHalf
 // into this macro (and change the name).  But beware: convert()
@ -146,7 +148,8 @@ struct dummy_int1_7_t {};
  _(at::Float8_e5m2, Float8_e5m2)              \
  _(at::Float8_e4m3fn, Float8_e4m3fn)          \
  _(at::Float8_e5m2fnuz, Float8_e5m2fnuz)      \
-  _(at::Float8_e4m3fnuz, Float8_e4m3fnuz)
+  _(at::Float8_e4m3fnuz, Float8_e4m3fnuz)      \
  _(at::Float8_e8m0fnu, Float8_e8m0fnu)
 enum class ScalarType : int8_t {
 #define DEFINE_ST_ENUM_VAL_(_1, n) n,
@ -317,6 +320,13 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
  _(c10::quint4x2, QUInt4x2)    \
  _(c10::quint2x4, QUInt2x4)
 #define AT_FORALL_FLOAT8_TYPES(_)         \
  _(at::Float8_e5m2, Float8_e5m2)         \
  _(at::Float8_e4m3fn, Float8_e4m3fn)     \
  _(at::Float8_e5m2fnuz, Float8_e5m2fnuz) \
  _(at::Float8_e4m3fnuz, Float8_e4m3fnuz) \
  _(at::Float8_e8m0fnu, Float8_e8m0fnu)
 #define AT_FORALL_COMPLEX_TYPES(_)     \
  _(c10::complex<float>, ComplexFloat) \
  _(c10::complex<double>, ComplexDouble)
@ -372,7 +382,8 @@ inline bool isIntegralType(ScalarType t) {
 inline bool isFloat8Type(ScalarType t) {
  return t == ScalarType::Float8_e5m2 || t == ScalarType::Float8_e5m2fnuz ||
-      t == ScalarType::Float8_e4m3fn || t == ScalarType::Float8_e4m3fnuz;
+      t == ScalarType::Float8_e4m3fn || t == ScalarType::Float8_e4m3fnuz ||
      t == ScalarType::Float8_e8m0fnu;
 }
 inline bool isReducedFloatingType(ScalarType t) {
@ -446,6 +457,10 @@ inline bool isSignedType(ScalarType t) {
    return std::numeric_limits< \
        ::c10::impl::ScalarTypeToCPPTypeT<ScalarType::name>>::is_signed;
  // TODO(#146647): If we expect to have numeric_limits for everything,
  // let's just have a big macro for the whole thing.
  // If we're hardcoding it, let's just use the macro and a "true"/"false"
  // below?
  switch (t) {
    case ScalarType::QInt8:
    case ScalarType::QUInt8:
@ -467,6 +482,7 @@ inline bool isSignedType(ScalarType t) {
      CASE_ISSIGNED(Float8_e5m2fnuz);
      CASE_ISSIGNED(Float8_e4m3fn);
      CASE_ISSIGNED(Float8_e4m3fnuz);
      CASE_ISSIGNED(Float8_e8m0fnu);
      CASE_ISSIGNED(Byte);
      CASE_ISSIGNED(Char);
      CASE_ISSIGNED(Short);
--- a/c10/util/Float8_e8m0fnu-inl.h
+++ b/c10/util/Float8_e8m0fnu-inl.h
@ -0,0 +1,112 @@
 #pragma once
 #include <c10/macros/Macros.h>
 #include <c10/util/floating_point_utils.h>
 #include <cstring>
 #include <limits>
 // TODO(#146647): Can we remove the below warning?
 C10_CLANG_DIAGNOSTIC_PUSH()
 #if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
 C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
 #endif
 namespace c10 {
 /// Constructors
 inline C10_HOST_DEVICE Float8_e8m0fnu::Float8_e8m0fnu(float value)
    : x(detail::fp8e8m0fnu_from_fp32_value(value)) {}
 /// Implicit conversions
 inline C10_HOST_DEVICE Float8_e8m0fnu::operator float() const {
  // TODO(#146647): maybe rewrite without control flow
  // if exponent is zero, need to special case to return 2^-127 instead of zero
  if (x == 0) {
    return c10::detail::fp32_from_bits(0x00400000);
  }
  // if exponent is NaN, need to special case to return properly encoded NaN
  if (isnan()) {
    return c10::detail::fp32_from_bits(0x7f800001);
  }
  // leave sign at 0, set the exponent bits, leave stored mantissa at 0
  uint32_t res = x << 23;
  return c10::detail::fp32_from_bits(res);
 }
 /// Special values helper
 inline C10_HOST_DEVICE bool Float8_e8m0fnu::isnan() const {
  return x == 0b11111111;
 }
 /// NOTE: we do not define comparisons directly and instead rely on the implicit
 /// conversion from c10::Float8_e8m0fnu to float.
 } // namespace c10
 namespace std {
 template <>
 class numeric_limits<c10::Float8_e8m0fnu> {
 public:
  static constexpr bool is_specialized = true;
  static constexpr bool is_signed = false;
  static constexpr bool is_integer = false;
  static constexpr bool is_exact = false;
  static constexpr bool has_infinity = false;
  static constexpr bool has_quiet_NaN = true;
  static constexpr bool has_signaling_NaN = false;
  static constexpr auto has_denorm = false;
  static constexpr auto has_denorm_loss = false;
  static constexpr auto round_style = numeric_limits<float>::round_style;
  static constexpr bool is_iec559 = false;
  static constexpr bool is_bounded = true;
  static constexpr bool is_modulo = false;
  static constexpr int digits = 1;
  static constexpr int digits10 = 0;
  static constexpr int max_digits10 = 1; // just a 2!
  static constexpr int radix = 2;
  static constexpr int min_exponent = -126;
  static constexpr int min_exponent10 = -38;
  static constexpr int max_exponent = 128;
  static constexpr int max_exponent10 = 38;
  static constexpr auto traps = numeric_limits<float>::traps;
  static constexpr auto tinyness_before = false;
  static constexpr c10::Float8_e8m0fnu min() {
    // 2^-127
    return c10::Float8_e8m0fnu(0b00000000, c10::Float8_e8m0fnu::from_bits());
  }
  static constexpr c10::Float8_e8m0fnu lowest() {
    // 2^-127
    return c10::Float8_e8m0fnu(0b00000000, c10::Float8_e8m0fnu::from_bits());
  }
  static constexpr c10::Float8_e8m0fnu max() {
    // 254 biased, which is 127 unbiased, so 2^127
    return c10::Float8_e8m0fnu(0b11111110, c10::Float8_e8m0fnu::from_bits());
  }
  static constexpr c10::Float8_e8m0fnu epsilon() {
    // according to https://en.cppreference.com/w/cpp/types/numeric_limits, this
    // is "the difference between 1.0 and the next representable value of the
    // given floating-point type". The next representable value is 2.0, so the
    // difference is 1.0 which is 2^0. 0 unbiased is 127 biased.
    return c10::Float8_e8m0fnu(0b01111111, c10::Float8_e8m0fnu::from_bits());
  }
  static constexpr c10::Float8_e8m0fnu round_error() {
    // 0.5 in float, which is 2^-1, and -1 + 127 = 126
    return c10::Float8_e8m0fnu(0b01111110, c10::Float8_e8m0fnu::from_bits());
  }
  static constexpr c10::Float8_e8m0fnu quiet_NaN() {
    return c10::Float8_e8m0fnu(0b11111111, c10::Float8_e8m0fnu::from_bits());
  }
 };
 } // namespace std
 C10_CLANG_DIAGNOSTIC_POP()
--- a/c10/util/Float8_e8m0fnu.cpp
+++ b/c10/util/Float8_e8m0fnu.cpp
@ -0,0 +1,12 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/Float8_e8m0fnu.h>
 namespace c10 {
 // TODO(#146647): Can we have these in a single shared cpp file
 // built with macro to remove the need for a new cpp file?
 static_assert(
    std::is_standard_layout_v<Float8_e8m0fnu>,
    "c10::Float8_e8m0fnu must be standard layout.");
 } // namespace c10
--- a/c10/util/Float8_e8m0fnu.h
+++ b/c10/util/Float8_e8m0fnu.h
@ -0,0 +1,120 @@
 #pragma once
 /// Defines the Float8_e8m0fnu type (8-bit floating-point) including
 /// conversions to standard C types
 /// Binary configuration :
 /// eeeeeeee
 /// no sign bits
 /// 8 exponent bits
 /// no mantissa bits
 ///
 /// This is the E8M0 dtype from the OCP MX format spec
 /// (https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf,
 /// Section 5.4.1)
 #include <c10/macros/Export.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/floating_point_utils.h>
 #include <type_traits>
 // TODO(#146647): do we need to special case OPENCL?
 #if defined(__cplusplus)
 #include <cstdint>
 #elif !defined(__OPENCL_VERSION__)
 #include <math.h>
 #include <stdint.h>
 #endif
 #include <iosfwd>
 #include <ostream>
 namespace c10 {
 namespace detail {
 /*
 * Convert a 32-bit floating-point number in IEEE single-precision format to a
 * 8-bit floating-point number in fp8 e8m0fnu format, in bit representation.
 */
 inline C10_HOST_DEVICE uint8_t fp8e8m0fnu_from_fp32_value(float f) {
  // TODO(#146647): maybe rewrite without control flow
  uint32_t f_bits = c10::detail::fp32_to_bits(f);
  // extract the exponent
  uint32_t exponent = (f_bits >> 23) & 0b11111111;
  // special case float32 NaN and +-inf to map to e8m0 nan
  if (exponent == 0b11111111) {
    return exponent;
  }
  // next, we use guard, round, sticky bits and the LSB to implement round to
  // nearest, with ties to even
  // guard bit - bit 23, or 22 zero-indexed
  uint8_t g = (f_bits & 0x400000) > 0;
  // round bit - bit 22, or 21 zero-indexed
  uint8_t r = (f_bits & 0x200000) > 0;
  // sticky bit - bits 21 to 1, or 20 to 0 zero-indexed
  uint8_t s = (f_bits & 0x1FFFFF) > 0;
  // in casting to e8m0, LSB is the implied mantissa bit. It equals to 0 if the
  // original float32 is denormal, and to 1 if the original float32 is normal.
  uint8_t lsb = exponent > 0;
  // implement the RNE logic
  bool round_up = false;
  // if g == 0, round down (no-op)
  if (g == 1) {
    if ((r == 1) || (s == 1)) {
      // round up
      round_up = true;
    } else {
      if (lsb == 1) {
        // round up
        round_up = true;
      }
      // if lsb == 0, round down (no-op)
    }
  }
  if (round_up) {
    // adjust exponent
    // note that if exponent was 255 we would have already returned earlier, so
    // we know we can add one safely without running out of bounds
    exponent++;
  }
  return exponent;
 }
 } // namespace detail
 struct alignas(1) Float8_e8m0fnu {
  uint8_t x;
  struct from_bits_t {};
  C10_HOST_DEVICE static constexpr from_bits_t from_bits() {
    return from_bits_t();
  }
  Float8_e8m0fnu() = default;
  constexpr C10_HOST_DEVICE Float8_e8m0fnu(uint8_t bits, from_bits_t)
      : x(bits) {}
  inline C10_HOST_DEVICE Float8_e8m0fnu(float value);
  inline C10_HOST_DEVICE operator float() const;
  inline C10_HOST_DEVICE bool isnan() const;
 };
 C10_API inline std::ostream& operator<<(
    std::ostream& out,
    const Float8_e8m0fnu& value) {
  out << (float)value;
  return out;
 }
 } // namespace c10
 #include <c10/util/Float8_e8m0fnu-inl.h> // IWYU pragma: keep
--- a/c10/util/TypeCast.h
+++ b/c10/util/TypeCast.h
@ -5,6 +5,7 @@
 #include <c10/util/Float8_e4m3fnuz.h>
 #include <c10/util/Float8_e5m2.h>
 #include <c10/util/Float8_e5m2fnuz.h>
 #include <c10/util/Float8_e8m0fnu.h>
 #include <c10/util/Half.h>
 #include <c10/util/complex.h>
 #include <c10/util/overflows.h>
@ -151,6 +152,19 @@ struct static_cast_with_inter_type<
  }
 };
 // TODO(#146647): Can we make all these template specialization happen
 // based off our apply macros?
 template <>
 struct static_cast_with_inter_type<
    c10::complex<c10::Half>,
    c10::Float8_e8m0fnu> {
  C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline c10::complex<
      c10::Half>
  apply(c10::Float8_e8m0fnu src) {
    return static_cast<c10::complex<c10::Half>>(c10::complex<float>{src});
  }
 };
 template <>
 struct static_cast_with_inter_type<c10::complex<c10::Half>, c10::Half> {
  C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline c10::complex<
--- a/test/quantization/core/experimental/test_float8.py
+++ b/test/quantization/core/experimental/test_float8.py
@ -1,5 +1,6 @@
 # Owner(s): ["oncall: quantization"]
 import struct
 import unittest
 import torch
@ -14,6 +15,7 @@ from torch.testing._internal.common_utils import (
    parametrize,
    run_tests,
    subtest,
    TemporaryFileName,
    TestCase,
 )
@ -23,11 +25,13 @@ FLOAT8_DTYPES = [
    torch.float8_e5m2fnuz,
    torch.float8_e4m3fn,
    torch.float8_e4m3fnuz,
    torch.float8_e8m0fnu,
 ]
 CUDA_FLOAT8_DTYPES = [
    torch.float8_e5m2,
    torch.float8_e4m3fn,
    torch.float8_e8m0fnu,
 ]
 # The following information are not yet provided by torch.finfo.
@ -37,6 +41,7 @@ MANTISSA_BITS = {
    torch.float8_e5m2fnuz: 2,
    torch.float8_e4m3fn: 3,
    torch.float8_e4m3fnuz: 3,
    torch.float8_e8m0fnu: 0,
 }
 # As in np.finfo(dtype).minexp
@ -45,6 +50,7 @@ MINEXP = {
    torch.float8_e5m2fnuz: -15,
    torch.float8_e4m3fn: -6,
    torch.float8_e4m3fnuz: -7,
    torch.float8_e8m0fnu: -127,
 }
 SPECIAL_NUMBERS = {
@ -108,11 +114,24 @@ SPECIAL_NUMBERS = {
        ("00000001", 0.125 * (2**-7), "min_subnorm"),
        ("10000001", -0.125 * (2**-7), "neg_min_subnorm"),
    ],
    torch.float8_e8m0fnu: [
        ("00000000", float(2**-127), "smallest_number"),
        ("11111110", float(2**127), "largest_number"),
        ("01111110", 0.5, "zero_point_five"),
        ("01111111", 1.0, "one"),
        ("10000000", 2.0, "two"),
        ("11111111", float("nan"), "nan"),
    ],
 }
 FLOAT8_DTYPES_WITH_INF = [torch.float8_e5m2]
 def _int_bits_to_float(x):
    y = struct.unpack("!f", struct.pack("!I", x))[0]
    return y
 def simulate_fp8_precision(input, variant):
    """Round input (as float32) to the given float8 datatype variant."""
@ -165,6 +184,24 @@ def simulate_fp8_precision(input, variant):
    return vals * signs
 def _round_e8m0_rne(biased_exponent, lsb, g, r, s):
    round_up = False
    # apply g,r,s rounding rules for RNE rounding
    if g == 1:
        if (r == 1) or (s == 1):
            round_up = True
        else:
            if lsb:
                round_up = True
    # round up if necessary
    if round_up:
        biased_exponent += 1
    return biased_exponent
 ROUND_TRIP_TEST_CASES = (
    # A general 'soak test'.
    subtest(
@ -198,17 +235,19 @@ ROUND_TRIP_TEST_CASES = (
 class TestFloat8Dtype(TestCase):
    """
    Sanity test for zeros comparison
    """
    @dtypes(*FLOAT8_DTYPES)
    @dtypesIfCUDA(*CUDA_FLOAT8_DTYPES)
    def test_creation_with_zeros(self, dtype, device):
        """Sanity test, round-trip casting of zeros."""
        x = torch.zeros(8, dtype=torch.float, device=device)
        x8 = torch.zeros(8, dtype=dtype, device=device)
-        self.assertEqual(x, x8.float(), atol=0, rtol=0)
+        if dtype is torch.float8_e8m0fnu:
            # zeros are not supported for this dtype, values get clamped
            # to 2 ^ -127
            x = torch.full((8,), 2**-127, dtype=torch.float, device=device)
            self.assertEqual(x, x8.float(), atol=0, rtol=0)
        else:
            x = torch.zeros(8, dtype=torch.float, device=device)
            self.assertEqual(x, x8.float(), atol=0, rtol=0)
    @dtypes(*FLOAT8_DTYPES)
    @dtypesIfCUDA(*CUDA_FLOAT8_DTYPES)
@ -217,12 +256,69 @@ class TestFloat8Dtype(TestCase):
        """Numerical test of float8 conversion, by performing a round-trip cast
        to the float8 dtype and back to float32, comparing against simulated
        lower precision."""
        if dtype is torch.float8_e8m0fnu:
            return unittest.skip("numerics for e8m0fnu are tested elsewhere")
        x = get_input(dtype, device)
        x = torch.cat((x, -x))
        x8 = x.to(dtype)
        x8_simulated = simulate_fp8_precision(x, dtype)
        self.assertEqual(x8_simulated, x8.float())
    def test_float8_e8m0fnu_rne_rounding(self, device):
        """
        For every possible e8m0 exponent (256 options) and for every possible
        g, r, s bits of the float32 mantissa, verify that RNE rounding is
        correctly applied when casting from float32 to e8m0
        Note: this code is morally similar to `test_cast_round_trip`, but
        IMO simpler to special case e8m0 here.
        """
        for biased_exponent in range(0, 256):
            # iterate through all the possible options of guard, round, sticky bits
            # for the current exponent
            for grs in range(8):
                # create a positive floating point number with the specified exponent
                # and mantissa guard, round, sticky bits
                uint32_t_start = (biased_exponent << 23) + (grs << 20)
                fp32_start = _int_bits_to_float(uint32_t_start)
                # create an RNE rounded version of the exponent
                if biased_exponent == 255:
                    new_biased_exponent = biased_exponent
                else:
                    lsb = biased_exponent > 0
                    g = grs >> 2
                    r = (grs >> 1) & 0b1
                    s = grs & 0b1
                    new_biased_exponent = _round_e8m0_rne(biased_exponent, lsb, g, r, s)
                # create an RNE rounded version of the float
                fp32_e8m0_fp32_emulated = _int_bits_to_float(new_biased_exponent << 23)
                # now, do the same in PyTorch and see if results match
                fp32_pt_start = torch.full(
                    (1,), fp32_start, device=device, dtype=torch.float
                )
                fp32_pt_e8m0 = fp32_pt_start.to(torch.float8_e8m0fnu)
                fp32_pt_e8m0_fp32 = fp32_pt_e8m0.to(torch.float)
                expected = fp32_e8m0_fp32_emulated
                if biased_exponent == 254 and grs >= 4:
                    # special case rounding up from the largest representable float32 exponent, which
                    # saturates to nan
                    expected = float("nan")
                elif biased_exponent == 255:
                    # special case inf and nan, which becomes nan
                    expected = float("nan")
                actual = fp32_pt_e8m0_fp32.item()
                self.assertEqual(
                    expected, actual, f"expected: {expected}, actual: {actual}"
                )
    @dtypes(*FLOAT8_DTYPES)
    @dtypesIfCUDA(*CUDA_FLOAT8_DTYPES)
    def test_special_numbers(self, dtype, device):
@ -269,6 +365,32 @@ class TestFloat8Dtype(TestCase):
                torch.use_deterministic_algorithms(use_deterministic)
                torch.empty(4, 4, device=device, dtype=dtype)
    @dtypes(*FLOAT8_DTYPES)
    @dtypesIfCUDA(*CUDA_FLOAT8_DTYPES)
    def test_to_string(self, dtype, device):
        x = torch.empty(4, 4, device=device, dtype=dtype)
        str(x)
    @dtypes(*FLOAT8_DTYPES)
    def test_finfo(self, dtype, device):
        torch.finfo(dtype)
    @dtypes(*FLOAT8_DTYPES)
    @dtypesIfCUDA(*CUDA_FLOAT8_DTYPES)
    def test_cat(self, dtype, device):
        x1 = torch.empty(4, 4, device=device, dtype=dtype)
        x2 = torch.empty(4, 4, device=device, dtype=dtype)
        torch.cat([x1, x2])
    @dtypes(*FLOAT8_DTYPES)
    @dtypesIfCUDA(*CUDA_FLOAT8_DTYPES)
    def test_save_load(self, dtype, device):
        x1 = torch.randint(0, 10, (4, 4), device=device, dtype=torch.uint8).view(dtype)
        with TemporaryFileName() as fname:
            torch.save(x1, fname)
            x1_save_load = torch.load(fname)
            torch.testing.assert_close(x1, x1_save_load, atol=0, rtol=0)
 instantiate_device_type_tests(TestFloat8Dtype, globals())
@ -285,6 +407,9 @@ class TestFloat8DtypeCPUOnly(TestCase):
    @dtypes(*CUDA_FLOAT8_DTYPES)
    def test_mul(self, dtype):
        # TODO(#113663): remove arithmetic support from all float8 dtypes
        if dtype is torch.float8_e8m0fnu:
            return unittest.skip("arithmetic not supported for torch.float8_e8m0fnu")
        shape = (10, 10)
        a = torch.randn(shape)
        a8_simulated = simulate_fp8_precision(a, dtype)
@ -299,6 +424,11 @@ class TestFloat8DtypeCPUOnly(TestCase):
    @unittest.skipIf(IS_WINDOWS, "torch.compile not supported on Windows yet")
    @dtypes(*CUDA_FLOAT8_DTYPES)
    def test_pt2_traceable_aot_eager(self, dtype):
        if dtype is torch.float8_e8m0fnu:
            return unittest.skip(
                "PT2 support for torch.float8_e8m0fnu is not implemented yet"
            )
        @torch.compile(backend="aot_eager", fullgraph=True)
        def f(x):
            x = x.to(dtype)
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@ -1362,7 +1362,7 @@ def gen_pyi(
    # Generate type signatures for dtype classes
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    # TODO: don't explicitly list dtypes here; get it from canonical
+    # TODO(#146647): don't explicitly list dtypes here; get it from canonical
    # source
    dtype_class_hints = [
        f"{n}: dtype = ..."
@ -1377,6 +1377,7 @@ def gen_pyi(
            "float8_e4m3fnuz",
            "float8_e5m2",
            "float8_e5m2fnuz",
            "float8_e8m0fnu",
            "half",
            "uint8",
            "uint16",
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@ -150,7 +150,17 @@ class _Formatter:
                # no valid number, do nothing
                return
            if tensor.dtype == torch.float8_e8m0fnu:  # type: ignore[attr-defined]
                # float8_e8m0fnu is special and does not define arithmetic ops,
                # and printing code further in this file assumes the existence
                # of various arithmetic ops to figure out what to print. We hack
                # and convert to float here to make printing work correctly.
                # TODO(#113663): also add the other float8 dtypes here after arithmetic
                # support for them is removed
                nonzero_finite_vals = nonzero_finite_vals.float()
            # Convert to double for easy calculation. HalfTensor overflows with 1e8, and there's no div() on CPU.
            nonzero_finite_abs = tensor_totype(nonzero_finite_vals.abs())
            nonzero_finite_min = tensor_totype(nonzero_finite_abs.min())
            nonzero_finite_max = tensor_totype(nonzero_finite_abs.max())
--- a/torch/csrc/TypeInfo.cpp
+++ b/torch/csrc/TypeInfo.cpp
@ -123,16 +123,15 @@ static PyObject* THPDTypeInfo_bits(THPDTypeInfo* self, void*) {
 }
 #define _AT_DISPATCH_FINFO_TYPES(TYPE, NAME, ...) \
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND6(    \
+  AT_DISPATCH_V2(                                 \
      at::kHalf,                                  \
      at::ScalarType::BFloat16,                   \
      at::ScalarType::Float8_e5m2,                \
      at::ScalarType::Float8_e5m2fnuz,            \
      at::ScalarType::Float8_e4m3fn,              \
      at::ScalarType::Float8_e4m3fnuz,            \
      TYPE,                                       \
      NAME,                                       \
-      __VA_ARGS__)
+      AT_WRAP(__VA_ARGS__),                       \
      AT_EXPAND(AT_FLOATING_TYPES),               \
      AT_EXPAND(AT_COMPLEX_TYPES),                \
      at::kHalf,                                  \
      at::ScalarType::BFloat16,                   \
      AT_EXPAND(AT_FLOAT8_TYPES))
 static PyObject* THPFInfo_eps(THPFInfo* self, void*) {
  HANDLE_TH_ERRORS
--- a/torch/csrc/utils/python_scalars.h
+++ b/torch/csrc/utils/python_scalars.h
@ -79,6 +79,7 @@ inline void store_scalar(void* data, at::ScalarType scalarType, PyObject* obj) {
      *(at::BFloat16*)data =
          at::convert<at::BFloat16, double>(THPUtils_unpackDouble(obj));
      break;
    // TODO(#146647): simplify below with macros
    case at::kFloat8_e5m2:
      *(at::Float8_e5m2*)data =
          at::convert<at::Float8_e5m2, double>(THPUtils_unpackDouble(obj));
@ -95,8 +96,12 @@ inline void store_scalar(void* data, at::ScalarType scalarType, PyObject* obj) {
      *(at::Float8_e4m3fnuz*)data =
          at::convert<at::Float8_e4m3fnuz, double>(THPUtils_unpackDouble(obj));
      break;
    case at::kFloat8_e8m0fnu:
      *(at::Float8_e8m0fnu*)data =
          at::convert<at::Float8_e8m0fnu, double>(THPUtils_unpackDouble(obj));
      break;
    default:
-      throw std::runtime_error("invalid type");
+      throw std::runtime_error("store_scalar: invalid type");
  }
 }
@ -143,6 +148,7 @@ inline PyObject* load_scalar(const void* data, at::ScalarType scalarType) {
    case at::kBFloat16:
      return PyFloat_FromDouble(
          at::convert<double, at::BFloat16>(*(at::BFloat16*)data));
    // TODO(#146647): simplify below with macros
    case at::kFloat8_e5m2:
      return PyFloat_FromDouble(
          at::convert<double, at::Float8_e5m2>(*(at::Float8_e5m2*)data));
@ -155,8 +161,11 @@ inline PyObject* load_scalar(const void* data, at::ScalarType scalarType) {
    case at::kFloat8_e4m3fnuz:
      return PyFloat_FromDouble(at::convert<double, at::Float8_e4m3fnuz>(
          *(at::Float8_e4m3fnuz*)data));
    case at::kFloat8_e8m0fnu:
      return PyFloat_FromDouble(
          at::convert<double, at::Float8_e8m0fnu>(*(at::Float8_e8m0fnu*)data));
    default:
-      throw std::runtime_error("invalid type");
+      throw std::runtime_error("load_scalar: invalid type");
  }
 }
--- a/torch/storage.py
+++ b/torch/storage.py
@ -535,6 +535,7 @@ def _new_dtypes():
        torch.float8_e4m3fn,
        torch.float8_e5m2fnuz,
        torch.float8_e4m3fnuz,
        torch.float8_e8m0fnu,
        torch.bits8,
        torch.bits16,
        torch.bits1x8,
--- a/torchgen/api/types/types.py
+++ b/torchgen/api/types/types.py
@ -51,6 +51,7 @@ float8_e5m2T = BaseCppType("at", "Float8_e5m2")
 float8_e5m2fnuzT = BaseCppType("at", "Float8_e5m2fnuz")
 float8_e4m3fnT = BaseCppType("at", "Float8_e4m3fn")
 float8_e4m3fnuzT = BaseCppType("at", "Float8_e4m3fnuz")
 float8_e8m0fnuT = BaseCppType("at", "Float8_e8m0fnu")
 stringT = BaseCppType("c10", "string_view")
 generatorT = BaseCppType("at", "Generator")
 scalarTypeT = BaseCppType("at", "ScalarType")
@ -102,6 +103,7 @@ ScalarTypeToCppMapping: dict[ScalarType, BaseCppType] = {
    ScalarType.Float8_e5m2fnuz: float8_e5m2fnuzT,
    ScalarType.Float8_e4m3fn: float8_e4m3fnT,
    ScalarType.Float8_e4m3fnuz: float8_e4m3fnuzT,
    ScalarType.Float8_e8m0fnu: float8_e8m0fnuT,
 }
 BaseTypeToCppMapping: dict[BaseTy, BaseCppType] = {
--- a/torchgen/model.py
+++ b/torchgen/model.py
@ -374,6 +374,7 @@ class ScalarType(Enum):
    Float8_e5m2fnuz = auto()
    Float8_e4m3fn = auto()
    Float8_e4m3fnuz = auto()
    Float8_e8m0fnu = auto()
    def __str__(self) -> str:
        return self.name