fix missing-prototypes warnings in torch_cpu (Part 6) (#101845)

This PR fixes more missing-prototypes violations in the torch_cpu source following PRs #100053, #100147, #100245, #100849 and #101788 Pull Request resolved: https://github.com/pytorch/pytorch/pull/101845 Approved by: https://github.com/albanD
2025-10-20 21:14:14 +08:00 · 2023-06-15 16:48:25 +00:00
parent e75f7994e1
commit f2900420da
26 changed files with 132 additions and 100 deletions
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@ -64,6 +64,11 @@ bool operator==(const ivalue::Tuple& lhs, const ivalue::Tuple& rhs) {
             _fastEqualsForContainer);
 }

+std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) {
+  out << v.qualifiedClassName() << "." << v.name();
+  return out;
+}
+
 bool operator==(const ivalue::EnumHolder& lhs, const ivalue::EnumHolder& rhs) {
  return lhs.name() == rhs.name() && *rhs.type() == *lhs.type();
 }
@ -763,11 +768,6 @@ IValueComparator getGreaterThanComparator(const IValue& v) {
  };
 }

-std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) {
-  out << v.qualifiedClassName() << "." << v.name();
-  return out;
-}
-
 std::ostream& operator<<(std::ostream & out, const IValue & v) {
  auto formatter = [&](std::ostream& out, const IValue& v) {
    out << v;
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@ -171,7 +171,7 @@ Tensor arange(
  return at::arange_out(result, start, end, step);
 }

-Tensor& arange_start_out(const Scalar& start, const Scalar& end, Tensor& result) {
+static Tensor& arange_start_out(const Scalar& start, const Scalar& end, Tensor& result) {
    return at::arange_out(result, start, end, /*step=*/1);
 }

@ -179,7 +179,7 @@ Tensor& arange_out(const Scalar& end, Tensor& result) {
  return at::arange_out(result, /*start=*/0, end, /*step=*/1);
 }

-Tensor& arange_out(Tensor& result, const Scalar& start, const Scalar& end) {
+static Tensor& arange_out(Tensor& result, const Scalar& start, const Scalar& end) {
  return at::arange_out(result, start, end, /*step=*/1);
 }

@ -189,14 +189,14 @@ Tensor _dim_arange(const Tensor& like, int64_t dim) {

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ complex / polar ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-void complex_check_floating(const Tensor& a, const Tensor& b) {
+static void complex_check_floating(const Tensor& a, const Tensor& b) {
  TORCH_CHECK((a.scalar_type() == kFloat || a.scalar_type() == kDouble || a.scalar_type() == kHalf) &&
              (b.scalar_type() == kFloat || b.scalar_type() == kDouble || b.scalar_type() == kHalf),
              "Expected both inputs to be Half, Float or Double tensors but got ",
              a.scalar_type(), " and ", b.scalar_type());
 }

-void complex_check_dtype(
+static void complex_check_dtype(
    const Tensor& result,
    const Tensor& a,
    const Tensor& b) {
@ -352,7 +352,12 @@ Tensor& empty_out(IntArrayRef size,
    return self.to(ScalarType::n, non_blocking);                 \
  }

+// Some scalar types in CAST_OP have no declarations, they may be unused in Pytorch.
+// But we keep them and ignore the warning here until verified in the future.
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
 AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, DEFINE_CAST_OP)
+#pragma clang diagnostic pop

 #undef DEFINE_CAST_OP

--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@ -691,7 +691,7 @@ static void modified_bessel_k1_kernel(TensorIteratorBase& iterator) {

 #define IMPLEMENT_FLOAT_KERNEL(op)                                                  \
  inline namespace CPU_CAPABILITY {                                                 \
-  void op##_kernel(TensorIteratorBase& iter) {                                      \
+  static void op##_kernel(TensorIteratorBase& iter) {                               \
    TORCH_INTERNAL_ASSERT(iter.ntensors() == 2);                                    \
    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), #op "_vml_cpu", [&]() { \
      constexpr int64_t grain_size = 2048;                                          \
@ -715,6 +715,19 @@ static void modified_bessel_k1_kernel(TensorIteratorBase& iterator) {
  }                                                                                              \
  REGISTER_DISPATCH(op##_stub, &CPU_CAPABILITY::op##_kernel)

+#define STATIC_IMPLEMENT_COMPLEX_KERNEL(op)                                                      \
+  inline namespace CPU_CAPABILITY {                                                              \
+  static void op##_kernel(TensorIteratorBase& iter) {                                            \
+    TORCH_INTERNAL_ASSERT(iter.ntensors() == 2);                                                 \
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), #op "_vml_cpu", [&]() { \
+        constexpr int64_t grain_size = 2048;                                                     \
+        iter.for_each(IMPLEMENT_ITERATOR_LAMBDA(op), grain_size);                                \
+    });                                                                                          \
+    iter.cast_outputs();                                                                         \
+  }                                                                                              \
+  }                                                                                              \
+  REGISTER_DISPATCH(op##_stub, &CPU_CAPABILITY::op##_kernel)
+
 } // CPU_CAPABILITY namespace

 REGISTER_DISPATCH(rsqrt_stub, &CPU_CAPABILITY::rsqrt_kernel);
@ -761,51 +774,28 @@ REGISTER_DISPATCH(special_modified_bessel_i1_stub, &CPU_CAPABILITY::modified_bes
 REGISTER_DISPATCH(special_modified_bessel_k0_stub, &CPU_CAPABILITY::modified_bessel_k0_kernel);
 REGISTER_DISPATCH(special_modified_bessel_k1_stub, &CPU_CAPABILITY::modified_bessel_k1_kernel);

-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-IMPLEMENT_COMPLEX_KERNEL(acos)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-IMPLEMENT_COMPLEX_KERNEL(asin)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-IMPLEMENT_COMPLEX_KERNEL(atan)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+STATIC_IMPLEMENT_COMPLEX_KERNEL(acos)
+STATIC_IMPLEMENT_COMPLEX_KERNEL(asin)
+STATIC_IMPLEMENT_COMPLEX_KERNEL(atan)
 IMPLEMENT_FLOAT_KERNEL(ceil)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-IMPLEMENT_COMPLEX_KERNEL(cos)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+STATIC_IMPLEMENT_COMPLEX_KERNEL(cos)
 IMPLEMENT_FLOAT_KERNEL(erf)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
 IMPLEMENT_FLOAT_KERNEL(erfc)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
 IMPLEMENT_FLOAT_KERNEL(erfinv)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-IMPLEMENT_COMPLEX_KERNEL(exp)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-IMPLEMENT_COMPLEX_KERNEL(expm1)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+STATIC_IMPLEMENT_COMPLEX_KERNEL(exp)
+STATIC_IMPLEMENT_COMPLEX_KERNEL(expm1)
 IMPLEMENT_FLOAT_KERNEL(floor)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-IMPLEMENT_COMPLEX_KERNEL(log)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-IMPLEMENT_COMPLEX_KERNEL(log10)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-IMPLEMENT_COMPLEX_KERNEL(log1p)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-IMPLEMENT_COMPLEX_KERNEL(log2)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+STATIC_IMPLEMENT_COMPLEX_KERNEL(log)
+STATIC_IMPLEMENT_COMPLEX_KERNEL(log10)
+STATIC_IMPLEMENT_COMPLEX_KERNEL(log1p)
+STATIC_IMPLEMENT_COMPLEX_KERNEL(log2)
 IMPLEMENT_FLOAT_KERNEL(i0)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
 IMPLEMENT_FLOAT_KERNEL(round)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-IMPLEMENT_COMPLEX_KERNEL(sin)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+STATIC_IMPLEMENT_COMPLEX_KERNEL(sin)
 IMPLEMENT_COMPLEX_KERNEL(sqrt)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-IMPLEMENT_COMPLEX_KERNEL(tan)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-IMPLEMENT_COMPLEX_KERNEL(tanh)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+STATIC_IMPLEMENT_COMPLEX_KERNEL(tan)
+STATIC_IMPLEMENT_COMPLEX_KERNEL(tanh)
 IMPLEMENT_FLOAT_KERNEL(trunc)
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
 IMPLEMENT_FLOAT_KERNEL(lgamma)

 } // namespace at::native
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@ -27,19 +27,19 @@ Tensor mkldnn_convolution(
  TORCH_CHECK(false, "mkldnn_convolution_forward: ATen not compiled with MKLDNN support");
 }

-Tensor mkldnn_convolution_backward_input(
+static Tensor mkldnn_convolution_backward_input(
    IntArrayRef input_size, const Tensor& grad_output, const Tensor& weight,
    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
  TORCH_CHECK(false, "mkldnn_convolution_backward_input: ATen not compiled with MKLDNN support");
 }

-std::tuple<Tensor, Tensor> mkldnn_convolution_backward_weights(
+static std::tuple<Tensor, Tensor> mkldnn_convolution_backward_weights(
    IntArrayRef weight_size, const Tensor& grad_output, const Tensor& input,
    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
  TORCH_CHECK(false, "mkldnn_convolution_backward_weights: ATen not compiled with MKLDNN support");
 }

-std::tuple<Tensor, Tensor, Tensor> mkldnn_convolution_backward(
+static std::tuple<Tensor, Tensor, Tensor> mkldnn_convolution_backward(
    const Tensor& input, const Tensor& grad_output_t, const Tensor& weight,
    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, std::array<bool,3> output_mask) {
  TORCH_CHECK(false, "mkldnn_convolution_backward: ATen not compiled with MKLDNN support");
@ -47,27 +47,27 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_convolution_backward(

 REGISTER_NO_CPU_DISPATCH(mkldnn_convolution_backward_stub);

-Tensor mkldnn_convolution_transpose(
+static Tensor mkldnn_convolution_transpose(
    const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups) {
  TORCH_CHECK(false, "mkldnn_convolution_transpose: ATen not compiled with MKLDNN support");
 }

-Tensor mkldnn_convolution_transpose_backward_input(
+static Tensor mkldnn_convolution_transpose_backward_input(
    IntArrayRef input_size, const Tensor& grad_output, const Tensor& weight,
    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation,
    int64_t groups, bool bias_defined) {
  TORCH_CHECK(false, "mkldnn_convolution_transpose_backward_input: ATen not compiled with MKLDNN support");
 }

-std::tuple<Tensor, Tensor> mkldnn_convolution_transpose_backward_weights(
+static std::tuple<Tensor, Tensor> mkldnn_convolution_transpose_backward_weights(
    IntArrayRef weight_size, const Tensor& grad_output, const Tensor& input,
    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation,
    int64_t groups, bool bias_defined) {
  TORCH_CHECK(false, "mkldnn_convolution_transpose_backward_weights: ATen not compiled with MKLDNN support");
 }

-std::tuple<Tensor, Tensor, Tensor> mkldnn_convolution_transpose_backward(
+static std::tuple<Tensor, Tensor, Tensor> mkldnn_convolution_transpose_backward(
    const Tensor& input, const Tensor& grad_output_t, const Tensor& weight,
    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation,
    int64_t groups, std::array<bool,3> output_mask) {
--- a/aten/src/ATen/native/mkldnn/Normalization.cpp
+++ b/aten/src/ATen/native/mkldnn/Normalization.cpp
@ -6,6 +6,7 @@
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/_native_batch_norm_legit_native.h>
 #include <ATen/ops/_to_dense_native.h>
 #include <ATen/ops/empty_native.h>
 #include <ATen/ops/native_batch_norm_backward_native.h>
@ -34,7 +35,7 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_batch_norm_backward(
  TORCH_CHECK(false, "mkldnn_batch_norm_backward: ATen not compiled with MKLDNN support");
 }

-std::tuple<Tensor, Tensor, Tensor> mkldnn_layer_norm_last_index_weight_bias_f32(
+static std::tuple<Tensor, Tensor, Tensor> mkldnn_layer_norm_last_index_weight_bias_f32(
    const Tensor& input,
    IntArrayRef normalized_shape, const Tensor& weight, const Tensor& bias,
    double eps, bool inplace) {
--- a/aten/src/ATen/native/mkldnn/TensorShape.cpp
+++ b/aten/src/ATen/native/mkldnn/TensorShape.cpp
@ -106,7 +106,7 @@ namespace at {
 namespace native {


-Tensor mkldnn_view_symint(const Tensor& self, c10::SymIntArrayRef size) {
+static Tensor mkldnn_view_symint(const Tensor& self, c10::SymIntArrayRef size) {
  return mkldnn_view(self, C10_AS_INTARRAYREF_SLOW(size));
 }

--- a/aten/src/ATen/native/prim_native_functions.cpp
+++ b/aten/src/ATen/native/prim_native_functions.cpp
@ -5,6 +5,7 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/is_nonzero_native.h>
+#include <ATen/ops/_foobar_native.h>
 #include <ATen/ops/_test_functorch_fallback_native.h>
 #endif

--- a/aten/src/ATen/native/quantized/cpu/Pooling.cpp
+++ b/aten/src/ATen/native/quantized/cpu/Pooling.cpp
@ -21,6 +21,7 @@
 #include <ATen/ops/quantized_max_pool1d_native.h>
 #include <ATen/ops/quantized_max_pool2d.h>
 #include <ATen/ops/quantized_max_pool2d_native.h>
+#include <ATen/ops/quantized_max_pool3d_native.h>
 #endif

 #include <algorithm>
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@ -32,6 +32,7 @@
 #include <ATen/ops/_sparse_csr_tensor_unsafe_native.h>
 #include <ATen/ops/_sparse_mm_reduce_impl_backward_native.h>
 #include <ATen/ops/_sparse_mm_reduce_impl_backward_native.h>
+#include <ATen/ops/_sparse_mm_reduce_impl_native.h>
 #include <ATen/ops/_unique.h>
 #include <ATen/ops/abs.h>
 #include <ATen/ops/abs_native.h>
@ -464,7 +465,7 @@ CREATE_UNARY_UFUNC(tan);
 CREATE_UNARY_UFUNC(tanh);
 CREATE_UNARY_UFUNC(trunc);
 CREATE_UNARY_UFUNC(conj_physical);
-CREATE_UNARY_UFUNC(relu);
+static CREATE_UNARY_UFUNC(relu);

 // With addition of `round.decimals` overload, using CREATE_UNARY_UFUNC leads
 // to unresolved overload.
@ -776,18 +777,6 @@ Tensor _sparse_csr_mm(const Tensor& mat1, const Tensor& mat2) {
      1.0);
 }

-Tensor _sparse_csr_addmm(
-    const Tensor& t,
-    const SparseCsrTensor& sparse,
-    const Tensor& dense,
-    const Scalar& beta,
-    const Scalar& alpha) {
-  // _sparse_addmm forward is functionally equivalent to addmm; it's
-  // just the backward that is different.  This technically does an
-  // unnecessary redispatch, I was too lazy to make it not do that
-  return at::addmm(t, sparse, dense, beta, alpha);
-}
-
 // Functions for element-wise addition.
 Tensor add_sparse_csr(
    const Tensor& self,
--- a/aten/src/ATen/native/sparse/SparseUnaryOps.cpp
+++ b/aten/src/ATen/native/sparse/SparseUnaryOps.cpp
@ -188,7 +188,12 @@ COALESCED_UNARY_UFUNC(sqrt);
 COALESCED_UNARY_UFUNC(tan);
 COALESCED_UNARY_UFUNC(tanh);
 COALESCED_UNARY_UFUNC(trunc);
+// relu function has no declaration, it may be unused in Pytorch.
+// But we keep it and ignore the warning here until verified in the future.
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
 COALESCED_UNARY_UFUNC(relu);
+#pragma clang diagnostic pop

 COALESCED_UNARY_UFUNC_NO_INPLACE(signbit);
 COALESCED_UNARY_UFUNC_NO_INPLACE(isneginf);
--- a/aten/src/ATen/native/xnnpack/Activation.cpp
+++ b/aten/src/ATen/native/xnnpack/Activation.cpp
@ -1,6 +1,7 @@
 #ifdef USE_XNNPACK

 #include <ATen/native/xnnpack/Common.h>
+#include <ATen/native/xnnpack/Engine.h>
 #include <ATen/native/utils/Factory.h>

 namespace at {
@ -18,7 +19,7 @@ bool use_hardswish(
           true;
 }

-Tensor& hardswish_impl(Tensor& input, Tensor& output) {
+static Tensor& hardswish_impl(Tensor& input, Tensor& output) {
  using namespace internal;

  xnn_operator_t hardswish_op{};
--- a/aten/src/ATen/native/xnnpack/AveragePooling.cpp
+++ b/aten/src/ATen/native/xnnpack/AveragePooling.cpp
@ -1,7 +1,8 @@
 #ifdef USE_XNNPACK

-#include <ATen/native/xnnpack/Common.h>
 #include <ATen/native/utils/Factory.h>
+#include <ATen/native/xnnpack/Common.h>
+#include <ATen/native/xnnpack/Engine.h>
 #include <ATen/native/xnnpack/Pooling.h>

 namespace at {
--- a/aten/src/ATen/native/xnnpack/ChannelShuffle.cpp
+++ b/aten/src/ATen/native/xnnpack/ChannelShuffle.cpp
@ -1,6 +1,7 @@
 #ifdef USE_XNNPACK

 #include <ATen/native/xnnpack/Common.h>
+#include <ATen/native/xnnpack/Engine.h>
 #include <ATen/native/utils/Factory.h>

 namespace at {
--- a/aten/src/ATen/native/xnnpack/Convolution.cpp
+++ b/aten/src/ATen/native/xnnpack/Convolution.cpp
@ -2,11 +2,12 @@

 #include <vector>

-#include <ATen/native/xnnpack/Common.h>
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/utils/Factory.h>
 #include <ATen/native/utils/ParamUtils.h>
+#include <ATen/native/xnnpack/Common.h>
 #include <ATen/native/xnnpack/Convolution.h>
+#include <ATen/native/xnnpack/Engine.h>
 #include <c10/util/irange.h>

 namespace at {
--- a/aten/src/ATen/native/xnnpack/Convolution.h
+++ b/aten/src/ATen/native/xnnpack/Convolution.h
@ -62,6 +62,15 @@ Tensor run(ContextConv2D& context, const Tensor& input);

 } // namespace convolution2d
 } // namespace internal
+
+Tensor convolution2d(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    const IntArrayRef padding,
+    const IntArrayRef stride,
+    const IntArrayRef dilation,
+    const int64_t groups);
 } // namespace xnnpack
 } // namespace native
 } // namespace at
--- a/aten/src/ATen/native/xnnpack/Linear.h
+++ b/aten/src/ATen/native/xnnpack/Linear.h
@ -32,6 +32,16 @@ ContextLinear create(
 Tensor run(const ContextLinear& context, const Tensor& input);
 } // namespace linear
 } // namespace internal
+
+bool use_linear(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias);
+
+Tensor linear(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias);
 } // namespace xnnpack
 } // namespace native
 } // namespace at
--- a/aten/src/ATen/native/xnnpack/MaxPooling.cpp
+++ b/aten/src/ATen/native/xnnpack/MaxPooling.cpp
@ -1,8 +1,9 @@
 #ifdef USE_XNNPACK

 #include <ATen/native/Pool.h>
-#include <ATen/native/xnnpack/Common.h>
 #include <ATen/native/utils/Factory.h>
+#include <ATen/native/xnnpack/Common.h>
+#include <ATen/native/xnnpack/Engine.h>
 #include <ATen/native/xnnpack/Pooling.h>

 namespace at {
--- a/aten/src/ATen/templates/CompositeViewCopyKernels.cpp
+++ b/aten/src/ATen/templates/CompositeViewCopyKernels.cpp
@ -18,6 +18,7 @@ namespace native {
 // This file contains a number of kernels for aten functions that are fully code-generated.
 // TODO: rename this file to something more generic.

+namespace {
 at::Tensor clone_arg(const at::Tensor& t) {
    return t.clone();
 }
@ -59,6 +60,7 @@ void resize_out_helper(const at::TensorList& dst, const at::TensorList& src) {
        at::native::resize_output(dst[i], src[i].sizes());
    }
 }
+}


 ${CompositeViewCopyKernel_Definitions}
--- a/aten/src/ATen/vulkan/Context.h
+++ b/aten/src/ATen/vulkan/Context.h
@ -22,9 +22,10 @@ class VulkanImplRegistrar {
 };

 at::Tensor& vulkan_copy_(at::Tensor& self, const at::Tensor& src);
+} // namespace vulkan
+
 namespace native {
  bool is_vulkan_available();
 }// namespace native

-} // namespace vulkan
 } // namespace at
--- a/caffe2/utils/threadpool/ThreadPool.h
+++ b/caffe2/utils/threadpool/ThreadPool.h
@ -62,6 +62,7 @@ class TORCH_API /*alignas(kCacheLineSize)*/ ThreadPool {
  size_t minWorkSize_;
 };

+size_t getDefaultNumThreads();
 } // namespace caffe2

 #endif // CAFFE2_UTILS_THREADPOOL_H_
--- a/torch/csrc/autograd/VariableTypeManual.cpp
+++ b/torch/csrc/autograd/VariableTypeManual.cpp
@ -8,6 +8,7 @@
 #include <torch/csrc/autograd/VariableTypeUtils.h>
 #include <torch/csrc/autograd/autograd.h>
 #include <torch/csrc/autograd/functions/utils.h>
+#include <torch/csrc/autograd/generated/VariableType.h>
 #include <torch/csrc/utils/memory.h>
 #include <torch/library.h>

@ -22,7 +23,7 @@ namespace torch {
 namespace autograd {
 namespace VariableType {

-std::vector<at::DeprecatedTypeProperties*> allTypesForBackends(
+static std::vector<at::DeprecatedTypeProperties*> allTypesForBackends(
    at::ArrayRef<at::Backend> backends) {
  std::vector<DeprecatedTypeProperties*> res;
  res.reserve(backends.size());
@ -37,16 +38,16 @@ std::vector<at::DeprecatedTypeProperties*> allTypesForBackends(
  return res;
 }

-C10_EXPORT std::vector<at::DeprecatedTypeProperties*> allCPUTypes() {
+std::vector<at::DeprecatedTypeProperties*> allCPUTypes() {
  return allTypesForBackends({Backend::CPU, Backend::SparseCPU});
 }

-C10_EXPORT std::vector<at::DeprecatedTypeProperties*> allCUDATypes() {
+std::vector<at::DeprecatedTypeProperties*> allCUDATypes() {
  at::globalContext().lazyInitCUDA();
  return allTypesForBackends({Backend::CUDA, Backend::SparseCUDA});
 }

-C10_EXPORT std::vector<at::DeprecatedTypeProperties*> allXPUTypes() {
+std::vector<at::DeprecatedTypeProperties*> allXPUTypes() {
  return allTypesForBackends({Backend::XPU, Backend::SparseXPU});
 }

@ -375,7 +376,7 @@ namespace ADInplaceOrView {
      : (at::GradMode::is_enabled() ? CreationMeta::DEFAULT \
                                    : CreationMeta::NO_GRAD_MODE)

-Tensor& copy_(
+static Tensor& copy_(
    c10::DispatchKeySet ks,
    Tensor& self,
    const Tensor& src,
@ -389,7 +390,7 @@ Tensor& copy_(
  return self;
 }

-const Tensor& resize_(
+static const Tensor& resize_(
    c10::DispatchKeySet ks,
    const Tensor& self,
    SymIntArrayRef size,
@ -413,7 +414,7 @@ const Tensor& resize_(
  return self;
 }

-const Tensor& resize_as_(
+static const Tensor& resize_as_(
    c10::DispatchKeySet ks,
    const Tensor& self,
    const Tensor& the_template,
@ -438,7 +439,7 @@ const Tensor& resize_as_(
  return self;
 }

-Tensor detach(c10::DispatchKeySet ks, const Tensor& self) {
+static Tensor detach(c10::DispatchKeySet ks, const Tensor& self) {
  auto out = ([&]() {
    at::AutoDispatchBelowADInplaceOrView guard;
    return at::_ops::detach::redispatch(
@ -460,7 +461,10 @@ Tensor detach(c10::DispatchKeySet ks, const Tensor& self) {
  return result;
 }

-Tensor _fw_primal(c10::DispatchKeySet ks, const Tensor& self, int64_t level) {
+static Tensor _fw_primal(
+    c10::DispatchKeySet ks,
+    const Tensor& self,
+    int64_t level) {
  auto tmp = ([&]() {
    at::AutoDispatchBelowADInplaceOrView guard;
    return at::alias(self);
@ -484,7 +488,7 @@ Tensor _fw_primal(c10::DispatchKeySet ks, const Tensor& self, int64_t level) {
 }

 // NB: This does not redispatch any further
-Tensor _make_dual(
+static Tensor _make_dual(
    c10::DispatchKeySet ks,
    const Tensor& primal,
    const Tensor& tangent,
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@ -40,6 +40,7 @@ extern "C" {
 // This function is needed to avoid superfluous dependency on GNU OpenMP library
 // when cuPTI is linked statically For more details see
 // https://github.com/pytorch/pytorch/issues/51026
+__attribute__((weak)) int acc_get_device_type();
 __attribute__((weak)) int acc_get_device_type() {
  throw std::runtime_error(
      "Dummy implementation of acc_get_device_type is not supposed to be called!");
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@ -62,6 +62,9 @@ namespace ops {
 // Below are ProcessGroup's corresponding ops for each backend. Ops are but
 // routed through the dispatcher to be dispatched to the appropriate backend.
 // Currently a no-op as the process group does not have a list of backends.
+
+namespace {
+
 #define IMPL_SEND(DEV)                                                        \
  c10::intrusive_ptr<Work> send##DEV(                                         \
      at::TensorList tensors,                                                 \
@ -425,6 +428,7 @@ void monitored_barrier_CPU(
          BarrierOptions{device_ids, std::chrono::milliseconds(timeout)},
          wait_all_ranks);
 }
+} // namespace

 // register functions to dispatcher
 namespace {
--- a/torch/csrc/jit/mobile/nnc/aot_compiler.cpp
+++ b/torch/csrc/jit/mobile/nnc/aot_compiler.cpp
@ -30,7 +30,7 @@ namespace jit {
 namespace mobile {
 namespace nnc {

-std::vector<int64_t> getConstSizes(const BufPtr b) {
+static std::vector<int64_t> getConstSizes(const BufPtr b) {
  std::vector<int64_t> r;
  for (const auto& dim : b->dims()) {
    LongImmPtr imm_dim = to<LongImm>(dim);
@ -42,7 +42,7 @@ std::vector<int64_t> getConstSizes(const BufPtr b) {
 }

 // Construct input-specs vector from the inputs of the original graph
-std::vector<mobile::nnc::InputSpec> toInputSpecs(
+static std::vector<mobile::nnc::InputSpec> toInputSpecs(
    const std::shared_ptr<tensorexpr::TensorExprKernel>& kernel) {
  const std::shared_ptr<Graph>& g = kernel->graph();
  std::vector<mobile::nnc::InputSpec> specs;
@ -89,7 +89,7 @@ std::vector<mobile::nnc::InputSpec> toInputSpecs(
 // If a symbolic shape can be found in several different positions, we
 // return the first one we find (TODO: maybe we should return all and
 // verify that they all match at runtime).
-std::vector<SymbolicShapePosition> findSymbolicShapePositions(
+static std::vector<SymbolicShapePosition> findSymbolicShapePositions(
    std::shared_ptr<tensorexpr::TensorExprKernel> kernel) {
  std::vector<SymbolicShapePosition> res;
  for (int64_t sym_idx : kernel->getSymbolicShapeInputs()) {
@ -122,7 +122,7 @@ std::vector<SymbolicShapePosition> findSymbolicShapePositions(
  return res;
 }

-std::unique_ptr<Function> compileMethod(
+static std::unique_ptr<Function> compileMethod(
    std::shared_ptr<tensorexpr::TensorExprKernel> kernel,
    const std::string& method_name,
    const std::vector<std::vector<int64_t>>& sizes,
@ -181,7 +181,7 @@ std::unique_ptr<Function> compileMethod(
  return func;
 }

-std::pair<std::unique_ptr<Function>, const std::string> aotCompile(
+static std::pair<std::unique_ptr<Function>, const std::string> aotCompile(
    const std::string& method_name,
    std::shared_ptr<Graph>& g,
    const std::vector<std::vector<int64_t>>& sizes,
@ -217,7 +217,7 @@ std::pair<std::unique_ptr<Function>, const std::string> aotCompile(
  return std::make_pair(std::move(func), compiled_assembly);
 }

-void writeOutputLlvmAssembly(
+static void writeOutputLlvmAssembly(
    const std::string& asm_code,
    const std::string& output_llvm_file_name) {
  std::ofstream output(output_llvm_file_name);
@ -226,7 +226,7 @@ void writeOutputLlvmAssembly(
      "The compiled llvm assembly code was saved to ", output_llvm_file_name);
 }

-std::vector<std::string> split(
+static std::vector<std::string> split(
    char separator,
    const std::string& string,
    bool ignore_empty = true) {
@ -241,7 +241,7 @@ std::vector<std::string> split(
  return pieces;
 }

-std::vector<std::vector<int64_t>> parseInputShapes(
+static std::vector<std::vector<int64_t>> parseInputShapes(
    const std::string& input_dims_s) {
  std::vector<std::string> input_dims_list = split(';', input_dims_s);
  std::vector<std::vector<int64_t>> inputs;
@ -257,7 +257,7 @@ std::vector<std::vector<int64_t>> parseInputShapes(
  return inputs;
 }

-std::vector<at::ScalarType> parseInputTypes(
+static std::vector<at::ScalarType> parseInputTypes(
    const std::string& input_types_str) {
  std::vector<std::string> inputTypes = split(';', input_types_str);
  std::vector<at::ScalarType> scalarTypes;
@ -277,7 +277,7 @@ std::vector<at::ScalarType> parseInputTypes(
  return scalarTypes;
 }

-std::vector<at::MemoryFormat> parseInputMemoryFormats(
+static std::vector<at::MemoryFormat> parseInputMemoryFormats(
    const std::string& input_memory_format_str) {
  std::vector<std::string> memFormatsStr = split(';', input_memory_format_str);
  std::vector<at::MemoryFormat> memFormats;
@ -295,7 +295,7 @@ std::vector<at::MemoryFormat> parseInputMemoryFormats(
  return memFormats;
 }

-std::vector<int64_t> parseInputDynamicShapes(
+static std::vector<int64_t> parseInputDynamicShapes(
    const std::string& dynamic_dims_s) {
  std::vector<std::string> dynamic_dims_list = split(',', dynamic_dims_s);
  std::vector<int64_t> dynamic_dims;
@ -306,7 +306,7 @@ std::vector<int64_t> parseInputDynamicShapes(
  return dynamic_dims;
 }

-std::string getNncKernelId(
+static std::string getNncKernelId(
    const std::string& model_name,
    const std::string& model_version,
    const std::string& method_name) {
@ -316,7 +316,7 @@ std::string getNncKernelId(
      version_token;
 }

-std::string getNncKernelFuncName(
+static std::string getNncKernelFuncName(
    const std::string& model_name,
    const std::string& model_version,
    const std::string& method_name) {
@ -325,7 +325,8 @@ std::string getNncKernelFuncName(

 // Preprocess the graph and returns the processed graph and
 // symbolic values if dynamic input shapes are specified
-std::pair<std::shared_ptr<Graph>, std::vector<int64_t>> preprocessGraphPasses(
+static std::pair<std::shared_ptr<Graph>, std::vector<int64_t>>
+preprocessGraphPasses(
    std::shared_ptr<Graph>& graph,
    const std::vector<c10::optional<at::Tensor>>& example_inputs,
    const std::vector<int64_t>& dynamic_sizes) {
@ -367,7 +368,7 @@ std::pair<std::shared_ptr<Graph>, std::vector<int64_t>> preprocessGraphPasses(
  return std::make_pair(graph, sym_val);
 }

-std::vector<c10::optional<at::Tensor>> generateExampleInputs(
+static std::vector<c10::optional<at::Tensor>> generateExampleInputs(
    const std::vector<std::vector<int64_t>>& inputShapes,
    const std::vector<at::ScalarType>& inputTypes,
    const std::vector<at::MemoryFormat>& inputMemoryFormats) {
@ -382,7 +383,7 @@ std::vector<c10::optional<at::Tensor>> generateExampleInputs(
  return example_inputs;
 }

-c10::IValue preprocess(
+static c10::IValue preprocess(
    const torch::jit::Module& mod,
    const c10::Dict<c10::IValue, c10::IValue>& compile_spec,
    const torch::jit::BackendDebugHandleGenerator& generate_debug_handles) {
--- a/torch/csrc/profiler/unwind/unwind.cpp
+++ b/torch/csrc/profiler/unwind/unwind.cpp
@ -95,7 +95,7 @@ struct LibraryInfo {
  EHFrameHdr eh_frame_hdr_;
 };

-const char* process_name() {
+static const char* process_name() {
  static char name[PATH_MAX + 1] = "";
  if (*name == '\0') {
    ssize_t len = readlink("/proc/self/exe", name, PATH_MAX);
@ -267,6 +267,7 @@ struct UnwindCache {
 static UnwindCache unwind_cache;
 static std::shared_timed_mutex cache_mutex_;

+extern "C" void unwind_c(std::vector<void*>* result, int64_t rsp, int64_t rbp);
 extern "C" void unwind_c(std::vector<void*>* result, int64_t rsp, int64_t rbp) {
  std::shared_lock lock(cache_mutex_);
  UnwindState state;
--- a/torchgen/gen_backend_stubs.py
+++ b/torchgen/gen_backend_stubs.py
@ -467,6 +467,7 @@ TORCH_LIBRARY_IMPL(aten, $dispatch_key, m) {
    else:
        deferred_template = CodeTemplate(
            """\
+TORCH_API void Register${backend_name}${dispatch_key}NativeFunctions();
 TORCH_API void Register${backend_name}${dispatch_key}NativeFunctions() {
    static auto m = MAKE_TORCH_LIBRARY_IMPL(aten, $dispatch_key);
    $dispatch_registrations_body