mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/6357 Pull Request resolved: https://github.com/pytorch/pytorch/pull/138364 Approved by: https://github.com/Skylion007, https://github.com/eqy
This commit is contained in:
committed by
PyTorch MergeBot
parent
2f6a70bfea
commit
fddabc6e0b
@ -68,7 +68,7 @@ struct strided_tensor_iter_fixed {
|
||||
strided_tensor_iter_fixed(strided_tensor_iter_fixed&&) = default;
|
||||
strided_tensor_iter_fixed(
|
||||
Tensor& tensor,
|
||||
C10_UNUSED bool sort_strides = false)
|
||||
[[maybe_unused]] bool sort_strides = false)
|
||||
: data_(tensor.data_ptr<T>()) {
|
||||
std::memset(counter_, 0, sizeof(int64_t) * N);
|
||||
if (tensor.dim() > 0) {
|
||||
|
@ -63,38 +63,38 @@ TORCH_API void record_kernel_function_dtype(std::string name);
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...) \
|
||||
case enum_type: { \
|
||||
AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type); \
|
||||
using HINT C10_UNUSED = c10::impl::ScalarTypeToCPPTypeT<enum_type>; \
|
||||
return __VA_ARGS__(); \
|
||||
#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...) \
|
||||
case enum_type: { \
|
||||
AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type); \
|
||||
using HINT [[maybe_unused]] = c10::impl::ScalarTypeToCPPTypeT<enum_type>; \
|
||||
return __VA_ARGS__(); \
|
||||
}
|
||||
|
||||
#define AT_DISPATCH_CASE(enum_type, ...) \
|
||||
AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
|
||||
|
||||
#define AT_DISPATCH_CASE_QINT(enum_type, scalar_type, ...) \
|
||||
case enum_type: { \
|
||||
AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type); \
|
||||
using scalar_t = scalar_type; \
|
||||
using underlying_t C10_UNUSED = typename scalar_t::underlying; \
|
||||
C10_UNUSED const auto& SCALAR_TYPE = enum_type; \
|
||||
C10_UNUSED const auto& UNDERLYING_TYPE = toUnderlying(enum_type); \
|
||||
return __VA_ARGS__(); \
|
||||
#define AT_DISPATCH_CASE_QINT(enum_type, scalar_type, ...) \
|
||||
case enum_type: { \
|
||||
AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type); \
|
||||
using scalar_t = scalar_type; \
|
||||
using underlying_t [[maybe_unused]] = typename scalar_t::underlying; \
|
||||
[[maybe_unused]] const auto& SCALAR_TYPE = enum_type; \
|
||||
[[maybe_unused]] const auto& UNDERLYING_TYPE = toUnderlying(enum_type); \
|
||||
return __VA_ARGS__(); \
|
||||
}
|
||||
|
||||
#define AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE( \
|
||||
enum_type, scalar_type, bitwidth, qmin, qmax, ...) \
|
||||
case enum_type: { \
|
||||
AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type); \
|
||||
using scalar_t = scalar_type; \
|
||||
using underlying_t C10_UNUSED = typename scalar_t::underlying; \
|
||||
C10_UNUSED const auto& SCALAR_TYPE = enum_type; \
|
||||
C10_UNUSED const auto& UNDERLYING_TYPE = toUnderlying(enum_type); \
|
||||
C10_UNUSED int bit_width = bitwidth; \
|
||||
C10_UNUSED int64_t quant_min = qmin; \
|
||||
C10_UNUSED int64_t quant_max = qmax; \
|
||||
return __VA_ARGS__(); \
|
||||
#define AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE( \
|
||||
enum_type, scalar_type, bitwidth, qmin, qmax, ...) \
|
||||
case enum_type: { \
|
||||
AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type); \
|
||||
using scalar_t = scalar_type; \
|
||||
using underlying_t [[maybe_unused]] = typename scalar_t::underlying; \
|
||||
[[maybe_unused]] const auto& SCALAR_TYPE = enum_type; \
|
||||
[[maybe_unused]] const auto& UNDERLYING_TYPE = toUnderlying(enum_type); \
|
||||
[[maybe_unused]] int bit_width = bitwidth; \
|
||||
[[maybe_unused]] int64_t quant_min = qmin; \
|
||||
[[maybe_unused]] int64_t quant_max = qmax; \
|
||||
return __VA_ARGS__(); \
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
|
@ -638,7 +638,7 @@ void replace_(const ITensorListRef functional_tensor, ITensorListRef other) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(functional_tensor.size() == other.size());
|
||||
auto functional_tensor_it = functional_tensor.begin();
|
||||
auto other_it = other.begin();
|
||||
for (C10_UNUSED const auto i : c10::irange(functional_tensor.size())) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(functional_tensor.size())) {
|
||||
replace_(*functional_tensor_it++, *other_it++);
|
||||
}
|
||||
}
|
||||
@ -655,7 +655,7 @@ void propagate_xla_data(const ITensorListRef functional_tensor, ITensorListRef o
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(functional_tensor.size() == other.size());
|
||||
auto functional_tensor_it = functional_tensor.begin();
|
||||
auto other_it = other.begin();
|
||||
for (C10_UNUSED const auto i : c10::irange(functional_tensor.size())) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(functional_tensor.size())) {
|
||||
propagate_xla_data(*functional_tensor_it++, *other_it++);
|
||||
}
|
||||
}
|
||||
@ -670,7 +670,7 @@ void propagate_xla_data_direct(const ITensorListRef tensor,
|
||||
ITensorListRef other) {
|
||||
auto tensor_it = tensor.begin();
|
||||
auto other_it = other.begin();
|
||||
for (C10_UNUSED const auto i : c10::irange(tensor.size())) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(tensor.size())) {
|
||||
propagate_xla_data_direct(*tensor_it++, *other_it++);
|
||||
}
|
||||
}
|
||||
|
@ -205,7 +205,7 @@ struct CodeTemplate {
|
||||
// or trailing newlines. It's the responsibility of the calling function
|
||||
// to indent correctly in the context.
|
||||
void emitIndent(std::ostream& out, size_t indent) const {
|
||||
for (C10_UNUSED const auto i : c10::irange(indent)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(indent)) {
|
||||
out << " ";
|
||||
}
|
||||
}
|
||||
|
@ -153,7 +153,7 @@ static std::tuple<double, int> __printFormat(std::ostream& stream, const Tensor&
|
||||
|
||||
static void __printIndent(std::ostream &stream, int64_t indent)
|
||||
{
|
||||
for (C10_UNUSED const auto i : c10::irange(indent)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(indent)) {
|
||||
stream << " ";
|
||||
}
|
||||
}
|
||||
|
@ -390,7 +390,8 @@ struct TORCH_API ClassType : public NamedType {
|
||||
std::string doc_string = "",
|
||||
std::vector<std::string> unresolved_class_attributes = {});
|
||||
|
||||
std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
|
||||
std::string annotation_str_impl(
|
||||
[[maybe_unused]] const TypePrinter& printer = nullptr) const override {
|
||||
const auto& n = name().value();
|
||||
return n.qualifiedName();
|
||||
}
|
||||
|
@ -376,8 +376,8 @@ DynamicTypePtr ivalue::TupleTypeFactory<c10::DynamicType>::fallback(
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
TORCH_API TupleTypePtr
|
||||
ivalue::TupleTypeFactory<TupleType>::fallback(C10_UNUSED const Type& type) {
|
||||
TORCH_API TupleTypePtr ivalue::TupleTypeFactory<TupleType>::fallback(
|
||||
[[maybe_unused]] const Type& type) {
|
||||
#ifdef C10_MOBILE
|
||||
return nullptr;
|
||||
#else
|
||||
@ -398,5 +398,4 @@ ivalue::TupleTypeFactory<TupleType>::fallback(C10_UNUSED const Type& type) {
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
} // namespace c10
|
||||
|
@ -88,7 +88,7 @@ struct TORCH_API EnumType : public NamedType {
|
||||
cu_(std::move(cu)) {}
|
||||
|
||||
std::string annotation_str_impl(
|
||||
C10_UNUSED const TypePrinter& printer = nullptr) const override {
|
||||
[[maybe_unused]] const TypePrinter& printer = nullptr) const override {
|
||||
const auto& n = name().value();
|
||||
return n.qualifiedName();
|
||||
}
|
||||
|
@ -56,7 +56,7 @@ struct TORCH_API Function {
|
||||
virtual c10::intrusive_ptr<c10::ivalue::Future> runAsync(
|
||||
Stack& /*stack*/,
|
||||
// NOLINTNEXTLINE(performance-unnecessary-value-param)
|
||||
C10_UNUSED TaskLauncher taskLauncher = at::launch) {
|
||||
[[maybe_unused]] TaskLauncher taskLauncher = at::launch) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
|
||||
return {};
|
||||
}
|
||||
|
@ -1278,7 +1278,8 @@ struct TORCH_API NumberType : public Type {
|
||||
protected:
|
||||
NumberType(TypeKind kind = TypeKind::NumberType) : Type(kind) {}
|
||||
|
||||
std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
|
||||
std::string annotation_str_impl(
|
||||
[[maybe_unused]] const TypePrinter& printer = nullptr) const override {
|
||||
return "number"; // technically not a valid python type, but
|
||||
// we need to use it when parsing back in annotations
|
||||
// for implicit conversions
|
||||
@ -1305,7 +1306,8 @@ struct TORCH_API FloatType : public NumberType {
|
||||
|
||||
private:
|
||||
FloatType() : NumberType(TypeKind::FloatType) {}
|
||||
std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
|
||||
std::string annotation_str_impl(
|
||||
[[maybe_unused]] const TypePrinter& printer = nullptr) const override {
|
||||
return "float";
|
||||
}
|
||||
};
|
||||
@ -1330,7 +1332,8 @@ struct TORCH_API ComplexType : public NumberType {
|
||||
|
||||
private:
|
||||
ComplexType() : NumberType(TypeKind::ComplexType) {}
|
||||
std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
|
||||
std::string annotation_str_impl(
|
||||
[[maybe_unused]] const TypePrinter& printer = nullptr) const override {
|
||||
return "complex";
|
||||
}
|
||||
};
|
||||
@ -1419,7 +1422,8 @@ struct TORCH_API IntType : public NumberType {
|
||||
|
||||
private:
|
||||
IntType() : NumberType(TypeKind::IntType) {}
|
||||
std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
|
||||
std::string annotation_str_impl(
|
||||
[[maybe_unused]] const TypePrinter& printer = nullptr) const override {
|
||||
return "int";
|
||||
}
|
||||
};
|
||||
@ -1453,7 +1457,8 @@ struct TORCH_API StringType : public Type {
|
||||
// we only use "str" (not "string") in both FunctionSchema and script
|
||||
return annotation_str();
|
||||
}
|
||||
std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
|
||||
std::string annotation_str_impl(
|
||||
[[maybe_unused]] const TypePrinter& printer = nullptr) const override {
|
||||
return "str";
|
||||
}
|
||||
static const TypeKind Kind = TypeKind::StringType;
|
||||
@ -1473,7 +1478,8 @@ struct TORCH_API StorageType : public Type {
|
||||
std::string str() const override {
|
||||
return annotation_str();
|
||||
}
|
||||
std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
|
||||
std::string annotation_str_impl(
|
||||
[[maybe_unused]] const TypePrinter& printer = nullptr) const override {
|
||||
return "Storage";
|
||||
}
|
||||
static const TypeKind Kind = TypeKind::StorageType;
|
||||
@ -1508,7 +1514,8 @@ struct TORCH_API FunctionType : public NamedType {
|
||||
|
||||
private:
|
||||
FunctionType(torch::jit::Function* function);
|
||||
std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
|
||||
std::string annotation_str_impl(
|
||||
[[maybe_unused]] const TypePrinter& printer = nullptr) const override {
|
||||
const auto& n = name().value();
|
||||
return n.qualifiedName();
|
||||
}
|
||||
@ -2199,7 +2206,8 @@ struct TORCH_API InterfaceType : public NamedType {
|
||||
const InterfaceType& rhs,
|
||||
std::ostream* why_not);
|
||||
|
||||
std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
|
||||
std::string annotation_str_impl(
|
||||
[[maybe_unused]] const TypePrinter& printer = nullptr) const override {
|
||||
return name()->qualifiedName();
|
||||
}
|
||||
|
||||
|
@ -1121,7 +1121,7 @@ inline void convert(const src_T *src, dst_T *dst, int64_t n) {
|
||||
#ifndef _MSC_VER
|
||||
# pragma unroll
|
||||
#endif
|
||||
for (C10_UNUSED const auto i : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(n)) {
|
||||
*dst = c10::convert<dst_T>(c10::load(src));
|
||||
src++;
|
||||
dst++;
|
||||
|
@ -157,18 +157,19 @@ constexpr const char* _cusolver_backend_suggestion = \
|
||||
// See NOTE [ USE OF NVRTC AND DRIVER API ].
|
||||
#if !defined(USE_ROCM)
|
||||
|
||||
#define AT_CUDA_DRIVER_CHECK(EXPR) \
|
||||
do { \
|
||||
CUresult __err = EXPR; \
|
||||
if (__err != CUDA_SUCCESS) { \
|
||||
const char* err_str; \
|
||||
C10_UNUSED CUresult get_error_str_err = at::globalContext().getNVRTC().cuGetErrorString(__err, &err_str); \
|
||||
if (get_error_str_err != CUDA_SUCCESS) { \
|
||||
AT_ERROR("CUDA driver error: unknown error"); \
|
||||
} else { \
|
||||
AT_ERROR("CUDA driver error: ", err_str); \
|
||||
} \
|
||||
} \
|
||||
#define AT_CUDA_DRIVER_CHECK(EXPR) \
|
||||
do { \
|
||||
CUresult __err = EXPR; \
|
||||
if (__err != CUDA_SUCCESS) { \
|
||||
const char* err_str; \
|
||||
[[maybe_unused]] CUresult get_error_str_err = \
|
||||
at::globalContext().getNVRTC().cuGetErrorString(__err, &err_str); \
|
||||
if (get_error_str_err != CUDA_SUCCESS) { \
|
||||
AT_ERROR("CUDA driver error: unknown error"); \
|
||||
} else { \
|
||||
AT_ERROR("CUDA driver error: ", err_str); \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
|
@ -69,8 +69,12 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
|
||||
TORCH_CHECK(false, "Cannot initialize CUDA without ATen_cuda library. ", CUDA_HELP);
|
||||
}
|
||||
|
||||
virtual const Generator& getDefaultCUDAGenerator(C10_UNUSED DeviceIndex device_index = -1) const {
|
||||
TORCH_CHECK(false, "Cannot get default CUDA generator without ATen_cuda library. ", CUDA_HELP);
|
||||
virtual const Generator& getDefaultCUDAGenerator(
|
||||
[[maybe_unused]] DeviceIndex device_index = -1) const {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Cannot get default CUDA generator without ATen_cuda library. ",
|
||||
CUDA_HELP);
|
||||
}
|
||||
|
||||
Device getDeviceFromPtr(void* /*data*/) const override {
|
||||
|
@ -32,12 +32,15 @@ struct TORCH_API XPUHooksInterface : AcceleratorHooksInterface{
|
||||
TORCH_CHECK(false, "Cannot get XPU global device index without ATen_xpu library.");
|
||||
}
|
||||
|
||||
virtual Generator getXPUGenerator(C10_UNUSED DeviceIndex device_index = -1) const {
|
||||
virtual Generator getXPUGenerator(
|
||||
[[maybe_unused]] DeviceIndex device_index = -1) const {
|
||||
TORCH_CHECK(false, "Cannot get XPU generator without ATen_xpu library.");
|
||||
}
|
||||
|
||||
virtual const Generator& getDefaultXPUGenerator(C10_UNUSED DeviceIndex device_index = -1) const {
|
||||
TORCH_CHECK(false, "Cannot get default XPU generator without ATen_xpu library.");
|
||||
virtual const Generator& getDefaultXPUGenerator(
|
||||
[[maybe_unused]] DeviceIndex device_index = -1) const {
|
||||
TORCH_CHECK(
|
||||
false, "Cannot get default XPU generator without ATen_xpu library.");
|
||||
}
|
||||
|
||||
virtual DeviceIndex getNumGPUs() const {
|
||||
|
@ -135,7 +135,7 @@ static Tensor make_feature_noise(const Tensor& input) {
|
||||
sizes.reserve(input.dim());
|
||||
sizes.push_back(input_sizes[0]);
|
||||
sizes.push_back(input_sizes[1]);
|
||||
for (C10_UNUSED const auto i : c10::irange(2, input.dim())) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(2, input.dim())) {
|
||||
sizes.push_back(1);
|
||||
}
|
||||
// NB: THIS WAS CHANGED FROM THE ORIGINAL
|
||||
|
@ -1109,7 +1109,7 @@ void unpack_pivots_cpu_kernel(TensorIterator& iter, const int64_t dim_size, cons
|
||||
auto* perm_ptr = data[0];
|
||||
const auto* pivots_ptr = data[1];
|
||||
|
||||
for (C10_UNUSED const auto elem : c10::irange(nelems)) {
|
||||
for ([[maybe_unused]] const auto elem : c10::irange(nelems)) {
|
||||
// WARNING: linalg.lu_factor returns int32 pivots,
|
||||
// this behavior could change in the future.
|
||||
const auto perm_data = reinterpret_cast<int64_t*>(perm_ptr);
|
||||
|
@ -133,30 +133,50 @@ float bf16_dot_with_fp32_arith(
|
||||
#endif
|
||||
|
||||
template <typename scalar_t>
|
||||
bool scal_use_fast_path(C10_UNUSED int64_t n, C10_UNUSED int64_t incx) {
|
||||
bool scal_use_fast_path(
|
||||
[[maybe_unused]] int64_t n,
|
||||
[[maybe_unused]] int64_t incx) {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
bool gemv_use_fast_path(C10_UNUSED char trans, C10_UNUSED int64_t m,
|
||||
C10_UNUSED int64_t n, C10_UNUSED scalar_t alpha,
|
||||
C10_UNUSED int64_t lda,
|
||||
C10_UNUSED int64_t incx, C10_UNUSED scalar_t beta,
|
||||
C10_UNUSED int64_t incy) {
|
||||
bool gemv_use_fast_path(
|
||||
[[maybe_unused]] char trans,
|
||||
[[maybe_unused]] int64_t m,
|
||||
[[maybe_unused]] int64_t n,
|
||||
[[maybe_unused]] scalar_t alpha,
|
||||
[[maybe_unused]] int64_t lda,
|
||||
[[maybe_unused]] int64_t incx,
|
||||
[[maybe_unused]] scalar_t beta,
|
||||
[[maybe_unused]] int64_t incy) {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void scal_fast_path(C10_UNUSED int *n, C10_UNUSED scalar_t *a, C10_UNUSED scalar_t *x, C10_UNUSED int *incx) {
|
||||
TORCH_INTERNAL_ASSERT(false, "scal_fast_path shouldn't be called for this configuration");
|
||||
void scal_fast_path(
|
||||
[[maybe_unused]] int* n,
|
||||
[[maybe_unused]] scalar_t* a,
|
||||
[[maybe_unused]] scalar_t* x,
|
||||
[[maybe_unused]] int* incx) {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
false, "scal_fast_path shouldn't be called for this configuration");
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void gemv_fast_path(C10_UNUSED const char *trans, C10_UNUSED const int *m, C10_UNUSED const int *n,
|
||||
C10_UNUSED const scalar_t *alpha, C10_UNUSED const scalar_t *a, C10_UNUSED const int *lda,
|
||||
C10_UNUSED const scalar_t *x, C10_UNUSED const int *incx, C10_UNUSED const scalar_t *beta,
|
||||
C10_UNUSED scalar_t *y, C10_UNUSED const int *incy) {
|
||||
TORCH_INTERNAL_ASSERT(false, "gemv_fast_path shouldn't be called for this configuration");
|
||||
void gemv_fast_path(
|
||||
[[maybe_unused]] const char* trans,
|
||||
[[maybe_unused]] const int* m,
|
||||
[[maybe_unused]] const int* n,
|
||||
[[maybe_unused]] const scalar_t* alpha,
|
||||
[[maybe_unused]] const scalar_t* a,
|
||||
[[maybe_unused]] const int* lda,
|
||||
[[maybe_unused]] const scalar_t* x,
|
||||
[[maybe_unused]] const int* incx,
|
||||
[[maybe_unused]] const scalar_t* beta,
|
||||
[[maybe_unused]] scalar_t* y,
|
||||
[[maybe_unused]] const int* incy) {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
false, "gemv_fast_path shouldn't be called for this configuration");
|
||||
}
|
||||
|
||||
#define INSTANTIATE(scalar_t) \
|
||||
@ -188,15 +208,32 @@ void scal_fast_path<float>(int *n, float *a, float *x, int *incx) {
|
||||
}
|
||||
|
||||
template <>
|
||||
bool gemv_use_fast_path<float>(C10_UNUSED char trans, int64_t m, int64_t n, C10_UNUSED float alpha, int64_t lda, int64_t incx, C10_UNUSED float beta, int64_t incy) {
|
||||
bool gemv_use_fast_path<float>(
|
||||
[[maybe_unused]] char trans,
|
||||
int64_t m,
|
||||
int64_t n,
|
||||
[[maybe_unused]] float alpha,
|
||||
int64_t lda,
|
||||
int64_t incx,
|
||||
[[maybe_unused]] float beta,
|
||||
int64_t incy) {
|
||||
auto intmax = std::numeric_limits<int>::max();
|
||||
return (m <= intmax) && (n <= intmax) && (lda <= intmax) &&
|
||||
(incx > 0) && (incx <= intmax) && (incy > 0) && (incy <= intmax);
|
||||
}
|
||||
|
||||
template <>
|
||||
bool gemv_use_fast_path<double>(C10_UNUSED char trans, int64_t m, int64_t n, C10_UNUSED double alpha, int64_t lda, int64_t incx, C10_UNUSED double beta, int64_t incy) {
|
||||
return gemv_use_fast_path<float>(trans, m, n, (float)alpha, lda, incx, (float)beta, incy);
|
||||
bool gemv_use_fast_path<double>(
|
||||
[[maybe_unused]] char trans,
|
||||
int64_t m,
|
||||
int64_t n,
|
||||
[[maybe_unused]] double alpha,
|
||||
int64_t lda,
|
||||
int64_t incx,
|
||||
[[maybe_unused]] double beta,
|
||||
int64_t incy) {
|
||||
return gemv_use_fast_path<float>(
|
||||
trans, m, n, (float)alpha, lda, incx, (float)beta, incy);
|
||||
}
|
||||
|
||||
template <>
|
||||
@ -220,38 +257,40 @@ INSTANTIATE(int);
|
||||
INSTANTIATE(int64_t);
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE)
|
||||
template <>
|
||||
bool scal_use_fast_path<at::Half>(C10_UNUSED int64_t n, C10_UNUSED int64_t incx) {
|
||||
bool scal_use_fast_path<at::Half>(
|
||||
[[maybe_unused]] int64_t n,
|
||||
[[maybe_unused]] int64_t incx) {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool gemv_use_fast_path<at::Half>(
|
||||
C10_UNUSED char trans,
|
||||
C10_UNUSED int64_t m,
|
||||
C10_UNUSED int64_t n,
|
||||
[[maybe_unused]] char trans,
|
||||
[[maybe_unused]] int64_t m,
|
||||
[[maybe_unused]] int64_t n,
|
||||
at::Half alpha,
|
||||
C10_UNUSED int64_t lda,
|
||||
C10_UNUSED int64_t incx,
|
||||
[[maybe_unused]] int64_t lda,
|
||||
[[maybe_unused]] int64_t incx,
|
||||
at::Half beta,
|
||||
C10_UNUSED int64_t incy) {
|
||||
[[maybe_unused]] int64_t incy) {
|
||||
return incx == 1 && c10::detail::fp16_from_bits(alpha.x) == 1.0f &&
|
||||
c10::detail::fp16_from_bits(beta.x) == 0.0f;
|
||||
c10::detail::fp16_from_bits(beta.x) == 0.0f;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool gemv_use_fast_path<at::BFloat16>(
|
||||
C10_UNUSED char trans,
|
||||
C10_UNUSED int64_t m,
|
||||
C10_UNUSED int64_t n,
|
||||
[[maybe_unused]] char trans,
|
||||
[[maybe_unused]] int64_t m,
|
||||
[[maybe_unused]] int64_t n,
|
||||
at::BFloat16 alpha,
|
||||
C10_UNUSED int64_t lda,
|
||||
C10_UNUSED int64_t incx,
|
||||
[[maybe_unused]] int64_t lda,
|
||||
[[maybe_unused]] int64_t incx,
|
||||
at::BFloat16 beta,
|
||||
C10_UNUSED int64_t incy) {
|
||||
return (trans == 'T' || trans == 't') && incx == 1 && alpha == 1.0 && beta == 0.0;
|
||||
[[maybe_unused]] int64_t incy) {
|
||||
return (trans == 'T' || trans == 't') && incx == 1 && alpha == 1.0 &&
|
||||
beta == 0.0;
|
||||
}
|
||||
|
||||
|
||||
#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
|
||||
static inline float16_t reduce(float16x4_t x) {
|
||||
auto sum = vpadd_f16(x, x);
|
||||
|
@ -34,7 +34,7 @@ Tensor make_feature_noise(const Tensor& input) {
|
||||
sizes.reserve(input.dim());
|
||||
sizes.push_back(input_sizes[0]);
|
||||
sizes.push_back(input_sizes[1]);
|
||||
for (C10_UNUSED const auto i : c10::irange(2, input.dim())) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(2, input.dim())) {
|
||||
sizes.push_back(1);
|
||||
}
|
||||
return input.new_empty_symint(sizes);
|
||||
|
@ -13,9 +13,11 @@ static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask,
|
||||
" does not match the shape of the indexed tensor ", self.sizes(), " at index ", idx);
|
||||
}
|
||||
|
||||
|
||||
C10_UNUSED static std::vector<Tensor> expandTensors(const Tensor & self, IOptTensorListRef indices) {
|
||||
// If indices come in as ByteTensor or BoolTensor (masks), expand them into the equivalent indexing by LongTensors
|
||||
[[maybe_unused]] static std::vector<Tensor> expandTensors(
|
||||
const Tensor& self,
|
||||
IOptTensorListRef indices) {
|
||||
// If indices come in as ByteTensor or BoolTensor (masks), expand them into
|
||||
// the equivalent indexing by LongTensors
|
||||
std::vector<Tensor> result;
|
||||
for (const auto& index_opt : indices) {
|
||||
if (!index_opt.has_value()) {
|
||||
@ -48,7 +50,9 @@ C10_UNUSED static std::vector<Tensor> expandTensors(const Tensor & self, IOptTen
|
||||
return result;
|
||||
}
|
||||
|
||||
C10_UNUSED static void checkIndexTensorTypes(IOptTensorListRef indices, bool allow_int=false) {
|
||||
[[maybe_unused]] static void checkIndexTensorTypes(
|
||||
IOptTensorListRef indices,
|
||||
bool allow_int = false) {
|
||||
for (const auto& tensor : indices) {
|
||||
if (tensor.has_value() && tensor->defined()) {
|
||||
auto scalarType = tensor->scalar_type();
|
||||
@ -83,7 +87,7 @@ inline torch::List<std::optional<Tensor>> toListOfOptionalTensors(ArrayRef<IValu
|
||||
return result;
|
||||
}
|
||||
|
||||
C10_UNUSED static bool hasContiguousSubspace(TensorList tl) {
|
||||
[[maybe_unused]] static bool hasContiguousSubspace(TensorList tl) {
|
||||
// true if all the non-null tensors are adjacent
|
||||
auto isDefined = [](const Tensor & tensor){ return tensor.defined(); };
|
||||
auto isNull = [](const Tensor & tensor){ return !tensor.defined(); };
|
||||
@ -93,15 +97,15 @@ C10_UNUSED static bool hasContiguousSubspace(TensorList tl) {
|
||||
return it == stop.base();
|
||||
}
|
||||
|
||||
|
||||
// Transposes the tensor and indices together so that all the non-null indices
|
||||
// index the first k dimensions of the tensor. Returns the transposed tensor
|
||||
// and the reordered indices. For example:
|
||||
// transposeToFront(tensor, {nullptr, a, nullptr, b})
|
||||
// returns
|
||||
// tensor.permute([1, 3, 0, 2]), {a, b, nullptr, nullptr}
|
||||
C10_UNUSED static std::tuple<Tensor, std::vector<Tensor>>
|
||||
transposeToFront(const Tensor& self, TensorList indices) {
|
||||
[[maybe_unused]] static std::tuple<Tensor, std::vector<Tensor>> transposeToFront(
|
||||
const Tensor& self,
|
||||
TensorList indices) {
|
||||
std::vector<int64_t> dims;
|
||||
std::vector<Tensor> transposedIndices;
|
||||
dims.reserve(self.dim());
|
||||
|
@ -241,8 +241,9 @@ void batch_iterator_with_broadcasting(const Tensor& a, const Tensor& b, const fu
|
||||
auto* b_batch_idx_ptr = data[0];
|
||||
auto* a_batch_idx_ptr = data[1];
|
||||
|
||||
for (C10_UNUSED const auto elem : c10::irange(nelems)) {
|
||||
auto b_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(b_batch_idx_ptr);
|
||||
for ([[maybe_unused]] const auto elem : c10::irange(nelems)) {
|
||||
auto b_curr_linear_batch_idx =
|
||||
*reinterpret_cast<int64_t*>(b_batch_idx_ptr);
|
||||
auto a_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(a_batch_idx_ptr);
|
||||
|
||||
check_if_copy_needed_for_a(a_curr_linear_batch_idx);
|
||||
|
@ -76,7 +76,7 @@ static void multilabel_margin_loss_forward_out_frame(
|
||||
|
||||
accscalar_t sum = 0;
|
||||
|
||||
for (C10_UNUSED const auto t : c10::irange(nframe)) {
|
||||
for ([[maybe_unused]] const auto t : c10::irange(nframe)) {
|
||||
sum += multilabel_margin_loss_forward_inner_sum_cpu(
|
||||
input_data, target_data, is_target_data, dim);
|
||||
|
||||
@ -180,7 +180,7 @@ static void multilabel_margin_loss_backward_out_frame(
|
||||
reduction == Reduction::Mean ? 1. / (nframe * dim) : 1. / dim);
|
||||
|
||||
scalar_t* grad_input_row_data = grad_input.mutable_data_ptr<scalar_t>();
|
||||
for (C10_UNUSED const auto t : c10::irange(nframe)) {
|
||||
for ([[maybe_unused]] const auto t : c10::irange(nframe)) {
|
||||
for (const auto dt : c10::irange(dim)) {
|
||||
int64_t target_idx = target_data[dt];
|
||||
if (target_idx < 0) {
|
||||
|
@ -1204,22 +1204,30 @@ scalar_t calc_igamma(scalar_t a, scalar_t x) {
|
||||
}
|
||||
|
||||
template <>
|
||||
C10_UNUSED inline c10::BFloat16 calc_igamma<c10::BFloat16>(c10::BFloat16 a, c10::BFloat16 x) {
|
||||
[[maybe_unused]] inline c10::BFloat16 calc_igamma<c10::BFloat16>(
|
||||
c10::BFloat16 a,
|
||||
c10::BFloat16 x) {
|
||||
return calc_igamma<float>(float(a), float(x));
|
||||
}
|
||||
|
||||
template <>
|
||||
C10_UNUSED inline c10::Half calc_igamma<c10::Half>(c10::Half a, c10::Half x) {
|
||||
[[maybe_unused]] inline c10::Half calc_igamma<c10::Half>(
|
||||
c10::Half a,
|
||||
c10::Half x) {
|
||||
return calc_igamma<float>(float(a), float(x));
|
||||
}
|
||||
|
||||
template <>
|
||||
C10_UNUSED inline c10::BFloat16 calc_igammac<c10::BFloat16>(c10::BFloat16 a, c10::BFloat16 x) {
|
||||
[[maybe_unused]] inline c10::BFloat16 calc_igammac<c10::BFloat16>(
|
||||
c10::BFloat16 a,
|
||||
c10::BFloat16 x) {
|
||||
return calc_igammac<float>(float(a), float(x));
|
||||
}
|
||||
|
||||
template <>
|
||||
C10_UNUSED inline c10::Half calc_igammac<c10::Half>(c10::Half a, c10::Half x) {
|
||||
[[maybe_unused]] inline c10::Half calc_igammac<c10::Half>(
|
||||
c10::Half a,
|
||||
c10::Half x) {
|
||||
return calc_igammac<float>(float(a), float(x));
|
||||
}
|
||||
|
||||
@ -1231,7 +1239,7 @@ inline T abs_impl(T v) {
|
||||
}
|
||||
|
||||
template <>
|
||||
C10_UNUSED inline uint8_t abs_impl(uint8_t v) {
|
||||
[[maybe_unused]] inline uint8_t abs_impl(uint8_t v) {
|
||||
return v;
|
||||
}
|
||||
|
||||
|
@ -188,7 +188,7 @@ std::tuple<Tensor, Tensor> _pad_packed_sequence(const Tensor& data, const Tensor
|
||||
}
|
||||
int64_t dec = prev_batch_size - batch_size;
|
||||
if (dec > 0) {
|
||||
for (C10_UNUSED const auto j : c10::irange(dec)) {
|
||||
for ([[maybe_unused]] const auto j : c10::irange(dec)) {
|
||||
(*lengths--) = i;
|
||||
}
|
||||
}
|
||||
|
@ -1889,7 +1889,8 @@ static DEFINE_QUANTIZED_RNN_CELL_DYNAMIC(quantized_rnn_tanh_cell_dynamic, simple
|
||||
|
||||
namespace {
|
||||
|
||||
C10_UNUSED static auto ensure_linear_params_registered = register_linear_params();
|
||||
[[maybe_unused]] static auto ensure_linear_params_registered =
|
||||
register_linear_params();
|
||||
|
||||
static auto cell_params_base_registry =
|
||||
torch::selective_class_<CellParamsBase>("rnn", TORCH_SELECTIVE_CLASS("CellParamsBase"))
|
||||
|
@ -931,7 +931,7 @@ static inline Tensor diff_helper(const Tensor& self, int64_t n, int64_t dim) {
|
||||
bool is_kBool = (self.dtype() == at::kBool);
|
||||
n = n > self.sym_size(dim) ? self.sym_size(dim).guard_int(__FILE__, __LINE__) : n;
|
||||
|
||||
for (C10_UNUSED const auto i : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(n)) {
|
||||
if (is_kBool) {
|
||||
result = at::logical_xor(
|
||||
at::narrow_symint(result, dim, 1, out_len),
|
||||
@ -2255,7 +2255,7 @@ bool cpu_equal(const Tensor& self, const Tensor& other) {
|
||||
return;
|
||||
}
|
||||
char* self_data = data[0];
|
||||
for (C10_UNUSED const auto i : c10::irange(dim_size)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(dim_size)) {
|
||||
if (isnan_(c10::load<scalar_t>(self_data))) {
|
||||
result = false;
|
||||
return;
|
||||
@ -2282,7 +2282,7 @@ bool cpu_equal(const Tensor& self, const Tensor& other) {
|
||||
}
|
||||
char* self_data = data[0];
|
||||
char* other_data = data[1];
|
||||
for (C10_UNUSED const auto i : c10::irange(dim_size)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(dim_size)) {
|
||||
if (c10::load<scalar_t>(self_data) != c10::load<scalar_t>(other_data)) {
|
||||
result = false;
|
||||
return;
|
||||
|
@ -207,9 +207,13 @@ inline TensorIterator make_reduction(
|
||||
return TensorIterator::reduce_op(viewed_result, self.to(in_dtype));
|
||||
}
|
||||
|
||||
C10_UNUSED inline TensorIterator make_reduction(
|
||||
const char* name, Tensor& result, const Tensor& self,
|
||||
at::OptionalIntArrayRef dim, bool keepdim, ScalarType out_dtype) {
|
||||
[[maybe_unused]] inline TensorIterator make_reduction(
|
||||
const char* name,
|
||||
Tensor& result,
|
||||
const Tensor& self,
|
||||
at::OptionalIntArrayRef dim,
|
||||
bool keepdim,
|
||||
ScalarType out_dtype) {
|
||||
// special case for type promotion in mixed precision, improves computational
|
||||
// efficiency.
|
||||
// not generalize this to common mismatched input/output types to avoid cross
|
||||
@ -259,9 +263,14 @@ inline TensorIterator make_reduction(
|
||||
return TensorIterator::reduce_op(viewed_result1, viewed_result2, self.to(dtype1));
|
||||
}
|
||||
|
||||
C10_UNUSED inline TensorIterator make_reduction(
|
||||
const char* name, Tensor& result1, Tensor& result2, const Tensor& self,
|
||||
at::OptionalIntArrayRef dim, bool keepdim, ScalarType dtype) {
|
||||
[[maybe_unused]] inline TensorIterator make_reduction(
|
||||
const char* name,
|
||||
Tensor& result1,
|
||||
Tensor& result2,
|
||||
const Tensor& self,
|
||||
at::OptionalIntArrayRef dim,
|
||||
bool keepdim,
|
||||
ScalarType dtype) {
|
||||
return make_reduction(name, result1, result2, self, dim, keepdim, dtype, dtype);
|
||||
}
|
||||
|
||||
@ -313,9 +322,13 @@ inline std::vector<int64_t> get_zero_numel_tensor_size(
|
||||
// This function should be called when you are reducing a zero-numel tensor and want to
|
||||
// resize the output and return it. This function exists for resizing zero-numel
|
||||
// tensors when the size of the reduction dimension is non-zero.
|
||||
C10_UNUSED inline void zero_numel_tensor_resize(Tensor& result, Tensor& result_indices,
|
||||
const Tensor& self, const int64_t dim,
|
||||
const bool keepdim, const char *fn_name) {
|
||||
[[maybe_unused]] inline void zero_numel_tensor_resize(
|
||||
Tensor& result,
|
||||
Tensor& result_indices,
|
||||
const Tensor& self,
|
||||
const int64_t dim,
|
||||
const bool keepdim,
|
||||
const char* fn_name) {
|
||||
auto sizes = get_zero_numel_tensor_size(self, dim, keepdim, fn_name);
|
||||
at::native::resize_output(result, sizes);
|
||||
at::native::resize_output(result_indices, sizes);
|
||||
@ -349,11 +362,11 @@ inline ScalarType get_dtype_from_result(Tensor& result, std::optional<ScalarType
|
||||
|
||||
namespace at::meta {
|
||||
|
||||
C10_UNUSED inline DimVector get_reduction_shape(
|
||||
[[maybe_unused]] inline DimVector get_reduction_shape(
|
||||
const Tensor& self,
|
||||
IntArrayRef dims,
|
||||
bool keepdim,
|
||||
bool allow_empty_dims=false) {
|
||||
bool allow_empty_dims = false) {
|
||||
auto mask = native::make_dim_mask(dims, self.dim(), allow_empty_dims);
|
||||
return native::shape_from_dim_mask(self, mask, keepdim);
|
||||
}
|
||||
@ -434,7 +447,7 @@ inline TensorIterator make_reduction(
|
||||
return TensorIterator::reduce_op(viewed_result1, viewed_result2, self.to(dtype1));
|
||||
}
|
||||
|
||||
C10_UNUSED inline TensorIterator make_reduction_from_out_ty(
|
||||
[[maybe_unused]] inline TensorIterator make_reduction_from_out_ty(
|
||||
const Tensor& self,
|
||||
const Tensor& result,
|
||||
OptionalIntArrayRef opt_dims,
|
||||
|
@ -2409,7 +2409,7 @@ Tensor& nonzero_out_cpu(const Tensor& self, Tensor& result) {
|
||||
|
||||
for (const auto i : c10::irange(n2)) {
|
||||
const char* ptr = data[0] + i * strides[1];
|
||||
for (C10_UNUSED const auto j : c10::irange(n1)) {
|
||||
for ([[maybe_unused]] const auto j : c10::irange(n1)) {
|
||||
const auto& val = c10::load<scalar_t>(ptr);
|
||||
// If nonzero, write index
|
||||
if (val != scalar_t(0)) {
|
||||
|
@ -50,7 +50,8 @@ const Tensor& value){
|
||||
}
|
||||
}
|
||||
}
|
||||
for (C10_UNUSED const auto i : c10::irange(num_ind, self.ndimension())) {
|
||||
for ([[maybe_unused]] const auto i :
|
||||
c10::irange(num_ind, self.ndimension())) {
|
||||
mask = mask.unsqueeze(-1);
|
||||
}
|
||||
return std::make_tuple(true, mask);
|
||||
|
@ -1945,7 +1945,7 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
|
||||
at::parallel_for(0, index_len, at::internal::GRAIN_SIZE, [&](int64_t start, int64_t end) {
|
||||
const auto* src = ptr_index + start;
|
||||
auto* dst = ptr_nneg_index + start;
|
||||
for (C10_UNUSED const auto _ : c10::irange(start, end)) {
|
||||
for ([[maybe_unused]] const auto _ : c10::irange(start, end)) {
|
||||
auto idx = *src++;
|
||||
if (idx < -size || idx >= size) {
|
||||
// Mark self and dim as used if code is compiled with STRIP_ERROR_MESSAGES
|
||||
@ -2051,36 +2051,42 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
|
||||
const auto* ptr_sorted_start = ptr_sorted;
|
||||
const auto* ptr_sorted_end = ptr_sorted + sorted_len;
|
||||
|
||||
at::parallel_for(0, n_threads_src, 1, [&](int64_t tid, C10_UNUSED int64_t _) {
|
||||
const auto start = tid * chunk_size_src;
|
||||
const auto end = std::min(start + chunk_size_src, src_len);
|
||||
auto* ptr_tid_src_int_idx = src_int_idx.select(0, tid).data_ptr<int64_t>();
|
||||
auto* ptr_tid_sorted_int_idx = sorted_int_idx.select(0, tid).data_ptr<int64_t>();
|
||||
auto* ptr_tid_int_counts = int_counts.select(0, tid).data_ptr<int64_t>();
|
||||
const auto* ptr_src = src.const_data_ptr<int64_t>() + start;
|
||||
at::parallel_for(
|
||||
0, n_threads_src, 1, [&](int64_t tid, [[maybe_unused]] int64_t _) {
|
||||
const auto start = tid * chunk_size_src;
|
||||
const auto end = std::min(start + chunk_size_src, src_len);
|
||||
auto* ptr_tid_src_int_idx =
|
||||
src_int_idx.select(0, tid).data_ptr<int64_t>();
|
||||
auto* ptr_tid_sorted_int_idx =
|
||||
sorted_int_idx.select(0, tid).data_ptr<int64_t>();
|
||||
auto* ptr_tid_int_counts =
|
||||
int_counts.select(0, tid).data_ptr<int64_t>();
|
||||
const auto* ptr_src = src.const_data_ptr<int64_t>() + start;
|
||||
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
const auto src_val = *ptr_src++;
|
||||
const auto src_val_lb = std::lower_bound(ptr_sorted_start, ptr_sorted_end, src_val);
|
||||
// We cannot just use *src_val_lb != src_val because when
|
||||
// src_val_lb == ptr_sorted_end, dereferencing past-the-end value
|
||||
// is not well-defined.
|
||||
if (src_val_lb == ptr_sorted_end || *src_val_lb != src_val) {
|
||||
++ptr_tid_src_int_idx;
|
||||
++ptr_tid_sorted_int_idx;
|
||||
++ptr_tid_int_counts;
|
||||
continue;
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
const auto src_val = *ptr_src++;
|
||||
const auto src_val_lb =
|
||||
std::lower_bound(ptr_sorted_start, ptr_sorted_end, src_val);
|
||||
// We cannot just use *src_val_lb != src_val because when
|
||||
// src_val_lb == ptr_sorted_end, dereferencing past-the-end
|
||||
// value is not well-defined.
|
||||
if (src_val_lb == ptr_sorted_end || *src_val_lb != src_val) {
|
||||
++ptr_tid_src_int_idx;
|
||||
++ptr_tid_sorted_int_idx;
|
||||
++ptr_tid_int_counts;
|
||||
continue;
|
||||
}
|
||||
const auto src_val_ub =
|
||||
std::upper_bound(ptr_sorted_start, ptr_sorted_end, src_val);
|
||||
|
||||
const int64_t count = src_val_ub - src_val_lb;
|
||||
const int64_t j = src_val_lb - ptr_sorted_start;
|
||||
|
||||
*ptr_tid_src_int_idx++ = i;
|
||||
*ptr_tid_sorted_int_idx++ = j;
|
||||
*ptr_tid_int_counts++ = count;
|
||||
}
|
||||
const auto src_val_ub = std::upper_bound(ptr_sorted_start, ptr_sorted_end, src_val);
|
||||
|
||||
const int64_t count = src_val_ub - src_val_lb;
|
||||
const int64_t j = src_val_lb - ptr_sorted_start;
|
||||
|
||||
*ptr_tid_src_int_idx++ = i;
|
||||
*ptr_tid_sorted_int_idx++ = j;
|
||||
*ptr_tid_int_counts++ = count;
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
const auto compressed_int_counts = int_counts.sum(-1);
|
||||
@ -2111,29 +2117,35 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
|
||||
|
||||
const auto thread_offsets = compressed_int_counts.cumsum(0).sub_(compressed_int_counts);
|
||||
const auto* ptr_sorted_idx = sorted_idx.const_data_ptr<int64_t>();
|
||||
at::parallel_for(0, n_threads_src, 1, [&](int64_t tid, C10_UNUSED int64_t _) {
|
||||
const auto start = tid * chunk_size_src;
|
||||
const auto end = std::min(start + chunk_size_src, src_len);
|
||||
const auto tid_offset = thread_offsets.const_data_ptr<int64_t>()[tid];
|
||||
const auto* ptr_tid_src_int_idx = src_int_idx.select(0, tid).const_data_ptr<int64_t>();
|
||||
const auto* ptr_tid_sorted_int_idx = sorted_int_idx.select(0, tid).const_data_ptr<int64_t>();
|
||||
const auto* ptr_tid_int_counts = int_counts.select(0, tid).const_data_ptr<int64_t>();
|
||||
auto* ptr_tid_selected_sorted = ptr_selected_sorted + tid_offset;
|
||||
auto* ptr_tid_selected_src = ptr_selected_src + tid_offset;
|
||||
at::parallel_for(
|
||||
0, n_threads_src, 1, [&](int64_t tid, [[maybe_unused]] int64_t _) {
|
||||
const auto start = tid * chunk_size_src;
|
||||
const auto end = std::min(start + chunk_size_src, src_len);
|
||||
const auto tid_offset =
|
||||
thread_offsets.const_data_ptr<int64_t>()[tid];
|
||||
const auto* ptr_tid_src_int_idx =
|
||||
src_int_idx.select(0, tid).const_data_ptr<int64_t>();
|
||||
const auto* ptr_tid_sorted_int_idx =
|
||||
sorted_int_idx.select(0, tid).const_data_ptr<int64_t>();
|
||||
const auto* ptr_tid_int_counts =
|
||||
int_counts.select(0, tid).const_data_ptr<int64_t>();
|
||||
auto* ptr_tid_selected_sorted = ptr_selected_sorted + tid_offset;
|
||||
auto* ptr_tid_selected_src = ptr_selected_src + tid_offset;
|
||||
|
||||
for (C10_UNUSED const auto _ : c10::irange(start, end)) {
|
||||
const auto count = *ptr_tid_int_counts++;
|
||||
const auto i = *ptr_tid_src_int_idx++;
|
||||
const auto j = *ptr_tid_sorted_int_idx++;
|
||||
if (!count) continue;
|
||||
for ([[maybe_unused]] const auto _ : c10::irange(start, end)) {
|
||||
const auto count = *ptr_tid_int_counts++;
|
||||
const auto i = *ptr_tid_src_int_idx++;
|
||||
const auto j = *ptr_tid_sorted_int_idx++;
|
||||
if (!count)
|
||||
continue;
|
||||
|
||||
std::fill_n(ptr_tid_selected_src, count, i);
|
||||
std::copy_n(ptr_sorted_idx + j, count, ptr_tid_selected_sorted);
|
||||
std::fill_n(ptr_tid_selected_src, count, i);
|
||||
std::copy_n(ptr_sorted_idx + j, count, ptr_tid_selected_sorted);
|
||||
|
||||
ptr_tid_selected_sorted += count;
|
||||
ptr_tid_selected_src += count;
|
||||
}
|
||||
});
|
||||
ptr_tid_selected_sorted += count;
|
||||
ptr_tid_selected_src += count;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return search_in_dim_indices
|
||||
@ -2192,7 +2204,7 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
|
||||
else {
|
||||
auto* ptr_counts = counts.data_ptr<int64_t>();
|
||||
const auto* ptr_vals = t.const_data_ptr<int64_t>();
|
||||
for (C10_UNUSED const auto _ : c10::irange(t.numel())) {
|
||||
for ([[maybe_unused]] const auto _ : c10::irange(t.numel())) {
|
||||
++ptr_counts[*ptr_vals++];
|
||||
}
|
||||
}
|
||||
@ -2212,14 +2224,19 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
|
||||
const auto run_in_parallel = (n_threads == 1);
|
||||
|
||||
auto counts_per_thread = at::zeros({n_threads, size}, idx.options());
|
||||
at::parallel_for(0, n_threads, 1, [&](int64_t tid, C10_UNUSED int64_t _) {
|
||||
const auto start = tid * chunk_size;
|
||||
const auto end = std::min(start + chunk_size, idx_len);
|
||||
const auto tid_idx = idx.slice(0, start, end);
|
||||
auto tid_counts = counts_per_thread.select(0, tid);
|
||||
get_counts(tid_counts, tid_idx, /*bins=*/size,
|
||||
/*is_sorted=*/is_sorted, /*run_in_parallel=*/run_in_parallel);
|
||||
});
|
||||
at::parallel_for(
|
||||
0, n_threads, 1, [&](int64_t tid, [[maybe_unused]] int64_t _) {
|
||||
const auto start = tid * chunk_size;
|
||||
const auto end = std::min(start + chunk_size, idx_len);
|
||||
const auto tid_idx = idx.slice(0, start, end);
|
||||
auto tid_counts = counts_per_thread.select(0, tid);
|
||||
get_counts(
|
||||
tid_counts,
|
||||
tid_idx,
|
||||
/*bins=*/size,
|
||||
/*is_sorted=*/is_sorted,
|
||||
/*run_in_parallel=*/run_in_parallel);
|
||||
});
|
||||
|
||||
return counts_per_thread;
|
||||
};
|
||||
@ -2310,32 +2327,38 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
|
||||
1, std::min<int64_t>((src_len + grain_size - 1) / grain_size, at::get_num_threads())
|
||||
);
|
||||
const auto chunk_size = (src_len + n_threads_src - 1) / n_threads_src;
|
||||
at::parallel_for(0, n_threads_src, 1, [&](int64_t tid, C10_UNUSED int64_t _) {
|
||||
const auto start = tid * chunk_size;
|
||||
const auto end = std::min(start + chunk_size, src_len);
|
||||
auto* ptr_src_tid = ptr_src + start;
|
||||
const auto* ptr_src_counts_per_thread
|
||||
= src_counts_per_thread.select(0, tid).const_data_ptr<int64_t>();
|
||||
const auto* ptr_src_offset_counts_per_thread
|
||||
= src_offset_counts_per_thread.select(0, tid).const_data_ptr<int64_t>();
|
||||
auto tid_counts = at::zeros({size}, src.options());
|
||||
auto* ptr_tid_counts = tid_counts.data_ptr<int64_t>();
|
||||
at::parallel_for(
|
||||
0, n_threads_src, 1, [&](int64_t tid, [[maybe_unused]] int64_t _) {
|
||||
const auto start = tid * chunk_size;
|
||||
const auto end = std::min(start + chunk_size, src_len);
|
||||
auto* ptr_src_tid = ptr_src + start;
|
||||
const auto* ptr_src_counts_per_thread =
|
||||
src_counts_per_thread.select(0, tid)
|
||||
.const_data_ptr<int64_t>();
|
||||
const auto* ptr_src_offset_counts_per_thread =
|
||||
src_offset_counts_per_thread.select(0, tid)
|
||||
.const_data_ptr<int64_t>();
|
||||
auto tid_counts = at::zeros({size}, src.options());
|
||||
auto* ptr_tid_counts = tid_counts.data_ptr<int64_t>();
|
||||
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
const auto idx_val = *ptr_src_tid++;
|
||||
// skip idx value if not in the intersection
|
||||
if (!ptr_intersection_counts[idx_val]) continue;
|
||||
const auto idx_val_offset
|
||||
= ptr_src_intersection_offsets[idx_val]
|
||||
- ptr_src_intersection_counts[idx_val];
|
||||
const auto idx_val_tid_offset
|
||||
= ptr_src_offset_counts_per_thread[idx_val]
|
||||
- ptr_src_counts_per_thread[idx_val];
|
||||
auto& idx_val_local_tid_count = ptr_tid_counts[idx_val];
|
||||
ptr_src_idx[idx_val_offset + idx_val_tid_offset + idx_val_local_tid_count] = i;
|
||||
++idx_val_local_tid_count;
|
||||
}
|
||||
});
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
const auto idx_val = *ptr_src_tid++;
|
||||
// skip idx value if not in the intersection
|
||||
if (!ptr_intersection_counts[idx_val])
|
||||
continue;
|
||||
const auto idx_val_offset =
|
||||
ptr_src_intersection_offsets[idx_val] -
|
||||
ptr_src_intersection_counts[idx_val];
|
||||
const auto idx_val_tid_offset =
|
||||
ptr_src_offset_counts_per_thread[idx_val] -
|
||||
ptr_src_counts_per_thread[idx_val];
|
||||
auto& idx_val_local_tid_count = ptr_tid_counts[idx_val];
|
||||
ptr_src_idx
|
||||
[idx_val_offset + idx_val_tid_offset +
|
||||
idx_val_local_tid_count] = i;
|
||||
++idx_val_local_tid_count;
|
||||
}
|
||||
});
|
||||
|
||||
const auto src_idx_offsets = src_intersection_offsets.sub_(src_intersection_counts);
|
||||
|
||||
@ -2369,26 +2392,28 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
|
||||
1, std::min<int64_t>((idx_len + grain_size - 1) / grain_size, at::get_num_threads())
|
||||
);
|
||||
const auto chunk_size = (idx_len + n_threads_idx - 1) / n_threads_idx;
|
||||
at::parallel_for(0, n_threads_idx, 1, [&](int64_t tid, C10_UNUSED int64_t _) {
|
||||
const auto start = tid * chunk_size;
|
||||
const auto end = std::min(start + chunk_size, idx_len);
|
||||
const auto tid_offset = ptr_thread_offset[tid];
|
||||
const auto* ptr_idx_tid = ptr_idx + start;
|
||||
auto* ptr_idx_selected_tid = ptr_idx_selected + tid_offset;
|
||||
auto* ptr_src_selected_tid = ptr_src_selected + tid_offset;
|
||||
at::parallel_for(
|
||||
0, n_threads_idx, 1, [&](int64_t tid, [[maybe_unused]] int64_t _) {
|
||||
const auto start = tid * chunk_size;
|
||||
const auto end = std::min(start + chunk_size, idx_len);
|
||||
const auto tid_offset = ptr_thread_offset[tid];
|
||||
const auto* ptr_idx_tid = ptr_idx + start;
|
||||
auto* ptr_idx_selected_tid = ptr_idx_selected + tid_offset;
|
||||
auto* ptr_src_selected_tid = ptr_src_selected + tid_offset;
|
||||
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
const auto idx_val = *ptr_idx_tid++;
|
||||
// skip if idx_val is not in the intersection
|
||||
if (!ptr_intersection_counts[idx_val]) continue;
|
||||
const auto count = ptr_src_counts[idx_val];
|
||||
const auto j = ptr_src_idx_offsets[idx_val];
|
||||
std::fill_n(ptr_idx_selected_tid, count, i);
|
||||
std::copy_n(ptr_src_idx + j, count, ptr_src_selected_tid);
|
||||
ptr_idx_selected_tid += count;
|
||||
ptr_src_selected_tid += count;
|
||||
}
|
||||
});
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
const auto idx_val = *ptr_idx_tid++;
|
||||
// skip if idx_val is not in the intersection
|
||||
if (!ptr_intersection_counts[idx_val])
|
||||
continue;
|
||||
const auto count = ptr_src_counts[idx_val];
|
||||
const auto j = ptr_src_idx_offsets[idx_val];
|
||||
std::fill_n(ptr_idx_selected_tid, count, i);
|
||||
std::copy_n(ptr_src_idx + j, count, ptr_src_selected_tid);
|
||||
ptr_idx_selected_tid += count;
|
||||
ptr_src_selected_tid += count;
|
||||
}
|
||||
});
|
||||
|
||||
return std::make_tuple(idx_selected, src_selected);
|
||||
}();
|
||||
|
@ -29,13 +29,12 @@ namespace {
|
||||
// grad_in does not mean that it is a gradient wrt to input,
|
||||
// grad_in/grad_out is just an input/output of unfold_backward kernel.
|
||||
|
||||
C10_UNUSED static TensorIterator _make_unfold_backward_iter_over_grad_out(
|
||||
Tensor& grad_out,
|
||||
const Tensor& grad_in,
|
||||
int64_t dim,
|
||||
int64_t size,
|
||||
int64_t step
|
||||
) {
|
||||
[[maybe_unused]] static TensorIterator _make_unfold_backward_iter_over_grad_out(
|
||||
Tensor& grad_out,
|
||||
const Tensor& grad_in,
|
||||
int64_t dim,
|
||||
int64_t size,
|
||||
int64_t step) {
|
||||
dim = maybe_wrap_dim(dim, grad_out.dim());
|
||||
// last dim stores the folds
|
||||
|
||||
@ -106,7 +105,6 @@ C10_UNUSED static TensorIterator _make_unfold_backward_iter_over_grad_out(
|
||||
|
||||
return iter;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
@ -103,7 +103,9 @@ DECLARE_DISPATCH(upsampling_bicubic2d, upsample_bicubic2d_kernel);
|
||||
DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_kernel);
|
||||
DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_backward_kernel);
|
||||
|
||||
C10_UNUSED inline std::array<int64_t, 3> upsample_1d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
|
||||
[[maybe_unused]] inline std::array<int64_t, 3> upsample_1d_common_check(
|
||||
IntArrayRef input_size,
|
||||
IntArrayRef output_size) {
|
||||
TORCH_CHECK(
|
||||
output_size.size() == 1,
|
||||
"It is expected output_size equals to 1, but got size ",
|
||||
@ -131,7 +133,9 @@ C10_UNUSED inline std::array<int64_t, 3> upsample_1d_common_check(IntArrayRef in
|
||||
return {nbatch, channels, output_width};
|
||||
}
|
||||
|
||||
C10_UNUSED inline std::array<int64_t, 4> upsample_2d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
|
||||
[[maybe_unused]] inline std::array<int64_t, 4> upsample_2d_common_check(
|
||||
IntArrayRef input_size,
|
||||
IntArrayRef output_size) {
|
||||
TORCH_CHECK(
|
||||
output_size.size() == 2,
|
||||
"It is expected output_size equals to 2, but got size ",
|
||||
@ -167,8 +171,9 @@ C10_UNUSED inline std::array<int64_t, 4> upsample_2d_common_check(IntArrayRef in
|
||||
return {nbatch, channels, output_height, output_width};
|
||||
}
|
||||
|
||||
C10_UNUSED inline
|
||||
std::array<int64_t, 5> upsample_3d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
|
||||
[[maybe_unused]] inline std::array<int64_t, 5> upsample_3d_common_check(
|
||||
IntArrayRef input_size,
|
||||
IntArrayRef output_size) {
|
||||
TORCH_CHECK(
|
||||
output_size.size() == 3,
|
||||
"It is expected output_size equals to 3, but got size ",
|
||||
|
@ -40,7 +40,6 @@ int register_linear_params() {
|
||||
}
|
||||
|
||||
namespace {
|
||||
C10_UNUSED static auto linear_params = register_linear_params();
|
||||
} // namespace
|
||||
|
||||
[[maybe_unused]] static auto linear_params = register_linear_params();
|
||||
} // namespace
|
||||
}} // namespace ao::sparse
|
||||
|
@ -82,7 +82,7 @@ static void reduced_float_copy_kernel(TensorIteratorBase &iter, bool requires_ne
|
||||
std::copy_n(base, 2, data.data());
|
||||
const int64_t *outer_strides = &strides[2];
|
||||
|
||||
for (C10_UNUSED const auto it : c10::irange(size1)) {
|
||||
for ([[maybe_unused]] const auto it : c10::irange(size1)) {
|
||||
Vecd dst_s;
|
||||
if (strides_in[0] == 0) {
|
||||
dst_s = Vecd(dest_t(*((scalar_t*)data[1])));
|
||||
@ -151,7 +151,7 @@ static void reduced_float_copy_kernel(TensorIteratorBase &iter, bool requires_ne
|
||||
std::copy_n(base, 2, data.data());
|
||||
const int64_t *outer_strides = &strides[2];
|
||||
|
||||
for (C10_UNUSED const auto it : c10::irange(size1)) {
|
||||
for ([[maybe_unused]] const auto it : c10::irange(size1)) {
|
||||
Vecd dst_s;
|
||||
if (strides_in[0] == 0) {
|
||||
dst_s = Vecd(dest_t(*((source_t*)data[1])));
|
||||
|
@ -395,7 +395,7 @@ struct Dist {
|
||||
const scalar_t * t1_end = t1 + l1_size;
|
||||
const scalar_t * t2_end = t2 + l2_size;
|
||||
|
||||
for (C10_UNUSED const auto l : c10::irange(d)) {
|
||||
for ([[maybe_unused]] const auto l : c10::irange(d)) {
|
||||
for (; t1 != t1_end; t1 += m, res += m) {
|
||||
const Vec vec_t1 = Vec::loadu(t1, count);
|
||||
Vec res_vec = Vec::loadu(res, count);
|
||||
|
@ -473,7 +473,7 @@ void cpu_flash_attention(
|
||||
scalar_t* transpose_buffer_ptr = transpose_buffer.get();
|
||||
std::unique_ptr<scalar_t[]> v_copy_buffer = std::make_unique<scalar_t[]>(ekvSplitSize * packb_size);
|
||||
scalar_t* v_copy_buffer_ptr = v_copy_buffer.get();
|
||||
for (C10_UNUSED auto z : c10::irange(begin, end)) {
|
||||
for ([[maybe_unused]] auto z : c10::irange(begin, end)) {
|
||||
n = l * kvSplitSize;
|
||||
int64_t kvBlockSize = std::min(kvSplitSize, kvSize - n);
|
||||
int64_t ekvBlockSize = kvBlockSize % 2 == 0 ? kvBlockSize : kvBlockSize + 1;
|
||||
@ -566,7 +566,7 @@ void cpu_flash_attention(
|
||||
? query_padding_ptr + ompIdx * qSplitSize * eheadSize
|
||||
: nullptr;
|
||||
|
||||
for (C10_UNUSED auto z : c10::irange(begin, end)) {
|
||||
for ([[maybe_unused]] auto z : c10::irange(begin, end)) {
|
||||
int64_t m = k * qSplitSize;
|
||||
int64_t qBlockSize = std::min(qSplitSize, qSize - m);
|
||||
// Initialize max and sum
|
||||
@ -931,7 +931,7 @@ void cpu_flash_attention_backward(
|
||||
|
||||
at::Tensor dsum = at::empty({qSplitSize}, query.options().dtype(accumulate_dtype));
|
||||
accum_t* dsum_data = dsum.data_ptr<accum_t>();
|
||||
for (C10_UNUSED auto z : c10::irange(begin, end)) {
|
||||
for ([[maybe_unused]] auto z : c10::irange(begin, end)) {
|
||||
// rowsum of grad_out * out
|
||||
for (int64_t m = 0; m < qSize; m += qSplitSize) {
|
||||
int64_t qBlockSize = std::min(qSplitSize, qSize - m);
|
||||
|
@ -30,7 +30,7 @@ void _compute_linear_combination_cpu_kernel(
|
||||
auto* RESTRICT in_ptr = data[1];
|
||||
auto* RESTRICT coeff_ptr = data[2];
|
||||
|
||||
for (C10_UNUSED const auto elem : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto elem : c10::irange(n)) {
|
||||
auto* RESTRICT out_data = reinterpret_cast<scalar_t*>(out_ptr);
|
||||
auto* RESTRICT in_data = reinterpret_cast<scalar_t*>(in_ptr);
|
||||
using primitive_t = typename scalar_value_type<scalar_t>::type;
|
||||
|
@ -78,7 +78,7 @@ void cpu_take_put_kernel(
|
||||
auto loop = [&](char** data, const int64_t* strides, int64_t n) {
|
||||
auto* iterated_data_bytes = data[0];
|
||||
auto* index_data_bytes = data[1];
|
||||
for (C10_UNUSED const auto elem : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto elem : c10::irange(n)) {
|
||||
auto idx = *reinterpret_cast<int64_t*>(index_data_bytes);
|
||||
auto& iterated = *reinterpret_cast<scalar_t*>(iterated_data_bytes);
|
||||
|
||||
@ -203,7 +203,7 @@ void index_fill_kernel(
|
||||
auto handle_nonzero_idx_stride = [&](char** data, const int64_t* strides, int64_t n) {
|
||||
auto* self_data_bytes = data[0];
|
||||
auto* index_data_bytes = data[1];
|
||||
for (C10_UNUSED const auto elem : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto elem : c10::irange(n)) {
|
||||
auto* self_data = reinterpret_cast<scalar_t*>(self_data_bytes);
|
||||
auto idx = *reinterpret_cast<int64_t*>(index_data_bytes);
|
||||
TORCH_CHECK_INDEX(idx >= -self_dim_size && idx < self_dim_size,
|
||||
@ -229,7 +229,7 @@ void index_fill_kernel(
|
||||
if (idx < 0) {
|
||||
idx += self_dim_size;
|
||||
}
|
||||
for (C10_UNUSED const auto elem: c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto elem : c10::irange(n)) {
|
||||
auto* self_data = reinterpret_cast<scalar_t*>(self_data_bytes);
|
||||
|
||||
self_data[idx * self_dim_stride] = fill_val;
|
||||
@ -262,7 +262,7 @@ void index_copy_kernel(
|
||||
auto* self_data_bytes = data[0];
|
||||
auto* index_data_bytes = data[1];
|
||||
auto* source_data_bytes = data[2];
|
||||
for (C10_UNUSED const auto elem : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto elem : c10::irange(n)) {
|
||||
auto* self_data = reinterpret_cast<scalar_t*>(self_data_bytes);
|
||||
auto idx = *reinterpret_cast<int64_t*>(index_data_bytes);
|
||||
auto* source_data = reinterpret_cast<scalar_t*>(source_data_bytes);
|
||||
@ -285,7 +285,7 @@ void index_copy_kernel(
|
||||
TORCH_CHECK_INDEX(idx >= 0 && idx < self_dim_size,
|
||||
"index_copy_(): index ", idx, " is out of bounds for dimension ",
|
||||
dim, " with size ", self_dim_size);
|
||||
for (C10_UNUSED const auto elem : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto elem : c10::irange(n)) {
|
||||
auto* self_data = reinterpret_cast<scalar_t*>(self_data_bytes);
|
||||
auto* source_data = reinterpret_cast<scalar_t*>(source_data_bytes);
|
||||
|
||||
@ -474,8 +474,7 @@ void cpu_hflip_vec(at::TensorIterator& iter) {
|
||||
constexpr auto stride = sizeof(scalar_t);
|
||||
TORCH_INTERNAL_ASSERT(stride == -strides[0] && stride == strides[1]);
|
||||
|
||||
for (C10_UNUSED const auto j : c10::irange(size1)) {
|
||||
|
||||
for ([[maybe_unused]] const auto j : c10::irange(size1)) {
|
||||
// vectorized loop with negative stride for output
|
||||
char** C10_RESTRICT data_ = data_arr.data();
|
||||
int64_t n = size0;
|
||||
@ -543,8 +542,7 @@ void cpu_vflip_memcpy(at::TensorIterator& iter) {
|
||||
TORCH_INTERNAL_ASSERT(strides[0] == strides[1]);
|
||||
const int64_t stride = strides[0];
|
||||
|
||||
for (C10_UNUSED const auto j : c10::irange(size1)) {
|
||||
|
||||
for ([[maybe_unused]] const auto j : c10::irange(size1)) {
|
||||
char** C10_RESTRICT data_ = data_arr.data();
|
||||
int64_t n = size0;
|
||||
|
||||
|
@ -271,7 +271,7 @@ struct VectorizedLoop2d {
|
||||
const int64_t *outer_strides = &strides[ntensors];
|
||||
|
||||
if (is_contiguous<traits>(strides)) {
|
||||
for (C10_UNUSED const auto i : c10::irange(size1)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(size1)) {
|
||||
vectorized_loop(data.data(), size0, 0, op, vop);
|
||||
advance(data, outer_strides);
|
||||
}
|
||||
@ -279,12 +279,12 @@ struct VectorizedLoop2d {
|
||||
using Indices = std::make_index_sequence<traits::arity>;
|
||||
unroll_contiguous_scalar_checks<traits>(strides, Indices{}, [&](size_t idx) {
|
||||
if (idx) {
|
||||
for (C10_UNUSED const auto i : c10::irange(size1)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(size1)) {
|
||||
vectorized_loop(data.data(), size0, idx, op, vop);
|
||||
advance(data, outer_strides);
|
||||
}
|
||||
} else {
|
||||
for (C10_UNUSED const auto i : c10::irange(size1)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(size1)) {
|
||||
basic_loop(data.data(), strides, 0, size0, op);
|
||||
advance(data, outer_strides);
|
||||
}
|
||||
|
@ -70,7 +70,7 @@ inline void vectorized_reduction(char** data, int64_t n, int64_t stride,
|
||||
|
||||
template <typename F>
|
||||
inline void UNARY_OUTER_LOOP(char* data[2], const int64_t strides[2], int64_t n, F f) {
|
||||
for (C10_UNUSED const auto j : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto j : c10::irange(n)) {
|
||||
f();
|
||||
data[0] += strides[0];
|
||||
data[1] += strides[1];
|
||||
|
@ -62,11 +62,12 @@ static inline void cpu_cum_base_kernel(const Tensor& result,
|
||||
auto* result_data_bytes = data[0];
|
||||
const auto* self_data_bytes = data[1];
|
||||
|
||||
for (C10_UNUSED const auto i : c10::irange(n)) {
|
||||
f(
|
||||
(scalar_t*)result_data_bytes, result_dim_stride,
|
||||
(scalar_t*)self_data_bytes, self_dim_stride, init_val
|
||||
);
|
||||
for ([[maybe_unused]] const auto i : c10::irange(n)) {
|
||||
f((scalar_t*)result_data_bytes,
|
||||
result_dim_stride,
|
||||
(scalar_t*)self_data_bytes,
|
||||
self_dim_stride,
|
||||
init_val);
|
||||
result_data_bytes += strides[0];
|
||||
self_data_bytes += strides[1];
|
||||
}
|
||||
|
@ -215,7 +215,7 @@ struct cpu_scatter_gather_base_kernel {
|
||||
// vs dim-TensorIterator loop order depending on
|
||||
// whether dim is the last dimension
|
||||
if (dim== buffer.dim() - 1) {
|
||||
for (C10_UNUSED const auto nelem : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
|
||||
// dim loop is a separate code block
|
||||
// for better performance
|
||||
loop_func.template operator()<scalar_t, func_t>(
|
||||
@ -232,7 +232,7 @@ struct cpu_scatter_gather_base_kernel {
|
||||
for (const auto i : c10::irange(index_dim_size)) {
|
||||
auto* self_data = self_data_bytes;
|
||||
auto* index_data = (char*)((int64_t*)index_data_bytes + i * index_dim_stride);
|
||||
for (C10_UNUSED const auto nelem : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
|
||||
int64_t idx_dim = *(int64_t*)index_data;
|
||||
// we are not putting idx_dim in the error message because it disables
|
||||
// loop optimization in clang-7
|
||||
@ -306,7 +306,7 @@ struct cpu_scatter_gather_base_kernel {
|
||||
// vs dim-TensorIterator loop order depending on
|
||||
// whether dim is the last dimension
|
||||
if (dim== buffer.dim() - 1) {
|
||||
for (C10_UNUSED const auto nelem : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
|
||||
// dim loop is a separate code block
|
||||
// for better performance
|
||||
loop_func.template operator()<scalar_t, func_t>(
|
||||
@ -327,7 +327,7 @@ struct cpu_scatter_gather_base_kernel {
|
||||
auto* self_data = self_data_bytes;
|
||||
auto* index_data = (char*)((int64_t*)index_data_bytes + i * index_dim_stride);
|
||||
auto* src_data = src_data_bytes;
|
||||
for (C10_UNUSED const auto nelem : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
|
||||
int64_t idx_dim = *(int64_t*)index_data;
|
||||
// we are not putting idx_dim in the error message because it disables
|
||||
// loop optimization in clang-7
|
||||
@ -402,7 +402,7 @@ struct cpu_scatter_gather_base_kernel {
|
||||
// vs dim-TensorIterator loop order depending on
|
||||
// whether dim is the last dimension
|
||||
if (dim== buffer.dim() - 1) {
|
||||
for (C10_UNUSED const auto nelem : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
|
||||
// dim loop is a separate code block
|
||||
// for better performance
|
||||
loop_func.template operator()<scalar_t, ReduceMean>(
|
||||
@ -423,7 +423,7 @@ struct cpu_scatter_gather_base_kernel {
|
||||
auto* self_data = self_data_bytes;
|
||||
auto* index_data = (char*)((int64_t*)index_data_bytes + i * index_dim_stride);
|
||||
auto* src_data = src_data_bytes;
|
||||
for (C10_UNUSED const auto nelem : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
|
||||
int64_t idx_dim = *(int64_t*)index_data;
|
||||
// we are not putting idx_dim in the error message because it disables
|
||||
// loop optimization in clang-7
|
||||
@ -497,7 +497,7 @@ struct cpu_scatter_gather_base_kernel {
|
||||
// vs dim-TensorIterator loop order depending on
|
||||
// whether dim is the last dimension
|
||||
if (dim== buffer.dim() - 1) {
|
||||
for (C10_UNUSED const auto nelem : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
|
||||
// dim loop is a separate code block
|
||||
// for better performance
|
||||
loop_func.template operator()<scalar_t, ReduceMaximum>(
|
||||
@ -518,7 +518,7 @@ struct cpu_scatter_gather_base_kernel {
|
||||
auto* self_data = self_data_bytes;
|
||||
auto* index_data = (char*)((int64_t*)index_data_bytes + i * index_dim_stride);
|
||||
auto* src_data = src_data_bytes;
|
||||
for (C10_UNUSED const auto nelem : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
|
||||
int64_t idx_dim = *(int64_t*)index_data;
|
||||
// we are not putting idx_dim in the error message because it disables
|
||||
// loop optimization in clang-7
|
||||
@ -593,7 +593,7 @@ struct cpu_scatter_gather_base_kernel {
|
||||
// vs dim-TensorIterator loop order depending on
|
||||
// whether dim is the last dimension
|
||||
if (dim== buffer.dim() - 1) {
|
||||
for (C10_UNUSED const auto nelem : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
|
||||
// dim loop is a separate code block
|
||||
// for better performance
|
||||
loop_func.template operator()<scalar_t, ReduceMinimum>(
|
||||
@ -614,7 +614,7 @@ struct cpu_scatter_gather_base_kernel {
|
||||
auto* self_data = self_data_bytes;
|
||||
auto* index_data = (char*)((int64_t*)index_data_bytes + i * index_dim_stride);
|
||||
auto* src_data = src_data_bytes;
|
||||
for (C10_UNUSED const auto nelem : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto nelem : c10::irange(n)) {
|
||||
int64_t idx_dim = *(int64_t*)index_data;
|
||||
// we are not putting idx_dim in the error message because it disables
|
||||
// loop optimization in clang-7
|
||||
|
@ -53,14 +53,12 @@ void _dim_apply(
|
||||
return;
|
||||
}
|
||||
|
||||
for (C10_UNUSED const auto i : c10::irange(n)) {
|
||||
f(
|
||||
reinterpret_cast<scalar_t*>(values_data_bytes),
|
||||
for ([[maybe_unused]] const auto i : c10::irange(n)) {
|
||||
f(reinterpret_cast<scalar_t*>(values_data_bytes),
|
||||
values_dim_stride,
|
||||
reinterpret_cast<int64_t*>(indices_data_bytes),
|
||||
indices_dim_stride,
|
||||
dim_size
|
||||
);
|
||||
dim_size);
|
||||
|
||||
values_data_bytes += strides[0];
|
||||
indices_data_bytes += strides[1];
|
||||
|
@ -83,7 +83,7 @@ static inline void compare_base_kernel(const Tensor& result1, const Tensor& resu
|
||||
auto* result1_data_bytes = data[0];
|
||||
auto* result2_data_bytes = data[1];
|
||||
const auto* self_data_bytes = data[2];
|
||||
for (C10_UNUSED const auto i : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(n)) {
|
||||
f((scalar_t*)result1_data_bytes,
|
||||
(scalar_t_2*)result2_data_bytes,
|
||||
(scalar_t*)self_data_bytes,
|
||||
@ -253,7 +253,7 @@ static void mode_kernel_impl(
|
||||
|
||||
std::vector<std::pair<scalar_t, int64_t>> elements(self_dim_size);
|
||||
|
||||
for (C10_UNUSED const auto k : c10::irange(n)) {
|
||||
for ([[maybe_unused]] const auto k : c10::irange(n)) {
|
||||
scalar_t* values_data = (scalar_t*)values_data_bytes;
|
||||
int64_t* indices_data = (int64_t*)indices_data_bytes;
|
||||
const scalar_t* self_data = (scalar_t*)self_data_bytes;
|
||||
|
@ -353,8 +353,9 @@ static void unfolded2d_copy_channels_last(
|
||||
int64_t x = 0;
|
||||
data_index_init(start, y, output_height, x, output_width);
|
||||
|
||||
for (const auto k C10_UNUSED: c10::irange(start, end)) {
|
||||
scalar_t* dst = finput_data + y * output_width * kH * kW * n_input_plane + x * kH * kW * n_input_plane;
|
||||
for (const auto k [[maybe_unused]] : c10::irange(start, end)) {
|
||||
scalar_t* dst = finput_data + y * output_width * kH * kW * n_input_plane +
|
||||
x * kH * kW * n_input_plane;
|
||||
const scalar_t* src = input_data;
|
||||
|
||||
if (padW > 0 || padH > 0) {
|
||||
|
@ -76,7 +76,7 @@ void _unfold_backward_internal_kernel(
|
||||
auto* RESTRICT grad_in_ptr = data[1];
|
||||
auto* RESTRICT idx_dim_ptr = data[2];
|
||||
|
||||
for (C10_UNUSED const auto elem : c10::irange(nelems)) {
|
||||
for ([[maybe_unused]] const auto elem : c10::irange(nelems)) {
|
||||
auto* RESTRICT grad_out_data = reinterpret_cast<scalar_t*>(grad_out_ptr);
|
||||
auto* RESTRICT grad_in_data = reinterpret_cast<scalar_t*>(grad_in_ptr);
|
||||
|
||||
|
@ -733,8 +733,9 @@ struct HelperInterpBase {
|
||||
auto new_shape = std::vector<int64_t>(ndims, 1);
|
||||
new_shape[reshape_dim] = output_size;
|
||||
|
||||
for (C10_UNUSED const auto j : c10::irange(interp_size)) {
|
||||
output.emplace_back(empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>())));
|
||||
for ([[maybe_unused]] const auto j : c10::irange(interp_size)) {
|
||||
output.emplace_back(
|
||||
empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>())));
|
||||
output.emplace_back(empty(new_shape, CPU(output_type)));
|
||||
}
|
||||
}
|
||||
@ -1047,8 +1048,9 @@ struct HelperInterpNearest : public HelperInterpBase {
|
||||
auto new_shape = std::vector<int64_t>(ndims, 1);
|
||||
new_shape[reshape_dim] = output_size;
|
||||
|
||||
for (C10_UNUSED const auto j : c10::irange(interp_size)) {
|
||||
output.emplace_back(empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>())));
|
||||
for ([[maybe_unused]] const auto j : c10::irange(interp_size)) {
|
||||
output.emplace_back(
|
||||
empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>())));
|
||||
// Defines weights for consistency, but not used
|
||||
output.emplace_back(at::ones(new_shape, CPU(output_type)));
|
||||
}
|
||||
|
@ -102,7 +102,7 @@ void pack_rgb(
|
||||
|
||||
TORCH_INTERNAL_ASSERT(unpacked_increment == 3 || unpacked_increment == 4);
|
||||
|
||||
for (C10_UNUSED const auto i : c10::irange(num_pixels)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(num_pixels)) {
|
||||
for (const auto j : c10::irange(num_channels)) {
|
||||
packed[j * packed_stride] = unpacked[j];
|
||||
}
|
||||
|
@ -723,7 +723,7 @@ void int4pack_mm_kernel_(
|
||||
int mb{0}, nb{0};
|
||||
data_index_init(begin, mb, MB, nb, NB);
|
||||
|
||||
for (C10_UNUSED const auto i : c10::irange(begin, end)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(begin, end)) {
|
||||
int mb_start = mb * BLOCK_M;
|
||||
int mb_size = std::min(BLOCK_M, M - mb_start);
|
||||
int nb_start = nb * BLOCK_N;
|
||||
|
@ -177,7 +177,7 @@ struct KthValueLauncher {
|
||||
cuda::detail::TensorInfo<scalar_t, index_t> values_info,
|
||||
int collapse_values_dim,
|
||||
cuda::detail::TensorInfo<int64_t, index_t> indices_info,
|
||||
C10_UNUSED int collapse_indices_dim,
|
||||
[[maybe_unused]] int collapse_indices_dim,
|
||||
cuda::detail::TensorInfo<const scalar_t, index_t> self_info,
|
||||
int collapse_self_dim,
|
||||
int64_t num_slices,
|
||||
@ -212,9 +212,9 @@ struct MedianLauncher {
|
||||
template <typename scalar_t, typename index_t, int all_dims>
|
||||
inline void launch(
|
||||
cuda::detail::TensorInfo<scalar_t, index_t> values_info,
|
||||
C10_UNUSED int collapse_values_dim,
|
||||
[[maybe_unused]] int collapse_values_dim,
|
||||
cuda::detail::TensorInfo<int64_t, index_t> indices_info,
|
||||
C10_UNUSED int collapse_indices_dim,
|
||||
[[maybe_unused]] int collapse_indices_dim,
|
||||
cuda::detail::TensorInfo<const scalar_t, index_t> self_info,
|
||||
int collapse_self_dim,
|
||||
int64_t num_slices,
|
||||
|
@ -1374,7 +1374,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_cuda(
|
||||
for (const auto idx: c10::irange(axis)) {
|
||||
stat_shape.push_back(input_shape[idx]);
|
||||
}
|
||||
for (C10_UNUSED const auto idx: c10::irange(axis, input.dim())) {
|
||||
for ([[maybe_unused]] const auto idx : c10::irange(axis, input.dim())) {
|
||||
stat_shape.push_back(1);
|
||||
}
|
||||
|
||||
|
@ -74,7 +74,7 @@ cudnn_frontend::Tensor getTensorDescriptorWithTypeVirtual(
|
||||
// Ubuntu-22+ if `libnvrtc.so` is not found on the system, which strictly
|
||||
// speaking is not necessary for usecases below See
|
||||
// https://github.com/pytorch/pytorch/issues/97041
|
||||
C10_UNUSED static auto cudnn_cnn_infer_handler = [] {
|
||||
[[maybe_unused]] static auto cudnn_cnn_infer_handler = [] {
|
||||
void* handle = dlopen("libcudnn_cnn_infer.so.8", RTLD_LAZY);
|
||||
char* err = dlerror();
|
||||
if (!handle) {
|
||||
|
@ -51,7 +51,7 @@ static void layer_norm_with_mean_rstd_out(
|
||||
for (const auto idx : c10::irange(axis)) {
|
||||
stat_shape.emplace_back(input_shape[idx]);
|
||||
}
|
||||
for (C10_UNUSED const auto idx : c10::irange(axis, input.dim())) {
|
||||
for ([[maybe_unused]] const auto idx : c10::irange(axis, input.dim())) {
|
||||
stat_shape.emplace_back(1);
|
||||
}
|
||||
|
||||
@ -256,7 +256,7 @@ std::tuple<Tensor, Tensor, Tensor> math_native_layer_norm(
|
||||
for (const auto idx : c10::irange(axis)) {
|
||||
stat_shape.push_back(input_shape[idx]);
|
||||
}
|
||||
for (C10_UNUSED const auto idx : c10::irange(axis, input.dim())) {
|
||||
for ([[maybe_unused]] const auto idx : c10::irange(axis, input.dim())) {
|
||||
stat_shape.push_back(1);
|
||||
}
|
||||
mean = mean.view(stat_shape);
|
||||
|
@ -163,7 +163,7 @@ static void linalg_lu_factor_out_mps_impl(const Tensor& A, bool pivot, Tensor& L
|
||||
|
||||
status_tensors.reserve(batchSize);
|
||||
pivots_list.reserve(batchSize);
|
||||
for (C10_UNUSED const auto i : c10::irange(batchSize)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(batchSize)) {
|
||||
status_tensors.push_back(at::zeros(1, kInt, std::nullopt, kMPS, std::nullopt));
|
||||
pivots_list.push_back(at::zeros(numPivots, kInt, std::nullopt, kMPS, std::nullopt));
|
||||
}
|
||||
|
@ -922,7 +922,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_mps(const Tensor& input,
|
||||
for (const auto idx : c10::irange(axis)) {
|
||||
stat_shape.push_back(input_shape[idx]);
|
||||
}
|
||||
for (C10_UNUSED auto idx : c10::irange(axis, input.dim())) {
|
||||
for ([[maybe_unused]] auto idx : c10::irange(axis, input.dim())) {
|
||||
stat_shape.push_back(1);
|
||||
}
|
||||
mean = mean.view(stat_shape);
|
||||
|
@ -706,7 +706,7 @@ static ViewCachedGraph* createViewGraph(const Tensor& self,
|
||||
// Self is the input tensor we are creating view of
|
||||
newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, inputType, getMPSShape(base_shape));
|
||||
newCachedGraph->storageOffsetTensor = mpsGraphRankedPlaceHolder(mpsGraph, MPSDataTypeInt32, @[ @1 ]);
|
||||
for (C10_UNUSED const auto i : c10::irange(size.size())) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(size.size())) {
|
||||
newCachedGraph->strideTensors.push_back(mpsGraphRankedPlaceHolder(mpsGraph, MPSDataTypeInt32, @[ @1 ]));
|
||||
}
|
||||
if (needsScatter) {
|
||||
|
@ -444,7 +444,7 @@ Tensor qnnpack_avg_pool2d(
|
||||
} // namespace at
|
||||
|
||||
namespace {
|
||||
C10_UNUSED std::vector<float> generate_requantization_scales(
|
||||
[[maybe_unused]] std::vector<float> generate_requantization_scales(
|
||||
const at::Tensor& weight_scales,
|
||||
const float input_scale,
|
||||
const float output_scale,
|
||||
@ -468,11 +468,11 @@ C10_UNUSED std::vector<float> generate_requantization_scales(
|
||||
return requant_scales;
|
||||
}
|
||||
|
||||
C10_UNUSED std::pair<std::vector<uint8_t>, at::Tensor> make_zero_points_and_scales_tensor(
|
||||
[[maybe_unused]] std::pair<std::vector<uint8_t>, at::Tensor>
|
||||
make_zero_points_and_scales_tensor(
|
||||
const at::Tensor& weight_contig,
|
||||
bool transpose = false,
|
||||
uint32_t groups = 1
|
||||
) {
|
||||
uint32_t groups = 1) {
|
||||
const int out_ch_idx = transpose ? 1 : 0;
|
||||
const auto num_output_channels = weight_contig.size(out_ch_idx) * (transpose ? groups : 1);
|
||||
// Add 8 to account for bufferring needed by QNNPACK.
|
||||
|
@ -186,8 +186,9 @@ inline TensorQuantizationParams ChooseQuantizationParams(
|
||||
|
||||
// This function helps to convert the Conv1D dimensions usable by the Conv2d op.
|
||||
constexpr int64_t kConv1dSqueezeDim = 0;
|
||||
C10_UNUSED static torch::List<int64_t> MakeArgForConv1d(const torch::List<int64_t>& arg,
|
||||
int64_t base_value) {
|
||||
[[maybe_unused]] static torch::List<int64_t> MakeArgForConv1d(
|
||||
const torch::List<int64_t>& arg,
|
||||
int64_t base_value) {
|
||||
TORCH_CHECK(!arg.empty(), "Argument must have elements.");
|
||||
torch::List<int64_t> result({arg.get(0), base_value});
|
||||
if (arg.size() == 1) {
|
||||
|
@ -71,7 +71,7 @@ static void upsample_nearest3d_out_frame(
|
||||
const auto* pos1 = &i_p[d1 * input_height * input_width + h1 * input_width + w1];
|
||||
auto* pos2 = &o_p[d2 * output_height * output_width + h2 * output_width + w2];
|
||||
|
||||
for (C10_UNUSED const auto c : c10::irange(channels)) {
|
||||
for ([[maybe_unused]] const auto c : c10::irange(channels)) {
|
||||
pos2[0] = pos1[0];
|
||||
pos1 += input_depth * input_height * input_width;
|
||||
pos2 += output_depth * output_height * output_width;
|
||||
|
@ -143,7 +143,7 @@ ConvParamsSerializationTypeV3 parse_conv_serialized_state(c10::IValue v) {
|
||||
config_vals.push_back(dilation[0].item<int16_t>());
|
||||
}
|
||||
// output_padding does not exist in v1, so we fill in a default value
|
||||
for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) {
|
||||
config_vals.push_back(0);
|
||||
}
|
||||
config_vals.push_back(groups[0].item<int16_t>());
|
||||
@ -294,21 +294,24 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> deserialize_conv(
|
||||
torch::List<int64_t> stride, padding, output_padding, dilation;
|
||||
// skip kSpatialDim
|
||||
int idx = 1;
|
||||
for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) {
|
||||
stride.emplace_back(config_vals.at(idx));
|
||||
idx++;
|
||||
}
|
||||
for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) {
|
||||
padding.emplace_back(config_vals.at(idx));
|
||||
idx++;
|
||||
}
|
||||
for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) {
|
||||
dilation.emplace_back(config_vals.at(idx));
|
||||
idx++;
|
||||
}
|
||||
for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
|
||||
TORCH_INTERNAL_ASSERT(idx < static_cast<int64_t>(config_vals.size()),
|
||||
"Unexpected index = ", idx, " for config_vals of size ",
|
||||
for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
idx < static_cast<int64_t>(config_vals.size()),
|
||||
"Unexpected index = ",
|
||||
idx,
|
||||
" for config_vals of size ",
|
||||
config_vals.size());
|
||||
output_padding.emplace_back(config_vals.at(idx));
|
||||
idx++;
|
||||
|
@ -554,9 +554,9 @@ int register_embedding_params() {
|
||||
|
||||
namespace {
|
||||
|
||||
C10_UNUSED static auto conv2d_params = register_conv_params<2>();
|
||||
C10_UNUSED static auto conv3d_params = register_conv_params<3>();
|
||||
C10_UNUSED static auto linear_params = register_linear_params();
|
||||
C10_UNUSED static auto embedding_params = register_embedding_params();
|
||||
[[maybe_unused]] static auto conv2d_params = register_conv_params<2>();
|
||||
[[maybe_unused]] static auto conv3d_params = register_conv_params<3>();
|
||||
[[maybe_unused]] static auto linear_params = register_linear_params();
|
||||
[[maybe_unused]] static auto embedding_params = register_embedding_params();
|
||||
|
||||
} // namespace
|
||||
|
@ -2293,7 +2293,7 @@ void qupsample_bilinear2d_nhwc_kernel(
|
||||
int64_t b{0}, h2{0}, w2{0};
|
||||
data_index_init(begin, b, nbatch, h2, output_height, w2, output_width);
|
||||
|
||||
for (C10_UNUSED const auto i : c10::irange(begin, end)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(begin, end)) {
|
||||
auto* i_p = reinterpret_cast<typename scalar_t::underlying*>(
|
||||
idata + b * input_height * input_width * channels);
|
||||
auto* o_p = reinterpret_cast<typename scalar_t::underlying*>(
|
||||
@ -3818,8 +3818,8 @@ void quantize_tensor_per_channel_impl<c10::quint8>(
|
||||
// channels_last contig.
|
||||
// If axis = 0 and channels_last contig, implementation for channels
|
||||
// first (NCHW) works.
|
||||
for (C10_UNUSED const auto b : c10::irange(batches)) {
|
||||
for (C10_UNUSED const auto e : c10::irange(elements_per_channel)) {
|
||||
for ([[maybe_unused]] const auto b : c10::irange(batches)) {
|
||||
for ([[maybe_unused]] const auto e : c10::irange(elements_per_channel)) {
|
||||
uint32_t c = 0;
|
||||
while (c + 8 < channels) {
|
||||
const int32x4_t voffset0123 = vld1q_s32(&zero_points_int32t[c]);
|
||||
@ -3853,7 +3853,7 @@ void quantize_tensor_per_channel_impl<c10::quint8>(
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (C10_UNUSED const auto b : c10::irange(batches)) {
|
||||
for ([[maybe_unused]] const auto b : c10::irange(batches)) {
|
||||
for (const auto c : c10::irange(channels)) {
|
||||
uint32_t e = 0;
|
||||
const int32x4_t voffset = vdupq_n_s32(zero_points_int32t[c]);
|
||||
@ -3900,8 +3900,8 @@ void quantize_tensor_per_channel_impl<c10::quint8>(
|
||||
// channels_last contig.
|
||||
// If axis = 0 and channels_last contig, implementation for channels
|
||||
// first (NCHW) works.
|
||||
for (C10_UNUSED const auto b : c10::irange(batches)) {
|
||||
for (C10_UNUSED const auto e : c10::irange(elements_per_channel)) {
|
||||
for ([[maybe_unused]] const auto b : c10::irange(batches)) {
|
||||
for ([[maybe_unused]] const auto e : c10::irange(elements_per_channel)) {
|
||||
uint32_t c = 0;
|
||||
while (c + 8 < channels) {
|
||||
const int16x8_t vzero_point = vld1q_s16(&zero_points_int16t[c]);
|
||||
@ -3931,8 +3931,8 @@ void quantize_tensor_per_channel_impl<c10::quint8>(
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (C10_UNUSED const auto b : c10::irange(batches)) {
|
||||
for (C10_UNUSED const auto c : c10::irange(channels)) {
|
||||
for ([[maybe_unused]] const auto b : c10::irange(batches)) {
|
||||
for ([[maybe_unused]] const auto c : c10::irange(channels)) {
|
||||
uint32_t e = 0;
|
||||
const int16x8_t vzero_point = vdupq_n_s16(zero_points_int16t[c]);
|
||||
const float32x4_t vinv_scale = vdupq_n_f32(inv_scales[c]);
|
||||
|
@ -634,7 +634,7 @@ class QConvPackWeightInt8 final {
|
||||
int64_t groups) {
|
||||
torch::List<int64_t> output_padding;
|
||||
output_padding.reserve(kSpatialDim);
|
||||
for (C10_UNUSED const auto idx : c10::irange(kSpatialDim)) {
|
||||
for ([[maybe_unused]] const auto idx : c10::irange(kSpatialDim)) {
|
||||
output_padding.push_back((int64_t)0);
|
||||
}
|
||||
return _run(weight, bias, stride, padding, output_padding, dilation, groups,
|
||||
|
@ -139,7 +139,7 @@ class QConvPackWeightInt8Cudnn final {
|
||||
int64_t groups) {
|
||||
torch::List<int64_t> output_padding;
|
||||
output_padding.reserve(kSpatialDim);
|
||||
for (C10_UNUSED const auto idx : c10::irange(kSpatialDim)) {
|
||||
for ([[maybe_unused]] const auto idx : c10::irange(kSpatialDim)) {
|
||||
output_padding.push_back((int64_t)0);
|
||||
}
|
||||
return _run(weight, bias, stride, padding, output_padding, dilation, groups,
|
||||
|
@ -159,7 +159,7 @@ void _csr_matmult(
|
||||
}
|
||||
}
|
||||
|
||||
for (C10_UNUSED const auto jj : c10::irange(length)) {
|
||||
for ([[maybe_unused]] const auto jj : c10::irange(length)) {
|
||||
// NOTE: the linked list that encodes col indices
|
||||
// is not guaranteed to be sorted.
|
||||
Cj[nnz] = head;
|
||||
|
@ -11,7 +11,7 @@
|
||||
|
||||
// Compiler Macros
|
||||
|
||||
// Suppress an unused variable. Copied from C10_UNUSED
|
||||
// Suppress an unused variable. Copied from [[maybe_unused]]
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#define VK_UNUSED __pragma(warning(suppress : 4100 4101))
|
||||
#else
|
||||
|
@ -31,7 +31,7 @@ bool initialize() {
|
||||
return is_initialized_;
|
||||
}
|
||||
|
||||
C10_UNUSED bool deinitialize() {
|
||||
[[maybe_unused]] bool deinitialize() {
|
||||
using namespace internal;
|
||||
|
||||
// This implementation allows for retries.
|
||||
|
@ -89,7 +89,7 @@ void TestAdd(DeprecatedTypeProperties& type) {
|
||||
void TestZeros(DeprecatedTypeProperties& type) {
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
Tensor a = zeros({1024, 1024}, type);
|
||||
for (C10_UNUSED const auto i : c10::irange(1, 1000)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(1, 1000)) {
|
||||
a = zeros({128, 128}, type);
|
||||
}
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
@ -107,7 +107,7 @@ void TestLoadsOfAdds(DeprecatedTypeProperties& type) {
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
Tensor d = ones({3, 4}, type);
|
||||
Tensor r = zeros({3, 4}, type);
|
||||
for (C10_UNUSED const auto i : c10::irange(1000)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(1000)) {
|
||||
add_out(r, r, d);
|
||||
}
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
@ -124,7 +124,7 @@ void TestLoadOfAddsWithCopy(DeprecatedTypeProperties& type) {
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
Tensor d = ones({3, 4}, type);
|
||||
Tensor r = zeros({3, 4}, type);
|
||||
for (C10_UNUSED const auto i : c10::irange(1000)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(1000)) {
|
||||
r = add(r, d);
|
||||
}
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
|
@ -161,7 +161,7 @@ TEST(CPUGeneratorImpl, TestPhiloxEngineOffset1) {
|
||||
// So if you want to skip 8 values, offset would
|
||||
// be 2, since 2*4=8.
|
||||
at::Philox4_32 engine2(123, 1, 2);
|
||||
for (C10_UNUSED const auto i : c10::irange(8)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(8)) {
|
||||
// Note: instead of using the engine() call 8 times
|
||||
// we could have achieved the same functionality by
|
||||
// calling the incr() function twice.
|
||||
@ -222,14 +222,14 @@ TEST(CPUGeneratorImpl, TestMT19937EngineReproducibility) {
|
||||
// test with zero seed
|
||||
at::mt19937 engine1(0);
|
||||
std::mt19937 engine2(0);
|
||||
for (C10_UNUSED const auto i : c10::irange(10000)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(10000)) {
|
||||
ASSERT_EQ(engine1(), engine2());
|
||||
}
|
||||
|
||||
// test with large seed
|
||||
engine1 = at::mt19937(2147483647);
|
||||
engine2 = std::mt19937(2147483647);
|
||||
for (C10_UNUSED const auto i : c10::irange(10000)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(10000)) {
|
||||
ASSERT_EQ(engine1(), engine2());
|
||||
}
|
||||
|
||||
@ -238,10 +238,9 @@ TEST(CPUGeneratorImpl, TestMT19937EngineReproducibility) {
|
||||
auto seed = rd();
|
||||
engine1 = at::mt19937(seed);
|
||||
engine2 = std::mt19937(seed);
|
||||
for (C10_UNUSED const auto i : c10::irange(10000)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(10000)) {
|
||||
ASSERT_EQ(engine1(), engine2());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
TEST(CPUGeneratorImpl, TestPhiloxEngineReproducibilityRandN) {
|
||||
|
@ -170,7 +170,7 @@ TEST(VmapTest, TestBatchedTensorActualDim) {
|
||||
{
|
||||
// ActualDim on kVmapMaxTensorDims sized underlying tensor
|
||||
auto tensor = ones({});
|
||||
for (C10_UNUSED const auto i : c10::irange(kVmapMaxTensorDims)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(kVmapMaxTensorDims)) {
|
||||
tensor = tensor.unsqueeze(0);
|
||||
}
|
||||
ASSERT_EQ(tensor.dim(), kVmapMaxTensorDims);
|
||||
|
@ -14,7 +14,7 @@ void test(int given_num_threads) {
|
||||
ASSERT_TRUE(given_num_threads >= 0);
|
||||
ASSERT_EQ(at::get_num_threads(), given_num_threads);
|
||||
auto t_sum = t.sum();
|
||||
for (C10_UNUSED const auto i : c10::irange(1000)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(1000)) {
|
||||
t_sum = t_sum + t.sum();
|
||||
}
|
||||
}
|
||||
|
@ -1122,24 +1122,28 @@ namespace {
|
||||
float minv = static_cast<float>(static_cast<double>(min_val) * 2.0);
|
||||
float maxv = static_cast<float>(static_cast<double>(max_val) * 2.0);
|
||||
ValueGen<float> gen(minv, maxv, seed.add(2));
|
||||
for (C10_UNUSED const auto i : c10::irange(trials)) {
|
||||
float scale = generator_sc.get();
|
||||
float inv_scale = 1.0f / static_cast<float>(scale);
|
||||
auto zero_point_val = generator_zp.get();
|
||||
int index = 0;
|
||||
for (int j = 0; j < vec::float_num_vecs(); j++) {
|
||||
//generate vals
|
||||
for (auto& v : unit_float_vec) {
|
||||
v = gen.get();
|
||||
expected_qint_vals[index] = quantize_val<underlying>(scale, zero_point_val, v);
|
||||
index++;
|
||||
}
|
||||
float_ret[j] = vfloat::loadu(unit_float_vec);
|
||||
for ([[maybe_unused]] const auto i : c10::irange(trials)) {
|
||||
float scale = generator_sc.get();
|
||||
float inv_scale = 1.0f / static_cast<float>(scale);
|
||||
auto zero_point_val = generator_zp.get();
|
||||
int index = 0;
|
||||
for (int j = 0; j < vec::float_num_vecs(); j++) {
|
||||
// generate vals
|
||||
for (auto& v : unit_float_vec) {
|
||||
v = gen.get();
|
||||
expected_qint_vals[index] =
|
||||
quantize_val<underlying>(scale, zero_point_val, v);
|
||||
index++;
|
||||
}
|
||||
auto expected = vec::loadu(expected_qint_vals);
|
||||
auto actual = vec::quantize(float_ret, scale, zero_point_val, inv_scale);
|
||||
if (AssertVectorized<vec>(NAME_INFO(Quantize), expected, actual).check()) return;
|
||||
} //trials;
|
||||
float_ret[j] = vfloat::loadu(unit_float_vec);
|
||||
}
|
||||
auto expected = vec::loadu(expected_qint_vals);
|
||||
auto actual =
|
||||
vec::quantize(float_ret, scale, zero_point_val, inv_scale);
|
||||
if (AssertVectorized<vec>(NAME_INFO(Quantize), expected, actual)
|
||||
.check())
|
||||
return;
|
||||
} // trials;
|
||||
}
|
||||
#if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
|
||||
// This test case aims to test at::vec::QuantizeAvx512 and
|
||||
@ -1168,7 +1172,7 @@ namespace {
|
||||
float minv = static_cast<float>(static_cast<double>(min_val) * 2.0);
|
||||
float maxv = static_cast<float>(static_cast<double>(max_val) * 2.0);
|
||||
ValueGen<float> gen(minv, maxv, seed.add(2));
|
||||
for (C10_UNUSED const auto i : c10::irange(trials)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(trials)) {
|
||||
float scale = generator_sc.get();
|
||||
float inv_scale = 1.0f / static_cast<float>(scale);
|
||||
auto zero_point_val = generator_zp.get();
|
||||
@ -1227,35 +1231,36 @@ namespace {
|
||||
ValueGen<int> generator(min_val, max_val, seed.add(1));
|
||||
//scale
|
||||
ValueGen<float> generator_sc(1.f, 15.f, seed.add(2));
|
||||
for (C10_UNUSED const auto i : c10::irange(trials)) {
|
||||
float scale = generator_sc.get();
|
||||
int32_t zero_point_val = generator.get();
|
||||
float scale_zp_premul = -(scale * zero_point_val);
|
||||
vfloat vf_scale = vfloat{ scale };
|
||||
vfloat vf_zp = vfloat{ static_cast<float>(zero_point_val) };
|
||||
vfloat vf_scale_zp = vfloat{ scale_zp_premul };
|
||||
//generate vals
|
||||
for (auto& x : qint_vals) {
|
||||
x = generator.get();
|
||||
for ([[maybe_unused]] const auto i : c10::irange(trials)) {
|
||||
float scale = generator_sc.get();
|
||||
int32_t zero_point_val = generator.get();
|
||||
float scale_zp_premul = -(scale * zero_point_val);
|
||||
vfloat vf_scale = vfloat{scale};
|
||||
vfloat vf_zp = vfloat{static_cast<float>(zero_point_val)};
|
||||
vfloat vf_scale_zp = vfloat{scale_zp_premul};
|
||||
// generate vals
|
||||
for (auto& x : qint_vals) {
|
||||
x = generator.get();
|
||||
}
|
||||
// get expected
|
||||
int index = 0;
|
||||
auto qint_vec = vec::loadu(qint_vals);
|
||||
auto actual_float_ret =
|
||||
qint_vec.dequantize(vf_scale, vf_zp, vf_scale_zp);
|
||||
for (int j = 0; j < vec::float_num_vecs(); j++) {
|
||||
for (auto& v : unit_exp_vals) {
|
||||
v = dequantize_val(scale, zero_point_val, qint_vals[index]);
|
||||
index++;
|
||||
}
|
||||
//get expected
|
||||
int index = 0;
|
||||
auto qint_vec = vec::loadu(qint_vals);
|
||||
auto actual_float_ret = qint_vec.dequantize(vf_scale, vf_zp, vf_scale_zp);
|
||||
for (int j = 0; j < vec::float_num_vecs(); j++) {
|
||||
for (auto& v : unit_exp_vals) {
|
||||
v = dequantize_val(scale, zero_point_val, qint_vals[index]);
|
||||
index++;
|
||||
}
|
||||
vfloat expected = vfloat::loadu(unit_exp_vals);
|
||||
const auto& actual = actual_float_ret[j];
|
||||
vfloat expected = vfloat::loadu(unit_exp_vals);
|
||||
const auto& actual = actual_float_ret[j];
|
||||
#if defined(CHECK_DEQUANT_WITH_LOW_PRECISION)
|
||||
if (AssertVectorized<vfloat>(NAME_INFO(DeQuantize), seed, expected, actual).check(false, true, 1.e-3f)) return;
|
||||
#else
|
||||
if (AssertVectorized<vfloat>(NAME_INFO(DeQuantize), seed, expected, actual).check()) return;
|
||||
#endif
|
||||
}
|
||||
} //trials;
|
||||
} // trials;
|
||||
}
|
||||
TYPED_TEST(QuantizationTests, ReQuantizeFromInt) {
|
||||
using vec = TypeParam;
|
||||
@ -1274,25 +1279,29 @@ namespace {
|
||||
ValueGen<int32_t> generator(min_val, max_val, seed);
|
||||
//scale
|
||||
ValueGen<float> generator_sc(1.f, 15.f, seed.add(1));
|
||||
for (C10_UNUSED const auto i : c10::irange(trials)) {
|
||||
float multiplier = 1.f / (generator_sc.get());
|
||||
auto zero_point_val = generator.get();
|
||||
int index = 0;
|
||||
for (int j = 0; j < vec::float_num_vecs(); j++) {
|
||||
//generate vals
|
||||
for (auto& v : unit_int_vec) {
|
||||
v = c10::qint32(generator.get());
|
||||
expected_qint_vals[index] = requantize_from_int<underlying>(multiplier, zero_point_val, v.val_);
|
||||
index++;
|
||||
}
|
||||
int_ret[j] = vqint::loadu(unit_int_vec);
|
||||
for ([[maybe_unused]] const auto i : c10::irange(trials)) {
|
||||
float multiplier = 1.f / (generator_sc.get());
|
||||
auto zero_point_val = generator.get();
|
||||
int index = 0;
|
||||
for (int j = 0; j < vec::float_num_vecs(); j++) {
|
||||
// generate vals
|
||||
for (auto& v : unit_int_vec) {
|
||||
v = c10::qint32(generator.get());
|
||||
expected_qint_vals[index] = requantize_from_int<underlying>(
|
||||
multiplier, zero_point_val, v.val_);
|
||||
index++;
|
||||
}
|
||||
auto expected = vec::loadu(expected_qint_vals);
|
||||
auto actual = vec::requantize_from_int(int_ret, multiplier, zero_point_val);
|
||||
if (AssertVectorized<vec>(NAME_INFO(ReQuantizeFromInt), seed, expected, actual).check()) {
|
||||
return;
|
||||
}
|
||||
} //trials;
|
||||
int_ret[j] = vqint::loadu(unit_int_vec);
|
||||
}
|
||||
auto expected = vec::loadu(expected_qint_vals);
|
||||
auto actual =
|
||||
vec::requantize_from_int(int_ret, multiplier, zero_point_val);
|
||||
if (AssertVectorized<vec>(
|
||||
NAME_INFO(ReQuantizeFromInt), seed, expected, actual)
|
||||
.check()) {
|
||||
return;
|
||||
}
|
||||
} // trials;
|
||||
}
|
||||
TYPED_TEST(QuantizationTests, WideningSubtract) {
|
||||
using vec = TypeParam;
|
||||
@ -1311,30 +1320,33 @@ namespace {
|
||||
typename vec::int_vec_return_type expected_int_ret;
|
||||
auto seed = TestSeed();
|
||||
ValueGen<underlying> generator(min_val, max_val, seed);
|
||||
for (C10_UNUSED const auto i : c10::irange(trials)) {
|
||||
//generate vals
|
||||
for (int j = 0; j < vec::size(); j++) {
|
||||
qint_vals[j] = generator.get();
|
||||
qint_b[j] = generator.get();
|
||||
if constexpr (std::is_same_v<underlying, int>) {
|
||||
//filter overflow cases
|
||||
filter_sub_overflow(qint_vals[j], qint_b[j]);
|
||||
}
|
||||
for ([[maybe_unused]] const auto i : c10::irange(trials)) {
|
||||
// generate vals
|
||||
for (int j = 0; j < vec::size(); j++) {
|
||||
qint_vals[j] = generator.get();
|
||||
qint_b[j] = generator.get();
|
||||
if constexpr (std::is_same_v<underlying, int>) {
|
||||
// filter overflow cases
|
||||
filter_sub_overflow(qint_vals[j], qint_b[j]);
|
||||
}
|
||||
int index = 0;
|
||||
auto qint_vec = vec::loadu(qint_vals);
|
||||
auto qint_vec_b = vec::loadu(qint_b);
|
||||
auto actual_int_ret = qint_vec.widening_subtract(qint_vec_b);
|
||||
for (int j = 0; j < vec::float_num_vecs(); j++) {
|
||||
for (auto& v : unit_exp_vals) {
|
||||
v = widening_subtract(qint_vals[index], qint_b[index]);
|
||||
index++;
|
||||
}
|
||||
auto expected = vqint::loadu(unit_exp_vals);
|
||||
const auto& actual = actual_int_ret[j];
|
||||
if (AssertVectorized<vqint>(NAME_INFO(WideningSubtract), seed, expected, actual).check()) return;
|
||||
}
|
||||
int index = 0;
|
||||
auto qint_vec = vec::loadu(qint_vals);
|
||||
auto qint_vec_b = vec::loadu(qint_b);
|
||||
auto actual_int_ret = qint_vec.widening_subtract(qint_vec_b);
|
||||
for (int j = 0; j < vec::float_num_vecs(); j++) {
|
||||
for (auto& v : unit_exp_vals) {
|
||||
v = widening_subtract(qint_vals[index], qint_b[index]);
|
||||
index++;
|
||||
}
|
||||
} //trials;
|
||||
auto expected = vqint::loadu(unit_exp_vals);
|
||||
const auto& actual = actual_int_ret[j];
|
||||
if (AssertVectorized<vqint>(
|
||||
NAME_INFO(WideningSubtract), seed, expected, actual)
|
||||
.check())
|
||||
return;
|
||||
}
|
||||
} // trials;
|
||||
}
|
||||
TYPED_TEST(QuantizationTests, Relu) {
|
||||
using vec = TypeParam;
|
||||
|
@ -943,22 +943,25 @@ void test_unary(
|
||||
UVT start = dmn_argc > 0 ? dmn.ArgsDomain[0].start : default_start;
|
||||
UVT end = dmn_argc > 0 ? dmn.ArgsDomain[0].end : default_end;
|
||||
ValueGen<VT> generator(start, end, seed.add(changeSeedBy));
|
||||
for (C10_UNUSED const auto trial : c10::irange(trialCount)) {
|
||||
for (const auto k : c10::irange(el_count)) {
|
||||
vals[k] = generator.get();
|
||||
call_filter(filter, vals[k]);
|
||||
//map operator
|
||||
expected[k] = expectedFunction(vals[k]);
|
||||
}
|
||||
// test
|
||||
auto input = vec_type::loadu(vals);
|
||||
auto actual = actualFunction(input);
|
||||
auto vec_expected = vec_type::loadu(expected);
|
||||
AssertVectorized<vec_type> vecAssert(testNameInfo, seed, vec_expected, actual, input);
|
||||
if (vecAssert.check(bitwise, dmn.CheckWithTolerance, dmn.ToleranceError)) return;
|
||||
for ([[maybe_unused]] const auto trial : c10::irange(trialCount)) {
|
||||
for (const auto k : c10::irange(el_count)) {
|
||||
vals[k] = generator.get();
|
||||
call_filter(filter, vals[k]);
|
||||
// map operator
|
||||
expected[k] = expectedFunction(vals[k]);
|
||||
}
|
||||
// test
|
||||
auto input = vec_type::loadu(vals);
|
||||
auto actual = actualFunction(input);
|
||||
auto vec_expected = vec_type::loadu(expected);
|
||||
AssertVectorized<vec_type> vecAssert(
|
||||
testNameInfo, seed, vec_expected, actual, input);
|
||||
if (vecAssert.check(
|
||||
bitwise, dmn.CheckWithTolerance, dmn.ToleranceError))
|
||||
return;
|
||||
|
||||
}// trial
|
||||
//inrease Seed
|
||||
} // trial
|
||||
// inrease Seed
|
||||
changeSeedBy += 1;
|
||||
}
|
||||
for (auto& custom : testCase.getCustomChecks()) {
|
||||
@ -1002,22 +1005,25 @@ void test_binary(
|
||||
UVT end1 = dmn_argc > 1 ? dmn.ArgsDomain[1].end : default_end;
|
||||
ValueGen<VT> generator0(start0, end0, seed.add(changeSeedBy));
|
||||
ValueGen<VT> generator1(start1, end1, seed.add(changeSeedBy + 1));
|
||||
for (C10_UNUSED const auto trial : c10::irange(trialCount)) {
|
||||
for (const auto k : c10::irange(el_count)) {
|
||||
vals0[k] = generator0.get();
|
||||
vals1[k] = generator1.get();
|
||||
call_filter(filter, vals0[k], vals1[k]);
|
||||
//map operator
|
||||
expected[k] = expectedFunction(vals0[k], vals1[k]);
|
||||
}
|
||||
// test
|
||||
auto input0 = vec_type::loadu(vals0);
|
||||
auto input1 = vec_type::loadu(vals1);
|
||||
auto actual = actualFunction(input0, input1);
|
||||
auto vec_expected = vec_type::loadu(expected);
|
||||
AssertVectorized<vec_type> vecAssert(testNameInfo, seed, vec_expected, actual, input0, input1);
|
||||
if (vecAssert.check(bitwise, dmn.CheckWithTolerance, dmn.ToleranceError))return;
|
||||
}// trial
|
||||
for ([[maybe_unused]] const auto trial : c10::irange(trialCount)) {
|
||||
for (const auto k : c10::irange(el_count)) {
|
||||
vals0[k] = generator0.get();
|
||||
vals1[k] = generator1.get();
|
||||
call_filter(filter, vals0[k], vals1[k]);
|
||||
// map operator
|
||||
expected[k] = expectedFunction(vals0[k], vals1[k]);
|
||||
}
|
||||
// test
|
||||
auto input0 = vec_type::loadu(vals0);
|
||||
auto input1 = vec_type::loadu(vals1);
|
||||
auto actual = actualFunction(input0, input1);
|
||||
auto vec_expected = vec_type::loadu(expected);
|
||||
AssertVectorized<vec_type> vecAssert(
|
||||
testNameInfo, seed, vec_expected, actual, input0, input1);
|
||||
if (vecAssert.check(
|
||||
bitwise, dmn.CheckWithTolerance, dmn.ToleranceError))
|
||||
return;
|
||||
} // trial
|
||||
changeSeedBy += 1;
|
||||
}
|
||||
for (auto& custom : testCase.getCustomChecks()) {
|
||||
@ -1067,24 +1073,27 @@ void test_ternary(
|
||||
ValueGen<VT> generator1(start1, end1, seed.add(changeSeedBy + 1));
|
||||
ValueGen<VT> generator2(start2, end2, seed.add(changeSeedBy + 2));
|
||||
|
||||
for (C10_UNUSED const auto trial : c10::irange(trialCount)) {
|
||||
for (const auto k : c10::irange(el_count)) {
|
||||
vals0[k] = generator0.get();
|
||||
vals1[k] = generator1.get();
|
||||
vals2[k] = generator2.get();
|
||||
call_filter(filter, vals0[k], vals1[k], vals2[k]);
|
||||
//map operator
|
||||
expected[k] = expectedFunction(vals0[k], vals1[k], vals2[k]);
|
||||
}
|
||||
// test
|
||||
auto input0 = vec_type::loadu(vals0);
|
||||
auto input1 = vec_type::loadu(vals1);
|
||||
auto input2 = vec_type::loadu(vals2);
|
||||
auto actual = actualFunction(input0, input1, input2);
|
||||
auto vec_expected = vec_type::loadu(expected);
|
||||
AssertVectorized<vec_type> vecAssert(testNameInfo, seed, vec_expected, actual, input0, input1, input2);
|
||||
if (vecAssert.check(bitwise, dmn.CheckWithTolerance, dmn.ToleranceError)) return;
|
||||
}// trial
|
||||
for ([[maybe_unused]] const auto trial : c10::irange(trialCount)) {
|
||||
for (const auto k : c10::irange(el_count)) {
|
||||
vals0[k] = generator0.get();
|
||||
vals1[k] = generator1.get();
|
||||
vals2[k] = generator2.get();
|
||||
call_filter(filter, vals0[k], vals1[k], vals2[k]);
|
||||
// map operator
|
||||
expected[k] = expectedFunction(vals0[k], vals1[k], vals2[k]);
|
||||
}
|
||||
// test
|
||||
auto input0 = vec_type::loadu(vals0);
|
||||
auto input1 = vec_type::loadu(vals1);
|
||||
auto input2 = vec_type::loadu(vals2);
|
||||
auto actual = actualFunction(input0, input1, input2);
|
||||
auto vec_expected = vec_type::loadu(expected);
|
||||
AssertVectorized<vec_type> vecAssert(
|
||||
testNameInfo, seed, vec_expected, actual, input0, input1, input2);
|
||||
if (vecAssert.check(
|
||||
bitwise, dmn.CheckWithTolerance, dmn.ToleranceError))
|
||||
return;
|
||||
} // trial
|
||||
changeSeedBy += 1;
|
||||
}
|
||||
}
|
||||
|
@ -72,11 +72,11 @@ inline bool is_thp_alloc(size_t nbytes) {
|
||||
return (is_thp_alloc_enabled() && (nbytes >= gAlloc_threshold_thp));
|
||||
}
|
||||
#elif !defined(__ANDROID__) && !defined(_MSC_VER)
|
||||
constexpr size_t c10_compute_alignment(C10_UNUSED size_t nbytes) {
|
||||
constexpr size_t c10_compute_alignment([[maybe_unused]] size_t nbytes) {
|
||||
return gAlignment;
|
||||
}
|
||||
|
||||
constexpr bool is_thp_alloc(C10_UNUSED size_t nbytes) {
|
||||
constexpr bool is_thp_alloc([[maybe_unused]] size_t nbytes) {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
@ -196,7 +196,7 @@ CUDAKernelLaunchRegistry::CUDAKernelLaunchRegistry()
|
||||
dsa_check_if_all_devices_support_managed_memory()),
|
||||
gather_launch_stacktrace(check_env_for_enable_launch_stacktracing()),
|
||||
enabled_at_runtime(check_env_for_dsa_enabled()) {
|
||||
for (C10_UNUSED const auto _ : c10::irange(dsa_get_device_count())) {
|
||||
for ([[maybe_unused]] const auto _ : c10::irange(dsa_get_device_count())) {
|
||||
uvm_assertions.emplace_back(nullptr, uvm_deleter);
|
||||
}
|
||||
|
||||
|
@ -23,7 +23,7 @@ void c10_cuda_check_implementation(
|
||||
return;
|
||||
}
|
||||
|
||||
C10_UNUSED auto error_unused = cudaGetLastError();
|
||||
[[maybe_unused]] auto error_unused = cudaGetLastError();
|
||||
(void)error_unused;
|
||||
|
||||
std::string check_message;
|
||||
|
@ -40,7 +40,7 @@ class C10_CUDA_API CUDAError : public c10::Error {
|
||||
do { \
|
||||
const cudaError_t __err = EXPR; \
|
||||
if (C10_UNLIKELY(__err != cudaSuccess)) { \
|
||||
C10_UNUSED auto error_unused = cudaGetLastError(); \
|
||||
[[maybe_unused]] auto error_unused = cudaGetLastError(); \
|
||||
TORCH_WARN("CUDA warning: ", cudaGetErrorString(__err)); \
|
||||
} \
|
||||
} while (0)
|
||||
@ -49,18 +49,18 @@ class C10_CUDA_API CUDAError : public c10::Error {
|
||||
#define C10_CUDA_ERROR_HANDLED(EXPR) EXPR
|
||||
|
||||
// Intentionally ignore a CUDA error
|
||||
#define C10_CUDA_IGNORE_ERROR(EXPR) \
|
||||
do { \
|
||||
const cudaError_t __err = EXPR; \
|
||||
if (C10_UNLIKELY(__err != cudaSuccess)) { \
|
||||
C10_UNUSED cudaError_t error_unused = cudaGetLastError(); \
|
||||
} \
|
||||
#define C10_CUDA_IGNORE_ERROR(EXPR) \
|
||||
do { \
|
||||
const cudaError_t __err = EXPR; \
|
||||
if (C10_UNLIKELY(__err != cudaSuccess)) { \
|
||||
[[maybe_unused]] cudaError_t error_unused = cudaGetLastError(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// Clear the last CUDA error
|
||||
#define C10_CUDA_CLEAR_ERROR() \
|
||||
do { \
|
||||
C10_UNUSED cudaError_t error_unused = cudaGetLastError(); \
|
||||
#define C10_CUDA_CLEAR_ERROR() \
|
||||
do { \
|
||||
[[maybe_unused]] cudaError_t error_unused = cudaGetLastError(); \
|
||||
} while (0)
|
||||
|
||||
// This should be used directly after every kernel launch to ensure
|
||||
|
@ -22,7 +22,7 @@ int device_count_impl(bool fail_if_no_driver) {
|
||||
// Clear out the error state, so we don't spuriously trigger someone else.
|
||||
// (This shouldn't really matter, since we won't be running very much CUDA
|
||||
// code in this regime.)
|
||||
C10_UNUSED cudaError_t last_err = cudaGetLastError();
|
||||
[[maybe_unused]] cudaError_t last_err = cudaGetLastError();
|
||||
switch (err) {
|
||||
case cudaErrorNoDevice:
|
||||
// Zero devices is ok here
|
||||
@ -170,7 +170,7 @@ std::optional<DeviceIndex> getDeviceIndexWithPrimaryContext() {
|
||||
}
|
||||
|
||||
namespace _internal {
|
||||
bool dummyHasPrimaryContext(C10_UNUSED DeviceIndex device_index) {
|
||||
bool dummyHasPrimaryContext([[maybe_unused]] DeviceIndex device_index) {
|
||||
TORCH_CHECK(false, "Should never been called");
|
||||
}
|
||||
bool (*hasPrimaryContext)(DeviceIndex) = dummyHasPrimaryContext;
|
||||
|
@ -8,7 +8,7 @@
|
||||
CUresult __err = EXPR; \
|
||||
if (__err != CUDA_SUCCESS) { \
|
||||
const char* err_str; \
|
||||
CUresult get_error_str_err C10_UNUSED = \
|
||||
CUresult get_error_str_err [[maybe_unused]] = \
|
||||
c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \
|
||||
if (get_error_str_err != CUDA_SUCCESS) { \
|
||||
AT_ERROR("CUDA driver error: unknown error"); \
|
||||
|
@ -118,9 +118,6 @@
|
||||
#define C10_HAS_CPP_ATTRIBUTE(x) (0)
|
||||
#endif
|
||||
|
||||
// suppress an unused variable.
|
||||
#define C10_UNUSED [[maybe_unused]]
|
||||
|
||||
#if !defined(__has_attribute)
|
||||
#define __has_attribute(x) 0
|
||||
#endif
|
||||
|
@ -35,12 +35,12 @@ dict_int_int test_dict(dict_int_int& dict) {
|
||||
|
||||
// erase via iterators
|
||||
auto begin = dict.begin();
|
||||
for (C10_UNUSED const auto i : c10::irange(20)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(20)) {
|
||||
begin++;
|
||||
}
|
||||
|
||||
auto end = begin;
|
||||
for (C10_UNUSED const auto i : c10::irange(20)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(20)) {
|
||||
erase_set.insert(end->first);
|
||||
end++;
|
||||
}
|
||||
@ -134,11 +134,11 @@ TEST(OrderedPreservingDictTest, DictCollisions) {
|
||||
|
||||
// erase a few entries via iterator
|
||||
auto begin = dict.begin();
|
||||
for (C10_UNUSED const auto j : c10::irange(10)) {
|
||||
for ([[maybe_unused]] const auto j : c10::irange(10)) {
|
||||
begin++;
|
||||
}
|
||||
auto end = begin;
|
||||
for (C10_UNUSED const auto j : c10::irange(7)) {
|
||||
for ([[maybe_unused]] const auto j : c10::irange(7)) {
|
||||
erase_set.insert(end->first);
|
||||
end++;
|
||||
}
|
||||
|
@ -26,7 +26,7 @@ ApproximateClockToUnixTimeConverter::measurePair() {
|
||||
ApproximateClockToUnixTimeConverter::time_pairs
|
||||
ApproximateClockToUnixTimeConverter::measurePairs() {
|
||||
static constexpr auto n_warmup = 5;
|
||||
for (C10_UNUSED const auto _ : c10::irange(n_warmup)) {
|
||||
for ([[maybe_unused]] const auto _ : c10::irange(n_warmup)) {
|
||||
getApproximateTime();
|
||||
static_cast<void>(steady_clock_t::now());
|
||||
}
|
||||
|
@ -658,12 +658,12 @@ namespace c10::detail {
|
||||
// Report a warning to the user only once. Accepts an arbitrary number of extra
|
||||
// arguments which are concatenated into the warning message using operator<<
|
||||
//
|
||||
#define _TORCH_WARN_ONCE(...) \
|
||||
C10_UNUSED static const auto C10_ANONYMOUS_VARIABLE(torch_warn_once_) = \
|
||||
[&] { \
|
||||
TORCH_WARN(__VA_ARGS__); \
|
||||
return true; \
|
||||
}()
|
||||
#define _TORCH_WARN_ONCE(...) \
|
||||
[[maybe_unused]] static const auto C10_ANONYMOUS_VARIABLE( \
|
||||
torch_warn_once_) = [&] { \
|
||||
TORCH_WARN(__VA_ARGS__); \
|
||||
return true; \
|
||||
}()
|
||||
|
||||
#ifdef DISABLE_WARN
|
||||
#define TORCH_WARN_ONCE(...) ((void)0);
|
||||
|
@ -322,8 +322,8 @@ C10_API const std::unique_ptr<EventSampledHandler>& GetEventSampledHandler(
|
||||
* // Logs caller info with an arbitrary text event, if there is a usage.
|
||||
* C10_LOG_API_USAGE_ONCE("my_api");
|
||||
*/
|
||||
#define C10_LOG_API_USAGE_ONCE(...) \
|
||||
C10_UNUSED static bool C10_ANONYMOUS_VARIABLE(logFlag) = \
|
||||
#define C10_LOG_API_USAGE_ONCE(...) \
|
||||
[[maybe_unused]] static bool C10_ANONYMOUS_VARIABLE(logFlag) = \
|
||||
::c10::detail::LogAPIUsageFakeReturn(__VA_ARGS__);
|
||||
|
||||
// API usage logging capabilities
|
||||
|
@ -115,7 +115,7 @@ TEST(XPUStreamTest, StreamPoolRoundRobinTest) {
|
||||
}
|
||||
|
||||
std::vector<c10::xpu::XPUStream> streams{};
|
||||
for (C10_UNUSED const auto _ : c10::irange(200)) {
|
||||
for ([[maybe_unused]] const auto _ : c10::irange(200)) {
|
||||
streams.emplace_back(c10::xpu::getStreamFromPool());
|
||||
}
|
||||
|
||||
|
@ -2220,7 +2220,7 @@ TEST(DataLoaderTest, ChunkDatasetCrossChunkShuffle) {
|
||||
for (const auto i : c10::irange(
|
||||
(chunk_count + cross_chunk_shuffle_count - 1) /
|
||||
cross_chunk_shuffle_count)) {
|
||||
for (C10_UNUSED const auto j : c10::irange(chunk_size)) {
|
||||
for ([[maybe_unused]] const auto j : c10::irange(chunk_size)) {
|
||||
for (const auto k : c10::irange(cross_chunk_shuffle_count)) {
|
||||
if (i * cross_chunk_shuffle_count + k < chunk_count) {
|
||||
expected_result.push_back(i * cross_chunk_shuffle_count + k);
|
||||
|
@ -1343,7 +1343,7 @@ TEST_F(FunctionalTest, GumbelSoftmax) {
|
||||
|
||||
auto counts = torch::zeros_like(logits);
|
||||
torch::Tensor y_draw;
|
||||
for (C10_UNUSED const auto i : c10::irange(num_draws)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(num_draws)) {
|
||||
y_draw =
|
||||
F::gumbel_softmax(logits, F::GumbelSoftmaxFuncOptions().hard(true));
|
||||
counts += y_draw;
|
||||
|
@ -123,7 +123,7 @@ bool test_mnist(
|
||||
torch::Device device(with_cuda ? torch::kCUDA : torch::kCPU);
|
||||
model->to(device);
|
||||
|
||||
for (C10_UNUSED const auto epoch : c10::irange(number_of_epochs)) {
|
||||
for ([[maybe_unused]] const auto epoch : c10::irange(number_of_epochs)) {
|
||||
// NOLINTNEXTLINE(performance-for-range-copy)
|
||||
for (torch::data::Example<> batch : *data_loader) {
|
||||
auto data = batch.data.to(device);
|
||||
|
@ -3511,7 +3511,7 @@ void _multihead_attn_test_helper(
|
||||
std::uniform_int_distribution<int> d_2_10(2, 10);
|
||||
std::uniform_int_distribution<int> d_3_10(3, 10);
|
||||
bool registration_checked = false;
|
||||
for (C10_UNUSED const auto i : c10::irange(100)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(100)) {
|
||||
const auto batch_sz = d_2_10(generator);
|
||||
const auto seq_len = d_2_10(generator);
|
||||
const auto d_head = d_3_10(generator);
|
||||
|
@ -398,7 +398,8 @@ std::vector<torch::Tensor> PackedSequenceTest_ordered_sequence(
|
||||
torch::ScalarType tensor_type) {
|
||||
std::vector<torch::Tensor> seqs;
|
||||
seqs.reserve(PackedSequenceTest_batch_size);
|
||||
for (C10_UNUSED const auto i : c10::irange(PackedSequenceTest_batch_size)) {
|
||||
for ([[maybe_unused]] const auto i :
|
||||
c10::irange(PackedSequenceTest_batch_size)) {
|
||||
seqs.emplace_back(torch::empty(
|
||||
{torch::randint(1, PackedSequenceTest_max_length, {1}).item<int64_t>()},
|
||||
tensor_type));
|
||||
|
@ -12,7 +12,7 @@ struct OperationTest : torch::test::SeedingFixture {
|
||||
};
|
||||
|
||||
TEST_F(OperationTest, Lerp) {
|
||||
for (C10_UNUSED const auto i : c10::irange(TEST_AMOUNT)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(TEST_AMOUNT)) {
|
||||
// test lerp_kernel_scalar
|
||||
auto start = torch::rand({3, 5});
|
||||
auto end = torch::rand({3, 5});
|
||||
@ -36,7 +36,7 @@ TEST_F(OperationTest, Lerp) {
|
||||
}
|
||||
|
||||
TEST_F(OperationTest, Cross) {
|
||||
for (C10_UNUSED const auto i : c10::irange(TEST_AMOUNT)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(TEST_AMOUNT)) {
|
||||
// input
|
||||
auto a = torch::rand({10, 3});
|
||||
auto b = torch::rand({10, 3});
|
||||
|
@ -157,7 +157,7 @@ void check_exact_values(
|
||||
TEST(OptimTest, OptimizerAccessors) {
|
||||
auto options = AdagradOptions(1.0);
|
||||
std::vector<torch::Tensor> params;
|
||||
for (C10_UNUSED const auto i : c10::irange(3)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(3)) {
|
||||
params.push_back(torch::randn(10));
|
||||
}
|
||||
auto optimizer = Adagrad(params, options);
|
||||
|
@ -99,14 +99,14 @@ void stressTestStore(std::string path, std::string prefix = "") {
|
||||
std::vector<std::thread> threads;
|
||||
c10d::test::Semaphore sem1, sem2;
|
||||
|
||||
for (C10_UNUSED const auto i : c10::irange(numThreads)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(numThreads)) {
|
||||
threads.emplace_back([&] {
|
||||
auto fileStore =
|
||||
c10::make_intrusive<c10d::FileStore>(path, numThreads + 1);
|
||||
c10d::PrefixStore store(prefix, fileStore);
|
||||
sem1.post();
|
||||
sem2.wait();
|
||||
for (C10_UNUSED const auto j : c10::irange(numIterations)) {
|
||||
for ([[maybe_unused]] const auto j : c10::irange(numIterations)) {
|
||||
store.add("counter", 1);
|
||||
}
|
||||
});
|
||||
|
@ -62,11 +62,11 @@ void stressTestStore(std::string prefix = "") {
|
||||
auto hashStore = c10::make_intrusive<c10d::HashStore>();
|
||||
c10d::PrefixStore store(std::move(prefix), hashStore);
|
||||
|
||||
for (C10_UNUSED const auto i : c10::irange(numThreads)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(numThreads)) {
|
||||
threads.emplace_back([&] {
|
||||
sem1.post();
|
||||
sem2.wait();
|
||||
for (C10_UNUSED const auto j : c10::irange(numIterations)) {
|
||||
for ([[maybe_unused]] const auto j : c10::irange(numIterations)) {
|
||||
store.add("counter", 1);
|
||||
}
|
||||
});
|
||||
|
@ -15,12 +15,12 @@ using at::cuda::CUDAStream;
|
||||
template <typename T, typename... Args>
|
||||
std::vector<T> initialize(const std::string& path, size_t N, Args&&... args) {
|
||||
std::vector<T> tests;
|
||||
for (C10_UNUSED const auto i : c10::irange(N)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(N)) {
|
||||
tests.push_back(std::move(T(path, std::forward<Args>(args)...)));
|
||||
}
|
||||
|
||||
std::vector<std::thread> threads;
|
||||
for (C10_UNUSED const auto i : c10::irange(N)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(N)) {
|
||||
threads.push_back(std::thread([i, N, &tests] { tests[i].start(i, N); }));
|
||||
}
|
||||
|
||||
|
@ -123,7 +123,7 @@ class CollectiveTest {
|
||||
int num,
|
||||
bool delayed = false) {
|
||||
std::vector<CollectiveTest> tests;
|
||||
for (C10_UNUSED const auto i : c10::irange(num)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(num)) {
|
||||
tests.emplace_back(path);
|
||||
}
|
||||
|
||||
|
@ -102,7 +102,7 @@ void testHelper(bool useLibUV, const std::string& prefix = "") {
|
||||
|
||||
for (const auto i : c10::irange(numThreads)) {
|
||||
threads.emplace_back([=, &sem1, &sem2, &clientStores, &expectedCounterRes] {
|
||||
for (C10_UNUSED const auto j : c10::irange(numIterations)) {
|
||||
for ([[maybe_unused]] const auto j : c10::irange(numIterations)) {
|
||||
clientStores[i]->add("counter", 1);
|
||||
}
|
||||
// Let each thread set and get key on its client store
|
||||
|
@ -1043,7 +1043,7 @@ TEST(Reductions, ReduceSplitRfactor) {
|
||||
SimpleIREvaluator cg(s, {b, c});
|
||||
|
||||
cg.call({in, out});
|
||||
for (C10_UNUSED const auto i : c10::irange(M)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(M)) {
|
||||
ASSERT_EQ(out[0], 4950);
|
||||
}
|
||||
}
|
||||
|
@ -3884,7 +3884,7 @@ TEST(Simplify, SimplifyEliminateEmptyFor) {
|
||||
{
|
||||
// Flatten many layers around an empty block to an empty block.
|
||||
StmtPtr last = alloc<Block>(std::vector<StmtPtr>({}));
|
||||
for (C10_UNUSED const auto i : c10::irange(11)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(11)) {
|
||||
VarHandle loopVar("loopVar", kInt);
|
||||
last = For::make(loopVar, 0, 10, last);
|
||||
}
|
||||
@ -3968,7 +3968,7 @@ TEST(Simplify, SimplifyFlattenBlock) {
|
||||
{
|
||||
// Flatten many layers around an empty block to an empty block.
|
||||
StmtPtr last = alloc<Block>(std::vector<StmtPtr>({}));
|
||||
for (C10_UNUSED const auto i : c10::irange(11)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(11)) {
|
||||
last = alloc<Block>(std::vector<StmtPtr>({last}));
|
||||
}
|
||||
|
||||
|
@ -12,7 +12,7 @@ torch::List<torch::Tensor> custom_op(
|
||||
int64_t repeat) {
|
||||
torch::List<torch::Tensor> output;
|
||||
output.reserve(repeat);
|
||||
for (C10_UNUSED const auto i : c10::irange(repeat)) {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(repeat)) {
|
||||
output.push_back(tensor * scalar);
|
||||
}
|
||||
return output;
|
||||
|
@ -41,13 +41,13 @@ namespace torch::autograd {
|
||||
|
||||
namespace VariableType {
|
||||
namespace{
|
||||
C10_UNUSED void reset_grad_accumulator(Variable & self) {
|
||||
AutogradMeta* meta = torch::autograd::impl::get_autograd_meta(self);
|
||||
if (meta != nullptr) {
|
||||
meta->grad_accumulator_.reset();
|
||||
}
|
||||
[[maybe_unused]] void reset_grad_accumulator(Variable& self) {
|
||||
AutogradMeta* meta = torch::autograd::impl::get_autograd_meta(self);
|
||||
if (meta != nullptr) {
|
||||
meta->grad_accumulator_.reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user