Compare commits

..

8 Commits

145 changed files with 1668 additions and 2173 deletions

View File

@ -35,11 +35,10 @@ fi
print_cmake_info
if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
# Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
else
# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
# NB: we always build with distributed; USE_DISTRIBUTED turns off all
# backends (specifically the gloo backend), so test that this case works too
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
fi
if which sccache > /dev/null; then

View File

@ -13,9 +13,13 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
fi
popd
python -mpip install -r requirements.txt
# enable debug asserts in serialization
export TORCH_SERIALIZATION_DEBUG=1
python -mpip install --no-input -r requirements.txt
setup_test_python() {
# The CircleCI worker hostname doesn't resolve to an address.
# This environment variable makes ProcessGroupGloo default to

View File

@ -177,7 +177,8 @@ source ~/${desired_python}-build/bin/activate
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
retry brew install libomp
# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
# is build as part of tensorpipe submodule
export USE_DISTRIBUTED=1
export USE_MKLDNN=OFF

View File

@ -127,8 +127,6 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
# More memory is needed to build with asan
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-clang18-asan
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan

View File

@ -140,8 +140,6 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
# More memory is needed to build with asan
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-clang18-asan
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan

1
.gitignore vendored
View File

@ -82,7 +82,6 @@ torch/return_types.pyi
torch/nn/functional.pyi
torch/utils/data/datapipes/datapipe.pyi
torch/csrc/autograd/generated/*
torch/csrc/functionalization/generated/*
torch/csrc/lazy/generated/*.[!m]*
torch_compile_debug/
# Listed manually because some files in this directory are not generated

View File

@ -22,7 +22,6 @@ COMMON_COPTS = [
"-DHAVE_SHM_UNLINK=1",
"-D_FILE_OFFSET_BITS=64",
"-DUSE_FBGEMM",
"-DUSE_DISTRIBUTED",
"-DAT_PER_OPERATOR_HEADERS",
"-DATEN_THREADING=NATIVE",
"-DNO_CUDNN_DESTROY_HANDLE",
@ -91,8 +90,6 @@ generated_cpu_cpp = [
"aten/src/ATen/NativeMetaFunctions.h",
"aten/src/ATen/RegistrationDeclarations.h",
"aten/src/ATen/VmapGeneratedPlumbing.h",
"aten/src/ATen/ViewMetaClasses.h",
"aten/src/ATen/ViewMetaClasses.cpp",
"aten/src/ATen/core/aten_interned_strings.h",
"aten/src/ATen/core/enum_tag.h",
"aten/src/ATen/core/TensorBody.h",
@ -813,7 +810,7 @@ cc_library(
name = "torch_python",
srcs = libtorch_python_core_sources
+ if_cuda(libtorch_python_cuda_sources)
+ if_cuda(libtorch_python_distributed_sources)
+ libtorch_python_distributed_sources
+ GENERATED_AUTOGRAD_PYTHON,
hdrs = glob([
"torch/csrc/generic/*.cpp",
@ -1077,7 +1074,6 @@ test_suite(
"aten/src/ATen/templates/LazyNonNativeIr.h",
"aten/src/ATen/templates/RegisterDispatchKey.cpp",
"aten/src/ATen/templates/RegisterDispatchDefinitions.ini",
"aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp",
"aten/src/ATen/native/native_functions.yaml",
"aten/src/ATen/native/tags.yaml",
"aten/src/ATen/native/ts_native_functions.yaml",

View File

@ -180,8 +180,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
set(CPU_POWER ON)
endif()
# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
# tested and likely won't work without additional changes.
# For non-supported platforms, turn USE_DISTRIBUTED off by default.
# NB: USE_DISTRIBUTED simply disables the backend; distributed code
# still gets built
if(NOT LINUX AND NOT WIN32)
set(USE_DISTRIBUTED
OFF
@ -261,11 +262,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
option(USE_NATIVE_ARCH "Use -march=native" OFF)
cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
option(USE_DISTRIBUTED "Use distributed" ON)
option(USE_DISTRIBUTED "Enable default distributed backends" ON)
cmake_dependent_option(USE_NCCL "Use NCCL" ON
"USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
cmake_dependent_option(USE_XCCL "Use XCCL" ON
"USE_XPU;UNIX;NOT APPLE" OFF)
"USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
@ -437,11 +438,10 @@ if(WIN32)
PATH_SUFFIXES lib
NO_DEFAULT_PATH)
if(NOT libuv_tmp_LIBRARY)
set(USE_DISTRIBUTED OFF)
set(USE_GLOO OFF)
message(
WARNING
"Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
"Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
"Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
)
else()

View File

@ -9,6 +9,11 @@
namespace at::functionalization {
ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
if (out_idx == this->out_index) return *this;
return ViewMeta(forward_fn, reverse_fn, has_symbolic_inputs, is_multi_output, is_as_strided, out_idx);
}
// Note [Functionalization: Alias Removal Part 2]
// See Note [Functionalization: Alias Removal] for more details.
// This function applies a single update from one of the views to the StorageImpl.
@ -37,12 +42,12 @@ namespace at::functionalization {
static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) {
at::Tensor t = update.new_val;
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
if (update.view_metas.empty()) { return t; }
if (update.view_metas.empty()) return t;
std::vector<at::Tensor> tmp_values({base});
tmp_values.reserve(update.view_metas.size());
for (size_t i = 0; i < update.view_metas.size() - 1; ++i) {
at::Tensor next_view = update.view_metas[i]->forward(tmp_values.back());
at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index);
// NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided
// All of these ops require additional information to recover the sizes of the original tensor.
// If need to, we could probably apply this optimization and only bother computing tmp_values
@ -50,8 +55,9 @@ static const Tensor apply_update(const FunctionalStorageImpl::Update& update, co
tmp_values.push_back(std::move(next_view));
}
for(int64_t i = static_cast<int64_t>(update.view_metas.size()) - 1; i >= 0; --i) {
int64_t out_idx = update.view_metas[i].out_index;
// Each view inverse is implemented in ViewInverses.cpp.
t = update.view_metas[i]->reverse(tmp_values[i], t);
t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx);
}
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
return t;
@ -105,13 +111,13 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_));
}
void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<std::shared_ptr<ViewMeta>>& metas) {
void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& metas) {
TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage");
if (metas.size() > 1) {
for (size_t i = 1; i < metas.size(); ++i) {
// Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI
TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i]->is_as_strided,
TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided,
"During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i,
" was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today,"
"so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you "

View File

@ -8,89 +8,44 @@ namespace at::functionalization {
// See Note [Functionalization Pass In Core]
enum class InverseReturnMode {
/// Specifies that functional inverses should always return a view.
AlwaysView,
/// Specifies that functional inverses should always return a non-view / copy.
NeverView,
/// Specifies that functional inverses should return a view unless a (copying)
/// scatter
/// inverse exists, in which case that will be used instead.
/// This avoids as_strided() calls that can be difficult for subclasses to
/// handle.
ViewOrScatterInverse,
};
#define FUNCTIONALIZATION_VIEWMETA_NAME(TYPE) \
static const char* name() { \
return #TYPE; \
}
#define FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(...) \
using SerializableTuple = std::tuple<__VA_ARGS__>
// ViewMeta is a class used by the functionalization pass to navigate between
// a base tensor and a view tensor.
// For example, if I call `b = a.view1(...)`
// the functionalization pass will generate and store a ViewMeta specialization
// for `view1` operation on b that looks like:
// the functionalization pass will generate and store a ViewMeta on b that looks
// like:
//
// struct TORCH_API view1_ViewMeta : public ViewMeta {
// FUNCTIONALIZATION_VIEWMETA_NAME(view1_ViewMeta);
// FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
// bool /* reapply_views */,
// const std::vector<int64_t>&);
//
// view1_ViewMeta(const SerializableTuple& tpl)
// : view1_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
//
// view1_ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
// : ViewMeta(/*has_symbolic_inputs=*/false),
// reapply_views(reapply_views),
// size(size) {}
//
// Tensor forward(const Tensor& base) override {
// return base.view1(...);
// ViewMeta(
// [<captures>](const Tensor& base, int64_t mutated_view_idx) {
// return base.view1(...);
// },
// [<captures>](const at::Tensor& base, const at::Tensor& mutated_view,
// int64_t mutated_view_idx) -> at::Tensor {
// return at::functionalization::impl::view1_inverse(base, mutated_view,
// ...);
// }
//
// Tensor reverse(const Tensor& base, const Tensor& mutated_view) override {
// return at::functionalization::impl::view1_inverse(base, mutated_view,
// ...);
// }
// The forward_fn lambda describes how to replay view1 on a tensor.
//
// SerializableTuple to_serializable_tuple() {
// return std::make_tuple(reapply_views, size);
// }
//
// bool reapply_views;
// std::vector<int64_t> size;
// };
//
// The forward function describes how to replay view1 on a tensor.
//
// The reverse function describes how, given a tensor that is already a view,
// The reverse_fn lambda describes how, given a tensor that is already a view,
// how to get the corresponding base tensor. See Note [Functionalization Pass:
// View Inverses] for details.
//
// `SerializedTuple` is a typedef that defines an `std::tuple<...>` type
// representing the `ViewMeta` instance state. Methods that take in/return such
// a type are used for supporting pickle serialization.
struct ViewMeta {
ViewMeta(
std::function<Tensor(const Tensor&, int64_t)> forward,
std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
bool has_symbolic_inputs,
bool is_multi_output = false,
bool is_as_strided = false,
int64_t out_idx = 0)
: out_index(out_idx),
: forward_fn(std::move(forward)),
reverse_fn(std::move(reverse)),
out_index(out_idx),
is_multi_output(is_multi_output),
is_as_strided(is_as_strided),
has_symbolic_inputs(has_symbolic_inputs) {}
virtual ~ViewMeta() = default;
virtual Tensor forward(const Tensor& base) = 0;
virtual Tensor reverse(const Tensor& base, const Tensor& mutated_view) = 0;
std::function<Tensor(const Tensor&, int64_t)> forward_fn;
std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
// See Note [out_idx in ViewMeta]
int64_t out_index;
@ -102,17 +57,10 @@ struct ViewMeta {
// Tells us if this view operation has any symbolic inputs
bool has_symbolic_inputs;
// Returns a new ViewMeta with the same forward/reverse
// Returns a copy of the current ViewMeta, if out_idx matches the current
// out_index. Otherwise, returns a new ViewMeta with the same forward/reverse
// functions, but a new out index.
//
// This method should be implemented by those `ViewMeta` that have more than
// one output.
virtual std::shared_ptr<ViewMeta> to_out_index(int64_t out_index) {
TORCH_CHECK_NOT_IMPLEMENTED(
false,
"ViewMeta::to_out_index not implemented. ",
"Likely because there's only one output.");
}
ViewMeta to_out_idx(int64_t out_idx);
};
// FunctionalStorageImpl is a subclass of StorageImpl used by the
@ -145,14 +93,14 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
const at::Tensor new_val;
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
const std::vector<std::shared_ptr<ViewMeta>> view_metas;
const std::vector<ViewMeta> view_metas;
};
explicit FunctionalStorageImpl(const Tensor& value);
void add_update(
const Tensor& updated_val,
const std::vector<std::shared_ptr<ViewMeta>>& view_metas);
const std::vector<ViewMeta>& view_metas);
bool apply_updates();
const Tensor& base() {
return base_;

View File

@ -129,19 +129,17 @@ void FunctionalTensorWrapper::freeze_storage() const {
// - view_value: The output tensor that we need to wrap.
// - base: The "base" of the view that `view_value` was generated from.
// See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic.
FunctionalTensorWrapper::FunctionalTensorWrapper(
const Tensor& view_value,
const FunctionalTensorWrapper* base,
const std::shared_ptr<functionalization::ViewMeta>& meta)
: c10::TensorImpl(
c10::DispatchKeySet(DispatchKey::Functionalize),
view_value.dtype(),
base->storage().data_ptr().device()),
value_(view_value),
is_multi_output_view_(
base->is_multi_output_view_ || meta->is_multi_output),
was_storage_changed_(base->was_storage_changed_),
is_symbolic_(base->is_symbolic_) {
FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta)
: c10::TensorImpl(
c10::DispatchKeySet(DispatchKey::Functionalize),
view_value.dtype(),
base->storage().data_ptr().device()
),
value_(view_value),
is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
was_storage_changed_(base->was_storage_changed_),
is_symbolic_(base->is_symbolic_)
{
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_));
TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
set_constructor_metadata();
@ -150,10 +148,11 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(
view_metas_ = base->view_metas_; // copy
}
view_metas_.push_back(meta);
maybe_mark_symbolic(meta.get());
maybe_mark_symbolic(meta);
storage_ = base->storage_; // alias this tensor's storage with the base tensor's
}
functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const {
return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl());
}
@ -177,18 +176,18 @@ bool FunctionalTensorWrapper::is_up_to_date() const {
}
// See Note [Functionalization Pass - Inplace View Ops]
void FunctionalTensorWrapper::mutate_view_meta(const std::shared_ptr<at::functionalization::ViewMeta>& meta) {
void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) {
view_metas_.push_back(meta);
// Manually track the fact that this tensor received a metadata mutation!
has_metadata_mutation_ = true;
// Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation.
maybe_mark_symbolic(meta.get());
maybe_mark_symbolic(meta);
// Note [Functionalization Pass - Inplace View Ops]
// So, these ops are special - they're mutation AND view ops. They get special codegen.
// An example is transpose_, e.g. `a.transpose_()`
// Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas.
at::AutoDispatchSkipFunctionalize guard;
value_ = meta->forward(value_);
value_ = meta.forward_fn(value_, meta.out_index);
TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
}
@ -369,8 +368,15 @@ void FunctionalTensorWrapper::sync_() {
regenerate_from_base();
}
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& FunctionalTensorWrapper::view_metas() const {
return view_metas_;
Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) {
auto t = base;
// Reapply views to get the viewed tensor from the base in alias_
for (auto& view_meta: view_metas_) {
t = view_meta.forward_fn(t, view_meta.out_index);
}
return t;
}
void FunctionalTensorWrapper::regenerate_from_base() {
@ -379,7 +385,7 @@ void FunctionalTensorWrapper::regenerate_from_base() {
auto t = storage_impl->base();
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
t = at::functionalization::impl::apply_view_meta_sequence(t, view_metas_);
t = apply_view_metas(t);
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
replace_(t, /*from_lazy_regenerate=*/true);
@ -721,11 +727,11 @@ bool isFunctionalTensor(const std::optional<Tensor>& t) {
}
bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
if (t_list.empty()) { return false; }
if (t_list.empty()) return false;
auto functional_count = 0;
for (const auto i : c10::irange(t_list.size())) {
auto const & e= t_list[i];
if (!e.has_value() || !e->defined()) { continue; }
if (!e.has_value() || !e->defined()) continue;
if (isFunctionalTensor(e)) {
++functional_count;
}
@ -735,10 +741,10 @@ bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
template <typename T>
static bool isFunctionalTensorIListRef(c10::IListRef<T> list) {
if (list.size() == 0) { return false; }
if (list.size() == 0) return false;
auto functional_count = 0;
for (const auto& tensor : list) {
if (!tensor.defined()) { continue; }
if (!tensor.defined()) continue;
if (isFunctionalTensor(tensor)) {
++functional_count;
}
@ -756,28 +762,20 @@ void freeze_functional_tensor(const Tensor& tensor) {
functional_base_impl->freeze_storage();
}
Tensor create_functional_tensor_with_view_meta(
const at::Tensor& view_to_wrap,
const at::Tensor& base,
const std::shared_ptr<functionalization::ViewMeta>& meta,
int64_t out_idx) {
Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) {
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap));
TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base);
auto meta_ = meta;
if (out_idx != 0) {
// Note [out_idx in ViewMeta]
// When a view op outputs multiple tensors, each output needs its own separate ViewMeta.
// Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function.
meta_ = meta->to_out_index(out_idx);
meta = meta.to_out_idx(out_idx);
}
return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta_);
return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta);
}
std::vector<Tensor> create_functional_tensor_with_view_meta(
ITensorListRef view_to_wrap,
const at::Tensor& base,
const std::shared_ptr<functionalization::ViewMeta>& meta) {
std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) {
std::vector<Tensor> outputs(view_to_wrap.size());
int64_t i = 0;
for (const auto& tensor : view_to_wrap) {
@ -787,22 +785,12 @@ std::vector<Tensor> create_functional_tensor_with_view_meta(
return outputs;
}
void mutate_view_meta(const at::Tensor& self, const std::shared_ptr<functionalization::ViewMeta>& meta) {
void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) {
TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self));
auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
self_impl->mutate_view_meta(meta);
}
Tensor apply_view_meta_sequence(
const Tensor& base,
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence) {
Tensor r = base;
for (auto& vm : sequence) {
r = vm->forward(r);
}
return r;
}
// Note [Propagating strides in the functionalization pass]
// In order to properly compute stride information, the functionalization pass
// calls each {view} reference implementations with meta tensors.
@ -896,7 +884,7 @@ void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* s
const auto& ivalue = returns[idx];
if (ivalue.isTensor()) {
const auto& t = ivalue.toTensor();
if (!t.defined()) { continue; }
if (!t.defined()) continue;
at::functionalization::impl::sync(t);
auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t));
(*stack)[returns_begin + idx] = t_new;

View File

@ -56,7 +56,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
explicit FunctionalTensorWrapper(
const Tensor& view_value,
const FunctionalTensorWrapper* base,
const std::shared_ptr<functionalization::ViewMeta>& meta);
const functionalization::ViewMeta& meta);
// Get the underlying, actual tensor, that doesn't know anything about
// functionalization.
@ -99,17 +99,17 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
->are_all_mutations_under_no_grad_or_inference_mode();
}
void maybe_mark_symbolic(functionalization::ViewMeta* meta) {
is_symbolic_ = is_symbolic_ | meta->has_symbolic_inputs;
void maybe_mark_symbolic(const functionalization::ViewMeta& meta) {
is_symbolic_ = is_symbolic_ | meta.has_symbolic_inputs;
}
bool is_symbolic() const {
return is_symbolic_;
}
// Retrieves the ViewMeta sequence of this tensor.
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& view_metas()
const;
// Runs the forward_fn of every ViewMeta collected in the current instance
// to some other base.
Tensor apply_view_metas(const Tensor& base);
// Sync's the underlying tensor with its alias, if it's out of date. This
// involves two steps: 1) Apply any pending updates/mutations to the alias 2)
@ -146,8 +146,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
// from the base tensor. This method is used by inplace-view ops like
// transpose_. It appends a ViewMeta to the existing stack, and refreshes the
// tensor by replaying the views off of the alias.
void mutate_view_meta(
const std::shared_ptr<at::functionalization::ViewMeta>& meta);
void mutate_view_meta(const at::functionalization::ViewMeta& meta);
// Custom implementation of self.set_(src)
void set__impl(const FunctionalTensorWrapper* other);
@ -286,7 +285,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
bool is_symbolic_ = false;
size_t generation_ = 0;
std::vector<std::shared_ptr<at::functionalization::ViewMeta>> view_metas_;
std::vector<at::functionalization::ViewMeta> view_metas_;
protected:
static void copy_tensor_metadata(
@ -378,20 +377,16 @@ TORCH_API void propagate_xla_data_direct(
Tensor create_functional_tensor_with_view_meta(
const Tensor& view_to_wrap,
const Tensor& base,
const std::shared_ptr<functionalization::ViewMeta>& meta,
functionalization::ViewMeta meta,
int64_t out_idx = 0);
std::vector<Tensor> create_functional_tensor_with_view_meta(
ITensorListRef view_to_wrap,
const Tensor& base,
const std::shared_ptr<functionalization::ViewMeta>& meta);
const functionalization::ViewMeta& meta);
void mutate_view_meta(
const Tensor& self,
const std::shared_ptr<functionalization::ViewMeta>& meta);
TORCH_API Tensor apply_view_meta_sequence(
const Tensor& base,
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence);
const functionalization::ViewMeta& meta);
void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
void set_sizes_strides_offset(

View File

@ -1,5 +1,3 @@
#include <ATen/FunctionalizeFallbackKernel.h>
#include <ATen/core/dispatch/Dispatcher.h>
#include <ATen/core/LegacyTypeDispatch.h>
#include <ATen/EmptyTensor.h>
@ -9,6 +7,7 @@
#include <torch/library.h>
#include <c10/util/irange.h>
#include <c10/util/strides.h>
#include <ATen/EmptyTensor.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/ATen.h>
@ -29,31 +28,6 @@
#include <utility>
#endif
namespace at::functionalization {
Tensor resize__ViewMeta::forward(const Tensor& base) {
if (reapply_views) {
return base.as_strided(size, c10::contiguous_strides(size));
} else {
return at::as_strided_copy(base, size, c10::contiguous_strides(size));
}
}
Tensor resize__ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
return base.as_strided_scatter(
mutated_view, size, c10::contiguous_strides(size));
}
Tensor _unsafe_view_ViewMeta::forward(const Tensor& base) {
return at::_unsafe_view_symint(base, size);
}
Tensor _unsafe_view_ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
}
} // namespace at::functionalization
namespace {
void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) {
const auto& schema = op.schema();
@ -132,9 +106,7 @@ namespace {
const auto& ivalue = returns[idx];
if (ivalue.isTensor() && should_wrap_outputs) {
const auto& t = ivalue.toTensor();
if (!t.defined()) {
continue;
}
if (!t.defined()) continue;
auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t));
(*stack)[returns_begin + idx] = t_new;
} else if (ivalue.isTensorList() && should_wrap_outputs) {
@ -197,8 +169,19 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch
// The output of resizing is equivalent to taking a slice of a larger tensor.
// We have to emulate this "slicing" with an as_strided call.
auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS();
auto view_meta = std::make_shared<at::functionalization::resize__ViewMeta>(
reapply_views, size.vec());
at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
[reapply_views = reapply_views, size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
if (reapply_views) {
return base.as_strided(size, c10::contiguous_strides(size));
} else {
return at::as_strided_copy(base, size, c10::contiguous_strides(size));
}
},
[size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size));
},
/*has_symbolic_inputs=*/false
);
at::functionalization::impl::mutate_view_meta(self, view_meta);
return self;
}
@ -317,11 +300,17 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt
tmp_output = at::_unsafe_view_symint(self_, size);
}
bool has_symbolic_inputs = std::any_of(
size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
auto view_meta =
std::make_shared<at::functionalization::_unsafe_view_ViewMeta>(
has_symbolic_inputs, size.vec());
bool has_symbolic_inputs = std::any_of(size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
[size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
return at::_unsafe_view_symint(base, size);
},
[size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
},
/*has_symbolic_inputs=*/has_symbolic_inputs
);
auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta));
// See Note [Propagating strides in the functionalization pass]

View File

@ -1,58 +0,0 @@
#pragma once
#include <ATen/FunctionalStorageImpl.h>
namespace at::functionalization {
// `ViewMeta` implementation for `resize_` operation.
struct TORCH_API resize__ViewMeta : public ViewMeta {
FUNCTIONALIZATION_VIEWMETA_NAME(resize__ViewMeta)
FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
bool /* reapply_views */,
const std::vector<int64_t>&);
resize__ViewMeta(const SerializableTuple& tpl)
: resize__ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
resize__ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
: ViewMeta(/*has_symbolic_inputs=*/false),
reapply_views(reapply_views),
size(size) {}
Tensor forward(const Tensor& base) override;
Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
SerializableTuple to_serializable_tuple() {
return std::make_tuple(reapply_views, size);
}
bool reapply_views;
std::vector<int64_t> size;
};
// `ViewMeta` implementation for `_unsafe_view` operation.
struct TORCH_API _unsafe_view_ViewMeta : public ViewMeta {
FUNCTIONALIZATION_VIEWMETA_NAME(_unsafe_view_ViewMeta)
FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
bool /* has_symbolic_inputs */,
const std::vector<c10::SymInt>&);
_unsafe_view_ViewMeta(const SerializableTuple& tpl)
: _unsafe_view_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
_unsafe_view_ViewMeta(
bool has_symbolic_inputs,
const std::vector<c10::SymInt>& size)
: ViewMeta(has_symbolic_inputs), size(size) {}
Tensor forward(const Tensor& base) override;
Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
SerializableTuple to_serializable_tuple() {
return std::make_tuple(has_symbolic_inputs, size);
}
std::vector<c10::SymInt> size;
};
} // namespace at::functionalization

View File

@ -120,7 +120,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
// buffer (in bytes)
size_t orig_m = sparse_input.size(0);
size_t div = orig_m * sparse_input.itemsize();
size_t new_n = (compressed_size + div - 1) / div; // ceil(s,d) = (s+d-1)/d
size_t new_n = (compressed_size + div - 1) / div; // floor
auto compressed_tensor = sparse_input.new_empty({(int64_t)orig_m, (int64_t)new_n});
auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
@ -155,7 +155,7 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
handle_initialized = true;
}
// cuSPARSELt constructs
// cupsarselt constructs
cusparseLtMatmulDescriptor_t matmul;
cusparseLtMatmulPlan_t plan;
cusparseLtMatmulAlgSelection_t alg_sel;

View File

@ -2,12 +2,22 @@
// ${generated_comment}
#include <ATen/FunctionalStorageImpl.h>
#include <ATen/Tensor.h>
namespace at {
namespace functionalization {
enum class InverseReturnMode {
/// Specifies that functional inverses should always return a view.
AlwaysView,
/// Specifies that functional inverses should always return a non-view / copy.
NeverView,
/// Specifies that functional inverses should return a view unless a (copying) scatter
/// inverse exists, in which case that will be used instead.
/// This avoids as_strided() calls that can be difficult for subclasses to handle.
ViewOrScatterInverse,
};
struct FunctionalInverses {
${view_inverse_declarations}

View File

@ -4,7 +4,7 @@
#include <ATen/core/LegacyTypeDispatch.h>
#include <ATen/EmptyTensor.h>
#include <ATen/FunctionalTensorWrapper.h>
#include <ATen/ViewMetaClasses.h>
#include <ATen/FunctionalInverses.h>
#include <ATen/MemoryOverlap.h>
#include <torch/library.h>

View File

@ -1,19 +0,0 @@
// ${generated_comment}
#include <ATen/FunctionalInverses.h>
#include <ATen/ViewMetaClasses.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Operators.h>
#include <ATen/NativeFunctions.h>
#else
${op_headers}
#endif
namespace at {
namespace functionalization {
${view_meta_implementations}
} // namespace functionalization
} // namespace at

View File

@ -1,12 +0,0 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
// ${generated_comment}
#include <ATen/FunctionalStorageImpl.h>
namespace at {
namespace functionalization {
${view_meta_declarations}
} // namespace functionalization
} // namespace at

View File

@ -1,11 +0,0 @@
#include <ATen/ViewMetaClasses.h>
#include <torch/csrc/functionalization/Module.h>
namespace torch::functionalization {
void initGenerated(PyObject* module) {
auto functionalization = py::handle(module).cast<py::module>();
$view_meta_bindings
}
} // namespace torch::functionalization

View File

@ -156,7 +156,7 @@ ROOT = "//" if IS_OSS else "//xplat/caffe2"
# for targets in subfolders
ROOT_PATH = "//" if IS_OSS else "//xplat/caffe2/"
C10 = "//c10:c10" if IS_OSS else "//xplat/caffe2/c10:c10"
C10 = "//c10:c10" if IS_OSS else ("//xplat/caffe2/c10:c10_ovrsource" if is_arvr_mode() else "//xplat/caffe2/c10:c10")
# a dictionary maps third party library name to fbsource and oss target
THIRD_PARTY_LIBS = {
@ -391,8 +391,6 @@ def get_aten_generated_files(enabled_backends):
"CompositeExplicitAutogradFunctions_inl.h",
"CompositeExplicitAutogradNonFunctionalFunctions.h",
"CompositeExplicitAutogradNonFunctionalFunctions_inl.h",
"ViewMetaClasses.h",
"ViewMetaClasses.cpp",
"VmapGeneratedPlumbing.h",
"core/ATenOpList.cpp",
"core/TensorBody.h",
@ -950,6 +948,7 @@ def define_buck_targets(
[
("torch/csrc/api/include", "torch/**/*.h"),
("", "torch/csrc/**/*.h"),
("", "torch/csrc/**/*.hpp"),
("", "torch/nativert/**/*.h"),
("", "torch/headeronly/**/*.h"),
("", "torch/script.h"),
@ -1194,7 +1193,6 @@ def define_buck_targets(
"NativeMetaFunctions.h": ":gen_aten[NativeMetaFunctions.h]",
"Operators.h": ":gen_aten[Operators.h]",
"RedispatchFunctions.h": ":gen_aten[RedispatchFunctions.h]",
"ViewMetaClasses.h": ":gen_aten[ViewMetaClasses.h]",
"core/TensorBody.h": ":gen_aten[core/TensorBody.h]",
"core/aten_interned_strings.h": ":gen_aten[core/aten_interned_strings.h]",
"core/enum_tag.h": ":gen_aten[core/enum_tag.h]",
@ -2050,6 +2048,7 @@ def define_buck_targets(
("", "caffe2/utils/*.h"),
("", "caffe2/core/*.h"),
("", "torch/csrc/*.h"),
("", "torch/csrc/*.hpp"),
("", "torch/csrc/api/include/torch/*.h"),
("", "torch/csrc/autograd/*.h"),
("", "torch/csrc/autograd/*/*.h"),

View File

@ -118,9 +118,6 @@ def define_targets(rules):
":LazyNonNativeIr.h",
":RegisterDispatchDefinitions.ini",
":RegisterDispatchKey.cpp",
":ViewMetaClassesPythonBinding.cpp",
":ViewMetaClasses.cpp",
":ViewMetaClasses.h",
":native_functions.yaml",
":shape_inference.h",
":tags.yaml",
@ -173,7 +170,6 @@ GENERATED_H = [
"FunctionalInverses.h",
"RedispatchFunctions.h",
"RegistrationDeclarations.h",
"ViewMetaClasses.h",
"VmapGeneratedPlumbing.h",
]
@ -250,7 +246,6 @@ GENERATED_CPP = [
"RegisterFunctionalization_1.cpp",
"RegisterFunctionalization_2.cpp",
"RegisterFunctionalization_3.cpp",
"ViewMetaClasses.cpp",
]
GENERATED_CPP_CORE = [
@ -312,7 +307,6 @@ _GENERATED_AUTOGRAD_PYTHON_CPP = [
"torch/csrc/autograd/generated/python_torch_functions_1.cpp",
"torch/csrc/autograd/generated/python_torch_functions_2.cpp",
"torch/csrc/autograd/generated/python_variable_methods.cpp",
"torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp"
]
GENERATED_AUTOGRAD_PYTHON = _GENERATED_AUTOGRAD_PYTHON_HEADERS + _GENERATED_AUTOGRAD_PYTHON_CPP

View File

@ -1010,7 +1010,6 @@ libtorch_python_core_sources = [
"torch/csrc/utils/disable_torch_function.cpp",
"torch/csrc/utils/verbose.cpp",
"torch/csrc/cpu/Module.cpp",
"torch/csrc/functionalization/Module.cpp",
"torch/csrc/instruction_counter/Module.cpp",
"torch/nativert/python/Bindings.cpp",
] + lazy_tensor_core_python_sources
@ -1053,7 +1052,6 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
"torch/csrc/autograd/generated/python_torch_functions_1.cpp",
"torch/csrc/autograd/generated/python_torch_functions_2.cpp",
"torch/csrc/autograd/generated/python_variable_methods.cpp",
"torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp",
]]
_libtorch_python_sources.extend(libtorch_python_core_sources)

View File

@ -3244,7 +3244,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
are_equal<sizeof(autograd_meta_), 4, FieldNameEnum::autograd_meta_>();
are_equal<sizeof(extra_meta_), 4, FieldNameEnum::extra_meta_>();
are_equal<sizeof(version_counter_), 4, FieldNameEnum::version_counter_>();
are_equal<sizeof(pyobj_slot_), 4, FieldNameEnum::pyobj_slot_>();
are_equal<sizeof(pyobj_slot_), 8, FieldNameEnum::pyobj_slot_>();
is_le<sizeof(sizes_and_strides_), 88, FieldNameEnum::sizes_and_strides_>();
are_equal<sizeof(storage_offset_), 8, FieldNameEnum::storage_offset_>();
are_equal<sizeof(numel_), 8, FieldNameEnum::numel_>();
@ -3269,7 +3269,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
is_le<sizeof(autograd_meta_), 16, FieldNameEnum::autograd_meta_>();
is_le<sizeof(extra_meta_), 16, FieldNameEnum::extra_meta_>();
are_equal<sizeof(version_counter_), 8, FieldNameEnum::version_counter_>();
are_equal<sizeof(pyobj_slot_), 8, FieldNameEnum::pyobj_slot_>();
are_equal<sizeof(pyobj_slot_), 16, FieldNameEnum::pyobj_slot_>();
are_equal<sizeof(sizes_and_strides_), 88, FieldNameEnum::sizes_and_strides_>();
are_equal<sizeof(storage_offset_), 8, FieldNameEnum::storage_offset_>();
are_equal<sizeof(numel_), 8, FieldNameEnum::numel_>();

View File

@ -13,10 +13,11 @@ struct C10_API PyInterpreterHooksInterface {
// Get the PyInterpreter instance
// Stub implementation throws error when Python is not available
// We return nullptr rather than throwing an error since there are bits of c10
// that expect an empty PyObjectSlot when python is not available.
virtual PyInterpreter* getPyInterpreter() const {
return nullptr;
TORCH_CHECK(
false,
"PyTorch was compiled without Python support. "
"Cannot access Python interpreter from C++.");
}
};

View File

@ -2,7 +2,7 @@
namespace c10::impl {
PyObjectSlot::PyObjectSlot() : pyobj_(nullptr) {}
PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
PyObjectSlot::~PyObjectSlot() {
maybe_destroy_pyobj();
@ -10,9 +10,9 @@ PyObjectSlot::~PyObjectSlot() {
void PyObjectSlot::maybe_destroy_pyobj() {
if (owns_pyobj()) {
TORCH_INTERNAL_ASSERT(getGlobalPyInterpreter() != nullptr);
TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
(*getGlobalPyInterpreter())
(*pyobj_interpreter_.load(std::memory_order_acquire))
->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
// NB: this destructor can only be entered when there are no
// references to this C++ object (obviously), NOR any references
@ -25,7 +25,7 @@ void PyObjectSlot::maybe_destroy_pyobj() {
}
PyInterpreter* PyObjectSlot::pyobj_interpreter() {
return getGlobalPyInterpreter();
return pyobj_interpreter_.load(std::memory_order_acquire);
}
PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
@ -35,7 +35,7 @@ PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
}
PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
auto interpreter = getGlobalPyInterpreter();
auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
if (interpreter) {
return *interpreter;
}

View File

@ -6,17 +6,10 @@
#include <c10/util/python_stub.h>
#include <optional>
#include <atomic>
namespace c10::impl {
// Function pointer type for getting the global interpreter
using GetPyInterpreterFn = PyInterpreter* (*)();
// Global function pointer (set by csrc initialization)
C10_API extern GetPyInterpreterFn g_get_pyinterpreter_fn;
// Helper function to get the global interpreter
C10_API PyInterpreter* getGlobalPyInterpreter();
struct C10_API PyObjectSlot {
public:
PyObjectSlot();
@ -33,6 +26,8 @@ struct C10_API PyObjectSlot {
// NB: THIS FUNCTION CAN RAISE AN EXCEPTION. Make sure to clean up after
// PyObject if necessary!
void init_pyobj(PyObject* pyobj) {
pyobj_interpreter_.store(
getGlobalPyInterpreter(), std::memory_order_relaxed);
pyobj_ = pyobj;
}
@ -60,15 +55,18 @@ struct C10_API PyObjectSlot {
// @todo alban: I'm not too sure what's going on here, we can probably delete
// it but it's worthwhile making sure
std::optional<PyObject*> check_pyobj() const {
impl::PyInterpreter* interpreter = getGlobalPyInterpreter();
if (interpreter == nullptr || pyobj_ == nullptr) {
std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
impl::PyInterpreter* interpreter =
pyobj_interpreter_.load(std::memory_order_acquire);
if (interpreter == nullptr) {
return std::nullopt;
}
if (c10::impl::HermeticPyObjectTLS::get_state()) {
if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
return std::nullopt;
} else {
return _unchecked_untagged_pyobj();
}
return _unchecked_untagged_pyobj();
}
PyInterpreter& load_pyobj_interpreter() const;
@ -78,6 +76,30 @@ struct C10_API PyObjectSlot {
void set_owns_pyobj(bool b);
private:
// This field contains the interpreter tag for this object. See
// Note [Python interpreter tag] for general context
//
// Note [Memory ordering on Python interpreter tag]
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// What memory_order do we need when accessing this atomic? We don't
// need a single total modification order (as provided by
// memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
// transition from -1 to some positive integer and never changes afterwards.
// Because there is only one modification, it trivially already has a total
// modification order (e.g., we don't need fences or locked instructions on
// x86)
//
// In fact, one could make a reasonable argument that relaxed reads are OK,
// due to the presence of external locking (GIL) to ensure that interactions
// with other data structures are still correctly synchronized, so that
// we fall in the "Single-Location Data Structures" case as described in
// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
// However, on x86, it doesn't matter if I use acquire or relaxed on the load
// as I get the same assembly in both cases. So I just use the more
// conservative acquire (which will impede compiler optimizations but I don't
// care)
std::atomic<PyInterpreter*> pyobj_interpreter_;
// This field contains a reference to a PyObject representing this Tensor.
// If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
// PyObject for it and set this field. This field does not have to be

View File

@ -18,9 +18,9 @@ cuda_supported_platforms = [
def define_c10_ovrsource(name, is_mobile):
if is_mobile:
pp_flags = ["-DC10_MOBILE=1"]
pp_flags = ["-DC10_MOBILE=1", "-DC10_USE_GLOG"]
else:
pp_flags = []
pp_flags = ["-DC10_USE_GLOG"]
oxx_static_library(
name = name,

View File

@ -316,7 +316,6 @@ set(GENERATED_CXX_PYTHON
"${TORCH_SRC_DIR}/csrc/autograd/generated/python_special_functions.cpp"
"${TORCH_SRC_DIR}/csrc/autograd/generated/python_return_types.cpp"
"${TORCH_SRC_DIR}/csrc/autograd/generated/python_enum_tag.cpp"
"${TORCH_SRC_DIR}/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp"
)
set(GENERATED_H_PYTHON
@ -380,9 +379,6 @@ add_custom_command(
"${TORCH_ROOT}/aten/src/ATen/templates/LazyIr.h"
"${TORCH_ROOT}/aten/src/ATen/templates/LazyNonNativeIr.h"
"${TORCH_ROOT}/aten/src/ATen/templates/RegisterDispatchKey.cpp"
"${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClasses.h"
"${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClasses.cpp"
"${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp"
${autograd_python}
${autograd_yaml}
${autograd_templates}
@ -544,11 +540,9 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
)
if(USE_DISTRIBUTED)
append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
if(NOT WIN32)
append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
endif()
append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
if(NOT WIN32)
append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
endif()
endif()
@ -579,32 +573,30 @@ if(USE_CUDA)
list(APPEND Caffe2_GPU_SRCS
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
endif()
if(USE_DISTRIBUTED)
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
if(NOT WIN32)
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
set_source_files_properties(
${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
)
endif()
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
if(NOT WIN32)
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
set_source_files_properties(
${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
)
endif()
set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
# Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
if(CMAKE_COMPILER_IS_GNUCXX)
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
endif()
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
endif()
set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
# Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
if(CMAKE_COMPILER_IS_GNUCXX)
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
endif()
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
endif()
set_source_files_properties(
${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@ -637,11 +629,9 @@ if(USE_ROCM)
list(APPEND Caffe2_HIP_SRCS
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
endif()
if(USE_DISTRIBUTED)
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
if(NOT WIN32)
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
endif()
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
if(NOT WIN32)
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
endif()
# caffe2_nvrtc's stubs to driver APIs are useful for HIP.
# See NOTE [ ATen NVRTC Stub and HIP ]
@ -1362,12 +1352,10 @@ if(BUILD_TEST)
add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
if(USE_DISTRIBUTED)
add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
if(NOT WIN32)
add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
endif()
add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
if(NOT WIN32)
add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
endif()
if(NOT NO_API)
add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api)
@ -1472,46 +1460,40 @@ if(BUILD_LITE_INTERPRETER)
endif()
endif()
# Pass USE_DISTRIBUTED to torch_cpu, as some codes in jit/pickler.cpp and
# jit/unpickler.cpp need to be compiled only when USE_DISTRIBUTED is set
if(USE_DISTRIBUTED)
target_compile_definitions(torch_cpu PUBLIC USE_DISTRIBUTED)
if(USE_GLOO AND USE_C10D_GLOO)
target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
if(USE_GLOO AND USE_C10D_GLOO)
target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
endif()
if(USE_UCC AND USE_C10D_UCC)
target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
if(USE_CUDA)
target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
endif()
if(USE_UCC AND USE_C10D_UCC)
target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
if(USE_CUDA)
target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
endif()
endif()
if(USE_NCCL AND USE_C10D_NCCL)
if(USE_ROCM)
target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
else()
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
endif()
if(USE_NCCL AND USE_C10D_NCCL)
if(USE_ROCM)
target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
else()
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
endif()
endif()
if(USE_MPI AND USE_C10D_MPI)
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set_source_files_properties(
"${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
endif()
target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
endif()
# Pass USE_RPC in order to reduce use of
# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
# need to be removed when RPC is supported
if(NOT WIN32)
target_compile_definitions(torch_cpu PUBLIC USE_RPC)
endif()
# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
# can only be compiled with USE_TENSORPIPE is set.
if(USE_TENSORPIPE)
target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
endif()
if(USE_MPI AND USE_C10D_MPI)
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set_source_files_properties(
"${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
endif()
target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
endif()
# Pass USE_RPC in order to reduce use of
# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
# need to be removed when RPC is supported
if(NOT WIN32)
target_compile_definitions(torch_cpu PUBLIC USE_RPC)
endif()
# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
# can only be compiled with USE_TENSORPIPE is set.
if(USE_TENSORPIPE)
target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
endif()
if(NOT INTERN_BUILD_MOBILE)

View File

@ -1134,7 +1134,7 @@ if(USE_CUDA AND CUDA_VERSION VERSION_LESS 13.0)
include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
endif()
if(USE_DISTRIBUTED AND USE_TENSORPIPE)
if(USE_TENSORPIPE)
if(MSVC)
message(WARNING "Tensorpipe cannot be used on Windows.")
else()

View File

@ -193,13 +193,11 @@ function(caffe2_print_configuration_summary)
message(STATUS " USE_PYTORCH_QNNPACK : ${USE_PYTORCH_QNNPACK}")
message(STATUS " USE_XNNPACK : ${USE_XNNPACK}")
message(STATUS " USE_DISTRIBUTED : ${USE_DISTRIBUTED}")
if(${USE_DISTRIBUTED})
message(STATUS " USE_MPI : ${USE_MPI}")
message(STATUS " USE_GLOO : ${USE_GLOO}")
message(STATUS " USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
message(STATUS " USE_GLOO_IBVERBS : ${USE_GLOO_IBVERBS}")
message(STATUS " USE_TENSORPIPE : ${USE_TENSORPIPE}")
endif()
message(STATUS " USE_MPI : ${USE_MPI}")
message(STATUS " USE_GLOO : ${USE_GLOO}")
message(STATUS " USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
message(STATUS " USE_GLOO_IBVERBS : ${USE_GLOO_IBVERBS}")
message(STATUS " USE_TENSORPIPE : ${USE_TENSORPIPE}")
if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
message(STATUS " SELECTED_OP_LIST : ${SELECTED_OP_LIST}")
endif()

View File

@ -3305,13 +3305,6 @@ def coverage_post_process(app, exception):
if not isinstance(app.builder, CoverageBuilder):
return
if not torch.distributed.is_available():
raise RuntimeError(
"The coverage tool cannot run with a version "
"of PyTorch that was built with USE_DISTRIBUTED=0 "
"as this module's API changes."
)
# These are all the modules that have "automodule" in an rst file
# These modules are the ones for which coverage is checked
# Here, we make sure that no module is missing from that list

View File

@ -1093,9 +1093,6 @@ The set of leaf modules can be customized by overriding
```{eval-rst}
.. autofunction:: torch.fx.replace_pattern
```
```{eval-rst}
.. autofunction:: torch.fx.traceback.annotate
```
<!-- The experimental and passes submodules are missing docs. -->
<!-- Adding it here for coverage but this doesn't add anything to the -->

View File

@ -156,7 +156,6 @@ def get_generate_code_bin_outs():
"autograd/generated/python_torch_functions_1.cpp": ["autograd/generated/python_torch_functions_1.cpp"],
"autograd/generated/python_torch_functions_2.cpp": ["autograd/generated/python_torch_functions_2.cpp"],
"autograd/generated/python_variable_methods.cpp": ["autograd/generated/python_variable_methods.cpp"],
"functionalization/generated/ViewMetaClassesPythonBinding.cpp": ["functionalization/generated/ViewMetaClassesPythonBinding.cpp"],
})
return outs

View File

@ -1704,18 +1704,7 @@ def main() -> None:
package_data = {
"torch": torch_package_data,
}
# some win libraries are excluded
# these are statically linked
exclude_windows_libs = [
"lib/dnnl.lib",
"lib/kineto.lib",
"lib/libprotobuf-lite.lib",
"lib/libprotobuf.lib",
"lib/libprotoc.lib",
]
exclude_package_data = {
"torch": exclude_windows_libs,
}
exclude_package_data = {}
if not BUILD_LIBTORCH_WHL:
package_data["torchgen"] = torchgen_package_data

View File

@ -1,4 +1,4 @@
if(USE_DISTRIBUTED AND NOT WIN32)
if(NOT WIN32)
set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd")
set(DIST_AUTOGRAD_TEST_SOURCES
${TORCH_ROOT}/test/cpp/common/main.cpp

View File

@ -1,7 +1,9 @@
if(WIN32)
set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/${CMAKE_IMPORT_LIBRARY_PREFIX}torch_python${CMAKE_IMPORT_LIBRARY_SUFFIX}")
set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/torch_python.lib")
elseif(APPLE)
set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/libtorch_python.dylib")
else()
set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}torch_python${CMAKE_SHARED_LIBRARY_SUFFIX}")
set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/libtorch_python.so")
endif()
add_library(torch_python SHARED IMPORTED)

View File

@ -143,19 +143,6 @@ class FlightRecorderEventTest(TestCase):
match_one_event(e11, e12, membership, "0").state,
MatchState.FULLY_MATCHED,
)
e13 = create_one_event(
"gather",
("0", "default"),
[[4, 4]],
[[4, 4]],
"completed",
1,
output_dtypes="",
)
self.assertEqual(
match_one_event(e11, e13, membership, "0").state,
MatchState.FULLY_MATCHED,
)
def test_all_events(self):
for collective in sorted(COLLECTIVES):

View File

@ -202,62 +202,6 @@ class ScheduleTest(TestCase):
torch.distributed.destroy_process_group()
@parametrize(
"ScheduleClass",
[
Schedule1F1B,
ScheduleGPipe,
ScheduleInterleaved1F1B,
ScheduleInterleavedZeroBubble,
ScheduleLoopedBFS,
],
)
def test_schedule_eval_then_train(self, ScheduleClass):
"""
Test that simply runs evaluation followed by training.
"""
store = FakeStore()
torch.distributed.init_process_group(
backend="fake", rank=0, world_size=1, store=store
)
d_hid, batch_size = 512, 256
n_stages = 1
device = "cpu"
full_mod = MultiMLP(d_hid, n_layers=n_stages)
full_mod.to(device)
x = torch.randn(batch_size, d_hid, device=device)
target = torch.randn(batch_size, d_hid, device=device)
def loss_fn(y, target):
return torch.nn.functional.cross_entropy(y, target)
submod_name = "layers.0"
stage_module = full_mod.get_submodule(submod_name)
# Create a pipeline stage to wrap that submodule
num_microbatches = 2
stages = [PipelineStage(stage_module, 0, n_stages, device)]
if issubclass(ScheduleClass, PipelineScheduleSingle):
stages = stages[0]
# Attach to a schedule
schedule = ScheduleClass(stages, num_microbatches, loss_fn=loss_fn)
# Run eval
for _ in range(2):
# Zero gradients
stage_module.zero_grad()
losses = []
schedule.eval(x, target=target, losses=losses)
# Run training
try:
for _ in range(2):
losses = []
schedule.step(x, target=target, losses=losses)
finally:
torch.distributed.destroy_process_group()
def test_zero_bubble_schedule_errors_with_compile(self):
"""
Test that zero bubble schedules raise an error when used with torch.compile.

View File

@ -352,7 +352,7 @@ class MicroPipelineTPTest(TestCase):
@parametrize("scatter_dim", [0, 1, 2])
@fresh_cache()
def test_fuse_scaled_matmul_reduce_scatter(self, A_dims, scatter_dim):
if scatter_dim >= A_dims - 1:
if scatter_dim >= A_dims:
return
group = dist.group.WORLD
@ -402,7 +402,7 @@ class MicroPipelineTPTest(TestCase):
@runOnRocmArch(MI300_ARCH)
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@parametrize("scatter_dim", [0, 1])
@parametrize("scatter_dim", [0, 1, 2])
@fresh_cache()
def test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape(
self, scatter_dim

View File

@ -0,0 +1,41 @@
# Copyright (c) Meta Platforms, Inc. and affiliates
# Owner(s): ["oncall: distributed"]
import torch
from torch._subclasses.fake_tensor import FakeTensorMode
from torch.distributed.tensor import DTensor
from torch.distributed.tensor.placement_types import Shard
from torch.testing._internal.common_utils import run_tests, TestCase
from torch.testing._internal.distributed.fake_pg import FakeStore
class TestFakeDTensor(TestCase):
def test_fake_dtensor_operations(self):
# Use FakeTensorMode to handle CUDA tensors without actual CUDA
fake_mode = FakeTensorMode()
world_size = 4
fake_store = FakeStore()
torch.distributed.init_process_group(
"fake", store=fake_store, rank=0, world_size=world_size
)
device_mesh = torch.distributed.device_mesh.init_device_mesh(
"cuda",
(2, world_size // 2),
)
# Create fake CUDA tensor using FakeTensorMode
with fake_mode:
x = torch.randn(1, 1, device="cuda")
x = DTensor.from_local(x, device_mesh, [Shard(0), Shard(1)])
# Test basic DTensor operations
self.assertIsInstance(x, DTensor)
# Test sum operation
r = x.sum(1)
self.assertIsInstance(r, DTensor)
if __name__ == "__main__":
run_tests()

View File

@ -880,34 +880,6 @@ class DistMathOpsTest(DTensorTestBase):
out_full = out_dt.full_tensor()
self.assertEqual(global_bins, out_full)
@with_comms
def test_logsumexp(self):
mesh = self.build_device_mesh()
comm_mode = CommDebugMode()
inp = torch.rand(3, 5, device=self.device_type)
shard_dim = 0
input_dtensor = distribute_tensor(
inp, device_mesh=mesh, placements=[Shard(shard_dim)]
)
logsumexp_dims = [0, 1]
for dim in logsumexp_dims:
output = torch.logsumexp(inp, dim=dim)
with comm_mode:
output_dtensor = torch.logsumexp(input_dtensor, dim=dim)
if dim == shard_dim:
self.assertEqual(comm_mode.get_total_counts(), 1)
self.assertEqual(
comm_mode.get_comm_counts()[funcol.all_gather_into_tensor],
1,
)
self.assertTrue(output_dtensor.placements[0].is_replicate())
else:
self.assertEqual(comm_mode.get_total_counts(), 0)
self.assertTrue(output_dtensor.placements[0].is_shard(shard_dim))
self.assertEqual(output_dtensor.full_tensor(), output)
if __name__ == "__main__":
run_tests()

View File

@ -505,7 +505,7 @@ class AsyncTPTest(MultiProcContinuousTest):
not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
)
@skip_if_lt_x_gpu(2)
@parametrize("scatter_dim", [0, 1, 2])
@parametrize("scatter_dim", [0, 1])
def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None:
self._init_process()

View File

@ -519,7 +519,11 @@ class AOTAutogradCacheTests(InductorTestCase):
@functorch_config.patch(
{"enable_autograd_cache": True, "view_replay_for_aliased_outputs": True}
)
def test_view_replay(self):
def test_view_replay_bypass(self):
"""
Should bypass when view replay is turned on
"""
def fn(a):
tmp = a.detach()
a.mul_(2)
@ -527,25 +531,10 @@ class AOTAutogradCacheTests(InductorTestCase):
with torch.autograd._force_original_view_tracking(True):
compiled_fn = torch.compile(fn)
compiled_fn(torch.rand(2, 3))
def run_and_check(miss, hit, bypass):
self._clear_dynamo_and_codecache()
inp = torch.rand(2, 3)
compiled_inp = inp.clone().detach()
with torch.autograd._force_original_view_tracking(True):
out = fn(inp)
compiled_out = compiled_fn(compiled_inp)
self.assertEqual(out, compiled_out)
self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], miss)
self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], hit)
self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], bypass)
run_and_check(miss=1, hit=0, bypass=0)
run_and_check(miss=1, hit=1, bypass=0)
run_and_check(miss=1, hit=2, bypass=0)
self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], 1)
@inductor_config.patch("fx_graph_remote_cache", False)
@inductor_config.patch("fx_graph_cache", True)

View File

@ -21,7 +21,6 @@ from unittest.mock import MagicMock, patch
import torch
import torch._dynamo as torchdynamo
import torch.fx.traceback as fx_traceback
import torch.nn.functional as F
import torch.utils._pytree as pytree
from functorch.experimental.control_flow import cond, map
@ -62,10 +61,7 @@ from torch.export.passes import move_to_device_pass
from torch.fx.experimental.proxy_tensor import make_fx
from torch.fx.experimental.symbolic_shapes import ShapeEnv
from torch.testing import FileCheck
from torch.testing._internal.common_cuda import (
PLATFORM_SUPPORTS_FLASH_ATTENTION,
xfailIfDistributedNotSupported,
)
from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
from torch.testing._internal.common_utils import (
find_library_location,
IS_FBCODE,
@ -15072,39 +15068,6 @@ def forward(self, x):
test_serdes=True,
)
# TODO: following tests should be fixed
@testing.expectedFailureTrainingIRToRunDecomp
@testing.expectedFailureTrainingIRToRunDecompNonStrict
def test_preserve_annotation(self):
class M(torch.nn.Module):
def forward(self, x):
with fx_traceback.annotate({"pp_stage": 0}):
with fx_traceback.annotate({"fdsp_bucket": 0}):
x = x + 1
x = x - 2
with fx_traceback.annotate({"cuda_stream": 2, "fsdp_bucket": 1}):
x = x * 2
x = x / 3
return x
m = M()
with fx_traceback.preserve_node_meta():
ep = export(m, (torch.randn(10),))
for node in ep.graph.nodes:
if node.target == torch.ops.aten.add.default:
self.assertTrue(node.meta["custom"], {"pp_stage": 0, "fdsp_bucket": 0})
if node.target == torch.ops.aten.sub.default:
self.assertTrue(node.meta["custom"], {"pp_stage": 0})
if node.target == torch.ops.aten.mul.default:
self.assertTrue(
node.meta["custom"],
{"pp_stage": 0, "cuda_stream": 2, "fsdp_bucket": 1},
)
if node.target == torch.ops.aten.div.default:
self.assertTrue(node.meta["custom"], {})
def test_dynamic_shapes_serdes_generic(self):
from torch._export.serde.dynamic_shapes import (
_dump_dynamic_shapes,
@ -15824,7 +15787,6 @@ class GraphModule(torch.nn.Module):
finally:
torch.distributed.destroy_process_group()
@xfailIfDistributedNotSupported
def test_distributed_all_reduce(self):
class Foo(torch.nn.Module):
def __init__(self):
@ -15842,7 +15804,6 @@ class GraphModule(torch.nn.Module):
inp = (torch.randn(4, 4),)
self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
@xfailIfDistributedNotSupported
def test_distributed_all_gather(self):
class Foo(torch.nn.Module):
def forward(self, x):
@ -15858,7 +15819,6 @@ class GraphModule(torch.nn.Module):
torch.allclose(a, b) for a, b in zip(ep.module()(*inp), m(*inp))
)
@xfailIfDistributedNotSupported
def test_distributed_all_gather_into_tensor(self):
class Foo(torch.nn.Module):
def forward(self, x):
@ -15872,7 +15832,6 @@ class GraphModule(torch.nn.Module):
inp = (torch.randn(2),)
self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
@xfailIfDistributedNotSupported
@testing.expectedFailureCppRuntime
def test_distributed_all_to_all_single(self):
class Foo(torch.nn.Module):
@ -15890,7 +15849,6 @@ class GraphModule(torch.nn.Module):
)
self.assertEqual(len(nodes), 1)
@xfailIfDistributedNotSupported
@testing.expectedFailureCppRuntime
def test_distributed_reduce_scatter_tensor(self):
class Foo(torch.nn.Module):

View File

@ -8500,6 +8500,7 @@ class TestAOTAutogradWithCache(TestAOTAutogradWithDynamo):
{
"enable_autograd_cache": True,
"strict_autograd_cache": True,
"view_replay_for_aliased_outputs": False,
}
)
@torch._inductor.config.patch("fx_graph_cache", True)

View File

@ -20,7 +20,11 @@ from torch._inductor import config
from torch._inductor.codegen.cpp import CppScheduling
from torch._inductor.codegen.triton import TritonScheduling
from torch._inductor.codegen.wrapper import PythonWrapperCodegen
from torch._inductor.codegen.wrapper_fxir import FxConverter, WrapperFxCodegen
from torch._inductor.codegen.wrapper_fxir import (
FxConverter,
replace_floor_div,
WrapperFxCodegen,
)
from torch._inductor.test_case import TestCase as InductorTestCase
from torch.export import Dim
from torch.testing._internal.common_utils import (
@ -34,6 +38,7 @@ from torch.testing._internal.inductor_utils import (
requires_gpu,
TRITON_HAS_CPU,
)
from torch.utils._sympy.functions import FloorDiv
if HAS_GPU:
@ -483,10 +488,11 @@ class FxirTestCase(InductorTestCase):
)
self.assertIn("ks0", triton_node.kwargs["kwargs"])
def test_dynamic_launch_grid_calc_python(self):
def test_dynamic_launch_grid_calc(self):
"""
Test the dyanmic launch grid calculation for Triton kernel wrapper using python mode
Test the dyanmic launch grid calculation.
"""
func = torch.add
args = [torch.randn(shape, device=self.device) for shape in [(7, 12), (7, 1)]]
(gm,) = self._compile_and_check(func, args, compile_kwargs={"dynamic": True})
@ -505,41 +511,6 @@ class FxirTestCase(InductorTestCase):
self.assertEqual(grid[1], 1)
self.assertEqual(grid[2], 1)
def test_dynamic_launch_grid_calc_python_slow(self):
"""
Test the dyanmic launch grid calculation for Triton kernel wrapper using python_slow mode
"""
from torch._inductor.runtime.triton_heuristics import GridExpr
# Mock GridExpr.from_meta to use "python_slow" mode explicitly
original_from_meta = GridExpr.from_meta
def mocked_from_meta(inductor_meta, cfg, mode="python"):
return original_from_meta(inductor_meta, cfg, mode="python_slow")
with unittest.mock.patch.object(GridExpr, "from_meta", mocked_from_meta):
func = torch.add
args = [
torch.randn(shape, device=self.device) for shape in [(7, 12), (7, 1)]
]
(gm,) = self._compile_and_check(
func, args, compile_kwargs={"dynamic": True}
)
# Check for the precomputed size arg.
(triton_node,) = gm.graph.find_nodes(
op="call_function", target=triton_kernel_wrapper_mutation
)
self.assertIn("grid", triton_node.kwargs)
self.assertIn("xnumel", triton_node.kwargs["kwargs"])
self.assertIn("XBLOCK", triton_node.kwargs["kwargs"])
grid = triton_node.kwargs["grid"][0]
xnumel = triton_node.kwargs["kwargs"]["xnumel"].meta["val"]
xblock = triton_node.kwargs["kwargs"]["XBLOCK"]
self.assertEqual(grid[0].meta["val"], ((xnumel + xblock - 1) // xblock))
self.assertEqual(grid[1], 1)
self.assertEqual(grid[2], 1)
@config.patch({"trace.enabled": True})
@unittest.mock.patch("torch._inductor.debug.DebugFormatter.output_code")
def test_debug(self, mock_output_code):
@ -990,6 +961,29 @@ def forward(self, arg0_1, arg1_1, arg2_1):
return [buf1, buf2]""", # noqa: B950
)
def test_dims_dynamic_outer_static_padded_inner(self):
"""
Test padding on inner dimensions, with dynamic outer dimensions.
"""
class M(torch.nn.Module):
def forward(self, x, y):
return x + y
def get_input_padded_inner(shape):
full_shape = shape[:-1] + (shape[-1] * 2,)
full = torch.randn(full_shape, dtype=torch.float32, device=self.device)
view = torch.as_strided(full, shape, full.stride())
return view
shape = (4, 4, 4)
args = tuple(get_input_padded_inner(shape) for _ in range(2))
self.check(
M(),
args,
dynamic_shapes=({0: Dim.DYNAMIC, 1: Dim.DYNAMIC, 2: Dim.STATIC},) * 2,
)
@parametrize("length", (4, 8))
def test_cond_dynamic_shape_pred_scalar_closure(self, length: int):
"""
@ -1033,6 +1027,132 @@ def forward(self, arg0_1, arg1_1, arg2_1):
self.check(M(), (x,), dynamic_shapes=({0: Dim.DYNAMIC},))
class TestReplaceFloorDiv(InductorTestCase):
"""
Tests for floor -> FloorDiv conversion.
"""
def _check(self, expr: sympy.Expr) -> sympy.Expr:
# Check that we started with floor's.
num_floors = expr.count(sympy.floor)
self.assertGreater(num_floors, 0)
replaced = replace_floor_div(expr)
# Check that all floor's were replaced.
# We shoud have no more new FloorDiv's than floor's in the original expression,
# although we can have less due to simplification.
self.assertEqual(replaced.count(sympy.floor), 0)
self.assertLessEqual(
replaced.count(FloorDiv) - expr.count(FloorDiv), num_floors
)
def expand_floor_div(
numerator: sympy.Expr, denominator: sympy.Expr
) -> sympy.Expr:
return sympy.floor(numerator / denominator)
# Expand FloorDiv back into floor and check for equality.
self.assertEqual(
*[
sympy.simplify(e.replace(FloorDiv, expand_floor_div))
for e in (replaced, expr)
]
)
return replaced
def test_rewrite_floor_div_mul_pow(self):
x, y = sympy.symbols("x y")
expr = sympy.floor(x / y)
self.assertEqual(expr.count(FloorDiv), 0)
self.assertEqual(expr.count(sympy.core.mul.Mul), 1)
self.assertEqual(expr.count(sympy.Pow), 1)
rewritten = self._check(expr)
self.assertTrue(isinstance(rewritten, FloorDiv))
self.assertEqual(rewritten.args, (x, y))
def test_rewrite_floor_div_mul_rational(self):
x = sympy.Symbol("x")
expr = sympy.floor(x / 5)
self.assertEqual(expr.count(FloorDiv), 0)
self.assertEqual(expr.count(sympy.core.mul.Mul), 1)
self.assertEqual(expr.count(sympy.Rational), 1)
rewritten = self._check(expr)
self.assertTrue(isinstance(rewritten, FloorDiv))
self.assertEqual(rewritten.args, (x, 5))
def test_no_rewrite_div(self):
x, y = sympy.symbols("x y")
expr = x / y
self.assertEqual(expr.count(FloorDiv), 0)
rewritten = replace_floor_div(expr)
self.assertEqual(rewritten, expr)
def test_rewrite_floor_div_nested(self):
x, y = sympy.symbols("x y")
expr = sympy.floor((sympy.floor(x / 5) + 1) / y)
self.assertEqual(expr.count(FloorDiv), 0)
rewritten = self._check(expr)
self.assertEqual(rewritten.count(FloorDiv), 2)
def test_rewrite_floor_div_rational_const(self):
expr = sympy.floor(sympy.S.One / 5, evaluate=False)
self.assertEqual(expr.count(FloorDiv), 0)
self.assertEqual(expr.count(sympy.Mul), 0)
self.assertEqual(expr.count(sympy.Rational), 1)
# Expression evaluates to a compile time constant
rewritten = self._check(expr)
self.assertEqual(rewritten, sympy.S.Zero)
def test_no_distribute_mul_floordiv(self):
"""
Test that multiplication doesn't distribute with floor division.
"""
x = sympy.Symbol("x")
expr = 2 * sympy.floor(x / 2)
rewritten = self._check(expr)
self.assertEqual(rewritten.count(sympy.Mul), 1)
self.assertEqual(rewritten.count(FloorDiv), 1)
def test_rational_multi_pows(self):
"""
Test an expression with a rational and multiple pows.
"""
x, y, z = sympy.symbols("x y z")
expr = sympy.floor((x / 5) * (y**2) * (z**3))
mul = expr.args[0]
self.assertTrue(isinstance(mul, sympy.Mul))
self.assertTrue(isinstance(mul.args[0], sympy.Rational))
self.assertEqual(expr.count(sympy.Pow), 2)
rewritten = self._check(expr)
self.assertEqual(rewritten.count(FloorDiv), 1)
def test_variable_exp(self):
"""
Test pow when the exponent is a variable.
"""
x = sympy.Symbol("x", positive=True)
expr = sympy.floor(2**-x)
replaced = self._check(expr)
# Check that x went to the denominator.
self.assertEqual(replaced.args, (1, 2**x))
def test_launch_grid_dynamic_padding(self):
"""
Test a complex launch grid expression arising from padding with dynamic shapes.
"""
x, y = sympy.symbols("x y")
expr = sympy.floor(-FloorDiv(x * y, 2) / FloorDiv(-x * y, 131070))
self._check(expr)
if __name__ == "__main__":
from torch._inductor.test_case import run_tests

View File

@ -3238,40 +3238,6 @@ aten::mm""",
assert "Overload Name" in key_averages.table()
validate_json(prof)
def test_expose_kineto_event_metadata(self):
def check_metadata(prof, op_name, metadata_key):
with TemporaryFileName(mode="w+") as fname:
prof.export_chrome_trace(fname)
with open(fname) as f:
events = json.load(f)["traceEvents"]
found_op = False
for e in events:
if "name" in e and "args" in e and e["name"] == op_name:
assert metadata_key in e["args"], (
f"Metadata for '{op_name}' in Chrome trace did not contain '{metadata_key}'."
)
found_op = True
assert found_op, f"Could not find op '{op_name}' in Chrome trace."
found_op = False
for event in prof.events():
if event.name == op_name:
assert metadata_key in event.metadata_json, (
f"Metadata for '{op_name}' in FunctionEvent did not contain '{metadata_key}'."
)
found_op = True
assert found_op, f"Could not find op '{op_name}' in prof.events()."
experimental_config = torch._C._profiler._ExperimentalConfig(
expose_kineto_event_metadata=True
)
with profile(
experimental_config=experimental_config,
activities=[ProfilerActivity.CPU],
) as prof:
torch.add(1, 5)
check_metadata(prof, op_name="aten::add", metadata_key="Ev Idx")
@unittest.skipIf(not torch.cuda.is_available(), "requries CUDA")
def test_profiler_debug_autotuner(self):
"""

View File

@ -7,7 +7,7 @@ import sys
from dataclasses import dataclass
from multiprocessing.context import SpawnProcess
from typing import Any, Optional
from unittest import skipUnless
from unittest import skipIf, skipUnless
from unittest.mock import mock_open, patch
import torch
@ -22,7 +22,7 @@ from torch.numa.binding import (
AffinityMode,
NumaOptions,
)
from torch.testing._internal.common_utils import run_tests, TestCase
from torch.testing._internal.common_utils import IS_MACOS, run_tests, TestCase
@dataclass(frozen=True)
@ -680,6 +680,7 @@ class NumaBindingTest(TestCase):
set(range(0, 2)),
)
@skipIf(IS_MACOS, "sched_getaffinity doesn't exist")
def test_binds_to_node_0_if_node_stored_as_minus_one(self) -> None:
self._add_mock_hardware(
num_sockets=1,

View File

@ -2488,9 +2488,9 @@ class TestSparseCSR(TestCase):
self.assertEqual(a.grad, a1.grad)
self.assertEqual(b.grad, b1.grad)
@skipCUDAIfRocm
@onlyCUDA
@skipCUDAIf(True, "Causes CUDA memory exception, see https://github.com/pytorch/pytorch/issues/72177")
# It works on ROCm and CUDA issue is currently active
@skipCUDAIf(not TEST_WITH_ROCM, "Causes CUDA memory exception, see https://github.com/pytorch/pytorch/issues/72177")
@dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
@precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3,
torch.float64: 1e-8, torch.complex128: 1e-8})

View File

@ -88,8 +88,7 @@ def build_pytorch(
) -> None:
my_env = _create_build_env()
if (
not check_negative_env_flag("USE_DISTRIBUTED")
and not check_negative_env_flag("USE_CUDA")
not check_negative_env_flag("USE_CUDA")
and not check_negative_env_flag("USE_NCCL")
and not check_env_flag("USE_SYSTEM_NCCL")
):

View File

@ -469,30 +469,6 @@ class Op:
f"{p2p_info}, " if p2p_info else ""
)
def dtype_mismatch(self, other: "Op") -> bool:
if (
(
self.type not in ["scatter", "gather", "broadcast"]
and set(self.input_dtypes) != set(self.output_dtypes)
and self.input_sizes[0]
and self.output_sizes[0]
)
or (
self.type not in ["scatter", "broadcast"]
and set(self.input_dtypes) != set(other.input_dtypes)
and self.input_sizes[0]
and other.input_sizes[0]
)
or (
self.type not in ["gather"]
and set(self.output_dtypes) != set(other.output_dtypes)
and self.output_sizes[0]
and other.output_sizes[0]
)
):
return True
return False
def match(self, other: "Op") -> MatchInfo:
# TODO: I think this can validly not match,
# e.g. if one PG was used for p2p ops between only some of the peers?
@ -534,7 +510,23 @@ class Op:
MatchState.COLLECTIVE_STATE_MISMATCH,
f"Expected state: '{self.state}' does not match found state: '{other.state}'",
)
if self.dtype_mismatch(other):
if (
(
set(self.input_dtypes) != set(self.output_dtypes)
and self.input_sizes[0]
and self.output_sizes[0]
)
or (
set(self.input_dtypes) != set(other.input_dtypes)
and self.input_sizes[0]
and other.input_sizes[0]
)
or (
set(self.input_dtypes) != set(other.output_dtypes)
and self.input_sizes[0]
and other.output_sizes[0]
)
):
return MatchInfo(
MatchState.COLLECTIVE_DTYPE_MISMATCH,
f"Expected dtypes: '{set(self.input_dtypes)}' does not "

View File

@ -189,12 +189,6 @@ def main() -> None:
)
options = parser.parse_args()
# Path: aten/src/ATen
aten_path = os.path.dirname(os.path.dirname(options.native_functions_path))
operator_selector = get_selector(
options.selected_op_list_path, options.operators_yaml_path
)
generate_code(
options.gen_dir,
options.native_functions_path,
@ -204,37 +198,18 @@ def main() -> None:
options.disable_autograd,
options.force_schema_registration,
# options.selected_op_list
operator_selector=operator_selector,
)
# Generate the python bindings for functionalization's `ViewMeta` classes.
from torchgen.gen_functionalization_type import (
gen_functionalization_view_meta_classes,
)
functionalization_templates_dir = os.path.join(aten_path, "templates")
install_dir = options.install_dir or os.fspath(options.gen_dir / "torch/csrc")
functionalization_install_dir = os.path.join(
install_dir, "functionalization", "generated"
)
os.makedirs(functionalization_install_dir, exist_ok=True)
assert os.path.isdir(functionalization_install_dir)
assert os.path.isdir(functionalization_templates_dir)
gen_functionalization_view_meta_classes(
options.native_functions_path or NATIVE_FUNCTIONS_PATH,
options.tags_path or TAGS_PATH,
selector=operator_selector,
install_dir=functionalization_install_dir,
template_dir=functionalization_templates_dir,
operator_selector=get_selector(
options.selected_op_list_path, options.operators_yaml_path
),
)
if options.gen_lazy_ts_backend:
aten_path = os.path.dirname(os.path.dirname(options.native_functions_path))
ts_backend_yaml = os.path.join(aten_path, "native/ts_native_functions.yaml")
ts_native_functions = "torch/csrc/lazy/ts_backend/ts_native_functions.cpp"
ts_node_base = "torch/csrc/lazy/ts_backend/ts_node.h"
lazy_install_dir = os.path.join(install_dir, "lazy", "generated")
install_dir = options.install_dir or os.fspath(options.gen_dir / "torch/csrc")
lazy_install_dir = os.path.join(install_dir, "lazy/generated")
os.makedirs(lazy_install_dir, exist_ok=True)
assert os.path.isfile(ts_backend_yaml), (

View File

@ -276,32 +276,30 @@ add_custom_command(
WORKING_DIRECTORY
"${TORCH_ROOT}"
)
if(USE_DISTRIBUTED)
if(WIN32)
append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
else()
append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
endif()
# Disable certain warnings for GCC-9.X
if(CMAKE_COMPILER_IS_GNUCXX)
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
endif()
# NCCL is a private dependency of libtorch, but libtorch_python includes
# some private headers of libtorch, which in turn include NCCL. As a hacky
# alternative to making NCCL a public dependency of libtorch, we make it
# a private dependency of libtorch_python as well.
if(USE_NCCL)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
endif()
# Same for MPI.
if(USE_MPI)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX)
endif()
list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
if(WIN32)
append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
else()
append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
endif()
# Disable certain warnings for GCC-9.X
if(CMAKE_COMPILER_IS_GNUCXX)
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
endif()
# NCCL is a private dependency of libtorch, but libtorch_python includes
# some private headers of libtorch, which in turn include NCCL. As a hacky
# alternative to making NCCL a public dependency of libtorch, we make it
# a private dependency of libtorch_python as well.
if(USE_NCCL)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
endif()
# Same for MPI.
if(USE_MPI)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX)
endif()
list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
if(USE_NCCL AND NOT WIN32)
list(APPEND TORCH_PYTHON_SRCS
@ -369,10 +367,6 @@ if(BUILD_LIBTORCHLESS)
target_compile_definitions(torch_python PRIVATE USE_C10D_NCCL)
endif()
if(USE_DISTRIBUTED)
target_compile_definitions(torch_python PRIVATE USE_DISTRIBUTED)
endif()
if(USE_MPI AND USE_C10D_MPI)
target_compile_definitions(torch_python PRIVATE USE_C10D_MPI)
endif()

View File

@ -30,7 +30,6 @@ from torch._C import (
_cpu,
_dynamo,
_export,
_functionalization,
_functorch,
_lazy,
_lazy_ts_backend,

View File

@ -78,7 +78,6 @@ class _KinetoEvent:
def privateuse1_elapsed_us(self) -> int: ...
def is_user_annotation(self) -> bool: ...
def is_hidden_event(self) -> bool: ...
def metadata_json(self) -> str: ...
class _ProfilerResult:
def events(self) -> list[_KinetoEvent]: ...

View File

@ -851,3 +851,12 @@ class ProcessGroupXCCL(Backend):
def _set_process_group(pg: ProcessGroup) -> None: ...
def _current_process_group() -> ProcessGroup: ...
def _dump_nccl_trace_json(
includeCollectives: Optional[bool] = ...,
onlyActive: Optional[bool] = ...,
) -> bytes: ...
def _dump_nccl_trace(
includeCollectives: Optional[bool] = ...,
includeStackTraces: Optional[bool] = ...,
onlyActive: Optional[bool] = ...,
) -> bytes: ...

View File

@ -1,16 +0,0 @@
from torch import Tensor
from torch.types import _bool
# Defined in torch/csrc/functionalization/Module.cpp
class ViewMeta:
has_symbolic_inputs: _bool
# Returns the list of ViewMeta instances of the given functional tensor.
#
# Although we do have python bindings for their types, we won't
# expose them here, since they should not be used by users.
def get_view_meta_sequence(tensor: Tensor) -> list[ViewMeta]: ...
# Applies the ViewMeta sequence on top of the given base.
def apply_view_meta_sequence(base: Tensor, sequence: list[ViewMeta]) -> Tensor: ...

View File

@ -51,7 +51,6 @@ from .resume_execution import TORCH_DYNAMO_RESUME_IN_PREFIX
from .utils import (
getfile,
hashable,
is_annotate_wrapped_function,
is_lru_cache_wrapped_function,
NP_SUPPORTED_MODULES,
unwrap_if_wrapper,
@ -155,7 +154,6 @@ manual_torch_name_rule_map: dict[
type[UserFunctionVariable],
],
] = {
"torch.fx.traceback.annotate": UserFunctionVariable,
"torch.onnx.is_in_onnx_export": TorchInGraphFunctionVariable,
"torch.onnx.operators.shape_as_tensor": TorchInGraphFunctionVariable,
"torch.overrides.is_tensor_like": TorchInGraphFunctionVariable,
@ -3004,8 +3002,6 @@ def get_torch_obj_rule_map() -> dict[Any, type["VariableTracker"]]:
continue
obj = torch_dir + k[len("torch/") :]
if obj is not None:
if is_annotate_wrapped_function(obj):
obj = obj.__wrapped__
if is_lru_cache_wrapped_function(obj):
obj = obj.__wrapped__
if obj in d and d[obj] != v:

View File

@ -1101,14 +1101,6 @@ def is_lru_cache_wrapped_function(
)
def is_annotate_wrapped_function(
value: Any,
) -> bool:
return value == torch.fx.traceback.annotate and is_function(
inspect.getattr_static(value, "__wrapped__")
)
_FuncTypes: TypeAlias = Union[
types.FunctionType,
types.BuiltinFunctionType,

View File

@ -284,6 +284,19 @@ def check_cacheable(gm: torch.fx.GraphModule):
check_cacheable(gm.saved_tensors_hooks_unpack_0) # type: ignore[arg-type]
def check_metadata_cacheable(metadata: ViewAndMutationMeta):
"""
When view replay is turned on, we bypass autograd cache if
the output is aliased.
"""
if config.view_replay_for_aliased_outputs:
for info in metadata.output_info:
if info.functional_tensor is not None:
raise BypassAOTAutogradCache(
"Cannot cache a graph with functional tensor"
)
class AOTAutogradCacheDetails(FxGraphHashDetails):
"""
Object to capture all the details for a dynamo graph module relevant to computing
@ -790,6 +803,7 @@ class GenericAOTAutogradCacheEntry(Generic[TForward, TBackward]):
"""
Perform any preparations to make the cache entry ready for serialization.
"""
check_metadata_cacheable(self.runtime_metadata)
self.compiled_fw.pre_save()
if self.compiled_bw is not None:
self.compiled_bw.pre_save()

View File

@ -43,10 +43,10 @@ from .functional_utils import (
has_metadata_mutation,
MetadataKey,
to_fun,
ViewMetaSequence,
was_inductor_storage_resized,
)
from .schemas import (
FunctionalTensorMetadataEq,
InputAliasInfo,
MemoryFormatMeta,
MutationType,
@ -640,7 +640,7 @@ from a multi-output view call"
#
# The FunctionalTensor will be saved if one of the 2 conditions below
# is true:
view_meta_sequence = None
functional_tensor = None
if (
# 1. If the output_type is either of:
# (i) alias_of_intermediate;
@ -672,7 +672,7 @@ from a multi-output view call"
and not input_info[base_idx].mutates_metadata
):
if isinstance(o, FunctionalTensor):
view_meta_sequence = ViewMetaSequence(o)
functional_tensor = FunctionalTensorMetadataEq(o.elem)
out_info = OutputAliasInfo(
output_type=output_type,
@ -680,7 +680,7 @@ from a multi-output view call"
base_idx=base_idx,
dynamic_dims=dynamic_dims,
requires_grad=isinstance(o, torch.Tensor) and o.requires_grad,
view_meta_sequence=view_meta_sequence,
functional_tensor=functional_tensor,
)
output_info.append(out_info)

View File

@ -14,7 +14,6 @@ from typing import Optional
import torch
from torch import Tensor
from torch._C import _functionalization
from torch._logging import getArtifactLogger
from torch._subclasses.fake_tensor import FakeTensor
from torch._subclasses.functional_tensor import FunctionalTensor
@ -225,9 +224,9 @@ def gen_alias_from_base(
aliased_base_tensor,
target_meta_tensor,
target_requires_grad,
target_view_meta_sequence: Optional[ViewMetaSequence] = None,
target_functional_tensor: Optional[FunctionalTensorMetadataEq] = None,
*,
replay_views: bool,
replay_views,
):
# Patch the correct requires_grad field of the output tensor, depending on whether:
# (i) the reconstructed output (out) was came from a tensor that requires grad or not;
@ -246,11 +245,13 @@ def gen_alias_from_base(
# to replay them (view functions) on the aliased_base_tensor.
if (
replay_views
and target_view_meta_sequence is not None
and not any(vm.has_symbolic_inputs for vm in target_view_meta_sequence.sequence)
and target_functional_tensor is not None
and not torch._functionalize_is_symbolic(target_functional_tensor.tensor)
):
out = _functionalization.apply_view_meta_sequence(
aliased_base_tensor, target_view_meta_sequence.sequence
functional_tensor = target_functional_tensor.tensor
out = torch._functionalize_apply_view_metas(
functional_tensor, aliased_base_tensor
)
# If re-applying the ViewMeta sequence succeeded, there should be no more
# problems going forward. We just check we got to the target shape and
@ -356,45 +357,25 @@ class MetadataKey:
)
# ViewMeta sequence wrapper for equality comparisons.
#
# Even though we can compare each ViewMeta instance, we compare the resulting
# tensor metadata, instead. That's because the creation of synthetic bases + the
# re-generation of input views might end-up creating a different sequence of
# ViewMeta that is semantically equivalent. i.e. gets to a tensor with the same
# metadata.
#
# Therefore, we store what the end result should look like as serializable
# metadata.
#
# When logging, this class should look like:
#
# ViewMetaSequence(view, select_int, slice_Tensor)
#
# i.e. a parenthesized list of view operations within that ViewMeta sequence.
class ViewMetaSequence:
def __init__(self, tensor: FunctionalTensor) -> None:
assert torch._is_functional_tensor(tensor.elem)
self.sequence = _functionalization.get_view_meta_sequence(tensor.elem)
self.metadata = MetadataKey.make(tensor)
def __repr__(self) -> str:
suffix = len("_ViewMeta")
types = ", ".join(type(vm).__name__[:-suffix] for vm in self.sequence)
return f"ViewMetaSequence({types})"
# Wrapper around a FunctionalTensorWrapper for comparing only the resulting metadata
# after applying all the ViewMeta operations.
class FunctionalTensorMetadataEq:
def __init__(self, tensor: torch.Tensor) -> None:
assert torch._is_functional_tensor(tensor)
self.tensor = tensor
def __eq__(self, other: object) -> bool:
# If other is None, then it probably means that we weren't able to recreate
# the ViewMeta sequence. One example is when we update the view metadata by
# calling: create_synthetic_base_metadata.
# the FunctionalTensorMetadataEq. One of this cases is when we update the
# view metadata by calling: create_synthetic_base_metadata.
if other is None:
return True
# Comparison against any other type is not implemented.
if not isinstance(other, ViewMetaSequence):
if not isinstance(other, FunctionalTensorMetadataEq):
return NotImplemented
return self.metadata == other.metadata
return has_same_metadata(self.tensor, other.tensor)
# new_arg and arg here are either:

View File

@ -89,7 +89,7 @@ def remove_dupe_metadata(
dynamic_dims=o.dynamic_dims,
base_idx=None if o.base_idx is None else add_dupe_map[o.base_idx],
requires_grad=o.requires_grad,
view_meta_sequence=o.view_meta_sequence,
functional_tensor=o.functional_tensor,
)
for o in m.output_info
],
@ -242,7 +242,7 @@ def create_synthetic_base_metadata(
# Map the input idx pre-synthetic-bases to the new idx post-synthetic-bases
base_idx=new_base_idx, # type: ignore[arg-type]
requires_grad=o.requires_grad,
view_meta_sequence=o.view_meta_sequence,
functional_tensor=o.functional_tensor,
)
)

View File

@ -150,7 +150,7 @@ class AliasOfInputHandler:
self.base_idx = info.base_idx
self.unwrap_out = _unwrap_tensoralias if trace_joint else _identity
self.requires_grad = info.requires_grad
self.view_meta_sequence = info.view_meta_sequence
self.functional_tensor = info.functional_tensor
self.replay_views = config.view_replay_for_aliased_outputs
def __call__(self, orig_inputs, fw_outs, out):
@ -159,7 +159,7 @@ class AliasOfInputHandler:
aliased_base_tensor,
self.unwrap_out(out),
self.requires_grad,
self.view_meta_sequence,
self.functional_tensor,
replay_views=self.replay_views,
)
@ -190,7 +190,7 @@ class AliasOfIntermediateHandler:
self.unwrap_out = _unwrap_tensoralias if trace_joint else _identity
self.requires_grad = info.requires_grad
self.view_meta_sequence = info.view_meta_sequence
self.functional_tensor = info.functional_tensor
self.replay_views = config.view_replay_for_aliased_outputs
def __call__(self, orig_inputs, fw_outs, out):
@ -199,7 +199,7 @@ class AliasOfIntermediateHandler:
self._unwrap_aliased_base_tensor(aliased_base_tensor),
self.unwrap_out(out),
self.requires_grad,
self.view_meta_sequence,
self.functional_tensor,
replay_views=self.replay_views,
)

View File

@ -7,6 +7,7 @@ input/output types, metadata, config, function signatures etc.
from __future__ import annotations
import collections
import dataclasses
import functools
import itertools
from dataclasses import dataclass, field
@ -31,7 +32,10 @@ from torch.fx.experimental._backward_state import BackwardState
from torch.utils._python_dispatch import is_traceable_wrapper_subclass
from .. import config
from .functional_utils import _check_if_mutation_can_be_in_graph, ViewMetaSequence
from .functional_utils import (
_check_if_mutation_can_be_in_graph,
FunctionalTensorMetadataEq,
)
from .utils import strict_zip
@ -113,14 +117,15 @@ class OutputAliasInfo:
dynamic_dims: Optional[set[int]]
# requires_grad
requires_grad: bool
# Sequence of ViewMeta objects.
# FunctionalTensorWrapper that represents this output.
#
# Provides us the means to re-run view functions on other tensors.
# Provides us the means to replay views from it.
#
# We need to wrap the actual list of ViewMeta with this class so that
# we compare the ViewMeta elements appropriately, i.e. their type and
# the elements returned by the `as_tuple()` call.
view_meta_sequence: Optional[ViewMetaSequence] = None
# We need to wrap the actual FunctionalTensorWrapper with this class so that
# we only compare the tensor's metadata. That's because with the transformations
# of the model throughout AOTAutograd, the sequence of ViewMeta and the base
# tensor might change.
functional_tensor: Optional[FunctionalTensorMetadataEq] = None
class MutationType(Enum):
@ -660,6 +665,17 @@ class ViewAndMutationMeta:
self.traced_tangent_metas = [extract_metadata(t) for t in self.traced_tangents]
# Clear traced tangents at runtime
self.traced_tangents = []
new_output_info = []
for out in self.output_info:
if config.view_replay_for_aliased_outputs:
new_out = out
else:
# If we're not using view_replay, remove the functional tensor.
# Functional tensors are unfortunately not serializable,
# so doing this is required for AOTAutograd caching.
new_out = dataclasses.replace(out, functional_tensor=None)
new_output_info.append(new_out)
self.output_info = new_output_info
for inp_meta in self.subclass_inp_meta:
if isinstance(inp_meta, SubclassCreationMeta):
inp_meta.make_runtime_safe()

View File

@ -23,11 +23,7 @@ from torch._higher_order_ops.triton_kernel_wrap import (
from torch._inductor.codecache import LambdaFuture, PyCodeCache
from torch._inductor.runtime.triton_heuristics import CachingAutotuner
from torch._inductor.select_algorithm import extern_kernels # noqa: F401
from torch._inductor.utils import (
convert_shape_to_symint,
convert_to_symint,
sympy_product,
)
from torch._inductor.utils import convert_shape_to_symint, convert_to_symint
from torch._inductor.virtualized import V
from torch._library.triton import wrap_triton
from torch.fx import GraphModule
@ -120,30 +116,20 @@ def replace_floor_div(expr: sympy.Expr) -> sympy.Expr:
def replace(expr: sympy.Expr) -> sympy.Expr:
expr = sympy.together(expr)
# Find division operations in the sympy.floor expression
# Div is either represented as Mul with:
# Rational denominator or Pow with negative exponent
if not isinstance(expr, sympy.core.mul.Mul):
return sympy.floor(expr)
if isinstance(expr.args[0], sympy.Rational):
frac = expr.args[0]
numerator = sympy_product(expr.args[1:]) * frac.numerator
denominator = frac.denominator
return FloorDiv(numerator, denominator)
elif isinstance(expr.args[0], sympy.Pow):
base = expr.args[0].base
exp = expr.args[0].exp
numerator = sympy_product(expr.args[1:])
if exp < 0:
denominator = base ** (-exp)
# Division is represented as a Mul with a Rational factor or a Pow with negative
# exponent. We convert floor(Mul(...)) to FloorDiv(numerator, denominator) by
# partitioning factors into the numerator and denominator.
(numerator, denominator) = (sympy.S.One,) * 2
for arg in sympy.Mul.make_args(expr):
if isinstance(arg, sympy.Rational):
numerator *= arg.numerator
denominator *= arg.denominator
elif isinstance(arg, sympy.Pow) and arg.exp.is_negative:
denominator *= arg.base**-arg.exp
else:
numerator = numerator * (base**exp)
denominator = 1
return FloorDiv(numerator, denominator)
else:
return sympy.floor(expr)
numerator *= arg
return FloorDiv(numerator, denominator)
return expr.replace(sympy.floor, replace)
@ -930,10 +916,6 @@ class FxConverter:
call_args = self._lookup_args(line.call_args)
kernel = self.kernels[line.kernel_name]
tuner = kernel.tuner
# Use python_slow mode instead of python mode to avoid
# the round to neginf behaviour, which is not the convention
# in other languages.
tuner.grid_mode = "python_slow"
# Optionally autotune the kernels.
# The FX backend currently only supports compile-time tuning.
@ -1007,8 +989,7 @@ class FxConverter:
call_kwargs = dict(zip(signature, call_args))
call_kwargs.update(kernel_config.kwargs)
# Replace all sympy.floor with FloorDiv
# _generate_sym_node does not support sympy.floor
# Replace sympy.floor with FloorDiv, to make the expression traceable.
grid = [replace_floor_div(x) if isinstance(x, sympy.Expr) else x for x in grid]
wrapper_grid = [tuple(self._generate_sym_nodes(grid))]
call_kwargs = {

View File

@ -880,14 +880,6 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None:
if not is_symm_mem_enabled_for_group(group_name):
return
filter_matmul = None
if orig_scatter_dim == _get_tensor(input_node).ndim - 1:
# scaled_mm is not supported yet for last dim mm+rs
def _filter_out_scaled_matmul(matmul: _Matmul):
return not isinstance(matmul, _ScaledMatmul)
filter_matmul = _filter_out_scaled_matmul
# Currently fused_matmul_reduce_scatter doesn't return the matmul result,
# so we can't apply the fusion if the matmul result is used by multiple
# users. This is not a fundamental limitation of the fused op and can be
@ -899,16 +891,12 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None:
return
matmul = _find_producer_matmul(input_node)
if matmul is None:
log.warning(
"no producer matmul found for reduce scatter, skipping fuse_matmul_reduce_scatter fusion"
)
return
if filter_matmul and not filter_matmul(matmul):
return
if rs_wait_tensor_node in matmul.arg_ancestor_nodes:
log.warning(
"reduce-scatter result node is an ancestor of matmul, skipping fuse_matmul_reduce_scatter fusion"

View File

@ -375,7 +375,7 @@ class CachingAutotuner(KernelInterface):
self.is_backward = False
# Mode for launch grid calculation
self.grid_mode: Literal["python", "python_slow", "cpp"] = "python"
self.grid_mode: Literal["python", "cpp"] = "python"
def is_statically_launchable(self):
"""
@ -3192,14 +3192,14 @@ class GridExpr:
"""Generate code for grid size expressions in launcher"""
inductor_meta: dict[str, Any]
mode: Literal["python", "cpp", "python_slow"] = "python"
mode: Literal["python", "cpp"] = "python"
prefix: list[str] = dataclasses.field(default_factory=list)
x_grid: Union[str, int] = 1
y_grid: Union[str, int] = 1
z_grid: Union[str, int] = 1
def __post_init__(self) -> None:
assert self.mode in ("python", "cpp", "python_slow")
assert self.mode in ("python", "cpp")
def generate(self, meta: dict[str, int]) -> None:
raise NotImplementedError
@ -3215,10 +3215,6 @@ class GridExpr:
# negative integer division is floored
if self.mode == "python":
return f"-(({numel}) // -({block}))"
# This is more generic than above, and works in languages where
# positive integer division is floored/truncated
elif self.mode == "python_slow":
return f"(({numel} + {block} - 1) // ({block}))"
# For cpp code gen
return f"(({numel} + ({block} - 1)) / ({block}))"
@ -3227,7 +3223,7 @@ class GridExpr:
items = self._constant_fold(max, seq)
if len(items) <= 1:
return items[0]
if self.mode in ("python", "python_slow"):
if self.mode == "python":
return f"max({', '.join(map(str, items))})"
return functools.reduce(lambda x, y: f"std::max({x}, {y})", items)
@ -3250,7 +3246,7 @@ class GridExpr:
def assign_tmp(self, name: str, expr: Union[str, int]) -> str:
# Grid functions are one per kernel, so name collisions are fine
if self.mode in ("python", "python_slow"):
if self.mode == "python":
return f"{name} = {expr}"
if self.mode == "cpp":
return f"uint32_t {name} = {expr};"
@ -3260,7 +3256,7 @@ class GridExpr:
def from_meta(
inductor_meta: dict[str, Any],
cfg: Union[Config, dict[str, int]],
mode: Literal["python", "cpp", "python_slow"] = "python",
mode: Literal["python", "cpp"] = "python",
) -> GridExpr:
grid_cls = globals()[inductor_meta["grid_type"]]
assert issubclass(grid_cls, GridExpr)

View File

@ -638,7 +638,6 @@ class profile:
device_resource_id=kineto_event.device_resource_id(),
flops=kineto_event.flops(),
is_user_annotation=kineto_event.is_user_annotation(),
metadata_json=kineto_event.metadata_json(),
)
max_evt_id = max(max_evt_id, fe.id)
if fe.device_type == DeviceType.CPU and not fe.is_async:

View File

@ -491,7 +491,6 @@ class FunctionEvent(FormattedTimesMixin):
concrete_inputs=None,
kwinputs=None,
is_user_annotation=False,
metadata_json=None,
):
self.id: int = id
self.node_id: int = node_id
@ -527,7 +526,6 @@ class FunctionEvent(FormattedTimesMixin):
self.self_cpu_percent = -1
self.total_cpu_percent = -1
self.total_device_percent = -1
self.metadata_json = metadata_json
def append_kernel(self, name, device, duration):
assert self.device_type == DeviceType.CPU

View File

@ -15,9 +15,7 @@
#include <torch/csrc/utils/cpp_stacktraces.h>
#include <torch/csrc/utils/pybind.h>
#if defined(USE_DISTRIBUTED)
#include <torch/csrc/distributed/c10d/exception.h>
#endif
inline void PyErr_SetString(PyObject* type, const std::string& message) {
PyErr_SetString(type, message.c_str());

View File

@ -72,7 +72,6 @@
#include <torch/csrc/cpu/Module.h>
#include <torch/csrc/dynamo/init.h>
#include <torch/csrc/export/pybind.h>
#include <torch/csrc/functionalization/Module.h>
#include <torch/csrc/functorch/init.h>
#include <torch/csrc/fx/node.h>
#include <torch/csrc/inductor/aoti_package/pybind.h>
@ -122,14 +121,10 @@
#endif
#endif
#ifdef USE_DISTRIBUTED
#ifdef USE_C10D
#include <torch/csrc/distributed/autograd/python_autograd.h>
#include <torch/csrc/distributed/c10d/c10d.h>
#include <torch/csrc/distributed/rpc/rpc.h>
#include <torch/csrc/distributed/rpc/testing/testing.h>
#endif
#endif
#if defined(USE_VALGRIND)
#include <callgrind.h>
@ -409,9 +404,11 @@ static PyObject* THPModule_swap_tensor_impl(PyObject* _unused, PyObject* args) {
// The TensorImpls contain PyObjectSlots that have a reference to the PyObject
// associated with the TensorImpl. Swap this field as well.
std::optional<PyObject*> mb_obj_a =
a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
/*ignore_hermetic_tls=*/false);
std::optional<PyObject*> mb_obj_b =
b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
/*ignore_hermetic_tls=*/false);
TORCH_INTERNAL_ASSERT(
mb_obj_a.has_value() && mb_obj_b.has_value(),
"Both tensors should have PyObjects tagged by the current python interpreter");
@ -552,11 +549,7 @@ static PyObject* THPModule_getBackcompatKeepdimWarn(
}
static PyObject* THPModule_hasDistributed(PyObject* _unused, PyObject* noargs) {
#ifdef USE_DISTRIBUTED
Py_RETURN_TRUE;
#else
Py_RETURN_FALSE;
#endif
}
static PyObject* THPModule_showConfig(PyObject* module, PyObject* noargs) {
@ -2008,7 +2001,6 @@ PyObject* initModule() {
#ifdef USE_XPU
THPUtils_addPyMethodDefs(methods, THXPModule_methods());
#endif
#if defined(USE_DISTRIBUTED) && defined(USE_C10D)
THPUtils_addPyMethodDefs(
methods, torch::distributed::c10d::python_functions());
#ifndef _WIN32
@ -2018,7 +2010,6 @@ PyObject* initModule() {
methods, torch::distributed::autograd::python_functions());
THPUtils_addPyMethodDefs(
methods, torch::distributed::rpc::testing::python_functions());
#endif
#endif
static struct PyModuleDef torchmodule = {
@ -2091,7 +2082,6 @@ PyObject* initModule() {
torch::instruction_counter::initModule(module);
torch::initVerboseBindings(module);
ASSERT_TRUE(THPStorage_init(module));
torch::functionalization::initModule(module);
#ifdef USE_CUDA
// This will only initialise base classes and attach them to library namespace

View File

@ -614,7 +614,8 @@ static void set_tensor_attr_with_capsule(
const c10::TensorImpl* tensor,
py::capsule& capsule,
const char* attr_name) {
std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj();
std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj(
/*ignore_hermetic_tls=*/false);
TORCH_CHECK(
mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
auto obj = mb_obj.value();
@ -641,7 +642,8 @@ static c10::ArrayRef<T> get_set_cached_attr(
const c10::TensorImpl* tensor,
const char* base_attr_name,
const py::object& obj) {
std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj();
std::optional<PyObject*> mb_obj =
tensor->pyobj_slot()->check_pyobj(getPyInterpreter());
TORCH_CHECK(
mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
auto tensor_obj = mb_obj.value();

View File

@ -41,8 +41,8 @@ PyObject* THPStorage_NewWithStorage(
"Creating a Storage subclass from a class that does not inherit from ",
"Storage is not possible. Make sure your class inherits from Storage.");
auto maybe_pyobj =
_storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj();
auto maybe_pyobj = _storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(
/*ignore_hermetic_tls=*/false);
if (maybe_pyobj.has_value() && maybe_pyobj.value()) {
TORCH_CHECK(
allow_preexisting_pyobj,
@ -93,7 +93,8 @@ PyObject* THPStorage_Wrap(c10::Storage storage) {
}
c10::impl::PyObjectSlot* pyobj_slot = storage_impl->pyobj_slot();
std::optional<PyObject*> maybe_pyobj = pyobj_slot->check_pyobj();
std::optional<PyObject*> maybe_pyobj = pyobj_slot->check_pyobj(
/*ignore_hermetic_tls=*/false);
if (maybe_pyobj.has_value()) {
auto obj = *maybe_pyobj;
if (obj) {
@ -126,8 +127,8 @@ static bool THPStorage_isPreservable(THPStorage* self) {
return false;
}
if (storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj() !=
(PyObject*)self) {
if (storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(
/*ignore_hermetic_tls=*/true) != (PyObject*)self) {
return false;
}
if (storage.use_count() <= 1) {
@ -144,7 +145,8 @@ static bool THPStorage_tryPreserve(THPStorage* self) {
const auto& storage = THPStorage_Unpack(self);
c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl();
auto maybe_pyobj = storage_impl->pyobj_slot()->check_pyobj();
auto maybe_pyobj = storage_impl->pyobj_slot()->check_pyobj(
/*ignore_hermetic_tls=*/true);
// NOTE: It is possible to just set the PyObjectSlot here, but the point is
// that we should have already set PyObjectSlot when the storage PyObject
// was created.

View File

@ -245,12 +245,13 @@ static void general_trace_function(
tracer::addInputs(
node, args[i].name().c_str(), iter->toBoolList().vec());
} else {
TORCH_CHECK(false, "unsupported input list type: ", elem_type->str());
throw std::runtime_error(
"unsupported input list type: " + elem_type->str());
}
} else if (iter->isObject()) {
tracer::addInputs(node, args[i].name().c_str(), iter->toObject());
} else {
TORCH_CHECK(false, "unsupported input type: ", type->str());
throw std::runtime_error("unsupported input type: " + type->str());
}
}
graph->insertNode(node);
@ -276,19 +277,16 @@ static void general_trace_function(
AT_ASSERT(iter->isTensorList());
tracer::addOutput(node, iter->toTensorList());
} else {
TORCH_CHECK(
false, "unsupported output list type: ", elem_type->str());
throw std::runtime_error(
"unsupported output list type: " + elem_type->str());
}
} else if (type->kind() == TypeKind::ClassType) {
AT_ASSERT(iter->isObject());
tracer::addOutput(node, iter->toObject());
} else {
TORCH_CHECK(
false,
"unsupported output type: ",
type->str(),
", from operator: ",
toString(op.operator_name()));
throw std::runtime_error(
"unsupported output type: " + type->str() +
", from operator: " + toString(op.operator_name()));
}
}
}

View File

@ -11,8 +11,10 @@ void check_single_result(
const at::TensorBase& value,
const at::TensorBase& result,
const std::string& hook_name) {
TORCH_CHECK(
value.defined(), "can't replace a empty gradient with a non-empty value");
if (!value.defined()) {
throw std::runtime_error(
"can't replace a empty gradient with a non-empty value");
}
torch::autograd::check_variable_result(value, result, hook_name);
}
} // namespace

View File

@ -482,31 +482,30 @@ void check_variable_result(
const at::TensorBase& original,
const at::TensorBase& result,
const std::string& hook_name) {
TORCH_CHECK(
original.options().type_equal(result.options()),
"hook '",
hook_name,
"' has changed the type of value (was ",
original.toString(),
" got ",
result.toString(),
")");
if (!original.options().type_equal(result.options())) {
std::stringstream ss;
ss << "hook '" << hook_name << "' has changed the type of value (";
ss << "was " << original.toString() << " got ";
ss << result.toString() << ")";
throw std::runtime_error(ss.str());
}
TORCH_CHECK(
original.is_cuda() == result.is_cuda(),
"hook '",
hook_name,
"' has changed the type of value (was ",
original.is_cuda() ? "CUDA tensor" : "CPU tensor",
" got ",
result.is_cuda() ? "CUDA tensor" : "CPU tensor",
")");
if (original.is_cuda() != result.is_cuda()) {
std::stringstream ss;
ss << "hook '" << hook_name << "' has changed the type of value";
if (original.is_cuda()) {
ss << " (was CUDA tensor got CPU tensor)";
} else {
ss << " (was CPU tensor got CUDA tensor)";
}
throw std::runtime_error(ss.str());
}
TORCH_CHECK(
original.sym_sizes().vec() == result.sym_sizes().vec(),
"hook '",
hook_name,
"' has changed the size of value");
if (original.sym_sizes().vec() != result.sym_sizes().vec()) {
std::stringstream ss;
ss << "hook '" << hook_name << "' has changed the size of value";
throw std::runtime_error(ss.str());
}
}
AutogradContext::AutogradContext(PackedArgs& packed_args) {

View File

@ -228,32 +228,30 @@ inline variable_list CppNode_apply_functional(
}
}
TORCH_CHECK(
num_outputs == num_forward_inputs,
"function ",
name,
" returned an incorrect number of gradients (expected ",
num_forward_inputs,
", got ",
num_outputs,
")");
if (num_outputs != num_forward_inputs) {
std::string msg("function ");
msg += name + " returned an incorrect number of gradients (expected ";
msg += std::to_string(num_forward_inputs) + ", got ";
msg += std::to_string(num_outputs) + ")";
throw std::runtime_error(msg);
}
variable_list results;
results.reserve(num_outputs);
for (const auto i : c10::irange(num_outputs)) {
if (!is_variable_input_[i]) {
TORCH_CHECK(
outputs[i].defined() == false,
"function ",
name,
" returned a gradient different that is defined at position ",
i + 1,
", std the corresponding forward input was not a Variable");
if (outputs[i].defined()) {
std::string msg("function ");
msg += name +
" returned a gradient different that is defined at position ";
msg += std::to_string(i + 1) +
", std the corresponding forward input was not a Variable";
throw std::runtime_error(msg);
}
continue;
}
results.emplace_back(outputs[i]);
}
return results;
}

View File

@ -707,8 +707,9 @@ void GraphTask::mark_as_completed_and_run_post_processing() {
}
void GraphTask::exec_post_processing() {
TORCH_CHECK(
not_ready_.empty(), "could not compute gradients for some functions");
if (!not_ready_.empty()) {
throw std::runtime_error("could not compute gradients for some functions");
}
// set the thread_local current_graph_task_ as more callbacks can be installed
// by existing final callbacks.
@ -1148,13 +1149,12 @@ void Engine::evaluate_function(
for (const auto i : c10::irange(num_outputs)) {
auto& output = outputs[i];
at::OptionalDeviceGuard guard(device_of(output));
TORCH_CHECK(
!output.defined() || !isnan(output)._is_any_true().item<bool>(),
"Function '",
fn.name(),
"' returned nan values in its ",
i,
"th output.");
if (output.defined() && isnan(output)._is_any_true().item<bool>()) {
std::stringstream ss;
ss << "Function '" << fn.name() << "' returned nan values in its " << i
<< "th output.";
throw std::runtime_error(ss.str());
}
}
}
@ -1175,7 +1175,7 @@ void Engine::evaluate_function(
if (it == dependencies.end()) {
auto name = next.function->name();
TORCH_CHECK(false, "dependency not found for ", name);
throw std::runtime_error(std::string("dependency not found for ") + name);
} else if (--it->second == 0) {
dependencies.erase(it);
is_ready = true;

View File

@ -17,7 +17,7 @@ variable_list Error::apply(variable_list&& inputs) {
}
variable_list Error::apply(variable_list&& inputs) const {
TORCH_CHECK(false, msg);
throw std::runtime_error(msg);
}
void Error::compiled_args(CompiledNodeArgs& args) const {

View File

@ -8,9 +8,7 @@
#include <torch/csrc/autograd/python_autograd.h>
#include <torch/csrc/autograd/python_cpp_function.h>
#include <torch/csrc/autograd/python_variable.h>
#ifdef USE_DISTRIBUTED
#include <torch/csrc/distributed/autograd/functions/sendrpc_backward.h>
#endif
#include <torch/csrc/jit/python/python_tracer.h>
#include <torch/csrc/utils/pybind.h>
#include <torch/csrc/utils/python_numbers.h>
@ -49,7 +47,7 @@ struct UndefinedGradCtor {
struct NoCtor {
Node* operator()(PyObject* args) {
TORCH_CHECK(false, "Cannot construct");
throw std::runtime_error("Cannot construct");
}
};
@ -150,11 +148,9 @@ void THPAutograd_initFunctions() {
static PyTypeObject CopyBackwardsClass;
addClass<CopyBackwards, NoCtor>(module, CopyBackwardsClass, "CopyBackwards");
#ifdef USE_DISTRIBUTED
static PyTypeObject SendRpcBackwardClass;
addClass<torch::distributed::autograd::SendRpcBackward, NoCtor>(
module, SendRpcBackwardClass, "SendRpcBackward");
#endif
static PyTypeObject CopySlicesClass;
addClass<CopySlices, NoCtor>(module, CopySlicesClass, "CopySlices");

View File

@ -184,7 +184,9 @@ inline variable_list CopySlices::apply_impl(
// see Note [Thread Safety on Autograd Node]
std::lock_guard<std::mutex> lock(mutex_);
TORCH_CHECK(fn, ERR_BACKWARD_TWICE);
if (!fn) {
throw std::runtime_error(ERR_BACKWARD_TWICE);
}
auto result =
grad.new_empty_strided_symint(base.sym_sizes(), base.sym_strides());
@ -250,7 +252,9 @@ variable_list CopySlices::apply_with_saved(
auto results = variable_list(num_outputs());
if (grads[0].defined()) {
TORCH_CHECK(fn, ERR_BACKWARD_TWICE);
if (!fn) {
throw std::runtime_error(ERR_BACKWARD_TWICE);
}
update_exec_info();
std::vector<bool> needs_input_grad;

View File

@ -53,22 +53,18 @@ void check_input_variables(
if (required_args == -1) {
required_args = args;
}
TORCH_CHECK(
inputs.size() == static_cast<size_t>(args),
name,
": expected ",
args,
" arguments (got ",
inputs.size(),
")");
if (inputs.size() != static_cast<size_t>(args)) {
std::stringstream ss;
ss << name << ": expected " << args << " arguments (got " << inputs.size();
ss << ")";
throw std::runtime_error(ss.str());
}
for (const auto i : c10::irange(required_args)) {
TORCH_CHECK(
inputs[i].defined() || allow_undefined,
name,
": expected Tensor at argument ",
i,
" (got None)");
if (!inputs[i].defined() && !allow_undefined) {
std::stringstream ss;
ss << name << ": expected Tensor at argument " << i << " (got None)";
throw std::runtime_error(ss.str());
}
}
}
} // namespace torch::autograd

View File

@ -309,12 +309,8 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
})
.def("nbytes", [](const KinetoEvent& e) { return e.nBytes(); })
// whether the event is hidden
.def(
"is_hidden_event",
[](const KinetoEvent& e) { return e.isHiddenEvent(); })
// KinetoEvent metadata
.def("metadata_json", [](const KinetoEvent& e) {
return e.metadataJson();
.def("is_hidden_event", [](const KinetoEvent& e) {
return e.isHiddenEvent();
});
m.def("_soft_assert_raises", &setSoftAssertRaises);

View File

@ -37,8 +37,7 @@ extern "C" {
// https://github.com/pytorch/pytorch/issues/51026
__attribute__((weak)) int acc_get_device_type();
__attribute__((weak)) int acc_get_device_type() {
TORCH_CHECK(
false,
throw std::runtime_error(
"Dummy implementation of acc_get_device_type is not supposed to be called!");
}
} // extern "C"
@ -1068,17 +1067,6 @@ void KinetoEvent::getPerfEventCounters(std::vector<uint64_t>& in) const {
[](const auto&) -> void { return; }));
}
std::string KinetoEvent::metadataJson() const {
return result_->visit(c10::overloaded(
[](const ExtraFields<EventType::TorchOp>& op) -> std::string {
return op.metadata_json_;
},
[](const ExtraFields<EventType::Kineto>& op) -> std::string {
return op.metadata_json_;
},
[](const auto&) -> std::string { return std::string(""); }));
}
#define FORWARD_FROM_RESULT(method_name, result_expr) \
decltype(std::declval<KinetoEvent>().method_name()) \
KinetoEvent::method_name() const { \

View File

@ -65,7 +65,6 @@ struct TORCH_API KinetoEvent {
int64_t privateuse1ElapsedUs() const;
void getPerfEventCounters(torch::profiler::perf_counters_t&) const;
extra_meta_t extraMeta() const;
std::string metadataJson() const;
private:
torch::profiler::impl::ProfilerVoidEventStub fallbackStart() const;

View File

@ -97,7 +97,7 @@ struct TORCH_API LegacyEvent {
case EventKind::MemoryAlloc:
return "memory_alloc";
}
TORCH_CHECK(false, "unknown event kind");
throw std::runtime_error("unknown event kind");
}
EventKind kind() const {

View File

@ -30,7 +30,7 @@ void PyAnomalyMetadata::store_stack() {
void PyAnomalyMetadata::print_stack(const std::string& current_node_name) {
pybind11::gil_scoped_acquire gil;
if (!PyDict_Check(dict())) {
TORCH_CHECK(false, "Anomaly metadata is not a python dictionary.");
throw std::runtime_error("Anomaly metadata is not a python dictionary.");
}
PyObject* trace_stack = nullptr;
if (PyDict_GetItemStringRef(dict(), ANOMALY_TRACE_KEY, &trace_stack) < 0) {

View File

@ -261,7 +261,8 @@ PyTypeObject* _initFunctionPyTypeObject(
type.tp_traverse = THPCppFunction_traverse;
type.tp_clear = THPCppFunction_clear;
if (PyType_Ready(&type) < 0) {
TORCH_CHECK(false, "Unable to instantiate PyTypeObject for ", name);
auto msg = std::string("Unable to instantiate PyTypeObject for ") + name;
throw std::runtime_error(msg);
}
return &type;
}

View File

@ -501,7 +501,7 @@ static void child_atfork() {
bool THPEngine_initModule(PyObject* module) {
#ifndef _WIN32
if (pthread_atfork(nullptr, nullptr, child_atfork) != 0) {
TORCH_CHECK(false, "unable to set pthread_atfork handler");
throw std::runtime_error("unable to set pthread_atfork handler");
}
#endif
if (PyType_Ready(&THPEngineType) < 0)

View File

@ -188,15 +188,13 @@ auto PyNode::apply(variable_list&& inputs) -> variable_list {
}
// Now the number of gradients should match
TORCH_CHECK(
num_outputs == num_forward_inputs,
"function ",
name(),
" returned an incorrect number of gradients (expected ",
num_forward_inputs,
", got ",
num_outputs,
")");
if (num_outputs != num_forward_inputs) {
std::string msg("function ");
msg += name() + " returned an incorrect number of gradients (expected ";
msg += std::to_string(num_forward_inputs) + ", got ";
msg += std::to_string(num_outputs) + ")";
throw std::runtime_error(msg);
}
// Massage the Python results tuple back into a C++ variable_list
return to_variable_list(r.get(), is_variable_input);
@ -437,24 +435,24 @@ variable_list PyNode::to_variable_list(
PyObject* output = PyTuple_GET_ITEM(outputs, i);
bool was_variable = is_variable_input[i];
if (!was_variable) {
TORCH_CHECK(
output == Py_None,
"function ",
name(),
" returned a gradient different than None at position ",
i + 1,
", but the corresponding forward input was not a Variable");
if (output != Py_None) {
std::string msg("function ");
msg += name() + " returned a gradient different than None at position ";
msg += std::to_string(i + 1) +
", but the corresponding forward input was not a Variable";
throw std::runtime_error(msg);
}
continue;
}
if (output == Py_None) {
results.emplace_back();
} else {
TORCH_CHECK(
THPVariable_Check(output),
"expected Variable or None (got ",
THPUtils_typename(output),
")");
if (!THPVariable_Check(output)) {
std::string msg("expected Variable or None (got ");
msg += THPUtils_typename(output);
msg += ")";
throw std::runtime_error(msg);
}
results.emplace_back(THPVariable_Unpack(output));
}
}

View File

@ -289,7 +289,9 @@ static variable_list unwrap_variables(PyObject* py_variables) {
results[i] = THPVariable_Unpack(item);
} else {
// this should never happen, but just in case...
TORCH_CHECK(false, "expected variable but got ", Py_TYPE(item)->tp_name);
std::stringstream ss;
ss << "expected variable but got " << Py_TYPE(item)->tp_name;
throw std::runtime_error(ss.str());
}
}
return results;
@ -306,16 +308,14 @@ static void check_result(PyObject* prev, PyObject* result, PyObject* hook) {
auto prev_size = PyTuple_GET_SIZE(prev);
auto result_size = PyTuple_GET_SIZE(result);
TORCH_CHECK(
prev_size == result_size,
"hook '",
hook_name(hook),
"' has returned an incorrect number of values (got ",
result_size,
", but expected ",
prev_size,
")");
if (prev_size != result_size) {
std::stringstream ss;
auto name = hook_name(hook);
ss << "hook '" << name << "' has returned an incorrect number ";
ss << "of values (got " << result_size << ", but expected ";
ss << prev_size << ")";
throw std::runtime_error(ss.str());
}
for (const auto i : c10::irange(prev_size)) {
check_single_result(
@ -330,9 +330,10 @@ static void check_single_result(
if (_result == Py_None)
return;
TORCH_CHECK(
_original != Py_None,
"can't replace a None gradient with a non-None value");
if (_original == Py_None) {
throw std::runtime_error(
"can't replace a None gradient with a non-None value");
}
if (!PyObject_IsInstance(_result, THPVariableClass)) {
PyErr_Format(

View File

@ -644,6 +644,15 @@ void initTorchFunctions(PyObject* module) {
at::functionalization::impl::isFunctionalTensor(t));
at::functionalization::impl::mark_mutation_hidden_from_autograd(t);
});
py_module.def(
"_functionalize_apply_view_metas",
[](const at::Tensor& tensor, const at::Tensor& base) {
TORCH_INTERNAL_ASSERT(
at::functionalization::impl::isFunctionalTensor(tensor));
auto impl =
at::functionalization::impl::unsafeGetFunctionalWrapper(tensor);
return impl->apply_view_metas(base);
});
py_module.def("_functionalize_is_symbolic", [](const at::Tensor& t) {
TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(t));
auto impl = at::functionalization::impl::unsafeGetFunctionalWrapper(t);

View File

@ -265,7 +265,8 @@ PyObject* THPVariable_Wrap(const at::TensorBase& var) {
}
std::optional<PyObject*> mb_obj =
var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
/*ignore_hermetic_tls=*/false);
if (mb_obj.has_value()) {
auto obj = *mb_obj;
if (obj) {
@ -328,8 +329,8 @@ static bool isResurrectable(THPVariable* self) {
return false;
}
// Check if this is hermetic. If it is, no resurrection.
if (tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj() !=
(PyObject*)self) {
if (tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
/*ignore_hermetic_tls=*/false) != (PyObject*)self) {
return false;
}
return true;
@ -354,7 +355,8 @@ static bool THPVariable_tryResurrect(THPVariable* self) {
!tensor.unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj());
c10::TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
auto maybe_pyobj = tensor_impl->pyobj_slot()->check_pyobj();
auto maybe_pyobj = tensor_impl->pyobj_slot()->check_pyobj(
/*ignore_hermetic_tls=*/false);
TORCH_INTERNAL_ASSERT(
maybe_pyobj.has_value(),
@ -2221,8 +2223,8 @@ static int THPVariable_subclass_clear(THPVariable* self) {
// because Tensor asked us to (it's already destructing).
if (!self->cdata.unsafeIsBorrowed() &&
tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj() ==
(PyObject*)self) {
tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
/*ignore_hermetic_tls=*/false) == (PyObject*)self) {
// TODO: empirically, on OS X this assert appears to be untrue
// In test_py_tensors_multi_async_call - ProcessGroupRpcTestWithSpawn
// distributed/rpc/test_process_group_agent.py
@ -2408,7 +2410,8 @@ static PyObject* THPVariable_NewWithVar(
// This function overwrite the Tensor's pyobj field without extra checks
// Make sure it is not set otherwise we would leak memory
auto mb_obj = _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
auto mb_obj = _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
/*ignore_hermetic_tls=*/false);
// Under some circumstances, we may attempt to create a new Python
// object for a variable that already has a Python object. The most common

View File

@ -11,8 +11,8 @@ struct TORCH_API SavedVariableHooks {
virtual ~SavedVariableHooks() = default;
virtual std::optional<std::pair<c10::SafePyObject, c10::SafePyObject>>
retrieve_unpack_hook_data() const {
TORCH_CHECK(
false, "Compiled Autograd only supports python saved tensor hooks ");
throw std::runtime_error(
"Compiled Autograd only supports python saved tensor hooks ");
}
};

View File

@ -17,8 +17,8 @@ inline std::tuple<
std::optional<at::MemoryFormat>>
parse_to_conversion(PythonArgs& r, bool allow_copy) {
if (r.idx == 0) {
TORCH_CHECK(
allow_copy || r.isNone(3), ".to() does not accept copy argument");
if (!allow_copy && !r.isNone(3))
throw std::runtime_error(".to() does not accept copy argument");
return std::make_tuple(
r.deviceOptional(0),
r.scalartypeOptional(1),
@ -26,8 +26,8 @@ parse_to_conversion(PythonArgs& r, bool allow_copy) {
r.toBool(3),
r.memoryformatOptional(4));
} else if (r.idx == 1) {
TORCH_CHECK(
allow_copy || r.isNone(2), ".to() does not accept copy argument");
if (!allow_copy && !r.isNone(2))
throw std::runtime_error(".to() does not accept copy argument");
return std::make_tuple(
std::nullopt,
r.scalartype(0),
@ -36,8 +36,8 @@ parse_to_conversion(PythonArgs& r, bool allow_copy) {
r.memoryformatOptional(3));
} else {
auto tensor = r.tensor(0);
TORCH_CHECK(
allow_copy || r.isNone(2), ".to() does not accept copy argument");
if (!allow_copy && !r.isNone(2))
throw std::runtime_error(".to() does not accept copy argument");
return std::make_tuple(
tensor.device(),
tensor.scalar_type(),

View File

@ -597,9 +597,10 @@ void VariableHooks::_backward(
void VariableHooks::requires_grad_(
const at::TensorBase& self,
bool _requires_grad) const {
TORCH_CHECK(
self.is_leaf() || _requires_grad,
autograd::utils::requires_grad_leaf_error(_requires_grad));
if (!self.is_leaf() && !_requires_grad) {
throw std::runtime_error(
autograd::utils::requires_grad_leaf_error(_requires_grad));
}
self.set_requires_grad(_requires_grad);
}
@ -623,7 +624,7 @@ const at::TensorBase& VariableHooks::base(const at::TensorBase& self) const {
"Can't get base of non-backward view Tensor");
return diff_view_meta->get_backward_view().base_;
} else {
TORCH_CHECK(false, "Can't get base of non-view Tensor");
throw std::runtime_error("Can't get base of non-view Tensor");
}
}

View File

@ -1,6 +1,5 @@
#include <torch/csrc/distributed/c10d/HashStore.hpp>
#include <unistd.h>
#include <cstdint>
#include <chrono>

View File

@ -1,5 +1,5 @@
#include <ATen/ThreadLocalState.h>
#include <distributed/c10d/ProcessGroup.hpp>
#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
#include <torch/csrc/distributed/c10d/cuda/StreamBlock.hpp>
#include <torch/csrc/distributed/c10d/Work.hpp>

Some files were not shown because too many files have changed in this diff Show More