mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-25 16:14:55 +08:00
Compare commits
8 Commits
logsumexp
...
brister/fx
| Author | SHA1 | Date | |
|---|---|---|---|
| 1175db1e14 | |||
| ee9ff543ea | |||
| a01600a0b4 | |||
| de01602ff6 | |||
| 3460bd6897 | |||
| c53a2ae78e | |||
| 194480f0ec | |||
| 0a11d1d0db |
@ -35,11 +35,10 @@ fi
|
||||
|
||||
print_cmake_info
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
|
||||
# Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
|
||||
USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
|
||||
USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
|
||||
else
|
||||
# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
|
||||
# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
|
||||
# NB: we always build with distributed; USE_DISTRIBUTED turns off all
|
||||
# backends (specifically the gloo backend), so test that this case works too
|
||||
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
|
||||
fi
|
||||
if which sccache > /dev/null; then
|
||||
|
||||
@ -13,9 +13,13 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
|
||||
fi
|
||||
popd
|
||||
|
||||
python -mpip install -r requirements.txt
|
||||
|
||||
# enable debug asserts in serialization
|
||||
export TORCH_SERIALIZATION_DEBUG=1
|
||||
|
||||
python -mpip install --no-input -r requirements.txt
|
||||
|
||||
setup_test_python() {
|
||||
# The CircleCI worker hostname doesn't resolve to an address.
|
||||
# This environment variable makes ProcessGroupGloo default to
|
||||
|
||||
@ -177,7 +177,8 @@ source ~/${desired_python}-build/bin/activate
|
||||
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
|
||||
retry brew install libomp
|
||||
|
||||
# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
|
||||
# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
|
||||
# is build as part of tensorpipe submodule
|
||||
export USE_DISTRIBUTED=1
|
||||
|
||||
export USE_MKLDNN=OFF
|
||||
|
||||
2
.github/workflows/pull.yml
vendored
2
.github/workflows/pull.yml
vendored
@ -127,8 +127,6 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
# More memory is needed to build with asan
|
||||
runner: linux.2xlarge.memory
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.10-clang18-asan
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
|
||||
|
||||
2
.github/workflows/slow.yml
vendored
2
.github/workflows/slow.yml
vendored
@ -140,8 +140,6 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
# More memory is needed to build with asan
|
||||
runner: linux.2xlarge.memory
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.10-clang18-asan
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@ -82,7 +82,6 @@ torch/return_types.pyi
|
||||
torch/nn/functional.pyi
|
||||
torch/utils/data/datapipes/datapipe.pyi
|
||||
torch/csrc/autograd/generated/*
|
||||
torch/csrc/functionalization/generated/*
|
||||
torch/csrc/lazy/generated/*.[!m]*
|
||||
torch_compile_debug/
|
||||
# Listed manually because some files in this directory are not generated
|
||||
|
||||
@ -22,7 +22,6 @@ COMMON_COPTS = [
|
||||
"-DHAVE_SHM_UNLINK=1",
|
||||
"-D_FILE_OFFSET_BITS=64",
|
||||
"-DUSE_FBGEMM",
|
||||
"-DUSE_DISTRIBUTED",
|
||||
"-DAT_PER_OPERATOR_HEADERS",
|
||||
"-DATEN_THREADING=NATIVE",
|
||||
"-DNO_CUDNN_DESTROY_HANDLE",
|
||||
@ -91,8 +90,6 @@ generated_cpu_cpp = [
|
||||
"aten/src/ATen/NativeMetaFunctions.h",
|
||||
"aten/src/ATen/RegistrationDeclarations.h",
|
||||
"aten/src/ATen/VmapGeneratedPlumbing.h",
|
||||
"aten/src/ATen/ViewMetaClasses.h",
|
||||
"aten/src/ATen/ViewMetaClasses.cpp",
|
||||
"aten/src/ATen/core/aten_interned_strings.h",
|
||||
"aten/src/ATen/core/enum_tag.h",
|
||||
"aten/src/ATen/core/TensorBody.h",
|
||||
@ -813,7 +810,7 @@ cc_library(
|
||||
name = "torch_python",
|
||||
srcs = libtorch_python_core_sources
|
||||
+ if_cuda(libtorch_python_cuda_sources)
|
||||
+ if_cuda(libtorch_python_distributed_sources)
|
||||
+ libtorch_python_distributed_sources
|
||||
+ GENERATED_AUTOGRAD_PYTHON,
|
||||
hdrs = glob([
|
||||
"torch/csrc/generic/*.cpp",
|
||||
@ -1077,7 +1074,6 @@ test_suite(
|
||||
"aten/src/ATen/templates/LazyNonNativeIr.h",
|
||||
"aten/src/ATen/templates/RegisterDispatchKey.cpp",
|
||||
"aten/src/ATen/templates/RegisterDispatchDefinitions.ini",
|
||||
"aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp",
|
||||
"aten/src/ATen/native/native_functions.yaml",
|
||||
"aten/src/ATen/native/tags.yaml",
|
||||
"aten/src/ATen/native/ts_native_functions.yaml",
|
||||
|
||||
@ -180,8 +180,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
|
||||
set(CPU_POWER ON)
|
||||
endif()
|
||||
|
||||
# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
|
||||
# tested and likely won't work without additional changes.
|
||||
# For non-supported platforms, turn USE_DISTRIBUTED off by default.
|
||||
# NB: USE_DISTRIBUTED simply disables the backend; distributed code
|
||||
# still gets built
|
||||
if(NOT LINUX AND NOT WIN32)
|
||||
set(USE_DISTRIBUTED
|
||||
OFF
|
||||
@ -261,11 +262,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
|
||||
option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
|
||||
option(USE_NATIVE_ARCH "Use -march=native" OFF)
|
||||
cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
|
||||
option(USE_DISTRIBUTED "Use distributed" ON)
|
||||
option(USE_DISTRIBUTED "Enable default distributed backends" ON)
|
||||
cmake_dependent_option(USE_NCCL "Use NCCL" ON
|
||||
"USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
|
||||
cmake_dependent_option(USE_XCCL "Use XCCL" ON
|
||||
"USE_XPU;UNIX;NOT APPLE" OFF)
|
||||
"USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
|
||||
cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
|
||||
cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
|
||||
cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
|
||||
@ -437,11 +438,10 @@ if(WIN32)
|
||||
PATH_SUFFIXES lib
|
||||
NO_DEFAULT_PATH)
|
||||
if(NOT libuv_tmp_LIBRARY)
|
||||
set(USE_DISTRIBUTED OFF)
|
||||
set(USE_GLOO OFF)
|
||||
message(
|
||||
WARNING
|
||||
"Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
|
||||
"Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
|
||||
"Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
|
||||
)
|
||||
else()
|
||||
|
||||
@ -9,6 +9,11 @@
|
||||
|
||||
namespace at::functionalization {
|
||||
|
||||
ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
|
||||
if (out_idx == this->out_index) return *this;
|
||||
return ViewMeta(forward_fn, reverse_fn, has_symbolic_inputs, is_multi_output, is_as_strided, out_idx);
|
||||
}
|
||||
|
||||
// Note [Functionalization: Alias Removal Part 2]
|
||||
// See Note [Functionalization: Alias Removal] for more details.
|
||||
// This function applies a single update from one of the views to the StorageImpl.
|
||||
@ -37,12 +42,12 @@ namespace at::functionalization {
|
||||
static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) {
|
||||
at::Tensor t = update.new_val;
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
|
||||
if (update.view_metas.empty()) { return t; }
|
||||
if (update.view_metas.empty()) return t;
|
||||
|
||||
std::vector<at::Tensor> tmp_values({base});
|
||||
tmp_values.reserve(update.view_metas.size());
|
||||
for (size_t i = 0; i < update.view_metas.size() - 1; ++i) {
|
||||
at::Tensor next_view = update.view_metas[i]->forward(tmp_values.back());
|
||||
at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index);
|
||||
// NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided
|
||||
// All of these ops require additional information to recover the sizes of the original tensor.
|
||||
// If need to, we could probably apply this optimization and only bother computing tmp_values
|
||||
@ -50,8 +55,9 @@ static const Tensor apply_update(const FunctionalStorageImpl::Update& update, co
|
||||
tmp_values.push_back(std::move(next_view));
|
||||
}
|
||||
for(int64_t i = static_cast<int64_t>(update.view_metas.size()) - 1; i >= 0; --i) {
|
||||
int64_t out_idx = update.view_metas[i].out_index;
|
||||
// Each view inverse is implemented in ViewInverses.cpp.
|
||||
t = update.view_metas[i]->reverse(tmp_values[i], t);
|
||||
t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx);
|
||||
}
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
|
||||
return t;
|
||||
@ -105,13 +111,13 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_));
|
||||
}
|
||||
|
||||
void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<std::shared_ptr<ViewMeta>>& metas) {
|
||||
void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& metas) {
|
||||
TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage");
|
||||
|
||||
if (metas.size() > 1) {
|
||||
for (size_t i = 1; i < metas.size(); ++i) {
|
||||
// Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI
|
||||
TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i]->is_as_strided,
|
||||
TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided,
|
||||
"During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i,
|
||||
" was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today,"
|
||||
"so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you "
|
||||
|
||||
@ -8,89 +8,44 @@ namespace at::functionalization {
|
||||
|
||||
// See Note [Functionalization Pass In Core]
|
||||
|
||||
enum class InverseReturnMode {
|
||||
/// Specifies that functional inverses should always return a view.
|
||||
AlwaysView,
|
||||
/// Specifies that functional inverses should always return a non-view / copy.
|
||||
NeverView,
|
||||
/// Specifies that functional inverses should return a view unless a (copying)
|
||||
/// scatter
|
||||
/// inverse exists, in which case that will be used instead.
|
||||
/// This avoids as_strided() calls that can be difficult for subclasses to
|
||||
/// handle.
|
||||
ViewOrScatterInverse,
|
||||
};
|
||||
|
||||
#define FUNCTIONALIZATION_VIEWMETA_NAME(TYPE) \
|
||||
static const char* name() { \
|
||||
return #TYPE; \
|
||||
}
|
||||
|
||||
#define FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(...) \
|
||||
using SerializableTuple = std::tuple<__VA_ARGS__>
|
||||
|
||||
// ViewMeta is a class used by the functionalization pass to navigate between
|
||||
// a base tensor and a view tensor.
|
||||
// For example, if I call `b = a.view1(...)`
|
||||
// the functionalization pass will generate and store a ViewMeta specialization
|
||||
// for `view1` operation on b that looks like:
|
||||
// the functionalization pass will generate and store a ViewMeta on b that looks
|
||||
// like:
|
||||
//
|
||||
// struct TORCH_API view1_ViewMeta : public ViewMeta {
|
||||
// FUNCTIONALIZATION_VIEWMETA_NAME(view1_ViewMeta);
|
||||
// FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
|
||||
// bool /* reapply_views */,
|
||||
// const std::vector<int64_t>&);
|
||||
//
|
||||
// view1_ViewMeta(const SerializableTuple& tpl)
|
||||
// : view1_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
|
||||
//
|
||||
// view1_ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
|
||||
// : ViewMeta(/*has_symbolic_inputs=*/false),
|
||||
// reapply_views(reapply_views),
|
||||
// size(size) {}
|
||||
//
|
||||
// Tensor forward(const Tensor& base) override {
|
||||
// return base.view1(...);
|
||||
// ViewMeta(
|
||||
// [<captures>](const Tensor& base, int64_t mutated_view_idx) {
|
||||
// return base.view1(...);
|
||||
// },
|
||||
// [<captures>](const at::Tensor& base, const at::Tensor& mutated_view,
|
||||
// int64_t mutated_view_idx) -> at::Tensor {
|
||||
// return at::functionalization::impl::view1_inverse(base, mutated_view,
|
||||
// ...);
|
||||
// }
|
||||
//
|
||||
// Tensor reverse(const Tensor& base, const Tensor& mutated_view) override {
|
||||
// return at::functionalization::impl::view1_inverse(base, mutated_view,
|
||||
// ...);
|
||||
// }
|
||||
// The forward_fn lambda describes how to replay view1 on a tensor.
|
||||
//
|
||||
// SerializableTuple to_serializable_tuple() {
|
||||
// return std::make_tuple(reapply_views, size);
|
||||
// }
|
||||
//
|
||||
// bool reapply_views;
|
||||
// std::vector<int64_t> size;
|
||||
// };
|
||||
//
|
||||
// The forward function describes how to replay view1 on a tensor.
|
||||
//
|
||||
// The reverse function describes how, given a tensor that is already a view,
|
||||
// The reverse_fn lambda describes how, given a tensor that is already a view,
|
||||
// how to get the corresponding base tensor. See Note [Functionalization Pass:
|
||||
// View Inverses] for details.
|
||||
//
|
||||
// `SerializedTuple` is a typedef that defines an `std::tuple<...>` type
|
||||
// representing the `ViewMeta` instance state. Methods that take in/return such
|
||||
// a type are used for supporting pickle serialization.
|
||||
struct ViewMeta {
|
||||
ViewMeta(
|
||||
std::function<Tensor(const Tensor&, int64_t)> forward,
|
||||
std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
|
||||
bool has_symbolic_inputs,
|
||||
bool is_multi_output = false,
|
||||
bool is_as_strided = false,
|
||||
int64_t out_idx = 0)
|
||||
: out_index(out_idx),
|
||||
: forward_fn(std::move(forward)),
|
||||
reverse_fn(std::move(reverse)),
|
||||
out_index(out_idx),
|
||||
is_multi_output(is_multi_output),
|
||||
is_as_strided(is_as_strided),
|
||||
has_symbolic_inputs(has_symbolic_inputs) {}
|
||||
|
||||
virtual ~ViewMeta() = default;
|
||||
|
||||
virtual Tensor forward(const Tensor& base) = 0;
|
||||
virtual Tensor reverse(const Tensor& base, const Tensor& mutated_view) = 0;
|
||||
|
||||
std::function<Tensor(const Tensor&, int64_t)> forward_fn;
|
||||
std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
|
||||
// See Note [out_idx in ViewMeta]
|
||||
int64_t out_index;
|
||||
|
||||
@ -102,17 +57,10 @@ struct ViewMeta {
|
||||
// Tells us if this view operation has any symbolic inputs
|
||||
bool has_symbolic_inputs;
|
||||
|
||||
// Returns a new ViewMeta with the same forward/reverse
|
||||
// Returns a copy of the current ViewMeta, if out_idx matches the current
|
||||
// out_index. Otherwise, returns a new ViewMeta with the same forward/reverse
|
||||
// functions, but a new out index.
|
||||
//
|
||||
// This method should be implemented by those `ViewMeta` that have more than
|
||||
// one output.
|
||||
virtual std::shared_ptr<ViewMeta> to_out_index(int64_t out_index) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false,
|
||||
"ViewMeta::to_out_index not implemented. ",
|
||||
"Likely because there's only one output.");
|
||||
}
|
||||
ViewMeta to_out_idx(int64_t out_idx);
|
||||
};
|
||||
|
||||
// FunctionalStorageImpl is a subclass of StorageImpl used by the
|
||||
@ -145,14 +93,14 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
|
||||
const at::Tensor new_val;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
|
||||
const std::vector<std::shared_ptr<ViewMeta>> view_metas;
|
||||
const std::vector<ViewMeta> view_metas;
|
||||
};
|
||||
|
||||
explicit FunctionalStorageImpl(const Tensor& value);
|
||||
|
||||
void add_update(
|
||||
const Tensor& updated_val,
|
||||
const std::vector<std::shared_ptr<ViewMeta>>& view_metas);
|
||||
const std::vector<ViewMeta>& view_metas);
|
||||
bool apply_updates();
|
||||
const Tensor& base() {
|
||||
return base_;
|
||||
|
||||
@ -129,19 +129,17 @@ void FunctionalTensorWrapper::freeze_storage() const {
|
||||
// - view_value: The output tensor that we need to wrap.
|
||||
// - base: The "base" of the view that `view_value` was generated from.
|
||||
// See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic.
|
||||
FunctionalTensorWrapper::FunctionalTensorWrapper(
|
||||
const Tensor& view_value,
|
||||
const FunctionalTensorWrapper* base,
|
||||
const std::shared_ptr<functionalization::ViewMeta>& meta)
|
||||
: c10::TensorImpl(
|
||||
c10::DispatchKeySet(DispatchKey::Functionalize),
|
||||
view_value.dtype(),
|
||||
base->storage().data_ptr().device()),
|
||||
value_(view_value),
|
||||
is_multi_output_view_(
|
||||
base->is_multi_output_view_ || meta->is_multi_output),
|
||||
was_storage_changed_(base->was_storage_changed_),
|
||||
is_symbolic_(base->is_symbolic_) {
|
||||
FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta)
|
||||
: c10::TensorImpl(
|
||||
c10::DispatchKeySet(DispatchKey::Functionalize),
|
||||
view_value.dtype(),
|
||||
base->storage().data_ptr().device()
|
||||
),
|
||||
value_(view_value),
|
||||
is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
|
||||
was_storage_changed_(base->was_storage_changed_),
|
||||
is_symbolic_(base->is_symbolic_)
|
||||
{
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_));
|
||||
TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
|
||||
set_constructor_metadata();
|
||||
@ -150,10 +148,11 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(
|
||||
view_metas_ = base->view_metas_; // copy
|
||||
}
|
||||
view_metas_.push_back(meta);
|
||||
maybe_mark_symbolic(meta.get());
|
||||
maybe_mark_symbolic(meta);
|
||||
storage_ = base->storage_; // alias this tensor's storage with the base tensor's
|
||||
}
|
||||
|
||||
|
||||
functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const {
|
||||
return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl());
|
||||
}
|
||||
@ -177,18 +176,18 @@ bool FunctionalTensorWrapper::is_up_to_date() const {
|
||||
}
|
||||
|
||||
// See Note [Functionalization Pass - Inplace View Ops]
|
||||
void FunctionalTensorWrapper::mutate_view_meta(const std::shared_ptr<at::functionalization::ViewMeta>& meta) {
|
||||
void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) {
|
||||
view_metas_.push_back(meta);
|
||||
// Manually track the fact that this tensor received a metadata mutation!
|
||||
has_metadata_mutation_ = true;
|
||||
// Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation.
|
||||
maybe_mark_symbolic(meta.get());
|
||||
maybe_mark_symbolic(meta);
|
||||
// Note [Functionalization Pass - Inplace View Ops]
|
||||
// So, these ops are special - they're mutation AND view ops. They get special codegen.
|
||||
// An example is transpose_, e.g. `a.transpose_()`
|
||||
// Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas.
|
||||
at::AutoDispatchSkipFunctionalize guard;
|
||||
value_ = meta->forward(value_);
|
||||
value_ = meta.forward_fn(value_, meta.out_index);
|
||||
TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
|
||||
}
|
||||
|
||||
@ -369,8 +368,15 @@ void FunctionalTensorWrapper::sync_() {
|
||||
regenerate_from_base();
|
||||
}
|
||||
|
||||
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& FunctionalTensorWrapper::view_metas() const {
|
||||
return view_metas_;
|
||||
Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) {
|
||||
auto t = base;
|
||||
|
||||
// Reapply views to get the viewed tensor from the base in alias_
|
||||
for (auto& view_meta: view_metas_) {
|
||||
t = view_meta.forward_fn(t, view_meta.out_index);
|
||||
}
|
||||
|
||||
return t;
|
||||
}
|
||||
|
||||
void FunctionalTensorWrapper::regenerate_from_base() {
|
||||
@ -379,7 +385,7 @@ void FunctionalTensorWrapper::regenerate_from_base() {
|
||||
auto t = storage_impl->base();
|
||||
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
|
||||
t = at::functionalization::impl::apply_view_meta_sequence(t, view_metas_);
|
||||
t = apply_view_metas(t);
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
|
||||
|
||||
replace_(t, /*from_lazy_regenerate=*/true);
|
||||
@ -721,11 +727,11 @@ bool isFunctionalTensor(const std::optional<Tensor>& t) {
|
||||
}
|
||||
|
||||
bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
|
||||
if (t_list.empty()) { return false; }
|
||||
if (t_list.empty()) return false;
|
||||
auto functional_count = 0;
|
||||
for (const auto i : c10::irange(t_list.size())) {
|
||||
auto const & e= t_list[i];
|
||||
if (!e.has_value() || !e->defined()) { continue; }
|
||||
if (!e.has_value() || !e->defined()) continue;
|
||||
if (isFunctionalTensor(e)) {
|
||||
++functional_count;
|
||||
}
|
||||
@ -735,10 +741,10 @@ bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
|
||||
|
||||
template <typename T>
|
||||
static bool isFunctionalTensorIListRef(c10::IListRef<T> list) {
|
||||
if (list.size() == 0) { return false; }
|
||||
if (list.size() == 0) return false;
|
||||
auto functional_count = 0;
|
||||
for (const auto& tensor : list) {
|
||||
if (!tensor.defined()) { continue; }
|
||||
if (!tensor.defined()) continue;
|
||||
if (isFunctionalTensor(tensor)) {
|
||||
++functional_count;
|
||||
}
|
||||
@ -756,28 +762,20 @@ void freeze_functional_tensor(const Tensor& tensor) {
|
||||
functional_base_impl->freeze_storage();
|
||||
}
|
||||
|
||||
Tensor create_functional_tensor_with_view_meta(
|
||||
const at::Tensor& view_to_wrap,
|
||||
const at::Tensor& base,
|
||||
const std::shared_ptr<functionalization::ViewMeta>& meta,
|
||||
int64_t out_idx) {
|
||||
Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) {
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap));
|
||||
TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
|
||||
auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base);
|
||||
auto meta_ = meta;
|
||||
if (out_idx != 0) {
|
||||
// Note [out_idx in ViewMeta]
|
||||
// When a view op outputs multiple tensors, each output needs its own separate ViewMeta.
|
||||
// Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function.
|
||||
meta_ = meta->to_out_index(out_idx);
|
||||
meta = meta.to_out_idx(out_idx);
|
||||
}
|
||||
return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta_);
|
||||
return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta);
|
||||
}
|
||||
|
||||
std::vector<Tensor> create_functional_tensor_with_view_meta(
|
||||
ITensorListRef view_to_wrap,
|
||||
const at::Tensor& base,
|
||||
const std::shared_ptr<functionalization::ViewMeta>& meta) {
|
||||
std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) {
|
||||
std::vector<Tensor> outputs(view_to_wrap.size());
|
||||
int64_t i = 0;
|
||||
for (const auto& tensor : view_to_wrap) {
|
||||
@ -787,22 +785,12 @@ std::vector<Tensor> create_functional_tensor_with_view_meta(
|
||||
return outputs;
|
||||
}
|
||||
|
||||
void mutate_view_meta(const at::Tensor& self, const std::shared_ptr<functionalization::ViewMeta>& meta) {
|
||||
void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) {
|
||||
TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self));
|
||||
auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
|
||||
self_impl->mutate_view_meta(meta);
|
||||
}
|
||||
|
||||
Tensor apply_view_meta_sequence(
|
||||
const Tensor& base,
|
||||
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence) {
|
||||
Tensor r = base;
|
||||
for (auto& vm : sequence) {
|
||||
r = vm->forward(r);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
// Note [Propagating strides in the functionalization pass]
|
||||
// In order to properly compute stride information, the functionalization pass
|
||||
// calls each {view} reference implementations with meta tensors.
|
||||
@ -896,7 +884,7 @@ void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* s
|
||||
const auto& ivalue = returns[idx];
|
||||
if (ivalue.isTensor()) {
|
||||
const auto& t = ivalue.toTensor();
|
||||
if (!t.defined()) { continue; }
|
||||
if (!t.defined()) continue;
|
||||
at::functionalization::impl::sync(t);
|
||||
auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t));
|
||||
(*stack)[returns_begin + idx] = t_new;
|
||||
|
||||
@ -56,7 +56,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
|
||||
explicit FunctionalTensorWrapper(
|
||||
const Tensor& view_value,
|
||||
const FunctionalTensorWrapper* base,
|
||||
const std::shared_ptr<functionalization::ViewMeta>& meta);
|
||||
const functionalization::ViewMeta& meta);
|
||||
|
||||
// Get the underlying, actual tensor, that doesn't know anything about
|
||||
// functionalization.
|
||||
@ -99,17 +99,17 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
|
||||
->are_all_mutations_under_no_grad_or_inference_mode();
|
||||
}
|
||||
|
||||
void maybe_mark_symbolic(functionalization::ViewMeta* meta) {
|
||||
is_symbolic_ = is_symbolic_ | meta->has_symbolic_inputs;
|
||||
void maybe_mark_symbolic(const functionalization::ViewMeta& meta) {
|
||||
is_symbolic_ = is_symbolic_ | meta.has_symbolic_inputs;
|
||||
}
|
||||
|
||||
bool is_symbolic() const {
|
||||
return is_symbolic_;
|
||||
}
|
||||
|
||||
// Retrieves the ViewMeta sequence of this tensor.
|
||||
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& view_metas()
|
||||
const;
|
||||
// Runs the forward_fn of every ViewMeta collected in the current instance
|
||||
// to some other base.
|
||||
Tensor apply_view_metas(const Tensor& base);
|
||||
|
||||
// Sync's the underlying tensor with its alias, if it's out of date. This
|
||||
// involves two steps: 1) Apply any pending updates/mutations to the alias 2)
|
||||
@ -146,8 +146,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
|
||||
// from the base tensor. This method is used by inplace-view ops like
|
||||
// transpose_. It appends a ViewMeta to the existing stack, and refreshes the
|
||||
// tensor by replaying the views off of the alias.
|
||||
void mutate_view_meta(
|
||||
const std::shared_ptr<at::functionalization::ViewMeta>& meta);
|
||||
void mutate_view_meta(const at::functionalization::ViewMeta& meta);
|
||||
|
||||
// Custom implementation of self.set_(src)
|
||||
void set__impl(const FunctionalTensorWrapper* other);
|
||||
@ -286,7 +285,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
|
||||
bool is_symbolic_ = false;
|
||||
|
||||
size_t generation_ = 0;
|
||||
std::vector<std::shared_ptr<at::functionalization::ViewMeta>> view_metas_;
|
||||
std::vector<at::functionalization::ViewMeta> view_metas_;
|
||||
|
||||
protected:
|
||||
static void copy_tensor_metadata(
|
||||
@ -378,20 +377,16 @@ TORCH_API void propagate_xla_data_direct(
|
||||
Tensor create_functional_tensor_with_view_meta(
|
||||
const Tensor& view_to_wrap,
|
||||
const Tensor& base,
|
||||
const std::shared_ptr<functionalization::ViewMeta>& meta,
|
||||
functionalization::ViewMeta meta,
|
||||
int64_t out_idx = 0);
|
||||
std::vector<Tensor> create_functional_tensor_with_view_meta(
|
||||
ITensorListRef view_to_wrap,
|
||||
const Tensor& base,
|
||||
const std::shared_ptr<functionalization::ViewMeta>& meta);
|
||||
const functionalization::ViewMeta& meta);
|
||||
|
||||
void mutate_view_meta(
|
||||
const Tensor& self,
|
||||
const std::shared_ptr<functionalization::ViewMeta>& meta);
|
||||
|
||||
TORCH_API Tensor apply_view_meta_sequence(
|
||||
const Tensor& base,
|
||||
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence);
|
||||
const functionalization::ViewMeta& meta);
|
||||
|
||||
void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
|
||||
void set_sizes_strides_offset(
|
||||
|
||||
@ -1,5 +1,3 @@
|
||||
#include <ATen/FunctionalizeFallbackKernel.h>
|
||||
|
||||
#include <ATen/core/dispatch/Dispatcher.h>
|
||||
#include <ATen/core/LegacyTypeDispatch.h>
|
||||
#include <ATen/EmptyTensor.h>
|
||||
@ -9,6 +7,7 @@
|
||||
#include <torch/library.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <c10/util/strides.h>
|
||||
#include <ATen/EmptyTensor.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/ATen.h>
|
||||
@ -29,31 +28,6 @@
|
||||
#include <utility>
|
||||
#endif
|
||||
|
||||
namespace at::functionalization {
|
||||
|
||||
Tensor resize__ViewMeta::forward(const Tensor& base) {
|
||||
if (reapply_views) {
|
||||
return base.as_strided(size, c10::contiguous_strides(size));
|
||||
} else {
|
||||
return at::as_strided_copy(base, size, c10::contiguous_strides(size));
|
||||
}
|
||||
}
|
||||
|
||||
Tensor resize__ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
|
||||
return base.as_strided_scatter(
|
||||
mutated_view, size, c10::contiguous_strides(size));
|
||||
}
|
||||
|
||||
Tensor _unsafe_view_ViewMeta::forward(const Tensor& base) {
|
||||
return at::_unsafe_view_symint(base, size);
|
||||
}
|
||||
|
||||
Tensor _unsafe_view_ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
|
||||
return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
|
||||
}
|
||||
|
||||
} // namespace at::functionalization
|
||||
|
||||
namespace {
|
||||
void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) {
|
||||
const auto& schema = op.schema();
|
||||
@ -132,9 +106,7 @@ namespace {
|
||||
const auto& ivalue = returns[idx];
|
||||
if (ivalue.isTensor() && should_wrap_outputs) {
|
||||
const auto& t = ivalue.toTensor();
|
||||
if (!t.defined()) {
|
||||
continue;
|
||||
}
|
||||
if (!t.defined()) continue;
|
||||
auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t));
|
||||
(*stack)[returns_begin + idx] = t_new;
|
||||
} else if (ivalue.isTensorList() && should_wrap_outputs) {
|
||||
@ -197,8 +169,19 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch
|
||||
// The output of resizing is equivalent to taking a slice of a larger tensor.
|
||||
// We have to emulate this "slicing" with an as_strided call.
|
||||
auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS();
|
||||
auto view_meta = std::make_shared<at::functionalization::resize__ViewMeta>(
|
||||
reapply_views, size.vec());
|
||||
at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
|
||||
[reapply_views = reapply_views, size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
|
||||
if (reapply_views) {
|
||||
return base.as_strided(size, c10::contiguous_strides(size));
|
||||
} else {
|
||||
return at::as_strided_copy(base, size, c10::contiguous_strides(size));
|
||||
}
|
||||
},
|
||||
[size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
|
||||
return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size));
|
||||
},
|
||||
/*has_symbolic_inputs=*/false
|
||||
);
|
||||
at::functionalization::impl::mutate_view_meta(self, view_meta);
|
||||
return self;
|
||||
}
|
||||
@ -317,11 +300,17 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt
|
||||
tmp_output = at::_unsafe_view_symint(self_, size);
|
||||
}
|
||||
|
||||
bool has_symbolic_inputs = std::any_of(
|
||||
size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
|
||||
auto view_meta =
|
||||
std::make_shared<at::functionalization::_unsafe_view_ViewMeta>(
|
||||
has_symbolic_inputs, size.vec());
|
||||
bool has_symbolic_inputs = std::any_of(size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
|
||||
|
||||
at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
|
||||
[size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
|
||||
return at::_unsafe_view_symint(base, size);
|
||||
},
|
||||
[size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
|
||||
return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
|
||||
},
|
||||
/*has_symbolic_inputs=*/has_symbolic_inputs
|
||||
);
|
||||
|
||||
auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta));
|
||||
// See Note [Propagating strides in the functionalization pass]
|
||||
|
||||
@ -1,58 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/FunctionalStorageImpl.h>
|
||||
|
||||
namespace at::functionalization {
|
||||
|
||||
// `ViewMeta` implementation for `resize_` operation.
|
||||
struct TORCH_API resize__ViewMeta : public ViewMeta {
|
||||
FUNCTIONALIZATION_VIEWMETA_NAME(resize__ViewMeta)
|
||||
FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
|
||||
bool /* reapply_views */,
|
||||
const std::vector<int64_t>&);
|
||||
|
||||
resize__ViewMeta(const SerializableTuple& tpl)
|
||||
: resize__ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
|
||||
|
||||
resize__ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
|
||||
: ViewMeta(/*has_symbolic_inputs=*/false),
|
||||
reapply_views(reapply_views),
|
||||
size(size) {}
|
||||
|
||||
Tensor forward(const Tensor& base) override;
|
||||
Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
|
||||
|
||||
SerializableTuple to_serializable_tuple() {
|
||||
return std::make_tuple(reapply_views, size);
|
||||
}
|
||||
|
||||
bool reapply_views;
|
||||
std::vector<int64_t> size;
|
||||
};
|
||||
|
||||
// `ViewMeta` implementation for `_unsafe_view` operation.
|
||||
struct TORCH_API _unsafe_view_ViewMeta : public ViewMeta {
|
||||
FUNCTIONALIZATION_VIEWMETA_NAME(_unsafe_view_ViewMeta)
|
||||
FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
|
||||
bool /* has_symbolic_inputs */,
|
||||
const std::vector<c10::SymInt>&);
|
||||
|
||||
_unsafe_view_ViewMeta(const SerializableTuple& tpl)
|
||||
: _unsafe_view_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
|
||||
|
||||
_unsafe_view_ViewMeta(
|
||||
bool has_symbolic_inputs,
|
||||
const std::vector<c10::SymInt>& size)
|
||||
: ViewMeta(has_symbolic_inputs), size(size) {}
|
||||
|
||||
Tensor forward(const Tensor& base) override;
|
||||
Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
|
||||
|
||||
SerializableTuple to_serializable_tuple() {
|
||||
return std::make_tuple(has_symbolic_inputs, size);
|
||||
}
|
||||
|
||||
std::vector<c10::SymInt> size;
|
||||
};
|
||||
|
||||
} // namespace at::functionalization
|
||||
@ -120,7 +120,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
|
||||
// buffer (in bytes)
|
||||
size_t orig_m = sparse_input.size(0);
|
||||
size_t div = orig_m * sparse_input.itemsize();
|
||||
size_t new_n = (compressed_size + div - 1) / div; // ceil(s,d) = (s+d-1)/d
|
||||
size_t new_n = (compressed_size + div - 1) / div; // floor
|
||||
auto compressed_tensor = sparse_input.new_empty({(int64_t)orig_m, (int64_t)new_n});
|
||||
|
||||
auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
|
||||
@ -155,7 +155,7 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
|
||||
TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
|
||||
handle_initialized = true;
|
||||
}
|
||||
// cuSPARSELt constructs
|
||||
// cupsarselt constructs
|
||||
cusparseLtMatmulDescriptor_t matmul;
|
||||
cusparseLtMatmulPlan_t plan;
|
||||
cusparseLtMatmulAlgSelection_t alg_sel;
|
||||
|
||||
@ -2,12 +2,22 @@
|
||||
|
||||
// ${generated_comment}
|
||||
|
||||
#include <ATen/FunctionalStorageImpl.h>
|
||||
#include <ATen/Tensor.h>
|
||||
|
||||
namespace at {
|
||||
namespace functionalization {
|
||||
|
||||
enum class InverseReturnMode {
|
||||
/// Specifies that functional inverses should always return a view.
|
||||
AlwaysView,
|
||||
/// Specifies that functional inverses should always return a non-view / copy.
|
||||
NeverView,
|
||||
/// Specifies that functional inverses should return a view unless a (copying) scatter
|
||||
/// inverse exists, in which case that will be used instead.
|
||||
/// This avoids as_strided() calls that can be difficult for subclasses to handle.
|
||||
ViewOrScatterInverse,
|
||||
};
|
||||
|
||||
struct FunctionalInverses {
|
||||
|
||||
${view_inverse_declarations}
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
#include <ATen/core/LegacyTypeDispatch.h>
|
||||
#include <ATen/EmptyTensor.h>
|
||||
#include <ATen/FunctionalTensorWrapper.h>
|
||||
#include <ATen/ViewMetaClasses.h>
|
||||
#include <ATen/FunctionalInverses.h>
|
||||
#include <ATen/MemoryOverlap.h>
|
||||
#include <torch/library.h>
|
||||
|
||||
|
||||
@ -1,19 +0,0 @@
|
||||
// ${generated_comment}
|
||||
|
||||
#include <ATen/FunctionalInverses.h>
|
||||
#include <ATen/ViewMetaClasses.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Operators.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
${op_headers}
|
||||
#endif
|
||||
|
||||
namespace at {
|
||||
namespace functionalization {
|
||||
|
||||
${view_meta_implementations}
|
||||
|
||||
} // namespace functionalization
|
||||
} // namespace at
|
||||
@ -1,12 +0,0 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
// ${generated_comment}
|
||||
|
||||
#include <ATen/FunctionalStorageImpl.h>
|
||||
|
||||
namespace at {
|
||||
namespace functionalization {
|
||||
|
||||
${view_meta_declarations}
|
||||
|
||||
} // namespace functionalization
|
||||
} // namespace at
|
||||
@ -1,11 +0,0 @@
|
||||
#include <ATen/ViewMetaClasses.h>
|
||||
#include <torch/csrc/functionalization/Module.h>
|
||||
|
||||
namespace torch::functionalization {
|
||||
|
||||
void initGenerated(PyObject* module) {
|
||||
auto functionalization = py::handle(module).cast<py::module>();
|
||||
$view_meta_bindings
|
||||
}
|
||||
|
||||
} // namespace torch::functionalization
|
||||
@ -156,7 +156,7 @@ ROOT = "//" if IS_OSS else "//xplat/caffe2"
|
||||
# for targets in subfolders
|
||||
ROOT_PATH = "//" if IS_OSS else "//xplat/caffe2/"
|
||||
|
||||
C10 = "//c10:c10" if IS_OSS else "//xplat/caffe2/c10:c10"
|
||||
C10 = "//c10:c10" if IS_OSS else ("//xplat/caffe2/c10:c10_ovrsource" if is_arvr_mode() else "//xplat/caffe2/c10:c10")
|
||||
|
||||
# a dictionary maps third party library name to fbsource and oss target
|
||||
THIRD_PARTY_LIBS = {
|
||||
@ -391,8 +391,6 @@ def get_aten_generated_files(enabled_backends):
|
||||
"CompositeExplicitAutogradFunctions_inl.h",
|
||||
"CompositeExplicitAutogradNonFunctionalFunctions.h",
|
||||
"CompositeExplicitAutogradNonFunctionalFunctions_inl.h",
|
||||
"ViewMetaClasses.h",
|
||||
"ViewMetaClasses.cpp",
|
||||
"VmapGeneratedPlumbing.h",
|
||||
"core/ATenOpList.cpp",
|
||||
"core/TensorBody.h",
|
||||
@ -950,6 +948,7 @@ def define_buck_targets(
|
||||
[
|
||||
("torch/csrc/api/include", "torch/**/*.h"),
|
||||
("", "torch/csrc/**/*.h"),
|
||||
("", "torch/csrc/**/*.hpp"),
|
||||
("", "torch/nativert/**/*.h"),
|
||||
("", "torch/headeronly/**/*.h"),
|
||||
("", "torch/script.h"),
|
||||
@ -1194,7 +1193,6 @@ def define_buck_targets(
|
||||
"NativeMetaFunctions.h": ":gen_aten[NativeMetaFunctions.h]",
|
||||
"Operators.h": ":gen_aten[Operators.h]",
|
||||
"RedispatchFunctions.h": ":gen_aten[RedispatchFunctions.h]",
|
||||
"ViewMetaClasses.h": ":gen_aten[ViewMetaClasses.h]",
|
||||
"core/TensorBody.h": ":gen_aten[core/TensorBody.h]",
|
||||
"core/aten_interned_strings.h": ":gen_aten[core/aten_interned_strings.h]",
|
||||
"core/enum_tag.h": ":gen_aten[core/enum_tag.h]",
|
||||
@ -2050,6 +2048,7 @@ def define_buck_targets(
|
||||
("", "caffe2/utils/*.h"),
|
||||
("", "caffe2/core/*.h"),
|
||||
("", "torch/csrc/*.h"),
|
||||
("", "torch/csrc/*.hpp"),
|
||||
("", "torch/csrc/api/include/torch/*.h"),
|
||||
("", "torch/csrc/autograd/*.h"),
|
||||
("", "torch/csrc/autograd/*/*.h"),
|
||||
|
||||
@ -118,9 +118,6 @@ def define_targets(rules):
|
||||
":LazyNonNativeIr.h",
|
||||
":RegisterDispatchDefinitions.ini",
|
||||
":RegisterDispatchKey.cpp",
|
||||
":ViewMetaClassesPythonBinding.cpp",
|
||||
":ViewMetaClasses.cpp",
|
||||
":ViewMetaClasses.h",
|
||||
":native_functions.yaml",
|
||||
":shape_inference.h",
|
||||
":tags.yaml",
|
||||
@ -173,7 +170,6 @@ GENERATED_H = [
|
||||
"FunctionalInverses.h",
|
||||
"RedispatchFunctions.h",
|
||||
"RegistrationDeclarations.h",
|
||||
"ViewMetaClasses.h",
|
||||
"VmapGeneratedPlumbing.h",
|
||||
]
|
||||
|
||||
@ -250,7 +246,6 @@ GENERATED_CPP = [
|
||||
"RegisterFunctionalization_1.cpp",
|
||||
"RegisterFunctionalization_2.cpp",
|
||||
"RegisterFunctionalization_3.cpp",
|
||||
"ViewMetaClasses.cpp",
|
||||
]
|
||||
|
||||
GENERATED_CPP_CORE = [
|
||||
@ -312,7 +307,6 @@ _GENERATED_AUTOGRAD_PYTHON_CPP = [
|
||||
"torch/csrc/autograd/generated/python_torch_functions_1.cpp",
|
||||
"torch/csrc/autograd/generated/python_torch_functions_2.cpp",
|
||||
"torch/csrc/autograd/generated/python_variable_methods.cpp",
|
||||
"torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp"
|
||||
]
|
||||
|
||||
GENERATED_AUTOGRAD_PYTHON = _GENERATED_AUTOGRAD_PYTHON_HEADERS + _GENERATED_AUTOGRAD_PYTHON_CPP
|
||||
|
||||
@ -1010,7 +1010,6 @@ libtorch_python_core_sources = [
|
||||
"torch/csrc/utils/disable_torch_function.cpp",
|
||||
"torch/csrc/utils/verbose.cpp",
|
||||
"torch/csrc/cpu/Module.cpp",
|
||||
"torch/csrc/functionalization/Module.cpp",
|
||||
"torch/csrc/instruction_counter/Module.cpp",
|
||||
"torch/nativert/python/Bindings.cpp",
|
||||
] + lazy_tensor_core_python_sources
|
||||
@ -1053,7 +1052,6 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
|
||||
"torch/csrc/autograd/generated/python_torch_functions_1.cpp",
|
||||
"torch/csrc/autograd/generated/python_torch_functions_2.cpp",
|
||||
"torch/csrc/autograd/generated/python_variable_methods.cpp",
|
||||
"torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp",
|
||||
]]
|
||||
|
||||
_libtorch_python_sources.extend(libtorch_python_core_sources)
|
||||
|
||||
@ -3244,7 +3244,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
|
||||
are_equal<sizeof(autograd_meta_), 4, FieldNameEnum::autograd_meta_>();
|
||||
are_equal<sizeof(extra_meta_), 4, FieldNameEnum::extra_meta_>();
|
||||
are_equal<sizeof(version_counter_), 4, FieldNameEnum::version_counter_>();
|
||||
are_equal<sizeof(pyobj_slot_), 4, FieldNameEnum::pyobj_slot_>();
|
||||
are_equal<sizeof(pyobj_slot_), 8, FieldNameEnum::pyobj_slot_>();
|
||||
is_le<sizeof(sizes_and_strides_), 88, FieldNameEnum::sizes_and_strides_>();
|
||||
are_equal<sizeof(storage_offset_), 8, FieldNameEnum::storage_offset_>();
|
||||
are_equal<sizeof(numel_), 8, FieldNameEnum::numel_>();
|
||||
@ -3269,7 +3269,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
|
||||
is_le<sizeof(autograd_meta_), 16, FieldNameEnum::autograd_meta_>();
|
||||
is_le<sizeof(extra_meta_), 16, FieldNameEnum::extra_meta_>();
|
||||
are_equal<sizeof(version_counter_), 8, FieldNameEnum::version_counter_>();
|
||||
are_equal<sizeof(pyobj_slot_), 8, FieldNameEnum::pyobj_slot_>();
|
||||
are_equal<sizeof(pyobj_slot_), 16, FieldNameEnum::pyobj_slot_>();
|
||||
are_equal<sizeof(sizes_and_strides_), 88, FieldNameEnum::sizes_and_strides_>();
|
||||
are_equal<sizeof(storage_offset_), 8, FieldNameEnum::storage_offset_>();
|
||||
are_equal<sizeof(numel_), 8, FieldNameEnum::numel_>();
|
||||
|
||||
@ -13,10 +13,11 @@ struct C10_API PyInterpreterHooksInterface {
|
||||
|
||||
// Get the PyInterpreter instance
|
||||
// Stub implementation throws error when Python is not available
|
||||
// We return nullptr rather than throwing an error since there are bits of c10
|
||||
// that expect an empty PyObjectSlot when python is not available.
|
||||
virtual PyInterpreter* getPyInterpreter() const {
|
||||
return nullptr;
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"PyTorch was compiled without Python support. "
|
||||
"Cannot access Python interpreter from C++.");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
namespace c10::impl {
|
||||
|
||||
PyObjectSlot::PyObjectSlot() : pyobj_(nullptr) {}
|
||||
PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
|
||||
|
||||
PyObjectSlot::~PyObjectSlot() {
|
||||
maybe_destroy_pyobj();
|
||||
@ -10,9 +10,9 @@ PyObjectSlot::~PyObjectSlot() {
|
||||
|
||||
void PyObjectSlot::maybe_destroy_pyobj() {
|
||||
if (owns_pyobj()) {
|
||||
TORCH_INTERNAL_ASSERT(getGlobalPyInterpreter() != nullptr);
|
||||
TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
|
||||
TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
|
||||
(*getGlobalPyInterpreter())
|
||||
(*pyobj_interpreter_.load(std::memory_order_acquire))
|
||||
->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
|
||||
// NB: this destructor can only be entered when there are no
|
||||
// references to this C++ object (obviously), NOR any references
|
||||
@ -25,7 +25,7 @@ void PyObjectSlot::maybe_destroy_pyobj() {
|
||||
}
|
||||
|
||||
PyInterpreter* PyObjectSlot::pyobj_interpreter() {
|
||||
return getGlobalPyInterpreter();
|
||||
return pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
}
|
||||
|
||||
PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
|
||||
@ -35,7 +35,7 @@ PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
|
||||
}
|
||||
|
||||
PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
|
||||
auto interpreter = getGlobalPyInterpreter();
|
||||
auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
if (interpreter) {
|
||||
return *interpreter;
|
||||
}
|
||||
|
||||
@ -6,17 +6,10 @@
|
||||
#include <c10/util/python_stub.h>
|
||||
#include <optional>
|
||||
|
||||
#include <atomic>
|
||||
|
||||
namespace c10::impl {
|
||||
|
||||
// Function pointer type for getting the global interpreter
|
||||
using GetPyInterpreterFn = PyInterpreter* (*)();
|
||||
|
||||
// Global function pointer (set by csrc initialization)
|
||||
C10_API extern GetPyInterpreterFn g_get_pyinterpreter_fn;
|
||||
|
||||
// Helper function to get the global interpreter
|
||||
C10_API PyInterpreter* getGlobalPyInterpreter();
|
||||
|
||||
struct C10_API PyObjectSlot {
|
||||
public:
|
||||
PyObjectSlot();
|
||||
@ -33,6 +26,8 @@ struct C10_API PyObjectSlot {
|
||||
// NB: THIS FUNCTION CAN RAISE AN EXCEPTION. Make sure to clean up after
|
||||
// PyObject if necessary!
|
||||
void init_pyobj(PyObject* pyobj) {
|
||||
pyobj_interpreter_.store(
|
||||
getGlobalPyInterpreter(), std::memory_order_relaxed);
|
||||
pyobj_ = pyobj;
|
||||
}
|
||||
|
||||
@ -60,15 +55,18 @@ struct C10_API PyObjectSlot {
|
||||
|
||||
// @todo alban: I'm not too sure what's going on here, we can probably delete
|
||||
// it but it's worthwhile making sure
|
||||
std::optional<PyObject*> check_pyobj() const {
|
||||
impl::PyInterpreter* interpreter = getGlobalPyInterpreter();
|
||||
if (interpreter == nullptr || pyobj_ == nullptr) {
|
||||
std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
|
||||
impl::PyInterpreter* interpreter =
|
||||
pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
if (interpreter == nullptr) {
|
||||
return std::nullopt;
|
||||
}
|
||||
if (c10::impl::HermeticPyObjectTLS::get_state()) {
|
||||
|
||||
if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
|
||||
return std::nullopt;
|
||||
} else {
|
||||
return _unchecked_untagged_pyobj();
|
||||
}
|
||||
return _unchecked_untagged_pyobj();
|
||||
}
|
||||
|
||||
PyInterpreter& load_pyobj_interpreter() const;
|
||||
@ -78,6 +76,30 @@ struct C10_API PyObjectSlot {
|
||||
void set_owns_pyobj(bool b);
|
||||
|
||||
private:
|
||||
// This field contains the interpreter tag for this object. See
|
||||
// Note [Python interpreter tag] for general context
|
||||
//
|
||||
// Note [Memory ordering on Python interpreter tag]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// What memory_order do we need when accessing this atomic? We don't
|
||||
// need a single total modification order (as provided by
|
||||
// memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
|
||||
// transition from -1 to some positive integer and never changes afterwards.
|
||||
// Because there is only one modification, it trivially already has a total
|
||||
// modification order (e.g., we don't need fences or locked instructions on
|
||||
// x86)
|
||||
//
|
||||
// In fact, one could make a reasonable argument that relaxed reads are OK,
|
||||
// due to the presence of external locking (GIL) to ensure that interactions
|
||||
// with other data structures are still correctly synchronized, so that
|
||||
// we fall in the "Single-Location Data Structures" case as described in
|
||||
// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
|
||||
// However, on x86, it doesn't matter if I use acquire or relaxed on the load
|
||||
// as I get the same assembly in both cases. So I just use the more
|
||||
// conservative acquire (which will impede compiler optimizations but I don't
|
||||
// care)
|
||||
std::atomic<PyInterpreter*> pyobj_interpreter_;
|
||||
|
||||
// This field contains a reference to a PyObject representing this Tensor.
|
||||
// If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
|
||||
// PyObject for it and set this field. This field does not have to be
|
||||
|
||||
@ -18,9 +18,9 @@ cuda_supported_platforms = [
|
||||
|
||||
def define_c10_ovrsource(name, is_mobile):
|
||||
if is_mobile:
|
||||
pp_flags = ["-DC10_MOBILE=1"]
|
||||
pp_flags = ["-DC10_MOBILE=1", "-DC10_USE_GLOG"]
|
||||
else:
|
||||
pp_flags = []
|
||||
pp_flags = ["-DC10_USE_GLOG"]
|
||||
|
||||
oxx_static_library(
|
||||
name = name,
|
||||
|
||||
@ -316,7 +316,6 @@ set(GENERATED_CXX_PYTHON
|
||||
"${TORCH_SRC_DIR}/csrc/autograd/generated/python_special_functions.cpp"
|
||||
"${TORCH_SRC_DIR}/csrc/autograd/generated/python_return_types.cpp"
|
||||
"${TORCH_SRC_DIR}/csrc/autograd/generated/python_enum_tag.cpp"
|
||||
"${TORCH_SRC_DIR}/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp"
|
||||
)
|
||||
|
||||
set(GENERATED_H_PYTHON
|
||||
@ -380,9 +379,6 @@ add_custom_command(
|
||||
"${TORCH_ROOT}/aten/src/ATen/templates/LazyIr.h"
|
||||
"${TORCH_ROOT}/aten/src/ATen/templates/LazyNonNativeIr.h"
|
||||
"${TORCH_ROOT}/aten/src/ATen/templates/RegisterDispatchKey.cpp"
|
||||
"${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClasses.h"
|
||||
"${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClasses.cpp"
|
||||
"${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp"
|
||||
${autograd_python}
|
||||
${autograd_yaml}
|
||||
${autograd_templates}
|
||||
@ -544,11 +540,9 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
|
||||
${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
|
||||
)
|
||||
|
||||
if(USE_DISTRIBUTED)
|
||||
append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
|
||||
endif()
|
||||
append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@ -579,32 +573,30 @@ if(USE_CUDA)
|
||||
list(APPEND Caffe2_GPU_SRCS
|
||||
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
|
||||
endif()
|
||||
if(USE_DISTRIBUTED)
|
||||
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
|
||||
set_source_files_properties(
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
|
||||
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
|
||||
)
|
||||
endif()
|
||||
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
|
||||
set_source_files_properties(
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
|
||||
${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
|
||||
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
|
||||
)
|
||||
endif()
|
||||
|
||||
set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
|
||||
# Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
|
||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
|
||||
endif()
|
||||
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
|
||||
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
|
||||
endif()
|
||||
set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
|
||||
# Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
|
||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
|
||||
endif()
|
||||
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
|
||||
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
|
||||
endif()
|
||||
set_source_files_properties(
|
||||
${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
|
||||
@ -637,11 +629,9 @@ if(USE_ROCM)
|
||||
list(APPEND Caffe2_HIP_SRCS
|
||||
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
|
||||
endif()
|
||||
if(USE_DISTRIBUTED)
|
||||
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
|
||||
endif()
|
||||
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
|
||||
if(NOT WIN32)
|
||||
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
|
||||
endif()
|
||||
# caffe2_nvrtc's stubs to driver APIs are useful for HIP.
|
||||
# See NOTE [ ATen NVRTC Stub and HIP ]
|
||||
@ -1362,12 +1352,10 @@ if(BUILD_TEST)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
|
||||
add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
|
||||
if(USE_DISTRIBUTED)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
|
||||
if(NOT WIN32)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
|
||||
endif()
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
|
||||
if(NOT WIN32)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
|
||||
endif()
|
||||
if(NOT NO_API)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api)
|
||||
@ -1472,46 +1460,40 @@ if(BUILD_LITE_INTERPRETER)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
# Pass USE_DISTRIBUTED to torch_cpu, as some codes in jit/pickler.cpp and
|
||||
# jit/unpickler.cpp need to be compiled only when USE_DISTRIBUTED is set
|
||||
if(USE_DISTRIBUTED)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_DISTRIBUTED)
|
||||
if(USE_GLOO AND USE_C10D_GLOO)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
|
||||
if(USE_GLOO AND USE_C10D_GLOO)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
|
||||
endif()
|
||||
if(USE_UCC AND USE_C10D_UCC)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
|
||||
if(USE_CUDA)
|
||||
target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
|
||||
endif()
|
||||
if(USE_UCC AND USE_C10D_UCC)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
|
||||
if(USE_CUDA)
|
||||
target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
|
||||
endif()
|
||||
endif()
|
||||
if(USE_NCCL AND USE_C10D_NCCL)
|
||||
if(USE_ROCM)
|
||||
target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
|
||||
else()
|
||||
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
|
||||
endif()
|
||||
if(USE_NCCL AND USE_C10D_NCCL)
|
||||
if(USE_ROCM)
|
||||
target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
|
||||
else()
|
||||
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
|
||||
endif()
|
||||
endif()
|
||||
if(USE_MPI AND USE_C10D_MPI)
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
set_source_files_properties(
|
||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
|
||||
PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
|
||||
endif()
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
|
||||
endif()
|
||||
# Pass USE_RPC in order to reduce use of
|
||||
# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
|
||||
# need to be removed when RPC is supported
|
||||
if(NOT WIN32)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_RPC)
|
||||
endif()
|
||||
# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
|
||||
# can only be compiled with USE_TENSORPIPE is set.
|
||||
if(USE_TENSORPIPE)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
|
||||
endif()
|
||||
if(USE_MPI AND USE_C10D_MPI)
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
set_source_files_properties(
|
||||
"${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
|
||||
PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
|
||||
endif()
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
|
||||
endif()
|
||||
# Pass USE_RPC in order to reduce use of
|
||||
# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
|
||||
# need to be removed when RPC is supported
|
||||
if(NOT WIN32)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_RPC)
|
||||
endif()
|
||||
# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
|
||||
# can only be compiled with USE_TENSORPIPE is set.
|
||||
if(USE_TENSORPIPE)
|
||||
target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
|
||||
endif()
|
||||
|
||||
if(NOT INTERN_BUILD_MOBILE)
|
||||
|
||||
@ -1134,7 +1134,7 @@ if(USE_CUDA AND CUDA_VERSION VERSION_LESS 13.0)
|
||||
include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
|
||||
endif()
|
||||
|
||||
if(USE_DISTRIBUTED AND USE_TENSORPIPE)
|
||||
if(USE_TENSORPIPE)
|
||||
if(MSVC)
|
||||
message(WARNING "Tensorpipe cannot be used on Windows.")
|
||||
else()
|
||||
|
||||
@ -193,13 +193,11 @@ function(caffe2_print_configuration_summary)
|
||||
message(STATUS " USE_PYTORCH_QNNPACK : ${USE_PYTORCH_QNNPACK}")
|
||||
message(STATUS " USE_XNNPACK : ${USE_XNNPACK}")
|
||||
message(STATUS " USE_DISTRIBUTED : ${USE_DISTRIBUTED}")
|
||||
if(${USE_DISTRIBUTED})
|
||||
message(STATUS " USE_MPI : ${USE_MPI}")
|
||||
message(STATUS " USE_GLOO : ${USE_GLOO}")
|
||||
message(STATUS " USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
|
||||
message(STATUS " USE_GLOO_IBVERBS : ${USE_GLOO_IBVERBS}")
|
||||
message(STATUS " USE_TENSORPIPE : ${USE_TENSORPIPE}")
|
||||
endif()
|
||||
message(STATUS " USE_MPI : ${USE_MPI}")
|
||||
message(STATUS " USE_GLOO : ${USE_GLOO}")
|
||||
message(STATUS " USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
|
||||
message(STATUS " USE_GLOO_IBVERBS : ${USE_GLOO_IBVERBS}")
|
||||
message(STATUS " USE_TENSORPIPE : ${USE_TENSORPIPE}")
|
||||
if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
|
||||
message(STATUS " SELECTED_OP_LIST : ${SELECTED_OP_LIST}")
|
||||
endif()
|
||||
|
||||
@ -3305,13 +3305,6 @@ def coverage_post_process(app, exception):
|
||||
if not isinstance(app.builder, CoverageBuilder):
|
||||
return
|
||||
|
||||
if not torch.distributed.is_available():
|
||||
raise RuntimeError(
|
||||
"The coverage tool cannot run with a version "
|
||||
"of PyTorch that was built with USE_DISTRIBUTED=0 "
|
||||
"as this module's API changes."
|
||||
)
|
||||
|
||||
# These are all the modules that have "automodule" in an rst file
|
||||
# These modules are the ones for which coverage is checked
|
||||
# Here, we make sure that no module is missing from that list
|
||||
|
||||
@ -1093,9 +1093,6 @@ The set of leaf modules can be customized by overriding
|
||||
```{eval-rst}
|
||||
.. autofunction:: torch.fx.replace_pattern
|
||||
```
|
||||
```{eval-rst}
|
||||
.. autofunction:: torch.fx.traceback.annotate
|
||||
```
|
||||
|
||||
<!-- The experimental and passes submodules are missing docs. -->
|
||||
<!-- Adding it here for coverage but this doesn't add anything to the -->
|
||||
|
||||
@ -156,7 +156,6 @@ def get_generate_code_bin_outs():
|
||||
"autograd/generated/python_torch_functions_1.cpp": ["autograd/generated/python_torch_functions_1.cpp"],
|
||||
"autograd/generated/python_torch_functions_2.cpp": ["autograd/generated/python_torch_functions_2.cpp"],
|
||||
"autograd/generated/python_variable_methods.cpp": ["autograd/generated/python_variable_methods.cpp"],
|
||||
"functionalization/generated/ViewMetaClassesPythonBinding.cpp": ["functionalization/generated/ViewMetaClassesPythonBinding.cpp"],
|
||||
})
|
||||
return outs
|
||||
|
||||
|
||||
13
setup.py
13
setup.py
@ -1704,18 +1704,7 @@ def main() -> None:
|
||||
package_data = {
|
||||
"torch": torch_package_data,
|
||||
}
|
||||
# some win libraries are excluded
|
||||
# these are statically linked
|
||||
exclude_windows_libs = [
|
||||
"lib/dnnl.lib",
|
||||
"lib/kineto.lib",
|
||||
"lib/libprotobuf-lite.lib",
|
||||
"lib/libprotobuf.lib",
|
||||
"lib/libprotoc.lib",
|
||||
]
|
||||
exclude_package_data = {
|
||||
"torch": exclude_windows_libs,
|
||||
}
|
||||
exclude_package_data = {}
|
||||
|
||||
if not BUILD_LIBTORCH_WHL:
|
||||
package_data["torchgen"] = torchgen_package_data
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
if(USE_DISTRIBUTED AND NOT WIN32)
|
||||
if(NOT WIN32)
|
||||
set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd")
|
||||
set(DIST_AUTOGRAD_TEST_SOURCES
|
||||
${TORCH_ROOT}/test/cpp/common/main.cpp
|
||||
|
||||
@ -1,7 +1,9 @@
|
||||
if(WIN32)
|
||||
set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/${CMAKE_IMPORT_LIBRARY_PREFIX}torch_python${CMAKE_IMPORT_LIBRARY_SUFFIX}")
|
||||
set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/torch_python.lib")
|
||||
elseif(APPLE)
|
||||
set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/libtorch_python.dylib")
|
||||
else()
|
||||
set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}torch_python${CMAKE_SHARED_LIBRARY_SUFFIX}")
|
||||
set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/libtorch_python.so")
|
||||
endif()
|
||||
|
||||
add_library(torch_python SHARED IMPORTED)
|
||||
|
||||
@ -143,19 +143,6 @@ class FlightRecorderEventTest(TestCase):
|
||||
match_one_event(e11, e12, membership, "0").state,
|
||||
MatchState.FULLY_MATCHED,
|
||||
)
|
||||
e13 = create_one_event(
|
||||
"gather",
|
||||
("0", "default"),
|
||||
[[4, 4]],
|
||||
[[4, 4]],
|
||||
"completed",
|
||||
1,
|
||||
output_dtypes="",
|
||||
)
|
||||
self.assertEqual(
|
||||
match_one_event(e11, e13, membership, "0").state,
|
||||
MatchState.FULLY_MATCHED,
|
||||
)
|
||||
|
||||
def test_all_events(self):
|
||||
for collective in sorted(COLLECTIVES):
|
||||
|
||||
@ -202,62 +202,6 @@ class ScheduleTest(TestCase):
|
||||
|
||||
torch.distributed.destroy_process_group()
|
||||
|
||||
@parametrize(
|
||||
"ScheduleClass",
|
||||
[
|
||||
Schedule1F1B,
|
||||
ScheduleGPipe,
|
||||
ScheduleInterleaved1F1B,
|
||||
ScheduleInterleavedZeroBubble,
|
||||
ScheduleLoopedBFS,
|
||||
],
|
||||
)
|
||||
def test_schedule_eval_then_train(self, ScheduleClass):
|
||||
"""
|
||||
Test that simply runs evaluation followed by training.
|
||||
"""
|
||||
store = FakeStore()
|
||||
torch.distributed.init_process_group(
|
||||
backend="fake", rank=0, world_size=1, store=store
|
||||
)
|
||||
d_hid, batch_size = 512, 256
|
||||
n_stages = 1
|
||||
device = "cpu"
|
||||
full_mod = MultiMLP(d_hid, n_layers=n_stages)
|
||||
full_mod.to(device)
|
||||
|
||||
x = torch.randn(batch_size, d_hid, device=device)
|
||||
target = torch.randn(batch_size, d_hid, device=device)
|
||||
|
||||
def loss_fn(y, target):
|
||||
return torch.nn.functional.cross_entropy(y, target)
|
||||
|
||||
submod_name = "layers.0"
|
||||
stage_module = full_mod.get_submodule(submod_name)
|
||||
|
||||
# Create a pipeline stage to wrap that submodule
|
||||
num_microbatches = 2
|
||||
stages = [PipelineStage(stage_module, 0, n_stages, device)]
|
||||
|
||||
if issubclass(ScheduleClass, PipelineScheduleSingle):
|
||||
stages = stages[0]
|
||||
|
||||
# Attach to a schedule
|
||||
schedule = ScheduleClass(stages, num_microbatches, loss_fn=loss_fn)
|
||||
# Run eval
|
||||
for _ in range(2):
|
||||
# Zero gradients
|
||||
stage_module.zero_grad()
|
||||
losses = []
|
||||
schedule.eval(x, target=target, losses=losses)
|
||||
# Run training
|
||||
try:
|
||||
for _ in range(2):
|
||||
losses = []
|
||||
schedule.step(x, target=target, losses=losses)
|
||||
finally:
|
||||
torch.distributed.destroy_process_group()
|
||||
|
||||
def test_zero_bubble_schedule_errors_with_compile(self):
|
||||
"""
|
||||
Test that zero bubble schedules raise an error when used with torch.compile.
|
||||
|
||||
@ -352,7 +352,7 @@ class MicroPipelineTPTest(TestCase):
|
||||
@parametrize("scatter_dim", [0, 1, 2])
|
||||
@fresh_cache()
|
||||
def test_fuse_scaled_matmul_reduce_scatter(self, A_dims, scatter_dim):
|
||||
if scatter_dim >= A_dims - 1:
|
||||
if scatter_dim >= A_dims:
|
||||
return
|
||||
|
||||
group = dist.group.WORLD
|
||||
@ -402,7 +402,7 @@ class MicroPipelineTPTest(TestCase):
|
||||
|
||||
@runOnRocmArch(MI300_ARCH)
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@parametrize("scatter_dim", [0, 1])
|
||||
@parametrize("scatter_dim", [0, 1, 2])
|
||||
@fresh_cache()
|
||||
def test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape(
|
||||
self, scatter_dim
|
||||
|
||||
41
test/distributed/tensor/test_fake.py
Normal file
41
test/distributed/tensor/test_fake.py
Normal file
@ -0,0 +1,41 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates
|
||||
# Owner(s): ["oncall: distributed"]
|
||||
|
||||
import torch
|
||||
from torch._subclasses.fake_tensor import FakeTensorMode
|
||||
from torch.distributed.tensor import DTensor
|
||||
from torch.distributed.tensor.placement_types import Shard
|
||||
from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
from torch.testing._internal.distributed.fake_pg import FakeStore
|
||||
|
||||
|
||||
class TestFakeDTensor(TestCase):
|
||||
def test_fake_dtensor_operations(self):
|
||||
# Use FakeTensorMode to handle CUDA tensors without actual CUDA
|
||||
fake_mode = FakeTensorMode()
|
||||
world_size = 4
|
||||
|
||||
fake_store = FakeStore()
|
||||
torch.distributed.init_process_group(
|
||||
"fake", store=fake_store, rank=0, world_size=world_size
|
||||
)
|
||||
device_mesh = torch.distributed.device_mesh.init_device_mesh(
|
||||
"cuda",
|
||||
(2, world_size // 2),
|
||||
)
|
||||
|
||||
# Create fake CUDA tensor using FakeTensorMode
|
||||
with fake_mode:
|
||||
x = torch.randn(1, 1, device="cuda")
|
||||
x = DTensor.from_local(x, device_mesh, [Shard(0), Shard(1)])
|
||||
|
||||
# Test basic DTensor operations
|
||||
self.assertIsInstance(x, DTensor)
|
||||
|
||||
# Test sum operation
|
||||
r = x.sum(1)
|
||||
self.assertIsInstance(r, DTensor)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
@ -880,34 +880,6 @@ class DistMathOpsTest(DTensorTestBase):
|
||||
out_full = out_dt.full_tensor()
|
||||
self.assertEqual(global_bins, out_full)
|
||||
|
||||
@with_comms
|
||||
def test_logsumexp(self):
|
||||
mesh = self.build_device_mesh()
|
||||
comm_mode = CommDebugMode()
|
||||
inp = torch.rand(3, 5, device=self.device_type)
|
||||
|
||||
shard_dim = 0
|
||||
input_dtensor = distribute_tensor(
|
||||
inp, device_mesh=mesh, placements=[Shard(shard_dim)]
|
||||
)
|
||||
|
||||
logsumexp_dims = [0, 1]
|
||||
for dim in logsumexp_dims:
|
||||
output = torch.logsumexp(inp, dim=dim)
|
||||
with comm_mode:
|
||||
output_dtensor = torch.logsumexp(input_dtensor, dim=dim)
|
||||
if dim == shard_dim:
|
||||
self.assertEqual(comm_mode.get_total_counts(), 1)
|
||||
self.assertEqual(
|
||||
comm_mode.get_comm_counts()[funcol.all_gather_into_tensor],
|
||||
1,
|
||||
)
|
||||
self.assertTrue(output_dtensor.placements[0].is_replicate())
|
||||
else:
|
||||
self.assertEqual(comm_mode.get_total_counts(), 0)
|
||||
self.assertTrue(output_dtensor.placements[0].is_shard(shard_dim))
|
||||
self.assertEqual(output_dtensor.full_tensor(), output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -505,7 +505,7 @@ class AsyncTPTest(MultiProcContinuousTest):
|
||||
not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
|
||||
)
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@parametrize("scatter_dim", [0, 1, 2])
|
||||
@parametrize("scatter_dim", [0, 1])
|
||||
def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None:
|
||||
self._init_process()
|
||||
|
||||
|
||||
@ -519,7 +519,11 @@ class AOTAutogradCacheTests(InductorTestCase):
|
||||
@functorch_config.patch(
|
||||
{"enable_autograd_cache": True, "view_replay_for_aliased_outputs": True}
|
||||
)
|
||||
def test_view_replay(self):
|
||||
def test_view_replay_bypass(self):
|
||||
"""
|
||||
Should bypass when view replay is turned on
|
||||
"""
|
||||
|
||||
def fn(a):
|
||||
tmp = a.detach()
|
||||
a.mul_(2)
|
||||
@ -527,25 +531,10 @@ class AOTAutogradCacheTests(InductorTestCase):
|
||||
|
||||
with torch.autograd._force_original_view_tracking(True):
|
||||
compiled_fn = torch.compile(fn)
|
||||
compiled_fn(torch.rand(2, 3))
|
||||
|
||||
def run_and_check(miss, hit, bypass):
|
||||
self._clear_dynamo_and_codecache()
|
||||
|
||||
inp = torch.rand(2, 3)
|
||||
compiled_inp = inp.clone().detach()
|
||||
|
||||
with torch.autograd._force_original_view_tracking(True):
|
||||
out = fn(inp)
|
||||
compiled_out = compiled_fn(compiled_inp)
|
||||
|
||||
self.assertEqual(out, compiled_out)
|
||||
self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], miss)
|
||||
self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], hit)
|
||||
self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], bypass)
|
||||
|
||||
run_and_check(miss=1, hit=0, bypass=0)
|
||||
run_and_check(miss=1, hit=1, bypass=0)
|
||||
run_and_check(miss=1, hit=2, bypass=0)
|
||||
self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
|
||||
self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], 1)
|
||||
|
||||
@inductor_config.patch("fx_graph_remote_cache", False)
|
||||
@inductor_config.patch("fx_graph_cache", True)
|
||||
|
||||
@ -21,7 +21,6 @@ from unittest.mock import MagicMock, patch
|
||||
|
||||
import torch
|
||||
import torch._dynamo as torchdynamo
|
||||
import torch.fx.traceback as fx_traceback
|
||||
import torch.nn.functional as F
|
||||
import torch.utils._pytree as pytree
|
||||
from functorch.experimental.control_flow import cond, map
|
||||
@ -62,10 +61,7 @@ from torch.export.passes import move_to_device_pass
|
||||
from torch.fx.experimental.proxy_tensor import make_fx
|
||||
from torch.fx.experimental.symbolic_shapes import ShapeEnv
|
||||
from torch.testing import FileCheck
|
||||
from torch.testing._internal.common_cuda import (
|
||||
PLATFORM_SUPPORTS_FLASH_ATTENTION,
|
||||
xfailIfDistributedNotSupported,
|
||||
)
|
||||
from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
|
||||
from torch.testing._internal.common_utils import (
|
||||
find_library_location,
|
||||
IS_FBCODE,
|
||||
@ -15072,39 +15068,6 @@ def forward(self, x):
|
||||
test_serdes=True,
|
||||
)
|
||||
|
||||
# TODO: following tests should be fixed
|
||||
@testing.expectedFailureTrainingIRToRunDecomp
|
||||
@testing.expectedFailureTrainingIRToRunDecompNonStrict
|
||||
def test_preserve_annotation(self):
|
||||
class M(torch.nn.Module):
|
||||
def forward(self, x):
|
||||
with fx_traceback.annotate({"pp_stage": 0}):
|
||||
with fx_traceback.annotate({"fdsp_bucket": 0}):
|
||||
x = x + 1
|
||||
x = x - 2
|
||||
with fx_traceback.annotate({"cuda_stream": 2, "fsdp_bucket": 1}):
|
||||
x = x * 2
|
||||
x = x / 3
|
||||
return x
|
||||
|
||||
m = M()
|
||||
|
||||
with fx_traceback.preserve_node_meta():
|
||||
ep = export(m, (torch.randn(10),))
|
||||
|
||||
for node in ep.graph.nodes:
|
||||
if node.target == torch.ops.aten.add.default:
|
||||
self.assertTrue(node.meta["custom"], {"pp_stage": 0, "fdsp_bucket": 0})
|
||||
if node.target == torch.ops.aten.sub.default:
|
||||
self.assertTrue(node.meta["custom"], {"pp_stage": 0})
|
||||
if node.target == torch.ops.aten.mul.default:
|
||||
self.assertTrue(
|
||||
node.meta["custom"],
|
||||
{"pp_stage": 0, "cuda_stream": 2, "fsdp_bucket": 1},
|
||||
)
|
||||
if node.target == torch.ops.aten.div.default:
|
||||
self.assertTrue(node.meta["custom"], {})
|
||||
|
||||
def test_dynamic_shapes_serdes_generic(self):
|
||||
from torch._export.serde.dynamic_shapes import (
|
||||
_dump_dynamic_shapes,
|
||||
@ -15824,7 +15787,6 @@ class GraphModule(torch.nn.Module):
|
||||
finally:
|
||||
torch.distributed.destroy_process_group()
|
||||
|
||||
@xfailIfDistributedNotSupported
|
||||
def test_distributed_all_reduce(self):
|
||||
class Foo(torch.nn.Module):
|
||||
def __init__(self):
|
||||
@ -15842,7 +15804,6 @@ class GraphModule(torch.nn.Module):
|
||||
inp = (torch.randn(4, 4),)
|
||||
self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
|
||||
|
||||
@xfailIfDistributedNotSupported
|
||||
def test_distributed_all_gather(self):
|
||||
class Foo(torch.nn.Module):
|
||||
def forward(self, x):
|
||||
@ -15858,7 +15819,6 @@ class GraphModule(torch.nn.Module):
|
||||
torch.allclose(a, b) for a, b in zip(ep.module()(*inp), m(*inp))
|
||||
)
|
||||
|
||||
@xfailIfDistributedNotSupported
|
||||
def test_distributed_all_gather_into_tensor(self):
|
||||
class Foo(torch.nn.Module):
|
||||
def forward(self, x):
|
||||
@ -15872,7 +15832,6 @@ class GraphModule(torch.nn.Module):
|
||||
inp = (torch.randn(2),)
|
||||
self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
|
||||
|
||||
@xfailIfDistributedNotSupported
|
||||
@testing.expectedFailureCppRuntime
|
||||
def test_distributed_all_to_all_single(self):
|
||||
class Foo(torch.nn.Module):
|
||||
@ -15890,7 +15849,6 @@ class GraphModule(torch.nn.Module):
|
||||
)
|
||||
self.assertEqual(len(nodes), 1)
|
||||
|
||||
@xfailIfDistributedNotSupported
|
||||
@testing.expectedFailureCppRuntime
|
||||
def test_distributed_reduce_scatter_tensor(self):
|
||||
class Foo(torch.nn.Module):
|
||||
|
||||
@ -8500,6 +8500,7 @@ class TestAOTAutogradWithCache(TestAOTAutogradWithDynamo):
|
||||
{
|
||||
"enable_autograd_cache": True,
|
||||
"strict_autograd_cache": True,
|
||||
"view_replay_for_aliased_outputs": False,
|
||||
}
|
||||
)
|
||||
@torch._inductor.config.patch("fx_graph_cache", True)
|
||||
|
||||
@ -20,7 +20,11 @@ from torch._inductor import config
|
||||
from torch._inductor.codegen.cpp import CppScheduling
|
||||
from torch._inductor.codegen.triton import TritonScheduling
|
||||
from torch._inductor.codegen.wrapper import PythonWrapperCodegen
|
||||
from torch._inductor.codegen.wrapper_fxir import FxConverter, WrapperFxCodegen
|
||||
from torch._inductor.codegen.wrapper_fxir import (
|
||||
FxConverter,
|
||||
replace_floor_div,
|
||||
WrapperFxCodegen,
|
||||
)
|
||||
from torch._inductor.test_case import TestCase as InductorTestCase
|
||||
from torch.export import Dim
|
||||
from torch.testing._internal.common_utils import (
|
||||
@ -34,6 +38,7 @@ from torch.testing._internal.inductor_utils import (
|
||||
requires_gpu,
|
||||
TRITON_HAS_CPU,
|
||||
)
|
||||
from torch.utils._sympy.functions import FloorDiv
|
||||
|
||||
|
||||
if HAS_GPU:
|
||||
@ -483,10 +488,11 @@ class FxirTestCase(InductorTestCase):
|
||||
)
|
||||
self.assertIn("ks0", triton_node.kwargs["kwargs"])
|
||||
|
||||
def test_dynamic_launch_grid_calc_python(self):
|
||||
def test_dynamic_launch_grid_calc(self):
|
||||
"""
|
||||
Test the dyanmic launch grid calculation for Triton kernel wrapper using python mode
|
||||
Test the dyanmic launch grid calculation.
|
||||
"""
|
||||
|
||||
func = torch.add
|
||||
args = [torch.randn(shape, device=self.device) for shape in [(7, 12), (7, 1)]]
|
||||
(gm,) = self._compile_and_check(func, args, compile_kwargs={"dynamic": True})
|
||||
@ -505,41 +511,6 @@ class FxirTestCase(InductorTestCase):
|
||||
self.assertEqual(grid[1], 1)
|
||||
self.assertEqual(grid[2], 1)
|
||||
|
||||
def test_dynamic_launch_grid_calc_python_slow(self):
|
||||
"""
|
||||
Test the dyanmic launch grid calculation for Triton kernel wrapper using python_slow mode
|
||||
"""
|
||||
from torch._inductor.runtime.triton_heuristics import GridExpr
|
||||
|
||||
# Mock GridExpr.from_meta to use "python_slow" mode explicitly
|
||||
original_from_meta = GridExpr.from_meta
|
||||
|
||||
def mocked_from_meta(inductor_meta, cfg, mode="python"):
|
||||
return original_from_meta(inductor_meta, cfg, mode="python_slow")
|
||||
|
||||
with unittest.mock.patch.object(GridExpr, "from_meta", mocked_from_meta):
|
||||
func = torch.add
|
||||
args = [
|
||||
torch.randn(shape, device=self.device) for shape in [(7, 12), (7, 1)]
|
||||
]
|
||||
(gm,) = self._compile_and_check(
|
||||
func, args, compile_kwargs={"dynamic": True}
|
||||
)
|
||||
|
||||
# Check for the precomputed size arg.
|
||||
(triton_node,) = gm.graph.find_nodes(
|
||||
op="call_function", target=triton_kernel_wrapper_mutation
|
||||
)
|
||||
self.assertIn("grid", triton_node.kwargs)
|
||||
self.assertIn("xnumel", triton_node.kwargs["kwargs"])
|
||||
self.assertIn("XBLOCK", triton_node.kwargs["kwargs"])
|
||||
grid = triton_node.kwargs["grid"][0]
|
||||
xnumel = triton_node.kwargs["kwargs"]["xnumel"].meta["val"]
|
||||
xblock = triton_node.kwargs["kwargs"]["XBLOCK"]
|
||||
self.assertEqual(grid[0].meta["val"], ((xnumel + xblock - 1) // xblock))
|
||||
self.assertEqual(grid[1], 1)
|
||||
self.assertEqual(grid[2], 1)
|
||||
|
||||
@config.patch({"trace.enabled": True})
|
||||
@unittest.mock.patch("torch._inductor.debug.DebugFormatter.output_code")
|
||||
def test_debug(self, mock_output_code):
|
||||
@ -990,6 +961,29 @@ def forward(self, arg0_1, arg1_1, arg2_1):
|
||||
return [buf1, buf2]""", # noqa: B950
|
||||
)
|
||||
|
||||
def test_dims_dynamic_outer_static_padded_inner(self):
|
||||
"""
|
||||
Test padding on inner dimensions, with dynamic outer dimensions.
|
||||
"""
|
||||
|
||||
class M(torch.nn.Module):
|
||||
def forward(self, x, y):
|
||||
return x + y
|
||||
|
||||
def get_input_padded_inner(shape):
|
||||
full_shape = shape[:-1] + (shape[-1] * 2,)
|
||||
full = torch.randn(full_shape, dtype=torch.float32, device=self.device)
|
||||
view = torch.as_strided(full, shape, full.stride())
|
||||
return view
|
||||
|
||||
shape = (4, 4, 4)
|
||||
args = tuple(get_input_padded_inner(shape) for _ in range(2))
|
||||
self.check(
|
||||
M(),
|
||||
args,
|
||||
dynamic_shapes=({0: Dim.DYNAMIC, 1: Dim.DYNAMIC, 2: Dim.STATIC},) * 2,
|
||||
)
|
||||
|
||||
@parametrize("length", (4, 8))
|
||||
def test_cond_dynamic_shape_pred_scalar_closure(self, length: int):
|
||||
"""
|
||||
@ -1033,6 +1027,132 @@ def forward(self, arg0_1, arg1_1, arg2_1):
|
||||
self.check(M(), (x,), dynamic_shapes=({0: Dim.DYNAMIC},))
|
||||
|
||||
|
||||
class TestReplaceFloorDiv(InductorTestCase):
|
||||
"""
|
||||
Tests for floor -> FloorDiv conversion.
|
||||
"""
|
||||
|
||||
def _check(self, expr: sympy.Expr) -> sympy.Expr:
|
||||
# Check that we started with floor's.
|
||||
num_floors = expr.count(sympy.floor)
|
||||
self.assertGreater(num_floors, 0)
|
||||
|
||||
replaced = replace_floor_div(expr)
|
||||
|
||||
# Check that all floor's were replaced.
|
||||
# We shoud have no more new FloorDiv's than floor's in the original expression,
|
||||
# although we can have less due to simplification.
|
||||
self.assertEqual(replaced.count(sympy.floor), 0)
|
||||
self.assertLessEqual(
|
||||
replaced.count(FloorDiv) - expr.count(FloorDiv), num_floors
|
||||
)
|
||||
|
||||
def expand_floor_div(
|
||||
numerator: sympy.Expr, denominator: sympy.Expr
|
||||
) -> sympy.Expr:
|
||||
return sympy.floor(numerator / denominator)
|
||||
|
||||
# Expand FloorDiv back into floor and check for equality.
|
||||
self.assertEqual(
|
||||
*[
|
||||
sympy.simplify(e.replace(FloorDiv, expand_floor_div))
|
||||
for e in (replaced, expr)
|
||||
]
|
||||
)
|
||||
|
||||
return replaced
|
||||
|
||||
def test_rewrite_floor_div_mul_pow(self):
|
||||
x, y = sympy.symbols("x y")
|
||||
expr = sympy.floor(x / y)
|
||||
self.assertEqual(expr.count(FloorDiv), 0)
|
||||
self.assertEqual(expr.count(sympy.core.mul.Mul), 1)
|
||||
self.assertEqual(expr.count(sympy.Pow), 1)
|
||||
|
||||
rewritten = self._check(expr)
|
||||
self.assertTrue(isinstance(rewritten, FloorDiv))
|
||||
self.assertEqual(rewritten.args, (x, y))
|
||||
|
||||
def test_rewrite_floor_div_mul_rational(self):
|
||||
x = sympy.Symbol("x")
|
||||
expr = sympy.floor(x / 5)
|
||||
self.assertEqual(expr.count(FloorDiv), 0)
|
||||
self.assertEqual(expr.count(sympy.core.mul.Mul), 1)
|
||||
self.assertEqual(expr.count(sympy.Rational), 1)
|
||||
|
||||
rewritten = self._check(expr)
|
||||
self.assertTrue(isinstance(rewritten, FloorDiv))
|
||||
self.assertEqual(rewritten.args, (x, 5))
|
||||
|
||||
def test_no_rewrite_div(self):
|
||||
x, y = sympy.symbols("x y")
|
||||
expr = x / y
|
||||
self.assertEqual(expr.count(FloorDiv), 0)
|
||||
|
||||
rewritten = replace_floor_div(expr)
|
||||
self.assertEqual(rewritten, expr)
|
||||
|
||||
def test_rewrite_floor_div_nested(self):
|
||||
x, y = sympy.symbols("x y")
|
||||
expr = sympy.floor((sympy.floor(x / 5) + 1) / y)
|
||||
self.assertEqual(expr.count(FloorDiv), 0)
|
||||
|
||||
rewritten = self._check(expr)
|
||||
self.assertEqual(rewritten.count(FloorDiv), 2)
|
||||
|
||||
def test_rewrite_floor_div_rational_const(self):
|
||||
expr = sympy.floor(sympy.S.One / 5, evaluate=False)
|
||||
self.assertEqual(expr.count(FloorDiv), 0)
|
||||
self.assertEqual(expr.count(sympy.Mul), 0)
|
||||
self.assertEqual(expr.count(sympy.Rational), 1)
|
||||
|
||||
# Expression evaluates to a compile time constant
|
||||
rewritten = self._check(expr)
|
||||
self.assertEqual(rewritten, sympy.S.Zero)
|
||||
|
||||
def test_no_distribute_mul_floordiv(self):
|
||||
"""
|
||||
Test that multiplication doesn't distribute with floor division.
|
||||
"""
|
||||
x = sympy.Symbol("x")
|
||||
expr = 2 * sympy.floor(x / 2)
|
||||
rewritten = self._check(expr)
|
||||
self.assertEqual(rewritten.count(sympy.Mul), 1)
|
||||
self.assertEqual(rewritten.count(FloorDiv), 1)
|
||||
|
||||
def test_rational_multi_pows(self):
|
||||
"""
|
||||
Test an expression with a rational and multiple pows.
|
||||
"""
|
||||
x, y, z = sympy.symbols("x y z")
|
||||
expr = sympy.floor((x / 5) * (y**2) * (z**3))
|
||||
mul = expr.args[0]
|
||||
self.assertTrue(isinstance(mul, sympy.Mul))
|
||||
self.assertTrue(isinstance(mul.args[0], sympy.Rational))
|
||||
self.assertEqual(expr.count(sympy.Pow), 2)
|
||||
rewritten = self._check(expr)
|
||||
self.assertEqual(rewritten.count(FloorDiv), 1)
|
||||
|
||||
def test_variable_exp(self):
|
||||
"""
|
||||
Test pow when the exponent is a variable.
|
||||
"""
|
||||
x = sympy.Symbol("x", positive=True)
|
||||
expr = sympy.floor(2**-x)
|
||||
replaced = self._check(expr)
|
||||
|
||||
# Check that x went to the denominator.
|
||||
self.assertEqual(replaced.args, (1, 2**x))
|
||||
|
||||
def test_launch_grid_dynamic_padding(self):
|
||||
"""
|
||||
Test a complex launch grid expression arising from padding with dynamic shapes.
|
||||
"""
|
||||
x, y = sympy.symbols("x y")
|
||||
expr = sympy.floor(-FloorDiv(x * y, 2) / FloorDiv(-x * y, 131070))
|
||||
self._check(expr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from torch._inductor.test_case import run_tests
|
||||
|
||||
|
||||
@ -3238,40 +3238,6 @@ aten::mm""",
|
||||
assert "Overload Name" in key_averages.table()
|
||||
validate_json(prof)
|
||||
|
||||
def test_expose_kineto_event_metadata(self):
|
||||
def check_metadata(prof, op_name, metadata_key):
|
||||
with TemporaryFileName(mode="w+") as fname:
|
||||
prof.export_chrome_trace(fname)
|
||||
with open(fname) as f:
|
||||
events = json.load(f)["traceEvents"]
|
||||
found_op = False
|
||||
for e in events:
|
||||
if "name" in e and "args" in e and e["name"] == op_name:
|
||||
assert metadata_key in e["args"], (
|
||||
f"Metadata for '{op_name}' in Chrome trace did not contain '{metadata_key}'."
|
||||
)
|
||||
found_op = True
|
||||
assert found_op, f"Could not find op '{op_name}' in Chrome trace."
|
||||
found_op = False
|
||||
for event in prof.events():
|
||||
if event.name == op_name:
|
||||
assert metadata_key in event.metadata_json, (
|
||||
f"Metadata for '{op_name}' in FunctionEvent did not contain '{metadata_key}'."
|
||||
)
|
||||
found_op = True
|
||||
assert found_op, f"Could not find op '{op_name}' in prof.events()."
|
||||
|
||||
experimental_config = torch._C._profiler._ExperimentalConfig(
|
||||
expose_kineto_event_metadata=True
|
||||
)
|
||||
with profile(
|
||||
experimental_config=experimental_config,
|
||||
activities=[ProfilerActivity.CPU],
|
||||
) as prof:
|
||||
torch.add(1, 5)
|
||||
|
||||
check_metadata(prof, op_name="aten::add", metadata_key="Ev Idx")
|
||||
|
||||
@unittest.skipIf(not torch.cuda.is_available(), "requries CUDA")
|
||||
def test_profiler_debug_autotuner(self):
|
||||
"""
|
||||
|
||||
@ -7,7 +7,7 @@ import sys
|
||||
from dataclasses import dataclass
|
||||
from multiprocessing.context import SpawnProcess
|
||||
from typing import Any, Optional
|
||||
from unittest import skipUnless
|
||||
from unittest import skipIf, skipUnless
|
||||
from unittest.mock import mock_open, patch
|
||||
|
||||
import torch
|
||||
@ -22,7 +22,7 @@ from torch.numa.binding import (
|
||||
AffinityMode,
|
||||
NumaOptions,
|
||||
)
|
||||
from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
from torch.testing._internal.common_utils import IS_MACOS, run_tests, TestCase
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@ -680,6 +680,7 @@ class NumaBindingTest(TestCase):
|
||||
set(range(0, 2)),
|
||||
)
|
||||
|
||||
@skipIf(IS_MACOS, "sched_getaffinity doesn't exist")
|
||||
def test_binds_to_node_0_if_node_stored_as_minus_one(self) -> None:
|
||||
self._add_mock_hardware(
|
||||
num_sockets=1,
|
||||
|
||||
@ -2488,9 +2488,9 @@ class TestSparseCSR(TestCase):
|
||||
self.assertEqual(a.grad, a1.grad)
|
||||
self.assertEqual(b.grad, b1.grad)
|
||||
|
||||
@skipCUDAIfRocm
|
||||
@onlyCUDA
|
||||
@skipCUDAIf(True, "Causes CUDA memory exception, see https://github.com/pytorch/pytorch/issues/72177")
|
||||
# It works on ROCm and CUDA issue is currently active
|
||||
@skipCUDAIf(not TEST_WITH_ROCM, "Causes CUDA memory exception, see https://github.com/pytorch/pytorch/issues/72177")
|
||||
@dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
|
||||
@precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3,
|
||||
torch.float64: 1e-8, torch.complex128: 1e-8})
|
||||
|
||||
@ -88,8 +88,7 @@ def build_pytorch(
|
||||
) -> None:
|
||||
my_env = _create_build_env()
|
||||
if (
|
||||
not check_negative_env_flag("USE_DISTRIBUTED")
|
||||
and not check_negative_env_flag("USE_CUDA")
|
||||
not check_negative_env_flag("USE_CUDA")
|
||||
and not check_negative_env_flag("USE_NCCL")
|
||||
and not check_env_flag("USE_SYSTEM_NCCL")
|
||||
):
|
||||
|
||||
@ -469,30 +469,6 @@ class Op:
|
||||
f"{p2p_info}, " if p2p_info else ""
|
||||
)
|
||||
|
||||
def dtype_mismatch(self, other: "Op") -> bool:
|
||||
if (
|
||||
(
|
||||
self.type not in ["scatter", "gather", "broadcast"]
|
||||
and set(self.input_dtypes) != set(self.output_dtypes)
|
||||
and self.input_sizes[0]
|
||||
and self.output_sizes[0]
|
||||
)
|
||||
or (
|
||||
self.type not in ["scatter", "broadcast"]
|
||||
and set(self.input_dtypes) != set(other.input_dtypes)
|
||||
and self.input_sizes[0]
|
||||
and other.input_sizes[0]
|
||||
)
|
||||
or (
|
||||
self.type not in ["gather"]
|
||||
and set(self.output_dtypes) != set(other.output_dtypes)
|
||||
and self.output_sizes[0]
|
||||
and other.output_sizes[0]
|
||||
)
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
def match(self, other: "Op") -> MatchInfo:
|
||||
# TODO: I think this can validly not match,
|
||||
# e.g. if one PG was used for p2p ops between only some of the peers?
|
||||
@ -534,7 +510,23 @@ class Op:
|
||||
MatchState.COLLECTIVE_STATE_MISMATCH,
|
||||
f"Expected state: '{self.state}' does not match found state: '{other.state}'",
|
||||
)
|
||||
if self.dtype_mismatch(other):
|
||||
if (
|
||||
(
|
||||
set(self.input_dtypes) != set(self.output_dtypes)
|
||||
and self.input_sizes[0]
|
||||
and self.output_sizes[0]
|
||||
)
|
||||
or (
|
||||
set(self.input_dtypes) != set(other.input_dtypes)
|
||||
and self.input_sizes[0]
|
||||
and other.input_sizes[0]
|
||||
)
|
||||
or (
|
||||
set(self.input_dtypes) != set(other.output_dtypes)
|
||||
and self.input_sizes[0]
|
||||
and other.output_sizes[0]
|
||||
)
|
||||
):
|
||||
return MatchInfo(
|
||||
MatchState.COLLECTIVE_DTYPE_MISMATCH,
|
||||
f"Expected dtypes: '{set(self.input_dtypes)}' does not "
|
||||
|
||||
@ -189,12 +189,6 @@ def main() -> None:
|
||||
)
|
||||
options = parser.parse_args()
|
||||
|
||||
# Path: aten/src/ATen
|
||||
aten_path = os.path.dirname(os.path.dirname(options.native_functions_path))
|
||||
operator_selector = get_selector(
|
||||
options.selected_op_list_path, options.operators_yaml_path
|
||||
)
|
||||
|
||||
generate_code(
|
||||
options.gen_dir,
|
||||
options.native_functions_path,
|
||||
@ -204,37 +198,18 @@ def main() -> None:
|
||||
options.disable_autograd,
|
||||
options.force_schema_registration,
|
||||
# options.selected_op_list
|
||||
operator_selector=operator_selector,
|
||||
)
|
||||
|
||||
# Generate the python bindings for functionalization's `ViewMeta` classes.
|
||||
from torchgen.gen_functionalization_type import (
|
||||
gen_functionalization_view_meta_classes,
|
||||
)
|
||||
|
||||
functionalization_templates_dir = os.path.join(aten_path, "templates")
|
||||
install_dir = options.install_dir or os.fspath(options.gen_dir / "torch/csrc")
|
||||
functionalization_install_dir = os.path.join(
|
||||
install_dir, "functionalization", "generated"
|
||||
)
|
||||
|
||||
os.makedirs(functionalization_install_dir, exist_ok=True)
|
||||
assert os.path.isdir(functionalization_install_dir)
|
||||
assert os.path.isdir(functionalization_templates_dir)
|
||||
|
||||
gen_functionalization_view_meta_classes(
|
||||
options.native_functions_path or NATIVE_FUNCTIONS_PATH,
|
||||
options.tags_path or TAGS_PATH,
|
||||
selector=operator_selector,
|
||||
install_dir=functionalization_install_dir,
|
||||
template_dir=functionalization_templates_dir,
|
||||
operator_selector=get_selector(
|
||||
options.selected_op_list_path, options.operators_yaml_path
|
||||
),
|
||||
)
|
||||
|
||||
if options.gen_lazy_ts_backend:
|
||||
aten_path = os.path.dirname(os.path.dirname(options.native_functions_path))
|
||||
ts_backend_yaml = os.path.join(aten_path, "native/ts_native_functions.yaml")
|
||||
ts_native_functions = "torch/csrc/lazy/ts_backend/ts_native_functions.cpp"
|
||||
ts_node_base = "torch/csrc/lazy/ts_backend/ts_node.h"
|
||||
lazy_install_dir = os.path.join(install_dir, "lazy", "generated")
|
||||
install_dir = options.install_dir or os.fspath(options.gen_dir / "torch/csrc")
|
||||
lazy_install_dir = os.path.join(install_dir, "lazy/generated")
|
||||
os.makedirs(lazy_install_dir, exist_ok=True)
|
||||
|
||||
assert os.path.isfile(ts_backend_yaml), (
|
||||
|
||||
@ -276,32 +276,30 @@ add_custom_command(
|
||||
WORKING_DIRECTORY
|
||||
"${TORCH_ROOT}"
|
||||
)
|
||||
if(USE_DISTRIBUTED)
|
||||
if(WIN32)
|
||||
append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
|
||||
else()
|
||||
append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
|
||||
endif()
|
||||
# Disable certain warnings for GCC-9.X
|
||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
|
||||
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
|
||||
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
|
||||
endif()
|
||||
# NCCL is a private dependency of libtorch, but libtorch_python includes
|
||||
# some private headers of libtorch, which in turn include NCCL. As a hacky
|
||||
# alternative to making NCCL a public dependency of libtorch, we make it
|
||||
# a private dependency of libtorch_python as well.
|
||||
if(USE_NCCL)
|
||||
list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
|
||||
endif()
|
||||
# Same for MPI.
|
||||
if(USE_MPI)
|
||||
list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX)
|
||||
endif()
|
||||
list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
|
||||
|
||||
if(WIN32)
|
||||
append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
|
||||
else()
|
||||
append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
|
||||
endif()
|
||||
# Disable certain warnings for GCC-9.X
|
||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
|
||||
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
|
||||
set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
|
||||
endif()
|
||||
# NCCL is a private dependency of libtorch, but libtorch_python includes
|
||||
# some private headers of libtorch, which in turn include NCCL. As a hacky
|
||||
# alternative to making NCCL a public dependency of libtorch, we make it
|
||||
# a private dependency of libtorch_python as well.
|
||||
if(USE_NCCL)
|
||||
list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
|
||||
endif()
|
||||
# Same for MPI.
|
||||
if(USE_MPI)
|
||||
list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX)
|
||||
endif()
|
||||
list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
|
||||
|
||||
if(USE_NCCL AND NOT WIN32)
|
||||
list(APPEND TORCH_PYTHON_SRCS
|
||||
@ -369,10 +367,6 @@ if(BUILD_LIBTORCHLESS)
|
||||
target_compile_definitions(torch_python PRIVATE USE_C10D_NCCL)
|
||||
endif()
|
||||
|
||||
if(USE_DISTRIBUTED)
|
||||
target_compile_definitions(torch_python PRIVATE USE_DISTRIBUTED)
|
||||
endif()
|
||||
|
||||
if(USE_MPI AND USE_C10D_MPI)
|
||||
target_compile_definitions(torch_python PRIVATE USE_C10D_MPI)
|
||||
endif()
|
||||
|
||||
@ -30,7 +30,6 @@ from torch._C import (
|
||||
_cpu,
|
||||
_dynamo,
|
||||
_export,
|
||||
_functionalization,
|
||||
_functorch,
|
||||
_lazy,
|
||||
_lazy_ts_backend,
|
||||
|
||||
@ -78,7 +78,6 @@ class _KinetoEvent:
|
||||
def privateuse1_elapsed_us(self) -> int: ...
|
||||
def is_user_annotation(self) -> bool: ...
|
||||
def is_hidden_event(self) -> bool: ...
|
||||
def metadata_json(self) -> str: ...
|
||||
|
||||
class _ProfilerResult:
|
||||
def events(self) -> list[_KinetoEvent]: ...
|
||||
|
||||
@ -851,3 +851,12 @@ class ProcessGroupXCCL(Backend):
|
||||
|
||||
def _set_process_group(pg: ProcessGroup) -> None: ...
|
||||
def _current_process_group() -> ProcessGroup: ...
|
||||
def _dump_nccl_trace_json(
|
||||
includeCollectives: Optional[bool] = ...,
|
||||
onlyActive: Optional[bool] = ...,
|
||||
) -> bytes: ...
|
||||
def _dump_nccl_trace(
|
||||
includeCollectives: Optional[bool] = ...,
|
||||
includeStackTraces: Optional[bool] = ...,
|
||||
onlyActive: Optional[bool] = ...,
|
||||
) -> bytes: ...
|
||||
|
||||
@ -1,16 +0,0 @@
|
||||
from torch import Tensor
|
||||
from torch.types import _bool
|
||||
|
||||
# Defined in torch/csrc/functionalization/Module.cpp
|
||||
|
||||
class ViewMeta:
|
||||
has_symbolic_inputs: _bool
|
||||
|
||||
# Returns the list of ViewMeta instances of the given functional tensor.
|
||||
#
|
||||
# Although we do have python bindings for their types, we won't
|
||||
# expose them here, since they should not be used by users.
|
||||
def get_view_meta_sequence(tensor: Tensor) -> list[ViewMeta]: ...
|
||||
|
||||
# Applies the ViewMeta sequence on top of the given base.
|
||||
def apply_view_meta_sequence(base: Tensor, sequence: list[ViewMeta]) -> Tensor: ...
|
||||
@ -51,7 +51,6 @@ from .resume_execution import TORCH_DYNAMO_RESUME_IN_PREFIX
|
||||
from .utils import (
|
||||
getfile,
|
||||
hashable,
|
||||
is_annotate_wrapped_function,
|
||||
is_lru_cache_wrapped_function,
|
||||
NP_SUPPORTED_MODULES,
|
||||
unwrap_if_wrapper,
|
||||
@ -155,7 +154,6 @@ manual_torch_name_rule_map: dict[
|
||||
type[UserFunctionVariable],
|
||||
],
|
||||
] = {
|
||||
"torch.fx.traceback.annotate": UserFunctionVariable,
|
||||
"torch.onnx.is_in_onnx_export": TorchInGraphFunctionVariable,
|
||||
"torch.onnx.operators.shape_as_tensor": TorchInGraphFunctionVariable,
|
||||
"torch.overrides.is_tensor_like": TorchInGraphFunctionVariable,
|
||||
@ -3004,8 +3002,6 @@ def get_torch_obj_rule_map() -> dict[Any, type["VariableTracker"]]:
|
||||
continue
|
||||
obj = torch_dir + k[len("torch/") :]
|
||||
if obj is not None:
|
||||
if is_annotate_wrapped_function(obj):
|
||||
obj = obj.__wrapped__
|
||||
if is_lru_cache_wrapped_function(obj):
|
||||
obj = obj.__wrapped__
|
||||
if obj in d and d[obj] != v:
|
||||
|
||||
@ -1101,14 +1101,6 @@ def is_lru_cache_wrapped_function(
|
||||
)
|
||||
|
||||
|
||||
def is_annotate_wrapped_function(
|
||||
value: Any,
|
||||
) -> bool:
|
||||
return value == torch.fx.traceback.annotate and is_function(
|
||||
inspect.getattr_static(value, "__wrapped__")
|
||||
)
|
||||
|
||||
|
||||
_FuncTypes: TypeAlias = Union[
|
||||
types.FunctionType,
|
||||
types.BuiltinFunctionType,
|
||||
|
||||
@ -284,6 +284,19 @@ def check_cacheable(gm: torch.fx.GraphModule):
|
||||
check_cacheable(gm.saved_tensors_hooks_unpack_0) # type: ignore[arg-type]
|
||||
|
||||
|
||||
def check_metadata_cacheable(metadata: ViewAndMutationMeta):
|
||||
"""
|
||||
When view replay is turned on, we bypass autograd cache if
|
||||
the output is aliased.
|
||||
"""
|
||||
if config.view_replay_for_aliased_outputs:
|
||||
for info in metadata.output_info:
|
||||
if info.functional_tensor is not None:
|
||||
raise BypassAOTAutogradCache(
|
||||
"Cannot cache a graph with functional tensor"
|
||||
)
|
||||
|
||||
|
||||
class AOTAutogradCacheDetails(FxGraphHashDetails):
|
||||
"""
|
||||
Object to capture all the details for a dynamo graph module relevant to computing
|
||||
@ -790,6 +803,7 @@ class GenericAOTAutogradCacheEntry(Generic[TForward, TBackward]):
|
||||
"""
|
||||
Perform any preparations to make the cache entry ready for serialization.
|
||||
"""
|
||||
check_metadata_cacheable(self.runtime_metadata)
|
||||
self.compiled_fw.pre_save()
|
||||
if self.compiled_bw is not None:
|
||||
self.compiled_bw.pre_save()
|
||||
|
||||
@ -43,10 +43,10 @@ from .functional_utils import (
|
||||
has_metadata_mutation,
|
||||
MetadataKey,
|
||||
to_fun,
|
||||
ViewMetaSequence,
|
||||
was_inductor_storage_resized,
|
||||
)
|
||||
from .schemas import (
|
||||
FunctionalTensorMetadataEq,
|
||||
InputAliasInfo,
|
||||
MemoryFormatMeta,
|
||||
MutationType,
|
||||
@ -640,7 +640,7 @@ from a multi-output view call"
|
||||
#
|
||||
# The FunctionalTensor will be saved if one of the 2 conditions below
|
||||
# is true:
|
||||
view_meta_sequence = None
|
||||
functional_tensor = None
|
||||
if (
|
||||
# 1. If the output_type is either of:
|
||||
# (i) alias_of_intermediate;
|
||||
@ -672,7 +672,7 @@ from a multi-output view call"
|
||||
and not input_info[base_idx].mutates_metadata
|
||||
):
|
||||
if isinstance(o, FunctionalTensor):
|
||||
view_meta_sequence = ViewMetaSequence(o)
|
||||
functional_tensor = FunctionalTensorMetadataEq(o.elem)
|
||||
|
||||
out_info = OutputAliasInfo(
|
||||
output_type=output_type,
|
||||
@ -680,7 +680,7 @@ from a multi-output view call"
|
||||
base_idx=base_idx,
|
||||
dynamic_dims=dynamic_dims,
|
||||
requires_grad=isinstance(o, torch.Tensor) and o.requires_grad,
|
||||
view_meta_sequence=view_meta_sequence,
|
||||
functional_tensor=functional_tensor,
|
||||
)
|
||||
output_info.append(out_info)
|
||||
|
||||
|
||||
@ -14,7 +14,6 @@ from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import Tensor
|
||||
from torch._C import _functionalization
|
||||
from torch._logging import getArtifactLogger
|
||||
from torch._subclasses.fake_tensor import FakeTensor
|
||||
from torch._subclasses.functional_tensor import FunctionalTensor
|
||||
@ -225,9 +224,9 @@ def gen_alias_from_base(
|
||||
aliased_base_tensor,
|
||||
target_meta_tensor,
|
||||
target_requires_grad,
|
||||
target_view_meta_sequence: Optional[ViewMetaSequence] = None,
|
||||
target_functional_tensor: Optional[FunctionalTensorMetadataEq] = None,
|
||||
*,
|
||||
replay_views: bool,
|
||||
replay_views,
|
||||
):
|
||||
# Patch the correct requires_grad field of the output tensor, depending on whether:
|
||||
# (i) the reconstructed output (out) was came from a tensor that requires grad or not;
|
||||
@ -246,11 +245,13 @@ def gen_alias_from_base(
|
||||
# to replay them (view functions) on the aliased_base_tensor.
|
||||
if (
|
||||
replay_views
|
||||
and target_view_meta_sequence is not None
|
||||
and not any(vm.has_symbolic_inputs for vm in target_view_meta_sequence.sequence)
|
||||
and target_functional_tensor is not None
|
||||
and not torch._functionalize_is_symbolic(target_functional_tensor.tensor)
|
||||
):
|
||||
out = _functionalization.apply_view_meta_sequence(
|
||||
aliased_base_tensor, target_view_meta_sequence.sequence
|
||||
functional_tensor = target_functional_tensor.tensor
|
||||
|
||||
out = torch._functionalize_apply_view_metas(
|
||||
functional_tensor, aliased_base_tensor
|
||||
)
|
||||
# If re-applying the ViewMeta sequence succeeded, there should be no more
|
||||
# problems going forward. We just check we got to the target shape and
|
||||
@ -356,45 +357,25 @@ class MetadataKey:
|
||||
)
|
||||
|
||||
|
||||
# ViewMeta sequence wrapper for equality comparisons.
|
||||
#
|
||||
# Even though we can compare each ViewMeta instance, we compare the resulting
|
||||
# tensor metadata, instead. That's because the creation of synthetic bases + the
|
||||
# re-generation of input views might end-up creating a different sequence of
|
||||
# ViewMeta that is semantically equivalent. i.e. gets to a tensor with the same
|
||||
# metadata.
|
||||
#
|
||||
# Therefore, we store what the end result should look like as serializable
|
||||
# metadata.
|
||||
#
|
||||
# When logging, this class should look like:
|
||||
#
|
||||
# ViewMetaSequence(view, select_int, slice_Tensor)
|
||||
#
|
||||
# i.e. a parenthesized list of view operations within that ViewMeta sequence.
|
||||
class ViewMetaSequence:
|
||||
def __init__(self, tensor: FunctionalTensor) -> None:
|
||||
assert torch._is_functional_tensor(tensor.elem)
|
||||
self.sequence = _functionalization.get_view_meta_sequence(tensor.elem)
|
||||
self.metadata = MetadataKey.make(tensor)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
suffix = len("_ViewMeta")
|
||||
types = ", ".join(type(vm).__name__[:-suffix] for vm in self.sequence)
|
||||
return f"ViewMetaSequence({types})"
|
||||
# Wrapper around a FunctionalTensorWrapper for comparing only the resulting metadata
|
||||
# after applying all the ViewMeta operations.
|
||||
class FunctionalTensorMetadataEq:
|
||||
def __init__(self, tensor: torch.Tensor) -> None:
|
||||
assert torch._is_functional_tensor(tensor)
|
||||
self.tensor = tensor
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
# If other is None, then it probably means that we weren't able to recreate
|
||||
# the ViewMeta sequence. One example is when we update the view metadata by
|
||||
# calling: create_synthetic_base_metadata.
|
||||
# the FunctionalTensorMetadataEq. One of this cases is when we update the
|
||||
# view metadata by calling: create_synthetic_base_metadata.
|
||||
if other is None:
|
||||
return True
|
||||
|
||||
# Comparison against any other type is not implemented.
|
||||
if not isinstance(other, ViewMetaSequence):
|
||||
if not isinstance(other, FunctionalTensorMetadataEq):
|
||||
return NotImplemented
|
||||
|
||||
return self.metadata == other.metadata
|
||||
return has_same_metadata(self.tensor, other.tensor)
|
||||
|
||||
|
||||
# new_arg and arg here are either:
|
||||
|
||||
@ -89,7 +89,7 @@ def remove_dupe_metadata(
|
||||
dynamic_dims=o.dynamic_dims,
|
||||
base_idx=None if o.base_idx is None else add_dupe_map[o.base_idx],
|
||||
requires_grad=o.requires_grad,
|
||||
view_meta_sequence=o.view_meta_sequence,
|
||||
functional_tensor=o.functional_tensor,
|
||||
)
|
||||
for o in m.output_info
|
||||
],
|
||||
@ -242,7 +242,7 @@ def create_synthetic_base_metadata(
|
||||
# Map the input idx pre-synthetic-bases to the new idx post-synthetic-bases
|
||||
base_idx=new_base_idx, # type: ignore[arg-type]
|
||||
requires_grad=o.requires_grad,
|
||||
view_meta_sequence=o.view_meta_sequence,
|
||||
functional_tensor=o.functional_tensor,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@ -150,7 +150,7 @@ class AliasOfInputHandler:
|
||||
self.base_idx = info.base_idx
|
||||
self.unwrap_out = _unwrap_tensoralias if trace_joint else _identity
|
||||
self.requires_grad = info.requires_grad
|
||||
self.view_meta_sequence = info.view_meta_sequence
|
||||
self.functional_tensor = info.functional_tensor
|
||||
self.replay_views = config.view_replay_for_aliased_outputs
|
||||
|
||||
def __call__(self, orig_inputs, fw_outs, out):
|
||||
@ -159,7 +159,7 @@ class AliasOfInputHandler:
|
||||
aliased_base_tensor,
|
||||
self.unwrap_out(out),
|
||||
self.requires_grad,
|
||||
self.view_meta_sequence,
|
||||
self.functional_tensor,
|
||||
replay_views=self.replay_views,
|
||||
)
|
||||
|
||||
@ -190,7 +190,7 @@ class AliasOfIntermediateHandler:
|
||||
|
||||
self.unwrap_out = _unwrap_tensoralias if trace_joint else _identity
|
||||
self.requires_grad = info.requires_grad
|
||||
self.view_meta_sequence = info.view_meta_sequence
|
||||
self.functional_tensor = info.functional_tensor
|
||||
self.replay_views = config.view_replay_for_aliased_outputs
|
||||
|
||||
def __call__(self, orig_inputs, fw_outs, out):
|
||||
@ -199,7 +199,7 @@ class AliasOfIntermediateHandler:
|
||||
self._unwrap_aliased_base_tensor(aliased_base_tensor),
|
||||
self.unwrap_out(out),
|
||||
self.requires_grad,
|
||||
self.view_meta_sequence,
|
||||
self.functional_tensor,
|
||||
replay_views=self.replay_views,
|
||||
)
|
||||
|
||||
|
||||
@ -7,6 +7,7 @@ input/output types, metadata, config, function signatures etc.
|
||||
from __future__ import annotations
|
||||
|
||||
import collections
|
||||
import dataclasses
|
||||
import functools
|
||||
import itertools
|
||||
from dataclasses import dataclass, field
|
||||
@ -31,7 +32,10 @@ from torch.fx.experimental._backward_state import BackwardState
|
||||
from torch.utils._python_dispatch import is_traceable_wrapper_subclass
|
||||
|
||||
from .. import config
|
||||
from .functional_utils import _check_if_mutation_can_be_in_graph, ViewMetaSequence
|
||||
from .functional_utils import (
|
||||
_check_if_mutation_can_be_in_graph,
|
||||
FunctionalTensorMetadataEq,
|
||||
)
|
||||
from .utils import strict_zip
|
||||
|
||||
|
||||
@ -113,14 +117,15 @@ class OutputAliasInfo:
|
||||
dynamic_dims: Optional[set[int]]
|
||||
# requires_grad
|
||||
requires_grad: bool
|
||||
# Sequence of ViewMeta objects.
|
||||
# FunctionalTensorWrapper that represents this output.
|
||||
#
|
||||
# Provides us the means to re-run view functions on other tensors.
|
||||
# Provides us the means to replay views from it.
|
||||
#
|
||||
# We need to wrap the actual list of ViewMeta with this class so that
|
||||
# we compare the ViewMeta elements appropriately, i.e. their type and
|
||||
# the elements returned by the `as_tuple()` call.
|
||||
view_meta_sequence: Optional[ViewMetaSequence] = None
|
||||
# We need to wrap the actual FunctionalTensorWrapper with this class so that
|
||||
# we only compare the tensor's metadata. That's because with the transformations
|
||||
# of the model throughout AOTAutograd, the sequence of ViewMeta and the base
|
||||
# tensor might change.
|
||||
functional_tensor: Optional[FunctionalTensorMetadataEq] = None
|
||||
|
||||
|
||||
class MutationType(Enum):
|
||||
@ -660,6 +665,17 @@ class ViewAndMutationMeta:
|
||||
self.traced_tangent_metas = [extract_metadata(t) for t in self.traced_tangents]
|
||||
# Clear traced tangents at runtime
|
||||
self.traced_tangents = []
|
||||
new_output_info = []
|
||||
for out in self.output_info:
|
||||
if config.view_replay_for_aliased_outputs:
|
||||
new_out = out
|
||||
else:
|
||||
# If we're not using view_replay, remove the functional tensor.
|
||||
# Functional tensors are unfortunately not serializable,
|
||||
# so doing this is required for AOTAutograd caching.
|
||||
new_out = dataclasses.replace(out, functional_tensor=None)
|
||||
new_output_info.append(new_out)
|
||||
self.output_info = new_output_info
|
||||
for inp_meta in self.subclass_inp_meta:
|
||||
if isinstance(inp_meta, SubclassCreationMeta):
|
||||
inp_meta.make_runtime_safe()
|
||||
|
||||
@ -23,11 +23,7 @@ from torch._higher_order_ops.triton_kernel_wrap import (
|
||||
from torch._inductor.codecache import LambdaFuture, PyCodeCache
|
||||
from torch._inductor.runtime.triton_heuristics import CachingAutotuner
|
||||
from torch._inductor.select_algorithm import extern_kernels # noqa: F401
|
||||
from torch._inductor.utils import (
|
||||
convert_shape_to_symint,
|
||||
convert_to_symint,
|
||||
sympy_product,
|
||||
)
|
||||
from torch._inductor.utils import convert_shape_to_symint, convert_to_symint
|
||||
from torch._inductor.virtualized import V
|
||||
from torch._library.triton import wrap_triton
|
||||
from torch.fx import GraphModule
|
||||
@ -120,30 +116,20 @@ def replace_floor_div(expr: sympy.Expr) -> sympy.Expr:
|
||||
def replace(expr: sympy.Expr) -> sympy.Expr:
|
||||
expr = sympy.together(expr)
|
||||
|
||||
# Find division operations in the sympy.floor expression
|
||||
# Div is either represented as Mul with:
|
||||
# Rational denominator or Pow with negative exponent
|
||||
if not isinstance(expr, sympy.core.mul.Mul):
|
||||
return sympy.floor(expr)
|
||||
|
||||
if isinstance(expr.args[0], sympy.Rational):
|
||||
frac = expr.args[0]
|
||||
numerator = sympy_product(expr.args[1:]) * frac.numerator
|
||||
denominator = frac.denominator
|
||||
|
||||
return FloorDiv(numerator, denominator)
|
||||
elif isinstance(expr.args[0], sympy.Pow):
|
||||
base = expr.args[0].base
|
||||
exp = expr.args[0].exp
|
||||
numerator = sympy_product(expr.args[1:])
|
||||
if exp < 0:
|
||||
denominator = base ** (-exp)
|
||||
# Division is represented as a Mul with a Rational factor or a Pow with negative
|
||||
# exponent. We convert floor(Mul(...)) to FloorDiv(numerator, denominator) by
|
||||
# partitioning factors into the numerator and denominator.
|
||||
(numerator, denominator) = (sympy.S.One,) * 2
|
||||
for arg in sympy.Mul.make_args(expr):
|
||||
if isinstance(arg, sympy.Rational):
|
||||
numerator *= arg.numerator
|
||||
denominator *= arg.denominator
|
||||
elif isinstance(arg, sympy.Pow) and arg.exp.is_negative:
|
||||
denominator *= arg.base**-arg.exp
|
||||
else:
|
||||
numerator = numerator * (base**exp)
|
||||
denominator = 1
|
||||
return FloorDiv(numerator, denominator)
|
||||
else:
|
||||
return sympy.floor(expr)
|
||||
numerator *= arg
|
||||
|
||||
return FloorDiv(numerator, denominator)
|
||||
|
||||
return expr.replace(sympy.floor, replace)
|
||||
|
||||
@ -930,10 +916,6 @@ class FxConverter:
|
||||
call_args = self._lookup_args(line.call_args)
|
||||
kernel = self.kernels[line.kernel_name]
|
||||
tuner = kernel.tuner
|
||||
# Use python_slow mode instead of python mode to avoid
|
||||
# the round to neginf behaviour, which is not the convention
|
||||
# in other languages.
|
||||
tuner.grid_mode = "python_slow"
|
||||
|
||||
# Optionally autotune the kernels.
|
||||
# The FX backend currently only supports compile-time tuning.
|
||||
@ -1007,8 +989,7 @@ class FxConverter:
|
||||
call_kwargs = dict(zip(signature, call_args))
|
||||
call_kwargs.update(kernel_config.kwargs)
|
||||
|
||||
# Replace all sympy.floor with FloorDiv
|
||||
# _generate_sym_node does not support sympy.floor
|
||||
# Replace sympy.floor with FloorDiv, to make the expression traceable.
|
||||
grid = [replace_floor_div(x) if isinstance(x, sympy.Expr) else x for x in grid]
|
||||
wrapper_grid = [tuple(self._generate_sym_nodes(grid))]
|
||||
call_kwargs = {
|
||||
|
||||
@ -880,14 +880,6 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None:
|
||||
if not is_symm_mem_enabled_for_group(group_name):
|
||||
return
|
||||
|
||||
filter_matmul = None
|
||||
if orig_scatter_dim == _get_tensor(input_node).ndim - 1:
|
||||
# scaled_mm is not supported yet for last dim mm+rs
|
||||
def _filter_out_scaled_matmul(matmul: _Matmul):
|
||||
return not isinstance(matmul, _ScaledMatmul)
|
||||
|
||||
filter_matmul = _filter_out_scaled_matmul
|
||||
|
||||
# Currently fused_matmul_reduce_scatter doesn't return the matmul result,
|
||||
# so we can't apply the fusion if the matmul result is used by multiple
|
||||
# users. This is not a fundamental limitation of the fused op and can be
|
||||
@ -899,16 +891,12 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None:
|
||||
return
|
||||
|
||||
matmul = _find_producer_matmul(input_node)
|
||||
|
||||
if matmul is None:
|
||||
log.warning(
|
||||
"no producer matmul found for reduce scatter, skipping fuse_matmul_reduce_scatter fusion"
|
||||
)
|
||||
return
|
||||
|
||||
if filter_matmul and not filter_matmul(matmul):
|
||||
return
|
||||
|
||||
if rs_wait_tensor_node in matmul.arg_ancestor_nodes:
|
||||
log.warning(
|
||||
"reduce-scatter result node is an ancestor of matmul, skipping fuse_matmul_reduce_scatter fusion"
|
||||
|
||||
@ -375,7 +375,7 @@ class CachingAutotuner(KernelInterface):
|
||||
self.is_backward = False
|
||||
|
||||
# Mode for launch grid calculation
|
||||
self.grid_mode: Literal["python", "python_slow", "cpp"] = "python"
|
||||
self.grid_mode: Literal["python", "cpp"] = "python"
|
||||
|
||||
def is_statically_launchable(self):
|
||||
"""
|
||||
@ -3192,14 +3192,14 @@ class GridExpr:
|
||||
"""Generate code for grid size expressions in launcher"""
|
||||
|
||||
inductor_meta: dict[str, Any]
|
||||
mode: Literal["python", "cpp", "python_slow"] = "python"
|
||||
mode: Literal["python", "cpp"] = "python"
|
||||
prefix: list[str] = dataclasses.field(default_factory=list)
|
||||
x_grid: Union[str, int] = 1
|
||||
y_grid: Union[str, int] = 1
|
||||
z_grid: Union[str, int] = 1
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
assert self.mode in ("python", "cpp", "python_slow")
|
||||
assert self.mode in ("python", "cpp")
|
||||
|
||||
def generate(self, meta: dict[str, int]) -> None:
|
||||
raise NotImplementedError
|
||||
@ -3215,10 +3215,6 @@ class GridExpr:
|
||||
# negative integer division is floored
|
||||
if self.mode == "python":
|
||||
return f"-(({numel}) // -({block}))"
|
||||
# This is more generic than above, and works in languages where
|
||||
# positive integer division is floored/truncated
|
||||
elif self.mode == "python_slow":
|
||||
return f"(({numel} + {block} - 1) // ({block}))"
|
||||
# For cpp code gen
|
||||
return f"(({numel} + ({block} - 1)) / ({block}))"
|
||||
|
||||
@ -3227,7 +3223,7 @@ class GridExpr:
|
||||
items = self._constant_fold(max, seq)
|
||||
if len(items) <= 1:
|
||||
return items[0]
|
||||
if self.mode in ("python", "python_slow"):
|
||||
if self.mode == "python":
|
||||
return f"max({', '.join(map(str, items))})"
|
||||
return functools.reduce(lambda x, y: f"std::max({x}, {y})", items)
|
||||
|
||||
@ -3250,7 +3246,7 @@ class GridExpr:
|
||||
|
||||
def assign_tmp(self, name: str, expr: Union[str, int]) -> str:
|
||||
# Grid functions are one per kernel, so name collisions are fine
|
||||
if self.mode in ("python", "python_slow"):
|
||||
if self.mode == "python":
|
||||
return f"{name} = {expr}"
|
||||
if self.mode == "cpp":
|
||||
return f"uint32_t {name} = {expr};"
|
||||
@ -3260,7 +3256,7 @@ class GridExpr:
|
||||
def from_meta(
|
||||
inductor_meta: dict[str, Any],
|
||||
cfg: Union[Config, dict[str, int]],
|
||||
mode: Literal["python", "cpp", "python_slow"] = "python",
|
||||
mode: Literal["python", "cpp"] = "python",
|
||||
) -> GridExpr:
|
||||
grid_cls = globals()[inductor_meta["grid_type"]]
|
||||
assert issubclass(grid_cls, GridExpr)
|
||||
|
||||
@ -638,7 +638,6 @@ class profile:
|
||||
device_resource_id=kineto_event.device_resource_id(),
|
||||
flops=kineto_event.flops(),
|
||||
is_user_annotation=kineto_event.is_user_annotation(),
|
||||
metadata_json=kineto_event.metadata_json(),
|
||||
)
|
||||
max_evt_id = max(max_evt_id, fe.id)
|
||||
if fe.device_type == DeviceType.CPU and not fe.is_async:
|
||||
|
||||
@ -491,7 +491,6 @@ class FunctionEvent(FormattedTimesMixin):
|
||||
concrete_inputs=None,
|
||||
kwinputs=None,
|
||||
is_user_annotation=False,
|
||||
metadata_json=None,
|
||||
):
|
||||
self.id: int = id
|
||||
self.node_id: int = node_id
|
||||
@ -527,7 +526,6 @@ class FunctionEvent(FormattedTimesMixin):
|
||||
self.self_cpu_percent = -1
|
||||
self.total_cpu_percent = -1
|
||||
self.total_device_percent = -1
|
||||
self.metadata_json = metadata_json
|
||||
|
||||
def append_kernel(self, name, device, duration):
|
||||
assert self.device_type == DeviceType.CPU
|
||||
|
||||
@ -15,9 +15,7 @@
|
||||
#include <torch/csrc/utils/cpp_stacktraces.h>
|
||||
#include <torch/csrc/utils/pybind.h>
|
||||
|
||||
#if defined(USE_DISTRIBUTED)
|
||||
#include <torch/csrc/distributed/c10d/exception.h>
|
||||
#endif
|
||||
|
||||
inline void PyErr_SetString(PyObject* type, const std::string& message) {
|
||||
PyErr_SetString(type, message.c_str());
|
||||
|
||||
@ -72,7 +72,6 @@
|
||||
#include <torch/csrc/cpu/Module.h>
|
||||
#include <torch/csrc/dynamo/init.h>
|
||||
#include <torch/csrc/export/pybind.h>
|
||||
#include <torch/csrc/functionalization/Module.h>
|
||||
#include <torch/csrc/functorch/init.h>
|
||||
#include <torch/csrc/fx/node.h>
|
||||
#include <torch/csrc/inductor/aoti_package/pybind.h>
|
||||
@ -122,14 +121,10 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef USE_DISTRIBUTED
|
||||
#ifdef USE_C10D
|
||||
#include <torch/csrc/distributed/autograd/python_autograd.h>
|
||||
#include <torch/csrc/distributed/c10d/c10d.h>
|
||||
#include <torch/csrc/distributed/rpc/rpc.h>
|
||||
#include <torch/csrc/distributed/rpc/testing/testing.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(USE_VALGRIND)
|
||||
#include <callgrind.h>
|
||||
@ -409,9 +404,11 @@ static PyObject* THPModule_swap_tensor_impl(PyObject* _unused, PyObject* args) {
|
||||
// The TensorImpls contain PyObjectSlots that have a reference to the PyObject
|
||||
// associated with the TensorImpl. Swap this field as well.
|
||||
std::optional<PyObject*> mb_obj_a =
|
||||
a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
|
||||
a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
|
||||
/*ignore_hermetic_tls=*/false);
|
||||
std::optional<PyObject*> mb_obj_b =
|
||||
b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
|
||||
b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
|
||||
/*ignore_hermetic_tls=*/false);
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
mb_obj_a.has_value() && mb_obj_b.has_value(),
|
||||
"Both tensors should have PyObjects tagged by the current python interpreter");
|
||||
@ -552,11 +549,7 @@ static PyObject* THPModule_getBackcompatKeepdimWarn(
|
||||
}
|
||||
|
||||
static PyObject* THPModule_hasDistributed(PyObject* _unused, PyObject* noargs) {
|
||||
#ifdef USE_DISTRIBUTED
|
||||
Py_RETURN_TRUE;
|
||||
#else
|
||||
Py_RETURN_FALSE;
|
||||
#endif
|
||||
}
|
||||
|
||||
static PyObject* THPModule_showConfig(PyObject* module, PyObject* noargs) {
|
||||
@ -2008,7 +2001,6 @@ PyObject* initModule() {
|
||||
#ifdef USE_XPU
|
||||
THPUtils_addPyMethodDefs(methods, THXPModule_methods());
|
||||
#endif
|
||||
#if defined(USE_DISTRIBUTED) && defined(USE_C10D)
|
||||
THPUtils_addPyMethodDefs(
|
||||
methods, torch::distributed::c10d::python_functions());
|
||||
#ifndef _WIN32
|
||||
@ -2018,7 +2010,6 @@ PyObject* initModule() {
|
||||
methods, torch::distributed::autograd::python_functions());
|
||||
THPUtils_addPyMethodDefs(
|
||||
methods, torch::distributed::rpc::testing::python_functions());
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static struct PyModuleDef torchmodule = {
|
||||
@ -2091,7 +2082,6 @@ PyObject* initModule() {
|
||||
torch::instruction_counter::initModule(module);
|
||||
torch::initVerboseBindings(module);
|
||||
ASSERT_TRUE(THPStorage_init(module));
|
||||
torch::functionalization::initModule(module);
|
||||
|
||||
#ifdef USE_CUDA
|
||||
// This will only initialise base classes and attach them to library namespace
|
||||
|
||||
@ -614,7 +614,8 @@ static void set_tensor_attr_with_capsule(
|
||||
const c10::TensorImpl* tensor,
|
||||
py::capsule& capsule,
|
||||
const char* attr_name) {
|
||||
std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj();
|
||||
std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj(
|
||||
/*ignore_hermetic_tls=*/false);
|
||||
TORCH_CHECK(
|
||||
mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
|
||||
auto obj = mb_obj.value();
|
||||
@ -641,7 +642,8 @@ static c10::ArrayRef<T> get_set_cached_attr(
|
||||
const c10::TensorImpl* tensor,
|
||||
const char* base_attr_name,
|
||||
const py::object& obj) {
|
||||
std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj();
|
||||
std::optional<PyObject*> mb_obj =
|
||||
tensor->pyobj_slot()->check_pyobj(getPyInterpreter());
|
||||
TORCH_CHECK(
|
||||
mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
|
||||
auto tensor_obj = mb_obj.value();
|
||||
|
||||
@ -41,8 +41,8 @@ PyObject* THPStorage_NewWithStorage(
|
||||
"Creating a Storage subclass from a class that does not inherit from ",
|
||||
"Storage is not possible. Make sure your class inherits from Storage.");
|
||||
|
||||
auto maybe_pyobj =
|
||||
_storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj();
|
||||
auto maybe_pyobj = _storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(
|
||||
/*ignore_hermetic_tls=*/false);
|
||||
if (maybe_pyobj.has_value() && maybe_pyobj.value()) {
|
||||
TORCH_CHECK(
|
||||
allow_preexisting_pyobj,
|
||||
@ -93,7 +93,8 @@ PyObject* THPStorage_Wrap(c10::Storage storage) {
|
||||
}
|
||||
c10::impl::PyObjectSlot* pyobj_slot = storage_impl->pyobj_slot();
|
||||
|
||||
std::optional<PyObject*> maybe_pyobj = pyobj_slot->check_pyobj();
|
||||
std::optional<PyObject*> maybe_pyobj = pyobj_slot->check_pyobj(
|
||||
/*ignore_hermetic_tls=*/false);
|
||||
if (maybe_pyobj.has_value()) {
|
||||
auto obj = *maybe_pyobj;
|
||||
if (obj) {
|
||||
@ -126,8 +127,8 @@ static bool THPStorage_isPreservable(THPStorage* self) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj() !=
|
||||
(PyObject*)self) {
|
||||
if (storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(
|
||||
/*ignore_hermetic_tls=*/true) != (PyObject*)self) {
|
||||
return false;
|
||||
}
|
||||
if (storage.use_count() <= 1) {
|
||||
@ -144,7 +145,8 @@ static bool THPStorage_tryPreserve(THPStorage* self) {
|
||||
const auto& storage = THPStorage_Unpack(self);
|
||||
c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl();
|
||||
|
||||
auto maybe_pyobj = storage_impl->pyobj_slot()->check_pyobj();
|
||||
auto maybe_pyobj = storage_impl->pyobj_slot()->check_pyobj(
|
||||
/*ignore_hermetic_tls=*/true);
|
||||
// NOTE: It is possible to just set the PyObjectSlot here, but the point is
|
||||
// that we should have already set PyObjectSlot when the storage PyObject
|
||||
// was created.
|
||||
|
||||
@ -245,12 +245,13 @@ static void general_trace_function(
|
||||
tracer::addInputs(
|
||||
node, args[i].name().c_str(), iter->toBoolList().vec());
|
||||
} else {
|
||||
TORCH_CHECK(false, "unsupported input list type: ", elem_type->str());
|
||||
throw std::runtime_error(
|
||||
"unsupported input list type: " + elem_type->str());
|
||||
}
|
||||
} else if (iter->isObject()) {
|
||||
tracer::addInputs(node, args[i].name().c_str(), iter->toObject());
|
||||
} else {
|
||||
TORCH_CHECK(false, "unsupported input type: ", type->str());
|
||||
throw std::runtime_error("unsupported input type: " + type->str());
|
||||
}
|
||||
}
|
||||
graph->insertNode(node);
|
||||
@ -276,19 +277,16 @@ static void general_trace_function(
|
||||
AT_ASSERT(iter->isTensorList());
|
||||
tracer::addOutput(node, iter->toTensorList());
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false, "unsupported output list type: ", elem_type->str());
|
||||
throw std::runtime_error(
|
||||
"unsupported output list type: " + elem_type->str());
|
||||
}
|
||||
} else if (type->kind() == TypeKind::ClassType) {
|
||||
AT_ASSERT(iter->isObject());
|
||||
tracer::addOutput(node, iter->toObject());
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"unsupported output type: ",
|
||||
type->str(),
|
||||
", from operator: ",
|
||||
toString(op.operator_name()));
|
||||
throw std::runtime_error(
|
||||
"unsupported output type: " + type->str() +
|
||||
", from operator: " + toString(op.operator_name()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -11,8 +11,10 @@ void check_single_result(
|
||||
const at::TensorBase& value,
|
||||
const at::TensorBase& result,
|
||||
const std::string& hook_name) {
|
||||
TORCH_CHECK(
|
||||
value.defined(), "can't replace a empty gradient with a non-empty value");
|
||||
if (!value.defined()) {
|
||||
throw std::runtime_error(
|
||||
"can't replace a empty gradient with a non-empty value");
|
||||
}
|
||||
torch::autograd::check_variable_result(value, result, hook_name);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
@ -482,31 +482,30 @@ void check_variable_result(
|
||||
const at::TensorBase& original,
|
||||
const at::TensorBase& result,
|
||||
const std::string& hook_name) {
|
||||
TORCH_CHECK(
|
||||
original.options().type_equal(result.options()),
|
||||
"hook '",
|
||||
hook_name,
|
||||
"' has changed the type of value (was ",
|
||||
original.toString(),
|
||||
" got ",
|
||||
result.toString(),
|
||||
")");
|
||||
if (!original.options().type_equal(result.options())) {
|
||||
std::stringstream ss;
|
||||
ss << "hook '" << hook_name << "' has changed the type of value (";
|
||||
ss << "was " << original.toString() << " got ";
|
||||
ss << result.toString() << ")";
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
|
||||
TORCH_CHECK(
|
||||
original.is_cuda() == result.is_cuda(),
|
||||
"hook '",
|
||||
hook_name,
|
||||
"' has changed the type of value (was ",
|
||||
original.is_cuda() ? "CUDA tensor" : "CPU tensor",
|
||||
" got ",
|
||||
result.is_cuda() ? "CUDA tensor" : "CPU tensor",
|
||||
")");
|
||||
if (original.is_cuda() != result.is_cuda()) {
|
||||
std::stringstream ss;
|
||||
ss << "hook '" << hook_name << "' has changed the type of value";
|
||||
if (original.is_cuda()) {
|
||||
ss << " (was CUDA tensor got CPU tensor)";
|
||||
} else {
|
||||
ss << " (was CPU tensor got CUDA tensor)";
|
||||
}
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
|
||||
TORCH_CHECK(
|
||||
original.sym_sizes().vec() == result.sym_sizes().vec(),
|
||||
"hook '",
|
||||
hook_name,
|
||||
"' has changed the size of value");
|
||||
if (original.sym_sizes().vec() != result.sym_sizes().vec()) {
|
||||
std::stringstream ss;
|
||||
ss << "hook '" << hook_name << "' has changed the size of value";
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
}
|
||||
|
||||
AutogradContext::AutogradContext(PackedArgs& packed_args) {
|
||||
|
||||
@ -228,32 +228,30 @@ inline variable_list CppNode_apply_functional(
|
||||
}
|
||||
}
|
||||
|
||||
TORCH_CHECK(
|
||||
num_outputs == num_forward_inputs,
|
||||
"function ",
|
||||
name,
|
||||
" returned an incorrect number of gradients (expected ",
|
||||
num_forward_inputs,
|
||||
", got ",
|
||||
num_outputs,
|
||||
")");
|
||||
if (num_outputs != num_forward_inputs) {
|
||||
std::string msg("function ");
|
||||
msg += name + " returned an incorrect number of gradients (expected ";
|
||||
msg += std::to_string(num_forward_inputs) + ", got ";
|
||||
msg += std::to_string(num_outputs) + ")";
|
||||
throw std::runtime_error(msg);
|
||||
}
|
||||
|
||||
variable_list results;
|
||||
results.reserve(num_outputs);
|
||||
for (const auto i : c10::irange(num_outputs)) {
|
||||
if (!is_variable_input_[i]) {
|
||||
TORCH_CHECK(
|
||||
outputs[i].defined() == false,
|
||||
"function ",
|
||||
name,
|
||||
" returned a gradient different that is defined at position ",
|
||||
i + 1,
|
||||
", std the corresponding forward input was not a Variable");
|
||||
if (outputs[i].defined()) {
|
||||
std::string msg("function ");
|
||||
msg += name +
|
||||
" returned a gradient different that is defined at position ";
|
||||
msg += std::to_string(i + 1) +
|
||||
", std the corresponding forward input was not a Variable";
|
||||
throw std::runtime_error(msg);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
results.emplace_back(outputs[i]);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
|
||||
@ -707,8 +707,9 @@ void GraphTask::mark_as_completed_and_run_post_processing() {
|
||||
}
|
||||
|
||||
void GraphTask::exec_post_processing() {
|
||||
TORCH_CHECK(
|
||||
not_ready_.empty(), "could not compute gradients for some functions");
|
||||
if (!not_ready_.empty()) {
|
||||
throw std::runtime_error("could not compute gradients for some functions");
|
||||
}
|
||||
|
||||
// set the thread_local current_graph_task_ as more callbacks can be installed
|
||||
// by existing final callbacks.
|
||||
@ -1148,13 +1149,12 @@ void Engine::evaluate_function(
|
||||
for (const auto i : c10::irange(num_outputs)) {
|
||||
auto& output = outputs[i];
|
||||
at::OptionalDeviceGuard guard(device_of(output));
|
||||
TORCH_CHECK(
|
||||
!output.defined() || !isnan(output)._is_any_true().item<bool>(),
|
||||
"Function '",
|
||||
fn.name(),
|
||||
"' returned nan values in its ",
|
||||
i,
|
||||
"th output.");
|
||||
if (output.defined() && isnan(output)._is_any_true().item<bool>()) {
|
||||
std::stringstream ss;
|
||||
ss << "Function '" << fn.name() << "' returned nan values in its " << i
|
||||
<< "th output.";
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1175,7 +1175,7 @@ void Engine::evaluate_function(
|
||||
|
||||
if (it == dependencies.end()) {
|
||||
auto name = next.function->name();
|
||||
TORCH_CHECK(false, "dependency not found for ", name);
|
||||
throw std::runtime_error(std::string("dependency not found for ") + name);
|
||||
} else if (--it->second == 0) {
|
||||
dependencies.erase(it);
|
||||
is_ready = true;
|
||||
|
||||
@ -17,7 +17,7 @@ variable_list Error::apply(variable_list&& inputs) {
|
||||
}
|
||||
|
||||
variable_list Error::apply(variable_list&& inputs) const {
|
||||
TORCH_CHECK(false, msg);
|
||||
throw std::runtime_error(msg);
|
||||
}
|
||||
|
||||
void Error::compiled_args(CompiledNodeArgs& args) const {
|
||||
|
||||
@ -8,9 +8,7 @@
|
||||
#include <torch/csrc/autograd/python_autograd.h>
|
||||
#include <torch/csrc/autograd/python_cpp_function.h>
|
||||
#include <torch/csrc/autograd/python_variable.h>
|
||||
#ifdef USE_DISTRIBUTED
|
||||
#include <torch/csrc/distributed/autograd/functions/sendrpc_backward.h>
|
||||
#endif
|
||||
#include <torch/csrc/jit/python/python_tracer.h>
|
||||
#include <torch/csrc/utils/pybind.h>
|
||||
#include <torch/csrc/utils/python_numbers.h>
|
||||
@ -49,7 +47,7 @@ struct UndefinedGradCtor {
|
||||
|
||||
struct NoCtor {
|
||||
Node* operator()(PyObject* args) {
|
||||
TORCH_CHECK(false, "Cannot construct");
|
||||
throw std::runtime_error("Cannot construct");
|
||||
}
|
||||
};
|
||||
|
||||
@ -150,11 +148,9 @@ void THPAutograd_initFunctions() {
|
||||
static PyTypeObject CopyBackwardsClass;
|
||||
addClass<CopyBackwards, NoCtor>(module, CopyBackwardsClass, "CopyBackwards");
|
||||
|
||||
#ifdef USE_DISTRIBUTED
|
||||
static PyTypeObject SendRpcBackwardClass;
|
||||
addClass<torch::distributed::autograd::SendRpcBackward, NoCtor>(
|
||||
module, SendRpcBackwardClass, "SendRpcBackward");
|
||||
#endif
|
||||
|
||||
static PyTypeObject CopySlicesClass;
|
||||
addClass<CopySlices, NoCtor>(module, CopySlicesClass, "CopySlices");
|
||||
|
||||
@ -184,7 +184,9 @@ inline variable_list CopySlices::apply_impl(
|
||||
// see Note [Thread Safety on Autograd Node]
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
|
||||
TORCH_CHECK(fn, ERR_BACKWARD_TWICE);
|
||||
if (!fn) {
|
||||
throw std::runtime_error(ERR_BACKWARD_TWICE);
|
||||
}
|
||||
|
||||
auto result =
|
||||
grad.new_empty_strided_symint(base.sym_sizes(), base.sym_strides());
|
||||
@ -250,7 +252,9 @@ variable_list CopySlices::apply_with_saved(
|
||||
|
||||
auto results = variable_list(num_outputs());
|
||||
if (grads[0].defined()) {
|
||||
TORCH_CHECK(fn, ERR_BACKWARD_TWICE);
|
||||
if (!fn) {
|
||||
throw std::runtime_error(ERR_BACKWARD_TWICE);
|
||||
}
|
||||
update_exec_info();
|
||||
|
||||
std::vector<bool> needs_input_grad;
|
||||
|
||||
@ -53,22 +53,18 @@ void check_input_variables(
|
||||
if (required_args == -1) {
|
||||
required_args = args;
|
||||
}
|
||||
TORCH_CHECK(
|
||||
inputs.size() == static_cast<size_t>(args),
|
||||
name,
|
||||
": expected ",
|
||||
args,
|
||||
" arguments (got ",
|
||||
inputs.size(),
|
||||
")");
|
||||
|
||||
if (inputs.size() != static_cast<size_t>(args)) {
|
||||
std::stringstream ss;
|
||||
ss << name << ": expected " << args << " arguments (got " << inputs.size();
|
||||
ss << ")";
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
for (const auto i : c10::irange(required_args)) {
|
||||
TORCH_CHECK(
|
||||
inputs[i].defined() || allow_undefined,
|
||||
name,
|
||||
": expected Tensor at argument ",
|
||||
i,
|
||||
" (got None)");
|
||||
if (!inputs[i].defined() && !allow_undefined) {
|
||||
std::stringstream ss;
|
||||
ss << name << ": expected Tensor at argument " << i << " (got None)";
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace torch::autograd
|
||||
|
||||
@ -309,12 +309,8 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
|
||||
})
|
||||
.def("nbytes", [](const KinetoEvent& e) { return e.nBytes(); })
|
||||
// whether the event is hidden
|
||||
.def(
|
||||
"is_hidden_event",
|
||||
[](const KinetoEvent& e) { return e.isHiddenEvent(); })
|
||||
// KinetoEvent metadata
|
||||
.def("metadata_json", [](const KinetoEvent& e) {
|
||||
return e.metadataJson();
|
||||
.def("is_hidden_event", [](const KinetoEvent& e) {
|
||||
return e.isHiddenEvent();
|
||||
});
|
||||
|
||||
m.def("_soft_assert_raises", &setSoftAssertRaises);
|
||||
|
||||
@ -37,8 +37,7 @@ extern "C" {
|
||||
// https://github.com/pytorch/pytorch/issues/51026
|
||||
__attribute__((weak)) int acc_get_device_type();
|
||||
__attribute__((weak)) int acc_get_device_type() {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
throw std::runtime_error(
|
||||
"Dummy implementation of acc_get_device_type is not supposed to be called!");
|
||||
}
|
||||
} // extern "C"
|
||||
@ -1068,17 +1067,6 @@ void KinetoEvent::getPerfEventCounters(std::vector<uint64_t>& in) const {
|
||||
[](const auto&) -> void { return; }));
|
||||
}
|
||||
|
||||
std::string KinetoEvent::metadataJson() const {
|
||||
return result_->visit(c10::overloaded(
|
||||
[](const ExtraFields<EventType::TorchOp>& op) -> std::string {
|
||||
return op.metadata_json_;
|
||||
},
|
||||
[](const ExtraFields<EventType::Kineto>& op) -> std::string {
|
||||
return op.metadata_json_;
|
||||
},
|
||||
[](const auto&) -> std::string { return std::string(""); }));
|
||||
}
|
||||
|
||||
#define FORWARD_FROM_RESULT(method_name, result_expr) \
|
||||
decltype(std::declval<KinetoEvent>().method_name()) \
|
||||
KinetoEvent::method_name() const { \
|
||||
|
||||
@ -65,7 +65,6 @@ struct TORCH_API KinetoEvent {
|
||||
int64_t privateuse1ElapsedUs() const;
|
||||
void getPerfEventCounters(torch::profiler::perf_counters_t&) const;
|
||||
extra_meta_t extraMeta() const;
|
||||
std::string metadataJson() const;
|
||||
|
||||
private:
|
||||
torch::profiler::impl::ProfilerVoidEventStub fallbackStart() const;
|
||||
|
||||
@ -97,7 +97,7 @@ struct TORCH_API LegacyEvent {
|
||||
case EventKind::MemoryAlloc:
|
||||
return "memory_alloc";
|
||||
}
|
||||
TORCH_CHECK(false, "unknown event kind");
|
||||
throw std::runtime_error("unknown event kind");
|
||||
}
|
||||
|
||||
EventKind kind() const {
|
||||
|
||||
@ -30,7 +30,7 @@ void PyAnomalyMetadata::store_stack() {
|
||||
void PyAnomalyMetadata::print_stack(const std::string& current_node_name) {
|
||||
pybind11::gil_scoped_acquire gil;
|
||||
if (!PyDict_Check(dict())) {
|
||||
TORCH_CHECK(false, "Anomaly metadata is not a python dictionary.");
|
||||
throw std::runtime_error("Anomaly metadata is not a python dictionary.");
|
||||
}
|
||||
PyObject* trace_stack = nullptr;
|
||||
if (PyDict_GetItemStringRef(dict(), ANOMALY_TRACE_KEY, &trace_stack) < 0) {
|
||||
|
||||
@ -261,7 +261,8 @@ PyTypeObject* _initFunctionPyTypeObject(
|
||||
type.tp_traverse = THPCppFunction_traverse;
|
||||
type.tp_clear = THPCppFunction_clear;
|
||||
if (PyType_Ready(&type) < 0) {
|
||||
TORCH_CHECK(false, "Unable to instantiate PyTypeObject for ", name);
|
||||
auto msg = std::string("Unable to instantiate PyTypeObject for ") + name;
|
||||
throw std::runtime_error(msg);
|
||||
}
|
||||
return &type;
|
||||
}
|
||||
|
||||
@ -501,7 +501,7 @@ static void child_atfork() {
|
||||
bool THPEngine_initModule(PyObject* module) {
|
||||
#ifndef _WIN32
|
||||
if (pthread_atfork(nullptr, nullptr, child_atfork) != 0) {
|
||||
TORCH_CHECK(false, "unable to set pthread_atfork handler");
|
||||
throw std::runtime_error("unable to set pthread_atfork handler");
|
||||
}
|
||||
#endif
|
||||
if (PyType_Ready(&THPEngineType) < 0)
|
||||
|
||||
@ -188,15 +188,13 @@ auto PyNode::apply(variable_list&& inputs) -> variable_list {
|
||||
}
|
||||
|
||||
// Now the number of gradients should match
|
||||
TORCH_CHECK(
|
||||
num_outputs == num_forward_inputs,
|
||||
"function ",
|
||||
name(),
|
||||
" returned an incorrect number of gradients (expected ",
|
||||
num_forward_inputs,
|
||||
", got ",
|
||||
num_outputs,
|
||||
")");
|
||||
if (num_outputs != num_forward_inputs) {
|
||||
std::string msg("function ");
|
||||
msg += name() + " returned an incorrect number of gradients (expected ";
|
||||
msg += std::to_string(num_forward_inputs) + ", got ";
|
||||
msg += std::to_string(num_outputs) + ")";
|
||||
throw std::runtime_error(msg);
|
||||
}
|
||||
|
||||
// Massage the Python results tuple back into a C++ variable_list
|
||||
return to_variable_list(r.get(), is_variable_input);
|
||||
@ -437,24 +435,24 @@ variable_list PyNode::to_variable_list(
|
||||
PyObject* output = PyTuple_GET_ITEM(outputs, i);
|
||||
bool was_variable = is_variable_input[i];
|
||||
if (!was_variable) {
|
||||
TORCH_CHECK(
|
||||
output == Py_None,
|
||||
"function ",
|
||||
name(),
|
||||
" returned a gradient different than None at position ",
|
||||
i + 1,
|
||||
", but the corresponding forward input was not a Variable");
|
||||
if (output != Py_None) {
|
||||
std::string msg("function ");
|
||||
msg += name() + " returned a gradient different than None at position ";
|
||||
msg += std::to_string(i + 1) +
|
||||
", but the corresponding forward input was not a Variable";
|
||||
throw std::runtime_error(msg);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (output == Py_None) {
|
||||
results.emplace_back();
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
THPVariable_Check(output),
|
||||
"expected Variable or None (got ",
|
||||
THPUtils_typename(output),
|
||||
")");
|
||||
|
||||
if (!THPVariable_Check(output)) {
|
||||
std::string msg("expected Variable or None (got ");
|
||||
msg += THPUtils_typename(output);
|
||||
msg += ")";
|
||||
throw std::runtime_error(msg);
|
||||
}
|
||||
results.emplace_back(THPVariable_Unpack(output));
|
||||
}
|
||||
}
|
||||
|
||||
@ -289,7 +289,9 @@ static variable_list unwrap_variables(PyObject* py_variables) {
|
||||
results[i] = THPVariable_Unpack(item);
|
||||
} else {
|
||||
// this should never happen, but just in case...
|
||||
TORCH_CHECK(false, "expected variable but got ", Py_TYPE(item)->tp_name);
|
||||
std::stringstream ss;
|
||||
ss << "expected variable but got " << Py_TYPE(item)->tp_name;
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
}
|
||||
return results;
|
||||
@ -306,16 +308,14 @@ static void check_result(PyObject* prev, PyObject* result, PyObject* hook) {
|
||||
|
||||
auto prev_size = PyTuple_GET_SIZE(prev);
|
||||
auto result_size = PyTuple_GET_SIZE(result);
|
||||
|
||||
TORCH_CHECK(
|
||||
prev_size == result_size,
|
||||
"hook '",
|
||||
hook_name(hook),
|
||||
"' has returned an incorrect number of values (got ",
|
||||
result_size,
|
||||
", but expected ",
|
||||
prev_size,
|
||||
")");
|
||||
if (prev_size != result_size) {
|
||||
std::stringstream ss;
|
||||
auto name = hook_name(hook);
|
||||
ss << "hook '" << name << "' has returned an incorrect number ";
|
||||
ss << "of values (got " << result_size << ", but expected ";
|
||||
ss << prev_size << ")";
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
|
||||
for (const auto i : c10::irange(prev_size)) {
|
||||
check_single_result(
|
||||
@ -330,9 +330,10 @@ static void check_single_result(
|
||||
if (_result == Py_None)
|
||||
return;
|
||||
|
||||
TORCH_CHECK(
|
||||
_original != Py_None,
|
||||
"can't replace a None gradient with a non-None value");
|
||||
if (_original == Py_None) {
|
||||
throw std::runtime_error(
|
||||
"can't replace a None gradient with a non-None value");
|
||||
}
|
||||
|
||||
if (!PyObject_IsInstance(_result, THPVariableClass)) {
|
||||
PyErr_Format(
|
||||
|
||||
@ -644,6 +644,15 @@ void initTorchFunctions(PyObject* module) {
|
||||
at::functionalization::impl::isFunctionalTensor(t));
|
||||
at::functionalization::impl::mark_mutation_hidden_from_autograd(t);
|
||||
});
|
||||
py_module.def(
|
||||
"_functionalize_apply_view_metas",
|
||||
[](const at::Tensor& tensor, const at::Tensor& base) {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
at::functionalization::impl::isFunctionalTensor(tensor));
|
||||
auto impl =
|
||||
at::functionalization::impl::unsafeGetFunctionalWrapper(tensor);
|
||||
return impl->apply_view_metas(base);
|
||||
});
|
||||
py_module.def("_functionalize_is_symbolic", [](const at::Tensor& t) {
|
||||
TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(t));
|
||||
auto impl = at::functionalization::impl::unsafeGetFunctionalWrapper(t);
|
||||
|
||||
@ -265,7 +265,8 @@ PyObject* THPVariable_Wrap(const at::TensorBase& var) {
|
||||
}
|
||||
|
||||
std::optional<PyObject*> mb_obj =
|
||||
var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
|
||||
var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
|
||||
/*ignore_hermetic_tls=*/false);
|
||||
if (mb_obj.has_value()) {
|
||||
auto obj = *mb_obj;
|
||||
if (obj) {
|
||||
@ -328,8 +329,8 @@ static bool isResurrectable(THPVariable* self) {
|
||||
return false;
|
||||
}
|
||||
// Check if this is hermetic. If it is, no resurrection.
|
||||
if (tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj() !=
|
||||
(PyObject*)self) {
|
||||
if (tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
|
||||
/*ignore_hermetic_tls=*/false) != (PyObject*)self) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@ -354,7 +355,8 @@ static bool THPVariable_tryResurrect(THPVariable* self) {
|
||||
!tensor.unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj());
|
||||
|
||||
c10::TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
|
||||
auto maybe_pyobj = tensor_impl->pyobj_slot()->check_pyobj();
|
||||
auto maybe_pyobj = tensor_impl->pyobj_slot()->check_pyobj(
|
||||
/*ignore_hermetic_tls=*/false);
|
||||
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
maybe_pyobj.has_value(),
|
||||
@ -2221,8 +2223,8 @@ static int THPVariable_subclass_clear(THPVariable* self) {
|
||||
// because Tensor asked us to (it's already destructing).
|
||||
|
||||
if (!self->cdata.unsafeIsBorrowed() &&
|
||||
tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj() ==
|
||||
(PyObject*)self) {
|
||||
tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
|
||||
/*ignore_hermetic_tls=*/false) == (PyObject*)self) {
|
||||
// TODO: empirically, on OS X this assert appears to be untrue
|
||||
// In test_py_tensors_multi_async_call - ProcessGroupRpcTestWithSpawn
|
||||
// distributed/rpc/test_process_group_agent.py
|
||||
@ -2408,7 +2410,8 @@ static PyObject* THPVariable_NewWithVar(
|
||||
|
||||
// This function overwrite the Tensor's pyobj field without extra checks
|
||||
// Make sure it is not set otherwise we would leak memory
|
||||
auto mb_obj = _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
|
||||
auto mb_obj = _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
|
||||
/*ignore_hermetic_tls=*/false);
|
||||
|
||||
// Under some circumstances, we may attempt to create a new Python
|
||||
// object for a variable that already has a Python object. The most common
|
||||
|
||||
@ -11,8 +11,8 @@ struct TORCH_API SavedVariableHooks {
|
||||
virtual ~SavedVariableHooks() = default;
|
||||
virtual std::optional<std::pair<c10::SafePyObject, c10::SafePyObject>>
|
||||
retrieve_unpack_hook_data() const {
|
||||
TORCH_CHECK(
|
||||
false, "Compiled Autograd only supports python saved tensor hooks ");
|
||||
throw std::runtime_error(
|
||||
"Compiled Autograd only supports python saved tensor hooks ");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -17,8 +17,8 @@ inline std::tuple<
|
||||
std::optional<at::MemoryFormat>>
|
||||
parse_to_conversion(PythonArgs& r, bool allow_copy) {
|
||||
if (r.idx == 0) {
|
||||
TORCH_CHECK(
|
||||
allow_copy || r.isNone(3), ".to() does not accept copy argument");
|
||||
if (!allow_copy && !r.isNone(3))
|
||||
throw std::runtime_error(".to() does not accept copy argument");
|
||||
return std::make_tuple(
|
||||
r.deviceOptional(0),
|
||||
r.scalartypeOptional(1),
|
||||
@ -26,8 +26,8 @@ parse_to_conversion(PythonArgs& r, bool allow_copy) {
|
||||
r.toBool(3),
|
||||
r.memoryformatOptional(4));
|
||||
} else if (r.idx == 1) {
|
||||
TORCH_CHECK(
|
||||
allow_copy || r.isNone(2), ".to() does not accept copy argument");
|
||||
if (!allow_copy && !r.isNone(2))
|
||||
throw std::runtime_error(".to() does not accept copy argument");
|
||||
return std::make_tuple(
|
||||
std::nullopt,
|
||||
r.scalartype(0),
|
||||
@ -36,8 +36,8 @@ parse_to_conversion(PythonArgs& r, bool allow_copy) {
|
||||
r.memoryformatOptional(3));
|
||||
} else {
|
||||
auto tensor = r.tensor(0);
|
||||
TORCH_CHECK(
|
||||
allow_copy || r.isNone(2), ".to() does not accept copy argument");
|
||||
if (!allow_copy && !r.isNone(2))
|
||||
throw std::runtime_error(".to() does not accept copy argument");
|
||||
return std::make_tuple(
|
||||
tensor.device(),
|
||||
tensor.scalar_type(),
|
||||
|
||||
@ -597,9 +597,10 @@ void VariableHooks::_backward(
|
||||
void VariableHooks::requires_grad_(
|
||||
const at::TensorBase& self,
|
||||
bool _requires_grad) const {
|
||||
TORCH_CHECK(
|
||||
self.is_leaf() || _requires_grad,
|
||||
autograd::utils::requires_grad_leaf_error(_requires_grad));
|
||||
if (!self.is_leaf() && !_requires_grad) {
|
||||
throw std::runtime_error(
|
||||
autograd::utils::requires_grad_leaf_error(_requires_grad));
|
||||
}
|
||||
self.set_requires_grad(_requires_grad);
|
||||
}
|
||||
|
||||
@ -623,7 +624,7 @@ const at::TensorBase& VariableHooks::base(const at::TensorBase& self) const {
|
||||
"Can't get base of non-backward view Tensor");
|
||||
return diff_view_meta->get_backward_view().base_;
|
||||
} else {
|
||||
TORCH_CHECK(false, "Can't get base of non-view Tensor");
|
||||
throw std::runtime_error("Can't get base of non-view Tensor");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
#include <torch/csrc/distributed/c10d/HashStore.hpp>
|
||||
|
||||
#include <unistd.h>
|
||||
#include <cstdint>
|
||||
|
||||
#include <chrono>
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#include <ATen/ThreadLocalState.h>
|
||||
#include <distributed/c10d/ProcessGroup.hpp>
|
||||
#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
|
||||
#include <torch/csrc/distributed/c10d/cuda/StreamBlock.hpp>
|
||||
|
||||
#include <torch/csrc/distributed/c10d/Work.hpp>
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user