mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
irange for size_t (#55320)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/55320 Test Plan: Sandcastle Reviewed By: ngimel Differential Revision: D27572577 fbshipit-source-id: 97710fd2bb1303006b05828a0d1343b0b59ccb03
This commit is contained in:
committed by
Facebook GitHub Bot
parent
f914ab193e
commit
3979cb0656
@ -181,7 +181,7 @@ std::vector<int64_t> ConvTransposeNdImpl<D, Derived>::_output_padding(
|
|||||||
max_sizes.push_back(min_sizes[d] + (*stride)[d] - 1);
|
max_sizes.push_back(min_sizes[d] + (*stride)[d] - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < output_size_.value().size(); i++) {
|
for(const auto i : c10::irange(output_size_.value().size())) {
|
||||||
int64_t size = output_size_.value()[i];
|
int64_t size = output_size_.value()[i];
|
||||||
int64_t min_size = min_sizes[i];
|
int64_t min_size = min_sizes[i];
|
||||||
int64_t max_size = max_sizes[i];
|
int64_t max_size = max_sizes[i];
|
||||||
|
@ -127,13 +127,13 @@ void RNNImplBase<Derived>::reset() {
|
|||||||
layer_params.emplace_back(w_hr);
|
layer_params.emplace_back(w_hr);
|
||||||
param_names.emplace_back("weight_hr_l{layer}{suffix}");
|
param_names.emplace_back("weight_hr_l{layer}{suffix}");
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < param_names.size(); i++) { // NOLINT(modernize-loop-convert)
|
for(const auto i : c10::irange(param_names.size())) { // NOLINT(modernize-loop-convert)
|
||||||
std::string x = std::regex_replace(param_names[i], std::regex("\\{layer\\}"), c10::str(layer));
|
std::string x = std::regex_replace(param_names[i], std::regex("\\{layer\\}"), c10::str(layer));
|
||||||
x = std::regex_replace(x, std::regex("\\{suffix\\}"), c10::str(suffix));
|
x = std::regex_replace(x, std::regex("\\{suffix\\}"), c10::str(suffix));
|
||||||
param_names[i] = x;
|
param_names[i] = x;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < param_names.size(); i++) {
|
for(const auto i : c10::irange(param_names.size())) {
|
||||||
auto name = param_names[i];
|
auto name = param_names[i];
|
||||||
auto param = layer_params[i];
|
auto param = layer_params[i];
|
||||||
this->register_parameter(name, param);
|
this->register_parameter(name, param);
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
#include <torch/optim/serialize.h>
|
#include <torch/optim/serialize.h>
|
||||||
|
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
|
||||||
@ -136,7 +137,7 @@ void Adagrad::load(serialize::InputArchive& archive) {
|
|||||||
torch::optim::serialize(archive, "step_buffers", step_buffers);
|
torch::optim::serialize(archive, "step_buffers", step_buffers);
|
||||||
// since there were no param_groups prior to version 1.5.0, assuming all tensors are now in one param_group
|
// since there were no param_groups prior to version 1.5.0, assuming all tensors are now in one param_group
|
||||||
std::vector<Tensor> params = param_groups_.at(0).params();
|
std::vector<Tensor> params = param_groups_.at(0).params();
|
||||||
for (size_t idx = 0; idx < params.size(); idx++) {
|
for(const auto idx : c10::irange(params.size())) {
|
||||||
auto state = std::make_unique<AdagradParamState>();
|
auto state = std::make_unique<AdagradParamState>();
|
||||||
state->step(step_buffers[idx]);
|
state->step(step_buffers[idx]);
|
||||||
state->sum(sum_buffers[idx]);
|
state->sum(sum_buffers[idx]);
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
#include <torch/utils.h>
|
#include <torch/utils.h>
|
||||||
|
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
@ -161,7 +162,7 @@ void Adam::load(serialize::InputArchive& archive) {
|
|||||||
torch::optim::serialize(archive, "max_exp_average_sq_buffers", max_exp_average_sq_buffers);
|
torch::optim::serialize(archive, "max_exp_average_sq_buffers", max_exp_average_sq_buffers);
|
||||||
// since there were no param_groups prior to version 1.5.0, assuming all tensors are now in one param_group
|
// since there were no param_groups prior to version 1.5.0, assuming all tensors are now in one param_group
|
||||||
std::vector<Tensor> params = param_groups_.at(0).params();
|
std::vector<Tensor> params = param_groups_.at(0).params();
|
||||||
for (size_t idx = 0; idx < step_buffers.size(); idx++) {
|
for(const auto idx : c10::irange(step_buffers.size())) {
|
||||||
auto state = std::make_unique<AdamParamState>();
|
auto state = std::make_unique<AdamParamState>();
|
||||||
state->step(step_buffers.at(idx));
|
state->step(step_buffers.at(idx));
|
||||||
state->exp_avg(exp_average_buffers.at(idx));
|
state->exp_avg(exp_average_buffers.at(idx));
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
#include <torch/utils.h>
|
#include <torch/utils.h>
|
||||||
|
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
@ -163,7 +164,7 @@ void AdamW::load(serialize::InputArchive& archive) {
|
|||||||
torch::optim::serialize(archive, "max_exp_average_sq_buffers", max_exp_average_sq_buffers);
|
torch::optim::serialize(archive, "max_exp_average_sq_buffers", max_exp_average_sq_buffers);
|
||||||
// since there were no param_groups prior to version 1.5.0, assuming all tensors are now in one param_group
|
// since there were no param_groups prior to version 1.5.0, assuming all tensors are now in one param_group
|
||||||
std::vector<Tensor> params = param_groups_.at(0).params();
|
std::vector<Tensor> params = param_groups_.at(0).params();
|
||||||
for (size_t idx = 0; idx < step_buffers.size(); idx++) {
|
for(const auto idx : c10::irange(step_buffers.size())) {
|
||||||
auto state = std::make_unique<AdamWParamState>();
|
auto state = std::make_unique<AdamWParamState>();
|
||||||
state->step(step_buffers.at(idx));
|
state->step(step_buffers.at(idx));
|
||||||
state->exp_avg(exp_average_buffers.at(idx));
|
state->exp_avg(exp_average_buffers.at(idx));
|
||||||
|
@ -59,7 +59,7 @@ void LBFGSOptions::set_lr(const double lr) {
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
bool if_container_equal(T lhs, T rhs) {
|
bool if_container_equal(T lhs, T rhs) {
|
||||||
if (!(lhs.size() == rhs.size())) return false;
|
if (!(lhs.size() == rhs.size())) return false;
|
||||||
for (size_t i = 0; i < lhs.size(); i++) {
|
for(const auto i : c10::irange(lhs.size())) {
|
||||||
if (!torch::equal(lhs.at(i), rhs.at(i))) return false;
|
if (!torch::equal(lhs.at(i), rhs.at(i))) return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@ -154,7 +154,7 @@ void LBFGS::_add_grad(const double step_size, const Tensor& update) {
|
|||||||
void LBFGS::_set_param(const std::vector<Tensor>& params_data) {
|
void LBFGS::_set_param(const std::vector<Tensor>& params_data) {
|
||||||
auto& _params = param_groups_.at(0).params();
|
auto& _params = param_groups_.at(0).params();
|
||||||
TORCH_INTERNAL_ASSERT(params_data.size() == _params.size());
|
TORCH_INTERNAL_ASSERT(params_data.size() == _params.size());
|
||||||
for (size_t i = 0; i < _params.size(); i++) {
|
for(const auto i : c10::irange(_params.size())) {
|
||||||
_params.at(i).copy_(params_data.at(i));
|
_params.at(i).copy_(params_data.at(i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#include <torch/utils.h>
|
#include <torch/utils.h>
|
||||||
|
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
|
||||||
@ -159,7 +160,7 @@ void RMSprop::load(serialize::InputArchive& archive) {
|
|||||||
torch::optim::serialize(archive, "grad_average_buffers", grad_average_buffers);
|
torch::optim::serialize(archive, "grad_average_buffers", grad_average_buffers);
|
||||||
// since there were no param_groups prior to version 1.5.0, assuming all tensors are now in one param_group
|
// since there were no param_groups prior to version 1.5.0, assuming all tensors are now in one param_group
|
||||||
std::vector<Tensor> params = param_groups_.at(0).params();
|
std::vector<Tensor> params = param_groups_.at(0).params();
|
||||||
for (size_t idx = 0; idx < square_average_buffers.size(); idx++) {
|
for(const auto idx : c10::irange(square_average_buffers.size())) {
|
||||||
auto state = std::make_unique<RMSpropParamState>();
|
auto state = std::make_unique<RMSpropParamState>();
|
||||||
state->square_avg(square_average_buffers[idx]);
|
state->square_avg(square_average_buffers[idx]);
|
||||||
if(idx < momentum_buffers.size()) {
|
if(idx < momentum_buffers.size()) {
|
||||||
|
@ -8,6 +8,7 @@
|
|||||||
#include <torch/utils.h>
|
#include <torch/utils.h>
|
||||||
|
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
|
||||||
@ -123,7 +124,7 @@ void SGD::load(serialize::InputArchive& archive) {
|
|||||||
torch::optim::serialize(archive, "momentum_buffers", momentum_buffers);
|
torch::optim::serialize(archive, "momentum_buffers", momentum_buffers);
|
||||||
// since there were no param_groups prior to version 1.5.0, assuming all tensors are now in one param_group
|
// since there were no param_groups prior to version 1.5.0, assuming all tensors are now in one param_group
|
||||||
std::vector<Tensor> params = param_groups_.at(0).params();
|
std::vector<Tensor> params = param_groups_.at(0).params();
|
||||||
for (size_t idx = 0; idx < momentum_buffers.size(); idx++) {
|
for(const auto idx : c10::irange(momentum_buffers.size())) {
|
||||||
auto state = std::make_unique<SGDParamState>();
|
auto state = std::make_unique<SGDParamState>();
|
||||||
state->momentum_buffer(momentum_buffers[idx]);
|
state->momentum_buffer(momentum_buffers[idx]);
|
||||||
state_[c10::guts::to_string(params[idx].unsafeGetTensorImpl())] = std::move(state);
|
state_[c10::guts::to_string(params[idx].unsafeGetTensorImpl())] = std::move(state);
|
||||||
|
@ -339,7 +339,7 @@ Tensor permute_backwards(const Tensor & grad, IntArrayRef fwd_dims) {
|
|||||||
// invert the permutation
|
// invert the permutation
|
||||||
auto ndims = fwd_dims.size();
|
auto ndims = fwd_dims.size();
|
||||||
std::vector<int64_t> dims(ndims);
|
std::vector<int64_t> dims(ndims);
|
||||||
for (size_t i = 0; i < ndims; i++) {
|
for(const auto i : c10::irange(ndims)) {
|
||||||
dims[at::maybe_wrap_dim(fwd_dims[i], ndims)] = i;
|
dims[at::maybe_wrap_dim(fwd_dims[i], ndims)] = i;
|
||||||
}
|
}
|
||||||
return grad.permute(dims);
|
return grad.permute(dims);
|
||||||
@ -358,7 +358,7 @@ Tensor deg2rad_backward(const Tensor& grad) {
|
|||||||
Tensor unsqueeze_multiple(const Tensor & t, IntArrayRef dim, size_t n_dims) {
|
Tensor unsqueeze_multiple(const Tensor & t, IntArrayRef dim, size_t n_dims) {
|
||||||
auto dims_to_unsqueeze = at::dim_list_to_bitset(dim, n_dims);
|
auto dims_to_unsqueeze = at::dim_list_to_bitset(dim, n_dims);
|
||||||
Tensor res = t;
|
Tensor res = t;
|
||||||
for (size_t i = 0; i < n_dims; i++){
|
for(const auto i : c10::irange(n_dims)){
|
||||||
if (dims_to_unsqueeze[i]) {
|
if (dims_to_unsqueeze[i]) {
|
||||||
res = res.unsqueeze(i);
|
res = res.unsqueeze(i);
|
||||||
}
|
}
|
||||||
|
@ -6,6 +6,8 @@
|
|||||||
#include <torch/csrc/autograd/function.h>
|
#include <torch/csrc/autograd/function.h>
|
||||||
#include <torch/csrc/autograd/functions/basic_ops.h>
|
#include <torch/csrc/autograd/functions/basic_ops.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace autograd {
|
namespace autograd {
|
||||||
|
|
||||||
@ -74,7 +76,7 @@ variable_list run_backward(
|
|||||||
size_t num_tensors = outputs.size();
|
size_t num_tensors = outputs.size();
|
||||||
edge_list roots;
|
edge_list roots;
|
||||||
roots.reserve(num_tensors);
|
roots.reserve(num_tensors);
|
||||||
for (size_t i = 0; i < num_tensors; i++) {
|
for(const auto i : c10::irange(num_tensors)) {
|
||||||
const Variable& output = outputs[i];
|
const Variable& output = outputs[i];
|
||||||
auto gradient_edge = impl::gradient_edge(output);
|
auto gradient_edge = impl::gradient_edge(output);
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
#include <c10/core/Stream.h>
|
#include <c10/core/Stream.h>
|
||||||
#include <c10/core/Event.h>
|
#include <c10/core/Event.h>
|
||||||
#include <c10/core/DeviceGuard.h>
|
#include <c10/core/DeviceGuard.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <c10/util/Optional.h>
|
#include <c10/util/Optional.h>
|
||||||
#include <c10/util/ThreadLocal.h>
|
#include <c10/util/ThreadLocal.h>
|
||||||
#include <c10/core/StreamGuard.h>
|
#include <c10/core/StreamGuard.h>
|
||||||
@ -602,7 +603,7 @@ void set_device(int device) {
|
|||||||
// Don't use DeviceGuard here because its destructor may be called before the
|
// Don't use DeviceGuard here because its destructor may be called before the
|
||||||
// device is reset. This is fine because the device is thread local.
|
// device is reset. This is fine because the device is thread local.
|
||||||
if (device != CPU_DEVICE) {
|
if (device != CPU_DEVICE) {
|
||||||
for (size_t i = 0; i < static_cast<size_t>(c10::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES); i++) {
|
for(const auto i : c10::irange(static_cast<size_t>(c10::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES))) {
|
||||||
auto* impl = c10::impl::device_guard_impl_registry[i].load();
|
auto* impl = c10::impl::device_guard_impl_registry[i].load();
|
||||||
if (impl && device < impl->deviceCount()) {
|
if (impl && device < impl->deviceCount()) {
|
||||||
impl->setDevice(at::Device(static_cast<c10::DeviceType>(i), device));
|
impl->setDevice(at::Device(static_cast<c10::DeviceType>(i), device));
|
||||||
@ -622,7 +623,7 @@ void validate_outputs(
|
|||||||
ss << edges.size() << ", but got " << grads.size();
|
ss << edges.size() << ", but got " << grads.size();
|
||||||
AT_ERROR(format_error(ss.str()));
|
AT_ERROR(format_error(ss.str()));
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < grads.size(); i++) {
|
for(const auto i : c10::irange(grads.size())) {
|
||||||
const auto& edge = edges[i];
|
const auto& edge = edges[i];
|
||||||
if (!edge.is_valid()) continue;
|
if (!edge.is_valid()) continue;
|
||||||
|
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
#include <torch/csrc/autograd/variable.h>
|
#include <torch/csrc/autograd/variable.h>
|
||||||
|
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
@ -92,7 +93,7 @@ auto CopySlices::apply(variable_list&& inputs) -> variable_list {
|
|||||||
auto res = (*fn)({ grad_slice.clone(at::MemoryFormat::Contiguous) });
|
auto res = (*fn)({ grad_slice.clone(at::MemoryFormat::Contiguous) });
|
||||||
|
|
||||||
variable_list grad_inputs(num_outputs());
|
variable_list grad_inputs(num_outputs());
|
||||||
for (size_t i = 0; i < res.size(); i++) {
|
for(const auto i : c10::irange(res.size())) {
|
||||||
if (should_compute_output(i)) {
|
if (should_compute_output(i)) {
|
||||||
AT_ASSERT(res[i].defined());
|
AT_ASSERT(res[i].defined());
|
||||||
if (i == 0) {
|
if (i == 0) {
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
#include <torch/csrc/autograd/python_hook.h>
|
#include <torch/csrc/autograd/python_hook.h>
|
||||||
|
|
||||||
#include <sstream>
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <pybind11/pybind11.h>
|
#include <pybind11/pybind11.h>
|
||||||
#include <torch/csrc/THP.h>
|
#include <torch/csrc/THP.h>
|
||||||
#include <torch/csrc/autograd/python_variable.h>
|
#include <torch/csrc/autograd/python_variable.h>
|
||||||
@ -9,6 +8,8 @@
|
|||||||
#include <torch/csrc/utils/python_strings.h>
|
#include <torch/csrc/utils/python_strings.h>
|
||||||
#include <torch/csrc/Exceptions.h>
|
#include <torch/csrc/Exceptions.h>
|
||||||
|
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
using torch::autograd::variable_list;
|
using torch::autograd::variable_list;
|
||||||
using torch::autograd::Variable;
|
using torch::autograd::Variable;
|
||||||
|
|
||||||
@ -113,7 +114,7 @@ static PyObject *wrap_variables(const variable_list& c_variables)
|
|||||||
|
|
||||||
static variable_list unwrap_variables(PyObject* py_variables) {
|
static variable_list unwrap_variables(PyObject* py_variables) {
|
||||||
variable_list results(PyTuple_GET_SIZE(py_variables));
|
variable_list results(PyTuple_GET_SIZE(py_variables));
|
||||||
for (size_t i = 0; i < results.size(); i++) {
|
for(const auto i : c10::irange(results.size())) {
|
||||||
PyObject* item = PyTuple_GET_ITEM(py_variables, i);
|
PyObject* item = PyTuple_GET_ITEM(py_variables, i);
|
||||||
if (item == Py_None) {
|
if (item == Py_None) {
|
||||||
continue;
|
continue;
|
||||||
|
@ -11,6 +11,7 @@
|
|||||||
#include <ATen/WrapDimUtils.h>
|
#include <ATen/WrapDimUtils.h>
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <c10/util/Optional.h>
|
#include <c10/util/Optional.h>
|
||||||
#include <torch/csrc/autograd/variable.h>
|
#include <torch/csrc/autograd/variable.h>
|
||||||
|
|
||||||
@ -73,7 +74,7 @@ static inline std::vector<Tensor>& _broadcast_out_impl(
|
|||||||
std::vector<Tensor>& broadcast_out(
|
std::vector<Tensor>& broadcast_out(
|
||||||
const Tensor& tensor,
|
const Tensor& tensor,
|
||||||
std::vector<Tensor>& out_tensors) {
|
std::vector<Tensor>& out_tensors) {
|
||||||
for (size_t i = 0; i < out_tensors.size(); i++) {
|
for(const auto i : c10::irange(out_tensors.size())) {
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
out_tensors[i].is_cuda(),
|
out_tensors[i].is_cuda(),
|
||||||
"Expected all output tensors to be CUDA tensors, but output tensor at index ",
|
"Expected all output tensors to be CUDA tensors, but output tensor at index ",
|
||||||
@ -240,7 +241,7 @@ std::vector<at::Tensor>& scatter_out(
|
|||||||
int64_t total_size = 0;
|
int64_t total_size = 0;
|
||||||
std::vector<int64_t> chunk_sizes;
|
std::vector<int64_t> chunk_sizes;
|
||||||
chunk_sizes.reserve(out_tensors.size());
|
chunk_sizes.reserve(out_tensors.size());
|
||||||
for (size_t i = 0; i < out_tensors.size(); i++) {
|
for(const auto i : c10::irange(out_tensors.size())) {
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
out_tensors[i].is_cuda(),
|
out_tensors[i].is_cuda(),
|
||||||
"Expected all output tensors to be CUDA tensors, but output tensor at index ",
|
"Expected all output tensors to be CUDA tensors, but output tensor at index ",
|
||||||
@ -283,7 +284,7 @@ std::vector<at::Tensor>& scatter_out(
|
|||||||
auto chunks =
|
auto chunks =
|
||||||
tensor.split_with_sizes(/*split_sizes=*/chunk_sizes, /*dim=*/dim);
|
tensor.split_with_sizes(/*split_sizes=*/chunk_sizes, /*dim=*/dim);
|
||||||
at::cuda::OptionalCUDAStreamGuard cuda_guard;
|
at::cuda::OptionalCUDAStreamGuard cuda_guard;
|
||||||
for (size_t i = 0; i < chunks.size(); i++) {
|
for(const auto i : c10::irange(chunks.size())) {
|
||||||
if (i < (streams ? streams->size() : 0U) && (*streams)[i]) {
|
if (i < (streams ? streams->size() : 0U) && (*streams)[i]) {
|
||||||
const auto device_index =
|
const auto device_index =
|
||||||
static_cast<int16_t>(out_tensors[i].get_device());
|
static_cast<int16_t>(out_tensors[i].get_device());
|
||||||
@ -379,7 +380,7 @@ static inline at::Tensor& _gather_out_impl(
|
|||||||
}
|
}
|
||||||
auto chunks =
|
auto chunks =
|
||||||
out_tensor.split_with_sizes(/*split_sizes=*/chunk_sizes, /*dim=*/dim);
|
out_tensor.split_with_sizes(/*split_sizes=*/chunk_sizes, /*dim=*/dim);
|
||||||
for (size_t i = 0; i < tensors.size(); i++) {
|
for(const auto i : c10::irange(tensors.size())) {
|
||||||
chunks[i].copy_(tensors[i], /*non_blocking=*/out_tensor.is_cuda());
|
chunks[i].copy_(tensors[i], /*non_blocking=*/out_tensor.is_cuda());
|
||||||
}
|
}
|
||||||
return out_tensor;
|
return out_tensor;
|
||||||
@ -395,7 +396,7 @@ at::Tensor& gather_out(
|
|||||||
const auto first_size = first.sizes();
|
const auto first_size = first.sizes();
|
||||||
dim = at::maybe_wrap_dim(dim, first);
|
dim = at::maybe_wrap_dim(dim, first);
|
||||||
std::vector<int64_t> expected_size(first_size.begin(), first_size.end());
|
std::vector<int64_t> expected_size(first_size.begin(), first_size.end());
|
||||||
for (size_t i = 0; i < tensors.size(); i++) {
|
for(const auto i : c10::irange(tensors.size())) {
|
||||||
const auto& tensor = tensors[i];
|
const auto& tensor = tensors[i];
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
tensor.is_cuda(),
|
tensor.is_cuda(),
|
||||||
@ -450,7 +451,7 @@ at::Tensor gather(
|
|||||||
dim = at::maybe_wrap_dim(dim, first);
|
dim = at::maybe_wrap_dim(dim, first);
|
||||||
std::vector<int64_t> expected_size(first_size.begin(), first_size.end());
|
std::vector<int64_t> expected_size(first_size.begin(), first_size.end());
|
||||||
auto memory_format = first.suggest_memory_format();
|
auto memory_format = first.suggest_memory_format();
|
||||||
for (size_t i = 0; i < tensors.size(); i++) {
|
for(const auto i : c10::irange(tensors.size())) {
|
||||||
const auto& tensor = tensors[i];
|
const auto& tensor = tensors[i];
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
tensor.is_cuda(),
|
tensor.is_cuda(),
|
||||||
|
@ -258,7 +258,7 @@ void check_inputs(
|
|||||||
int64_t numel = inputs[0].numel();
|
int64_t numel = inputs[0].numel();
|
||||||
auto dtype = inputs[0].scalar_type();
|
auto dtype = inputs[0].scalar_type();
|
||||||
|
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
auto input = inputs[i];
|
auto input = inputs[i];
|
||||||
auto output = outputs[i];
|
auto output = outputs[i];
|
||||||
|
|
||||||
@ -289,7 +289,7 @@ void check_inputs(
|
|||||||
int64_t numel = inputs[0].numel();
|
int64_t numel = inputs[0].numel();
|
||||||
auto dtype = inputs[0].scalar_type();
|
auto dtype = inputs[0].scalar_type();
|
||||||
|
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
auto input = inputs[i];
|
auto input = inputs[i];
|
||||||
|
|
||||||
check_tensor(
|
check_tensor(
|
||||||
@ -465,7 +465,7 @@ void reduce(
|
|||||||
|
|
||||||
AutoNcclGroup nccl_group_guard;
|
AutoNcclGroup nccl_group_guard;
|
||||||
at::cuda::OptionalCUDAGuard device_guard;
|
at::cuda::OptionalCUDAGuard device_guard;
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
int device = inputs[i].device().index();
|
int device = inputs[i].device().index();
|
||||||
device_guard.set_index(device);
|
device_guard.set_index(device);
|
||||||
// Default to the current stream
|
// Default to the current stream
|
||||||
@ -517,7 +517,7 @@ void all_reduce(
|
|||||||
|
|
||||||
AutoNcclGroup nccl_group_guard;
|
AutoNcclGroup nccl_group_guard;
|
||||||
at::cuda::OptionalCUDAGuard device_guard;
|
at::cuda::OptionalCUDAGuard device_guard;
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
int device = inputs[i].device().index();
|
int device = inputs[i].device().index();
|
||||||
device_guard.set_index(device);
|
device_guard.set_index(device);
|
||||||
// Default to the current stream
|
// Default to the current stream
|
||||||
@ -559,7 +559,7 @@ void reduce_scatter(
|
|||||||
|
|
||||||
AutoNcclGroup nccl_group_guard;
|
AutoNcclGroup nccl_group_guard;
|
||||||
at::cuda::OptionalCUDAGuard device_guard;
|
at::cuda::OptionalCUDAGuard device_guard;
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
int device = inputs[i].device().index();
|
int device = inputs[i].device().index();
|
||||||
device_guard.set_index(device);
|
device_guard.set_index(device);
|
||||||
// Default to the current stream
|
// Default to the current stream
|
||||||
@ -600,7 +600,7 @@ void all_gather(
|
|||||||
|
|
||||||
AutoNcclGroup nccl_group_guard;
|
AutoNcclGroup nccl_group_guard;
|
||||||
at::cuda::OptionalCUDAGuard device_guard;
|
at::cuda::OptionalCUDAGuard device_guard;
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
int device = inputs[i].device().index();
|
int device = inputs[i].device().index();
|
||||||
device_guard.set_index(device);
|
device_guard.set_index(device);
|
||||||
// Default to the current stream
|
// Default to the current stream
|
||||||
@ -728,7 +728,7 @@ void all2all(std::vector<at::Tensor>& outputTensors,
|
|||||||
auto comm = to_nccl_comm(_comm);
|
auto comm = to_nccl_comm(_comm);
|
||||||
|
|
||||||
NCCL_CHECK(ncclGroupStart());
|
NCCL_CHECK(ncclGroupStart());
|
||||||
for (size_t r = 0; r < outputTensors.size(); r++) {
|
for(const auto r : c10::irange(outputTensors.size())) {
|
||||||
at::Tensor &input = inputTensors[r];
|
at::Tensor &input = inputTensors[r];
|
||||||
at::Tensor &output = outputTensors[r];
|
at::Tensor &output = outputTensors[r];
|
||||||
if (input.numel() != 0) {
|
if (input.numel() != 0) {
|
||||||
|
@ -1,8 +1,11 @@
|
|||||||
#include <ATen/Parallel.h>
|
#include <ATen/Parallel.h>
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/deploy/deploy.h>
|
#include <torch/csrc/deploy/deploy.h>
|
||||||
#include <torch/script.h>
|
#include <torch/script.h>
|
||||||
#include <torch/torch.h>
|
#include <torch/torch.h>
|
||||||
|
|
||||||
#include <future>
|
#include <future>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -82,7 +85,7 @@ TEST(TorchpyTest, MultiSerialSimpleModel) {
|
|||||||
size_t ninterp = 3;
|
size_t ninterp = 3;
|
||||||
std::vector<at::Tensor> outputs;
|
std::vector<at::Tensor> outputs;
|
||||||
|
|
||||||
for (size_t i = 0; i < ninterp; i++) {
|
for (const auto i : c10::irange(ninterp)) {
|
||||||
outputs.push_back(model({input.alias()}).toTensor());
|
outputs.push_back(model({input.alias()}).toTensor());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -90,7 +93,7 @@ TEST(TorchpyTest, MultiSerialSimpleModel) {
|
|||||||
auto ref_output = ref_model.forward({input.alias()}).toTensor();
|
auto ref_output = ref_model.forward({input.alias()}).toTensor();
|
||||||
|
|
||||||
// Compare all to reference
|
// Compare all to reference
|
||||||
for (size_t i = 0; i < ninterp; i++) {
|
for (const auto i : c10::irange(ninterp)) {
|
||||||
ASSERT_TRUE(ref_output.equal(outputs[i]));
|
ASSERT_TRUE(ref_output.equal(outputs[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -121,7 +124,7 @@ TEST(TorchpyTest, ThreadedSimpleModel) {
|
|||||||
std::vector<at::Tensor> outputs;
|
std::vector<at::Tensor> outputs;
|
||||||
|
|
||||||
std::vector<std::future<at::Tensor>> futures;
|
std::vector<std::future<at::Tensor>> futures;
|
||||||
for (size_t i = 0; i < nthreads; i++) {
|
for (const auto i : c10::irange(nthreads)) {
|
||||||
futures.push_back(std::async(std::launch::async, [&model]() {
|
futures.push_back(std::async(std::launch::async, [&model]() {
|
||||||
auto input = torch::ones({10, 20});
|
auto input = torch::ones({10, 20});
|
||||||
for (int i = 0; i < 100; ++i) {
|
for (int i = 0; i < 100; ++i) {
|
||||||
@ -131,7 +134,7 @@ TEST(TorchpyTest, ThreadedSimpleModel) {
|
|||||||
return result;
|
return result;
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < nthreads; i++) {
|
for (const auto i : c10::irange(nthreads)) {
|
||||||
outputs.push_back(futures[i].get());
|
outputs.push_back(futures[i].get());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -139,7 +142,7 @@ TEST(TorchpyTest, ThreadedSimpleModel) {
|
|||||||
auto ref_output = ref_model.forward({input.alias()}).toTensor();
|
auto ref_output = ref_model.forward({input.alias()}).toTensor();
|
||||||
|
|
||||||
// Compare all to reference
|
// Compare all to reference
|
||||||
for (size_t i = 0; i < nthreads; i++) {
|
for (const auto i : c10::irange(nthreads)) {
|
||||||
ASSERT_TRUE(ref_output.equal(outputs[i]));
|
ASSERT_TRUE(ref_output.equal(outputs[i]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
#include <ATen/Parallel.h>
|
#include <ATen/Parallel.h>
|
||||||
#include <c10/core/Event.h>
|
#include <c10/core/Event.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/autograd/functions/accumulate_grad.h>
|
#include <torch/csrc/autograd/functions/accumulate_grad.h>
|
||||||
#include <torch/csrc/autograd/input_buffer.h>
|
#include <torch/csrc/autograd/input_buffer.h>
|
||||||
#include <torch/csrc/distributed/autograd/context/container.h>
|
#include <torch/csrc/distributed/autograd/context/container.h>
|
||||||
@ -97,7 +98,7 @@ void DistEngine::globalCpuThread(
|
|||||||
variables =
|
variables =
|
||||||
InputBuffer::variables(std::move(task.inputs_))]() mutable {
|
InputBuffer::variables(std::move(task.inputs_))]() mutable {
|
||||||
InputBuffer inputs(variables.size());
|
InputBuffer inputs(variables.size());
|
||||||
for (size_t i = 0; i < variables.size(); i++) {
|
for(const auto i : c10::irange(variables.size())) {
|
||||||
inputs.add(i, std::move(variables[i]), c10::nullopt, c10::nullopt);
|
inputs.add(i, std::move(variables[i]), c10::nullopt, c10::nullopt);
|
||||||
}
|
}
|
||||||
execute_graph_task_until_ready_queue_empty(
|
execute_graph_task_until_ready_queue_empty(
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include <torch/csrc/distributed/autograd/functions/recvrpc_backward.h>
|
#include <torch/csrc/distributed/autograd/functions/recvrpc_backward.h>
|
||||||
#include <ATen/core/functional.h>
|
#include <ATen/core/functional.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_req.h>
|
#include <torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_req.h>
|
||||||
#include <torch/csrc/distributed/rpc/rpc_agent.h>
|
#include <torch/csrc/distributed/rpc/rpc_agent.h>
|
||||||
|
|
||||||
@ -23,7 +24,7 @@ RecvRpcBackward::RecvRpcBackward(
|
|||||||
|
|
||||||
variable_list RecvRpcBackward::apply(variable_list&& grads) {
|
variable_list RecvRpcBackward::apply(variable_list&& grads) {
|
||||||
std::vector<Variable> outputGrads;
|
std::vector<Variable> outputGrads;
|
||||||
for (size_t i = 0; i < grads.size(); i++) {
|
for(const auto i : c10::irange(grads.size())) {
|
||||||
const auto& grad = grads[i];
|
const auto& grad = grads[i];
|
||||||
if (grad.defined()) {
|
if (grad.defined()) {
|
||||||
outputGrads.emplace_back(grad);
|
outputGrads.emplace_back(grad);
|
||||||
|
@ -2,6 +2,8 @@
|
|||||||
#include <torch/csrc/distributed/rpc/rpc_agent.h>
|
#include <torch/csrc/distributed/rpc/rpc_agent.h>
|
||||||
#include <torch/csrc/jit/serialization/pickle.h>
|
#include <torch/csrc/jit/serialization/pickle.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace distributed {
|
namespace distributed {
|
||||||
namespace autograd {
|
namespace autograd {
|
||||||
@ -74,7 +76,7 @@ std::unique_ptr<PropagateGradientsReq> PropagateGradientsReq::fromMessage(
|
|||||||
|
|
||||||
// Retrieve the gradient tensors.
|
// Retrieve the gradient tensors.
|
||||||
std::vector<Variable> grads(tupleElements.size());
|
std::vector<Variable> grads(tupleElements.size());
|
||||||
for (size_t i = 0; i < tupleElements.size(); i++) {
|
for(const auto i : c10::irange(tupleElements.size())) {
|
||||||
grads[i] = tupleElements[i].toTensor();
|
grads[i] = tupleElements[i].toTensor();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,6 +23,8 @@
|
|||||||
#include <torch/csrc/jit/serialization/pickler.h>
|
#include <torch/csrc/jit/serialization/pickler.h>
|
||||||
#include <torch/csrc/jit/serialization/unpickler.h>
|
#include <torch/csrc/jit/serialization/unpickler.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
using namespace torch::autograd::profiler;
|
using namespace torch::autograd::profiler;
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
@ -377,7 +379,7 @@ std::string wireSerialize(
|
|||||||
pickler.stop();
|
pickler.stop();
|
||||||
tensorData = pickler.tensorData();
|
tensorData = pickler.tensorData();
|
||||||
entries.push_back({kMeta, metaEntry.data(), metaEntry.size()});
|
entries.push_back({kMeta, metaEntry.data(), metaEntry.size()});
|
||||||
for (size_t i = 0; i < tensorData.size(); i++) {
|
for (const auto i : c10::irange(tensorData.size())) {
|
||||||
// Construct WritableTensorData for each tensor in the pickler tensorData
|
// Construct WritableTensorData for each tensor in the pickler tensorData
|
||||||
// Since tensorData is in function scope, and getWritableTensorData just
|
// Since tensorData is in function scope, and getWritableTensorData just
|
||||||
// record the tensors, the data() pointers stay valid for CPU tensors
|
// record the tensors, the data() pointers stay valid for CPU tensors
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <torch/csrc/jit/codegen/cuda/arith.h>
|
#include <torch/csrc/jit/codegen/cuda/arith.h>
|
||||||
|
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
|
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
|
||||||
#include <torch/csrc/jit/codegen/cuda/type.h>
|
#include <torch/csrc/jit/codegen/cuda/type.h>
|
||||||
|
|
||||||
@ -61,7 +62,7 @@ TensorView* newOutputTV(const std::vector<Val*>& vals, DataType dtype) {
|
|||||||
dom.size(),
|
dom.size(),
|
||||||
" dimensions but expected ",
|
" dimensions but expected ",
|
||||||
out_domain.size());
|
out_domain.size());
|
||||||
for (size_t i = 0; i < dom.size(); i++) {
|
for (const auto i : c10::irange(dom.size())) {
|
||||||
if (out_domain[i] != nullptr)
|
if (out_domain[i] != nullptr)
|
||||||
continue;
|
continue;
|
||||||
if (dom[i]->isBroadcast())
|
if (dom[i]->isBroadcast())
|
||||||
@ -69,7 +70,7 @@ TensorView* newOutputTV(const std::vector<Val*>& vals, DataType dtype) {
|
|||||||
out_domain[i] = new IterDomain(dom[i]->start(), dom[i]->extent());
|
out_domain[i] = new IterDomain(dom[i]->start(), dom[i]->extent());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (size_t dim_i = 0; dim_i < out_domain.size(); dim_i++) {
|
for (const auto dim_i : c10::irange(out_domain.size())) {
|
||||||
if (out_domain[dim_i] == nullptr) {
|
if (out_domain[dim_i] == nullptr) {
|
||||||
IterType itype = IterType::BroadcastWithoutStride;
|
IterType itype = IterType::BroadcastWithoutStride;
|
||||||
for (const auto tv : tvs) {
|
for (const auto tv : tvs) {
|
||||||
@ -103,7 +104,7 @@ std::vector<Val*> maybeBroadcast(const std::vector<Val*>& vals) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < vals.size(); i++) {
|
for (const auto i : c10::irange(vals.size())) {
|
||||||
if (vals[i]->getValType().value() == ValType::TensorView) {
|
if (vals[i]->getValType().value() == ValType::TensorView) {
|
||||||
auto tv = vals[i]->as<TensorView>();
|
auto tv = vals[i]->as<TensorView>();
|
||||||
size_t tv_dims = TensorDomain::noReductions(tv->getRootDomain()).size();
|
size_t tv_dims = TensorDomain::noReductions(tv->getRootDomain()).size();
|
||||||
@ -413,7 +414,7 @@ static TensorView* newForReduction(
|
|||||||
"Error setting up reduction, reduction axis is outside nDims. Keep in mind reductions are relative to root domains, not modified views.");
|
"Error setting up reduction, reduction axis is outside nDims. Keep in mind reductions are relative to root domains, not modified views.");
|
||||||
|
|
||||||
auto axis_iter = axes_set.begin();
|
auto axis_iter = axes_set.begin();
|
||||||
for (size_t dim = 0; dim < orig_domain.size(); dim++) {
|
for (const auto dim : c10::irange(orig_domain.size())) {
|
||||||
bool isReduction = false;
|
bool isReduction = false;
|
||||||
if (axis_iter != axes_set.end() && *axis_iter == dim) {
|
if (axis_iter != axes_set.end() && *axis_iter == dim) {
|
||||||
isReduction = true;
|
isReduction = true;
|
||||||
|
@ -6,6 +6,8 @@
|
|||||||
#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
|
#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
|
||||||
#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
|
#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
namespace fuser {
|
namespace fuser {
|
||||||
@ -135,7 +137,7 @@ T1 tvIterable(const T2& val_iterable) {
|
|||||||
std::deque<std::deque<TensorView*>> tvChains(
|
std::deque<std::deque<TensorView*>> tvChains(
|
||||||
std::deque<std::deque<Val*>> val_chains) {
|
std::deque<std::deque<Val*>> val_chains) {
|
||||||
std::deque<std::deque<TensorView*>> tv_chains(val_chains.size());
|
std::deque<std::deque<TensorView*>> tv_chains(val_chains.size());
|
||||||
for (size_t i = 0; i < val_chains.size(); i++) {
|
for (const auto i : c10::irange(val_chains.size())) {
|
||||||
tv_chains[i] = tvIterable<std::deque<TensorView*>>(val_chains[i]);
|
tv_chains[i] = tvIterable<std::deque<TensorView*>>(val_chains[i]);
|
||||||
}
|
}
|
||||||
return tv_chains;
|
return tv_chains;
|
||||||
|
@ -14,6 +14,7 @@
|
|||||||
#include <c10/core/DeviceGuard.h>
|
#include <c10/core/DeviceGuard.h>
|
||||||
#include <c10/cuda/CUDAFunctions.h>
|
#include <c10/cuda/CUDAFunctions.h>
|
||||||
#include <c10/cuda/CUDAStream.h>
|
#include <c10/cuda/CUDAStream.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
|
||||||
@ -413,7 +414,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
|
|||||||
at::AutoDispatchBelowADInplaceOrView non_variable_type_mode;
|
at::AutoDispatchBelowADInplaceOrView non_variable_type_mode;
|
||||||
// take the short-cut for launch if we see a recorded input set again;
|
// take the short-cut for launch if we see a recorded input set again;
|
||||||
launch_params = executor_entry->launch_params;
|
launch_params = executor_entry->launch_params;
|
||||||
for (size_t i = 0; i < executor_entry->output_sizes.size(); i++) {
|
for (const auto i : c10::irange(executor_entry->output_sizes.size())) {
|
||||||
alloced_outputs.push_back(at::native::empty_cuda(
|
alloced_outputs.push_back(at::native::empty_cuda(
|
||||||
executor_entry->output_sizes[i],
|
executor_entry->output_sizes[i],
|
||||||
executor_entry->output_types[i],
|
executor_entry->output_types[i],
|
||||||
@ -421,7 +422,8 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
|
|||||||
options_.device,
|
options_.device,
|
||||||
c10::nullopt));
|
c10::nullopt));
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < executor_entry->empty_buffer_sizes.size(); i++) {
|
for (const auto i :
|
||||||
|
c10::irange(executor_entry->empty_buffer_sizes.size())) {
|
||||||
global_buffers.empty_buffers.push_back(at::native::empty_cuda(
|
global_buffers.empty_buffers.push_back(at::native::empty_cuda(
|
||||||
executor_entry->empty_buffer_sizes[i],
|
executor_entry->empty_buffer_sizes[i],
|
||||||
executor_entry->empty_buffer_types[i],
|
executor_entry->empty_buffer_types[i],
|
||||||
@ -430,7 +432,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
|
|||||||
c10::nullopt));
|
c10::nullopt));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < executor_entry->zero_buffer_sizes.size(); i++) {
|
for (const auto i : c10::irange(executor_entry->zero_buffer_sizes.size())) {
|
||||||
auto tensor_options = at::TensorOptions()
|
auto tensor_options = at::TensorOptions()
|
||||||
.dtype(executor_entry->zero_buffer_types[i])
|
.dtype(executor_entry->zero_buffer_types[i])
|
||||||
.device(options_.device);
|
.device(options_.device);
|
||||||
|
@ -79,7 +79,7 @@ void KernelArgumentHolder::push(const uint64_t& val) {
|
|||||||
void** KernelArgumentHolder::getBuffer() {
|
void** KernelArgumentHolder::getBuffer() {
|
||||||
if (changed_) {
|
if (changed_) {
|
||||||
void_ptrs_ = std::vector<void*>(arguments_.size(), nullptr);
|
void_ptrs_ = std::vector<void*>(arguments_.size(), nullptr);
|
||||||
for (size_t i = 0; i < arguments_.size(); i++) {
|
for (const auto i : c10::irange(arguments_.size())) {
|
||||||
void_ptrs_[i] = static_cast<void*>(arguments_[i]->arg());
|
void_ptrs_[i] = static_cast<void*>(arguments_[i]->arg());
|
||||||
}
|
}
|
||||||
changed_ = false;
|
changed_ = false;
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
|
#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
|
||||||
|
|
||||||
#include <c10/cuda/CUDACachingAllocator.h>
|
#include <c10/cuda/CUDACachingAllocator.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <torch/csrc/jit/codegen/cuda/executor_utils.h>
|
#include <torch/csrc/jit/codegen/cuda/executor_utils.h>
|
||||||
#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
|
#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
|
||||||
@ -230,7 +231,7 @@ StatefulExpressionEvaluator statefulBindInputs(
|
|||||||
|
|
||||||
// This should probably move to EvaluationContext as we may want to bind
|
// This should probably move to EvaluationContext as we may want to bind
|
||||||
// input values frequently. Bind fusion input values to runtime values.
|
// input values frequently. Bind fusion input values to runtime values.
|
||||||
for (size_t i = 0; i < fusion->inputs().size(); i++) {
|
for (const auto i : c10::irange(fusion->inputs().size())) {
|
||||||
if (fusion->inputs()[i]->getValType() == ValType::TensorView) {
|
if (fusion->inputs()[i]->getValType() == ValType::TensorView) {
|
||||||
TensorView* cg_tensor = fusion->inputs()[i]->as<TensorView>();
|
TensorView* cg_tensor = fusion->inputs()[i]->as<TensorView>();
|
||||||
|
|
||||||
@ -244,7 +245,7 @@ StatefulExpressionEvaluator statefulBindInputs(
|
|||||||
aten_tensor.ndimension() == (int64_t)root_dom.size(),
|
aten_tensor.ndimension() == (int64_t)root_dom.size(),
|
||||||
"Something went wrong configuring launch. Inputs no longer match.");
|
"Something went wrong configuring launch. Inputs no longer match.");
|
||||||
|
|
||||||
for (size_t dim = 0; dim < root_dom.size(); dim++) {
|
for (const auto dim : c10::irange(root_dom.size())) {
|
||||||
evaluator.safeBind(
|
evaluator.safeBind(
|
||||||
root_dom[dim]->extent(), aten_tensor.sizes()[dim], lower);
|
root_dom[dim]->extent(), aten_tensor.sizes()[dim], lower);
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <torch/csrc/jit/passes/cuda_graph_fuser.h>
|
#include <torch/csrc/jit/passes/cuda_graph_fuser.h>
|
||||||
|
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
|
#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
|
||||||
#include <torch/csrc/jit/codegen/cuda/interface.h>
|
#include <torch/csrc/jit/codegen/cuda/interface.h>
|
||||||
#include <torch/csrc/jit/codegen/cuda/partition.h>
|
#include <torch/csrc/jit/codegen/cuda/partition.h>
|
||||||
@ -509,7 +510,7 @@ struct CudaGraphFuser {
|
|||||||
WithInsertPoint guard(bchunk->next());
|
WithInsertPoint guard(bchunk->next());
|
||||||
|
|
||||||
std::vector<Value*> producer_chunk_outputs;
|
std::vector<Value*> producer_chunk_outputs;
|
||||||
for (size_t i = 0; i < nchunks; i++) {
|
for (const auto i : c10::irange(nchunks)) {
|
||||||
producer_chunk_outputs.push_back(
|
producer_chunk_outputs.push_back(
|
||||||
bchunk->output(nchunks * producer_index + i));
|
bchunk->output(nchunks * producer_index + i));
|
||||||
}
|
}
|
||||||
@ -579,7 +580,7 @@ struct CudaGraphFuser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bchunk->removeInput(producer_index);
|
bchunk->removeInput(producer_index);
|
||||||
for (size_t i = 0; i < nchunks; i++) {
|
for (const auto i : c10::irange(nchunks)) {
|
||||||
bchunk->eraseOutput(nchunks * producer_index);
|
bchunk->eraseOutput(nchunks * producer_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <torch/csrc/jit/codegen/cuda/index_compute.h>
|
#include <torch/csrc/jit/codegen/cuda/index_compute.h>
|
||||||
|
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/codegen/cuda/arith.h>
|
#include <torch/csrc/jit/codegen/cuda/arith.h>
|
||||||
#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
|
#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
|
||||||
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
|
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
|
||||||
@ -181,7 +182,7 @@ class ContigIDs : public OptInDispatch {
|
|||||||
" != ",
|
" != ",
|
||||||
root_contiguity_.size());
|
root_contiguity_.size());
|
||||||
|
|
||||||
for (size_t i = 0; i < root_domain_.size(); i++) {
|
for (const auto i : c10::irange(root_domain_.size())) {
|
||||||
if (root_contiguity_[i]) {
|
if (root_contiguity_[i]) {
|
||||||
auto kir_root_domain_i =
|
auto kir_root_domain_i =
|
||||||
GpuLower::lowerValue(root_domain_[i])->as<kir::IterDomain>();
|
GpuLower::lowerValue(root_domain_[i])->as<kir::IterDomain>();
|
||||||
@ -794,7 +795,7 @@ kir::TensorIndex* Index::getGlobalProducerIndex(
|
|||||||
// Global striding
|
// Global striding
|
||||||
int64_t stride_i = 0;
|
int64_t stride_i = 0;
|
||||||
std::vector<Val*> strided_inds;
|
std::vector<Val*> strided_inds;
|
||||||
for (size_t i = 0; i < root_dom.size(); i++) {
|
for (const auto i : c10::irange(root_dom.size())) {
|
||||||
if (root_dom[i]->isReduction() ||
|
if (root_dom[i]->isReduction() ||
|
||||||
root_dom[i]->getIterType() == IterType::BroadcastWithoutStride) {
|
root_dom[i]->getIterType() == IterType::BroadcastWithoutStride) {
|
||||||
continue;
|
continue;
|
||||||
@ -918,7 +919,7 @@ kir::TensorIndex* Index::getProducerIndex_impl(
|
|||||||
|
|
||||||
std::vector<Val*> strided_inds;
|
std::vector<Val*> strided_inds;
|
||||||
|
|
||||||
for (size_t i = 0; i < root_dom.size(); i++) {
|
for (const auto i : c10::irange(root_dom.size())) {
|
||||||
if (root_dom[i]->isReduction() || root_dom[i]->isBroadcast()) {
|
if (root_dom[i]->isReduction() || root_dom[i]->isBroadcast()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -1023,7 +1024,7 @@ kir::TensorIndex* Index::getGlobalConsumerIndex(
|
|||||||
|
|
||||||
int64_t stride_i = 0;
|
int64_t stride_i = 0;
|
||||||
std::vector<Val*> strided_inds;
|
std::vector<Val*> strided_inds;
|
||||||
for (size_t i = 0; i < root_dom.size(); i++) {
|
for (const auto i : c10::irange(root_dom.size())) {
|
||||||
if (root_dom[i]->isReduction() ||
|
if (root_dom[i]->isReduction() ||
|
||||||
root_dom[i]->getIterType() == IterType::BroadcastWithoutStride) {
|
root_dom[i]->getIterType() == IterType::BroadcastWithoutStride) {
|
||||||
continue;
|
continue;
|
||||||
@ -1089,7 +1090,7 @@ kir::TensorIndex* Index::getConsumerIndex_impl(
|
|||||||
auto root_dom = consumer_tv->getMaybeRFactorDomain();
|
auto root_dom = consumer_tv->getMaybeRFactorDomain();
|
||||||
|
|
||||||
std::vector<Val*> strided_inds;
|
std::vector<Val*> strided_inds;
|
||||||
for (size_t i = 0; i < root_dom.size(); i++) {
|
for (const auto i : c10::irange(root_dom.size())) {
|
||||||
if (root_dom[i]->isReduction() || root_dom[i]->isBroadcast()) {
|
if (root_dom[i]->isReduction() || root_dom[i]->isBroadcast()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -1267,7 +1268,7 @@ std::pair<std::vector<Val*>, bool> Index::getConsumerRootPredIndices(
|
|||||||
: consumer_tv->getRootDomain();
|
: consumer_tv->getRootDomain();
|
||||||
|
|
||||||
std::vector<Val*> root_inds(root_dom.size(), ir_builder.create<kir::Int>(0));
|
std::vector<Val*> root_inds(root_dom.size(), ir_builder.create<kir::Int>(0));
|
||||||
for (size_t i = 0; i < root_dom.size(); i++) {
|
for (const auto i : c10::irange(root_dom.size())) {
|
||||||
if (root_dom[i]->isBroadcast()) {
|
if (root_dom[i]->isBroadcast()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <torch/csrc/jit/codegen/cuda/interface.h>
|
#include <torch/csrc/jit/codegen/cuda/interface.h>
|
||||||
|
|
||||||
#include <ATen/core/dispatch/OperatorOptions.h>
|
#include <ATen/core/dispatch/OperatorOptions.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/runtime/custom_operator.h>
|
#include <torch/csrc/jit/runtime/custom_operator.h>
|
||||||
#include <torch/csrc/jit/runtime/register_ops_utils.h>
|
#include <torch/csrc/jit/runtime/register_ops_utils.h>
|
||||||
|
|
||||||
@ -103,7 +104,7 @@ bool complyWith(
|
|||||||
const auto& t_sizes = tensor.sizes();
|
const auto& t_sizes = tensor.sizes();
|
||||||
const auto& t_strides = tensor.strides();
|
const auto& t_strides = tensor.strides();
|
||||||
int inner_dim = -1;
|
int inner_dim = -1;
|
||||||
for (size_t j = 0; j < *guard_tensor_type->dim(); j++) {
|
for (const auto j : c10::irange(*guard_tensor_type->dim())) {
|
||||||
// check b. for stride check, we go along dimensions from fastest stride to
|
// check b. for stride check, we go along dimensions from fastest stride to
|
||||||
// slowest stride
|
// slowest stride
|
||||||
int sorted_index = stride_properties[j]->stride_index_
|
int sorted_index = stride_properties[j]->stride_index_
|
||||||
@ -210,7 +211,7 @@ RegisterOperators reg_guard({
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < num_inputs; i++) {
|
for (const auto i : c10::irange(num_inputs)) {
|
||||||
const c10::TensorTypePtr& guard_tensor_type =
|
const c10::TensorTypePtr& guard_tensor_type =
|
||||||
types[i]->cast<TensorType>();
|
types[i]->cast<TensorType>();
|
||||||
|
|
||||||
|
@ -8,6 +8,7 @@
|
|||||||
#include <torch/csrc/jit/ir/ir.h>
|
#include <torch/csrc/jit/ir/ir.h>
|
||||||
|
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
@ -227,7 +228,7 @@ bool Expr::sameAs(const Expr* const other) const {
|
|||||||
if (inputs().size() != other->inputs().size() ||
|
if (inputs().size() != other->inputs().size() ||
|
||||||
outputs().size() != other->outputs().size())
|
outputs().size() != other->outputs().size())
|
||||||
return false;
|
return false;
|
||||||
for (size_t i = 0; i < inputs().size(); i++) {
|
for (const auto i : c10::irange(inputs().size())) {
|
||||||
if (!input(i)->sameAs(other->input(i)))
|
if (!input(i)->sameAs(other->input(i)))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,8 @@
|
|||||||
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
|
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
|
||||||
#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
|
#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
namespace fuser {
|
namespace fuser {
|
||||||
@ -51,7 +53,7 @@ void IrPrinter::handle(const TensorDomain* td) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
os_ << "[ ";
|
os_ << "[ ";
|
||||||
for (size_t i = 0; i < td->nDims(); i++) {
|
for (const auto i : c10::irange(td->nDims())) {
|
||||||
handle(td->axis(i));
|
handle(td->axis(i));
|
||||||
if (i != td->nDims() - 1)
|
if (i != td->nDims() - 1)
|
||||||
os_ << ", ";
|
os_ << ", ";
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
#include <c10/util/irange.h>
|
|
||||||
|
|
||||||
#include <torch/csrc/jit/codegen/cuda/arith.h>
|
#include <torch/csrc/jit/codegen/cuda/arith.h>
|
||||||
#include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
|
#include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
|
||||||
#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
|
#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
|
||||||
@ -9,6 +7,8 @@
|
|||||||
#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
|
#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
|
||||||
#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
|
#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
@ -1239,7 +1239,7 @@ void ConcretizeDomain::concretizePwOp(Expr* e) {
|
|||||||
TensorDomain::noReductions(i->getMaybeRFactorDomain());
|
TensorDomain::noReductions(i->getMaybeRFactorDomain());
|
||||||
TORCH_INTERNAL_ASSERT(ii.size() == io.size());
|
TORCH_INTERNAL_ASSERT(ii.size() == io.size());
|
||||||
|
|
||||||
for (size_t it = 0; it < ii.size(); it++) {
|
for (const auto it : c10::irange(ii.size())) {
|
||||||
if (!canConcretize(io[it]))
|
if (!canConcretize(io[it]))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
@ -1338,7 +1338,7 @@ class ProveValEqual : private IterVisitor {
|
|||||||
std::vector<IterDomain*> ii =
|
std::vector<IterDomain*> ii =
|
||||||
TensorDomain::noReductions(i->getMaybeRFactorDomain());
|
TensorDomain::noReductions(i->getMaybeRFactorDomain());
|
||||||
|
|
||||||
for (size_t it = 0; it < ii.size(); it++)
|
for (const auto it : c10::irange(ii.size()))
|
||||||
if (cd_.canConcretize(ii[it]) && cd_.canConcretize(io[it]))
|
if (cd_.canConcretize(ii[it]) && cd_.canConcretize(io[it]))
|
||||||
proveId(cd_.concretized(ii[it]), cd_.concretized(io[it]));
|
proveId(cd_.concretized(ii[it]), cd_.concretized(io[it]));
|
||||||
}
|
}
|
||||||
|
@ -408,20 +408,20 @@ std::vector<at::Tensor> FusionExecutorCache::runFusionWithInputs(
|
|||||||
|
|
||||||
bool GraphCache::requiresPermutation() {
|
bool GraphCache::requiresPermutation() {
|
||||||
const size_t input_rank = input_permutation_.size();
|
const size_t input_rank = input_permutation_.size();
|
||||||
for (size_t i = 0; i < input_rank; i++) {
|
for (const auto i : c10::irange(input_rank)) {
|
||||||
if (input_permutation_[i] != (long)i) {
|
if (input_permutation_[i] != (long)i) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Check if output agrees
|
// Check if output agrees
|
||||||
const size_t pw_output_rank = pw_output_permutation_.size();
|
const size_t pw_output_rank = pw_output_permutation_.size();
|
||||||
for (size_t i = 0; i < pw_output_rank; i++) {
|
for (const auto i : c10::irange(pw_output_rank)) {
|
||||||
TORCH_INTERNAL_ASSERT(
|
TORCH_INTERNAL_ASSERT(
|
||||||
pw_output_permutation_[i] == (long)i,
|
pw_output_permutation_[i] == (long)i,
|
||||||
"permutation of output and input is not consistent");
|
"permutation of output and input is not consistent");
|
||||||
}
|
}
|
||||||
const size_t reduction_output_rank = reduction_output_permutation_.size();
|
const size_t reduction_output_rank = reduction_output_permutation_.size();
|
||||||
for (size_t i = 0; i < reduction_output_rank; i++) {
|
for (const auto i : c10::irange(reduction_output_rank)) {
|
||||||
TORCH_INTERNAL_ASSERT(
|
TORCH_INTERNAL_ASSERT(
|
||||||
reduction_output_permutation_[i] == (long)i,
|
reduction_output_permutation_[i] == (long)i,
|
||||||
"permutation of output and input is not consistent");
|
"permutation of output and input is not consistent");
|
||||||
@ -505,7 +505,7 @@ void GraphCache::createFusion(const std::shared_ptr<Graph>& graph) {
|
|||||||
std::vector<int64_t> adjusted_reduction_axes;
|
std::vector<int64_t> adjusted_reduction_axes;
|
||||||
for (const auto dim : dims_list->vec()) {
|
for (const auto dim : dims_list->vec()) {
|
||||||
// adjust reduction axis to be the permuted axis;
|
// adjust reduction axis to be the permuted axis;
|
||||||
for (size_t j = 0; j < input_permutation_.size(); j++) {
|
for (const auto j : c10::irange(input_permutation_.size())) {
|
||||||
// follow the permutation to resolve the new reduction axes;
|
// follow the permutation to resolve the new reduction axes;
|
||||||
if (input_permutation_[j] == dim) {
|
if (input_permutation_[j] == dim) {
|
||||||
adjusted_reduction_axes.emplace_back(j);
|
adjusted_reduction_axes.emplace_back(j);
|
||||||
|
@ -7,6 +7,8 @@
|
|||||||
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
|
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
|
||||||
#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
|
#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
namespace fuser {
|
namespace fuser {
|
||||||
@ -103,7 +105,7 @@ void avoidRedundantWritesToSmem(
|
|||||||
TensorView* out_tv,
|
TensorView* out_tv,
|
||||||
ir_utils::ParallelTypeBitmap& pred) {
|
ir_utils::ParallelTypeBitmap& pred) {
|
||||||
if (out_tv->getMemoryType() == MemoryType::Shared) {
|
if (out_tv->getMemoryType() == MemoryType::Shared) {
|
||||||
for (size_t i = 0; i < out_tv->nDims(); i++) {
|
for (const auto i : c10::irange(out_tv->nDims())) {
|
||||||
auto id = out_tv->getComputeAtAxis(i).first;
|
auto id = out_tv->getComputeAtAxis(i).first;
|
||||||
if (out_tv->axis(i)->isBroadcast() && id->isThreadDim()) {
|
if (out_tv->axis(i)->isBroadcast() && id->isThreadDim()) {
|
||||||
pred.set(id->getParallelType(), true);
|
pred.set(id->getParallelType(), true);
|
||||||
@ -159,7 +161,7 @@ void ThreadPredicateMap::updateBitSet(Expr* expr) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Validate the combination of ptypes, reductions, bcasts
|
// Validate the combination of ptypes, reductions, bcasts
|
||||||
for (size_t i = 0; i < ir_utils::ParallelTypeBitmap::num_p_type; i++) {
|
for (const auto i : c10::irange(ir_utils::ParallelTypeBitmap::num_p_type)) {
|
||||||
if (input_reductions[i]) {
|
if (input_reductions[i]) {
|
||||||
if (id_ptypes[i]) {
|
if (id_ptypes[i]) {
|
||||||
TORCH_INTERNAL_ASSERT(
|
TORCH_INTERNAL_ASSERT(
|
||||||
|
@ -10,6 +10,8 @@
|
|||||||
#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
|
#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
|
||||||
#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
|
#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
namespace fuser {
|
namespace fuser {
|
||||||
@ -43,7 +45,7 @@ std::vector<kir::Bool*> PredicateCompute::computePredicates(
|
|||||||
std::vector<kir::Bool*> preds(root.size(), true_bool);
|
std::vector<kir::Bool*> preds(root.size(), true_bool);
|
||||||
Val* extent = nullptr;
|
Val* extent = nullptr;
|
||||||
|
|
||||||
for (size_t i = 0; i < indices.size(); i++) {
|
for (const auto i : c10::irange(indices.size())) {
|
||||||
const bool zero_ind = indices[i]->isZeroInt();
|
const bool zero_ind = indices[i]->isZeroInt();
|
||||||
const bool simple_ind = indices[i]->getOrigin() == nullptr;
|
const bool simple_ind = indices[i]->getOrigin() == nullptr;
|
||||||
|
|
||||||
@ -257,7 +259,7 @@ void UnrollPredicate::predicateOn(Expr* tv_expr) {
|
|||||||
all_preds.size() == root_dom.size(),
|
all_preds.size() == root_dom.size(),
|
||||||
"Predicates should be produced for every dimension, even if it's simply set as true.");
|
"Predicates should be produced for every dimension, even if it's simply set as true.");
|
||||||
|
|
||||||
for (size_t i = 0; i < all_preds.size(); i++) {
|
for (const auto i : c10::irange(all_preds.size())) {
|
||||||
if (all_preds[i]->isConst() && all_preds[i]->value().value()) {
|
if (all_preds[i]->isConst() && all_preds[i]->value().value()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
#include <torch/csrc/jit/codegen/cuda/parser.h>
|
#include <torch/csrc/jit/codegen/cuda/parser.h>
|
||||||
|
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
@ -23,7 +24,7 @@ namespace {
|
|||||||
std::vector<int> reductionAxes(TensorView* tv) {
|
std::vector<int> reductionAxes(TensorView* tv) {
|
||||||
size_t n_dims = tv->nDims();
|
size_t n_dims = tv->nDims();
|
||||||
std::vector<int> reduction_axes;
|
std::vector<int> reduction_axes;
|
||||||
for (size_t i = 0; i < n_dims; i++) {
|
for (const auto i : c10::irange(n_dims)) {
|
||||||
if (tv->axis(i)->isReduction()) {
|
if (tv->axis(i)->isReduction()) {
|
||||||
reduction_axes.emplace_back(i);
|
reduction_axes.emplace_back(i);
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
|
#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
|
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
@ -263,7 +264,7 @@ BestEffortReplay::BestEffortReplay(
|
|||||||
std::vector<IterDomain*>(t_inps.size(), nullptr);
|
std::vector<IterDomain*>(t_inps.size(), nullptr);
|
||||||
|
|
||||||
// Map t_expr inputs to replay domain directly
|
// Map t_expr inputs to replay domain directly
|
||||||
for (size_t t_i = 0; t_i < t_inps.size(); t_i++) {
|
for (const auto t_i : c10::irange(t_inps.size())) {
|
||||||
// There might not be a mapping, that could be okay.
|
// There might not be a mapping, that could be okay.
|
||||||
auto it = id_map_.find(t_inps[t_i]);
|
auto it = id_map_.find(t_inps[t_i]);
|
||||||
if (it != id_map_.end())
|
if (it != id_map_.end())
|
||||||
@ -382,7 +383,7 @@ BestEffortReplay::BestEffortReplay(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Add outputs to map.
|
// Add outputs to map.
|
||||||
for (size_t i = 0; i < t_expr->outputs().size(); i++) {
|
for (const auto i : c10::irange(t_expr->outputs().size())) {
|
||||||
auto t_out = t_expr->output(i);
|
auto t_out = t_expr->output(i);
|
||||||
auto r_out = r_expr->output(i);
|
auto r_out = r_expr->output(i);
|
||||||
if (t_out->getValType() == ValType::IterDomain &&
|
if (t_out->getValType() == ValType::IterDomain &&
|
||||||
@ -420,7 +421,7 @@ int BestEffortReplay::findFirstMismatchedID(
|
|||||||
|
|
||||||
BestEffortReplay ber(td2->domain(), td1->domain(), id_map);
|
BestEffortReplay ber(td2->domain(), td1->domain(), id_map);
|
||||||
|
|
||||||
for (size_t i = 0; i < td1->domain().size(); i++) {
|
for (const auto i : c10::irange(td1->domain().size())) {
|
||||||
if (ber.getReplay().find(td1->axis(i)) == ber.getReplay().end()) {
|
if (ber.getReplay().find(td1->axis(i)) == ber.getReplay().end()) {
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
@ -210,7 +210,7 @@ std::shared_ptr<FusedKernel> compileKernel(
|
|||||||
|
|
||||||
auto graph = spec.graph()->copy();
|
auto graph = spec.graph()->copy();
|
||||||
|
|
||||||
for (size_t i = 0; i < input_desc.size(); i++) {
|
for (const auto i : c10::irange(input_desc.size())) {
|
||||||
const auto& desc = input_desc[i];
|
const auto& desc = input_desc[i];
|
||||||
|
|
||||||
// TODO: can't get rid of this use of TensorType
|
// TODO: can't get rid of this use of TensorType
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
#include <torch/csrc/jit/frontend/concrete_module_type.h>
|
#include <torch/csrc/jit/frontend/concrete_module_type.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/python/pybind_utils.h>
|
#include <torch/csrc/jit/python/pybind_utils.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
@ -56,7 +58,7 @@ std::shared_ptr<ConcreteModuleType> ConcreteModuleType::fromJitType(
|
|||||||
// Populate the builder metadata from the JIT type. This is to ensure
|
// Populate the builder metadata from the JIT type. This is to ensure
|
||||||
// ConcreteModuleTypes produced from Python and ones produced from a JIT
|
// ConcreteModuleTypes produced from Python and ones produced from a JIT
|
||||||
// type directly behave the same to the rest of the system.
|
// type directly behave the same to the rest of the system.
|
||||||
for (size_t i = 0; i < classType->numAttributes(); i++) {
|
for (const auto i : c10::irange(classType->numAttributes())) {
|
||||||
const auto& attrName = classType->getAttributeName(i);
|
const auto& attrName = classType->getAttributeName(i);
|
||||||
const auto& attrType = classType->getAttribute(i);
|
const auto& attrType = classType->getAttribute(i);
|
||||||
if (attrType->is_module()) {
|
if (attrType->is_module()) {
|
||||||
@ -70,7 +72,7 @@ std::shared_ptr<ConcreteModuleType> ConcreteModuleType::fromJitType(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < classType->numConstants(); i++) {
|
for (const auto i : c10::irange(classType->numConstants())) {
|
||||||
builder.addConstant(
|
builder.addConstant(
|
||||||
classType->getConstantName(i), classType->getConstant(i));
|
classType->getConstantName(i), classType->getConstant(i));
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
#include <c10/util/StringUtil.h>
|
#include <c10/util/StringUtil.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/api/function_impl.h>
|
#include <torch/csrc/jit/api/function_impl.h>
|
||||||
#include <torch/csrc/jit/frontend/canonicalize_modified_loop.h>
|
#include <torch/csrc/jit/frontend/canonicalize_modified_loop.h>
|
||||||
#include <torch/csrc/jit/frontend/convert_to_ssa.h>
|
#include <torch/csrc/jit/frontend/convert_to_ssa.h>
|
||||||
@ -3985,7 +3986,7 @@ struct to_ir {
|
|||||||
rdim =
|
rdim =
|
||||||
handle_indexing(subscript_expr, rev_idx, rdim, /*is_reverse=*/true);
|
handle_indexing(subscript_expr, rev_idx, rdim, /*is_reverse=*/true);
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < exprs.size(); i++) {
|
for (const auto i : c10::irange(exprs.size())) {
|
||||||
if (!exprs[i].has_value()) {
|
if (!exprs[i].has_value()) {
|
||||||
if (subscript_exprs[i].kind() == TK_SLICE_EXPR) {
|
if (subscript_exprs[i].kind() == TK_SLICE_EXPR) {
|
||||||
sliceable = emitSlice(
|
sliceable = emitSlice(
|
||||||
@ -4451,7 +4452,7 @@ std::vector<Function*> CompilationUnit::define(
|
|||||||
this->register_function(std::move(fn));
|
this->register_function(std::move(fn));
|
||||||
};
|
};
|
||||||
|
|
||||||
for (size_t i = 0; i < properties.size(); i++) {
|
for (const auto i : c10::irange(properties.size())) {
|
||||||
PropertyPair property_fns = define_property(
|
PropertyPair property_fns = define_property(
|
||||||
prefix,
|
prefix,
|
||||||
properties[i],
|
properties[i],
|
||||||
@ -4470,7 +4471,7 @@ std::vector<Function*> CompilationUnit::define(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < definitions.size(); i++) {
|
for (const auto i : c10::irange(definitions.size())) {
|
||||||
auto fn = define(
|
auto fn = define(
|
||||||
prefix,
|
prefix,
|
||||||
definitions[i],
|
definitions[i],
|
||||||
@ -4549,7 +4550,7 @@ void CompilationUnit::define_hooks(
|
|||||||
};
|
};
|
||||||
|
|
||||||
// define hooks
|
// define hooks
|
||||||
for (size_t i = 0; i < hookDefs.size(); i++) {
|
for (const auto i : c10::irange(hookDefs.size())) {
|
||||||
// check to see if already defined this hook
|
// check to see if already defined this hook
|
||||||
auto existing_fn = check_collisions(hookDefs[i]);
|
auto existing_fn = check_collisions(hookDefs[i]);
|
||||||
if (existing_fn != nullptr) {
|
if (existing_fn != nullptr) {
|
||||||
@ -4576,7 +4577,7 @@ void CompilationUnit::define_hooks(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// define pre_hooks
|
// define pre_hooks
|
||||||
for (size_t i = 0; i < preHookDefs.size(); i++) {
|
for (const auto i : c10::irange(preHookDefs.size())) {
|
||||||
// check to see if already defined this hook
|
// check to see if already defined this hook
|
||||||
auto existing_fn = check_collisions(preHookDefs[i]);
|
auto existing_fn = check_collisions(preHookDefs[i]);
|
||||||
if (existing_fn != nullptr) {
|
if (existing_fn != nullptr) {
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include <torch/csrc/jit/frontend/sugared_value.h>
|
#include <torch/csrc/jit/frontend/sugared_value.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/frontend/schema_matching.h>
|
#include <torch/csrc/jit/frontend/schema_matching.h>
|
||||||
#include <torch/csrc/jit/frontend/tree_views.h>
|
#include <torch/csrc/jit/frontend/tree_views.h>
|
||||||
#include <torch/csrc/jit/ir/ir.h>
|
#include <torch/csrc/jit/ir/ir.h>
|
||||||
@ -137,7 +138,7 @@ std::shared_ptr<SugaredValue> SimpleValue::attr(
|
|||||||
if (auto tuple_type = value_->type()->cast<TupleType>()) {
|
if (auto tuple_type = value_->type()->cast<TupleType>()) {
|
||||||
if (tuple_type->schema()) {
|
if (tuple_type->schema()) {
|
||||||
auto attrs = tuple_type->schema()->arguments();
|
auto attrs = tuple_type->schema()->arguments();
|
||||||
for (size_t i = 0; i < attrs.size(); i++) {
|
for (const auto i : c10::irange(attrs.size())) {
|
||||||
if (attrs[i].name() == field) {
|
if (attrs[i].name() == field) {
|
||||||
auto idx = m.graph()->insertConstant(IValue(static_cast<int64_t>(i)));
|
auto idx = m.graph()->insertConstant(IValue(static_cast<int64_t>(i)));
|
||||||
auto out_type = tuple_type->elements().at(i);
|
auto out_type = tuple_type->elements().at(i);
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#include <ATen/core/Dict.h>
|
#include <ATen/core/Dict.h>
|
||||||
#include <ATen/core/functional.h>
|
#include <ATen/core/functional.h>
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/autograd/engine.h>
|
#include <torch/csrc/autograd/engine.h>
|
||||||
#include <torch/csrc/autograd/function.h>
|
#include <torch/csrc/autograd/function.h>
|
||||||
#include <torch/csrc/autograd/variable.h>
|
#include <torch/csrc/autograd/variable.h>
|
||||||
@ -381,7 +382,7 @@ static IValue addInput(
|
|||||||
|
|
||||||
if (input.isTensorList()) {
|
if (input.isTensorList()) {
|
||||||
auto elems = input.toTensorList();
|
auto elems = input.toTensorList();
|
||||||
for (size_t i = 0; i < num_elems; i++) {
|
for (const auto i : c10::irange(num_elems)) {
|
||||||
elems[i] = addInput(
|
elems[i] = addInput(
|
||||||
state,
|
state,
|
||||||
elems.get(i),
|
elems.get(i),
|
||||||
@ -392,7 +393,7 @@ static IValue addInput(
|
|||||||
return elems;
|
return elems;
|
||||||
} else {
|
} else {
|
||||||
auto elems = input.toList();
|
auto elems = input.toList();
|
||||||
for (size_t i = 0; i < num_elems; i++) {
|
for (const auto i : c10::irange(num_elems)) {
|
||||||
elems[i] = addInput(
|
elems[i] = addInput(
|
||||||
state,
|
state,
|
||||||
elems.get(i),
|
elems.get(i),
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include <torch/csrc/jit/ir/alias_analysis.h>
|
#include <torch/csrc/jit/ir/alias_analysis.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/jit_log.h>
|
#include <torch/csrc/jit/jit_log.h>
|
||||||
#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
|
#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
|
||||||
#include <torch/csrc/jit/runtime/operator.h>
|
#include <torch/csrc/jit/runtime/operator.h>
|
||||||
@ -609,7 +610,7 @@ void AliasDb::analyzeImpl(Node* node) {
|
|||||||
case prim::TypeCheck:
|
case prim::TypeCheck:
|
||||||
case prim::RequiresGradCheck: {
|
case prim::RequiresGradCheck: {
|
||||||
auto num_inputs = node->inputs().size();
|
auto num_inputs = node->inputs().size();
|
||||||
for (size_t i = 0; i < num_inputs; i++) {
|
for (const auto i : c10::irange(num_inputs)) {
|
||||||
makePointerTo(node->outputs().at(i), node->inputs().at(i));
|
makePointerTo(node->outputs().at(i), node->inputs().at(i));
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
@ -692,7 +693,7 @@ void AliasDb::analyzeImpl(Node* node) {
|
|||||||
// Bind the schema's "formal" alias annotation to the actual values those
|
// Bind the schema's "formal" alias annotation to the actual values those
|
||||||
// schema arguments represent
|
// schema arguments represent
|
||||||
std::unordered_map<Symbol, Value*> formalToActual;
|
std::unordered_map<Symbol, Value*> formalToActual;
|
||||||
for (size_t i = 0; i < schema.arguments().size(); i++) {
|
for (const auto i : c10::irange(schema.arguments().size())) {
|
||||||
const auto& formal = schema.arguments()[i].alias_info();
|
const auto& formal = schema.arguments()[i].alias_info();
|
||||||
const auto& actualValue = node->inputs().at(i);
|
const auto& actualValue = node->inputs().at(i);
|
||||||
// Skip if there's no alias annotation
|
// Skip if there's no alias annotation
|
||||||
@ -743,7 +744,7 @@ void AliasDb::analyzeImpl(Node* node) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Use the formal-actual mapping to give aliases to the outputs
|
// Use the formal-actual mapping to give aliases to the outputs
|
||||||
for (size_t i = 0; i < schema.returns().size(); i++) {
|
for (const auto i : c10::irange(schema.returns().size())) {
|
||||||
const auto actual = node->outputs().at(i);
|
const auto actual = node->outputs().at(i);
|
||||||
const auto& formal = schema.returns()[i].alias_info();
|
const auto& formal = schema.returns()[i].alias_info();
|
||||||
if (!formal) {
|
if (!formal) {
|
||||||
@ -820,7 +821,7 @@ void AliasDb::analyzeIf(Node* node) {
|
|||||||
analyze(trueBlock);
|
analyze(trueBlock);
|
||||||
analyze(falseBlock);
|
analyze(falseBlock);
|
||||||
|
|
||||||
for (size_t i = 0; i < node->outputs().size(); i++) {
|
for (const auto i : c10::irange(node->outputs().size())) {
|
||||||
const auto nodeOutput = node->outputs()[i];
|
const auto nodeOutput = node->outputs()[i];
|
||||||
|
|
||||||
const auto trueOutput = trueBlock->outputs().at(i);
|
const auto trueOutput = trueBlock->outputs().at(i);
|
||||||
@ -869,7 +870,7 @@ void AliasDb::analyzeSubgraph(Node* node) {
|
|||||||
// subgraph block.
|
// subgraph block.
|
||||||
TORCH_INTERNAL_ASSERT(
|
TORCH_INTERNAL_ASSERT(
|
||||||
subgraphBlock->outputs().size() >= node->outputs().size());
|
subgraphBlock->outputs().size() >= node->outputs().size());
|
||||||
for (size_t i = 0; i < node->outputs().size(); i++) {
|
for (const auto i : c10::irange(node->outputs().size())) {
|
||||||
makePointerTo(node->outputs()[i], subgraphBlock->outputs()[i]);
|
makePointerTo(node->outputs()[i], subgraphBlock->outputs()[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1186,7 +1187,7 @@ bool AliasDb::mayContainAlias(
|
|||||||
// Make each value in the `from` list point to its partner in the `to` list
|
// Make each value in the `from` list point to its partner in the `to` list
|
||||||
void AliasDb::mapAliases(at::ArrayRef<Value*> from, at::ArrayRef<Value*> to) {
|
void AliasDb::mapAliases(at::ArrayRef<Value*> from, at::ArrayRef<Value*> to) {
|
||||||
TORCH_INTERNAL_ASSERT(to.size() == from.size());
|
TORCH_INTERNAL_ASSERT(to.size() == from.size());
|
||||||
for (size_t i = 0; i < to.size(); i++) {
|
for (const auto i : c10::irange(to.size())) {
|
||||||
makePointerTo(from[i], to[i]);
|
makePointerTo(from[i], to[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
#include <ATen/core/function.h>
|
#include <ATen/core/function.h>
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
#include <c10/util/StringUtil.h>
|
#include <c10/util/StringUtil.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/api/function_impl.h>
|
#include <torch/csrc/jit/api/function_impl.h>
|
||||||
#include <torch/csrc/jit/frontend/error_report.h>
|
#include <torch/csrc/jit/frontend/error_report.h>
|
||||||
#include <torch/csrc/jit/frontend/schema_matching.h>
|
#include <torch/csrc/jit/frontend/schema_matching.h>
|
||||||
@ -1291,7 +1292,7 @@ void Node::cloneFrom(Node* s) {
|
|||||||
void Node::replaceAllUsesWith(Node* n) {
|
void Node::replaceAllUsesWith(Node* n) {
|
||||||
AT_ASSERT(outputs().size() == n->outputs().size());
|
AT_ASSERT(outputs().size() == n->outputs().size());
|
||||||
size_t nOutputs = outputs().size();
|
size_t nOutputs = outputs().size();
|
||||||
for (size_t i = 0; i < nOutputs; i++) {
|
for (const auto i : c10::irange(nOutputs)) {
|
||||||
outputs()[i]->replaceAllUsesWith(n->outputs()[i]);
|
outputs()[i]->replaceAllUsesWith(n->outputs()[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1615,7 +1616,7 @@ Value* Graph::insert(
|
|||||||
Node* Graph::create(NodeKind kind, size_t num_outputs) {
|
Node* Graph::create(NodeKind kind, size_t num_outputs) {
|
||||||
// NB: Node constructor adds node to all_nodes
|
// NB: Node constructor adds node to all_nodes
|
||||||
auto n = new Node(this, kind);
|
auto n = new Node(this, kind);
|
||||||
for (size_t i = 0; i < num_outputs; i++) {
|
for (const auto i : c10::irange(num_outputs)) {
|
||||||
n->addOutput();
|
n->addOutput();
|
||||||
}
|
}
|
||||||
return n;
|
return n;
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/ir/subgraph_matcher.h>
|
#include <torch/csrc/jit/ir/subgraph_matcher.h>
|
||||||
#include <torch/csrc/jit/jit_log.h>
|
#include <torch/csrc/jit/jit_log.h>
|
||||||
|
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <stack>
|
#include <stack>
|
||||||
|
|
||||||
@ -295,12 +297,12 @@ bool SubgraphMatcher::matchNodes(const Node* n1, Node* n2) {
|
|||||||
// Add nodes to the map before calling matchValues to avoid infinite
|
// Add nodes to the map before calling matchValues to avoid infinite
|
||||||
// recursion.
|
// recursion.
|
||||||
nodes_map_[n1] = n2;
|
nodes_map_[n1] = n2;
|
||||||
for (size_t i = 0; i < n1->outputs().size(); i++) {
|
for (const auto i : c10::irange(n1->outputs().size())) {
|
||||||
if (!matchValues(n1->outputs()[i], n2->outputs()[i])) {
|
if (!matchValues(n1->outputs()[i], n2->outputs()[i])) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < n1->inputs().size(); i++) {
|
for (const auto i : c10::irange(n1->inputs().size())) {
|
||||||
if (!matchValues(n1->inputs()[i], n2->inputs()[i])) {
|
if (!matchValues(n1->inputs()[i], n2->inputs()[i])) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <torch/csrc/jit/passes/bailout_graph.h>
|
#include <torch/csrc/jit/passes/bailout_graph.h>
|
||||||
|
|
||||||
#include <ATen/core/function.h>
|
#include <ATen/core/function.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/ir/alias_analysis.h>
|
#include <torch/csrc/jit/ir/alias_analysis.h>
|
||||||
#include <torch/csrc/jit/ir/ir_views.h>
|
#include <torch/csrc/jit/ir/ir_views.h>
|
||||||
#include <torch/csrc/jit/jit_log.h>
|
#include <torch/csrc/jit/jit_log.h>
|
||||||
@ -110,7 +111,7 @@ struct BailOutGraphBuilderForNode {
|
|||||||
const at::ArrayRef<Value*> block_outputs,
|
const at::ArrayRef<Value*> block_outputs,
|
||||||
const at::ArrayRef<Value*> carried_deps) {
|
const at::ArrayRef<Value*> carried_deps) {
|
||||||
TORCH_INTERNAL_ASSERT(block_outputs.size() == carried_deps.size());
|
TORCH_INTERNAL_ASSERT(block_outputs.size() == carried_deps.size());
|
||||||
for (size_t i = 0; i < block_outputs.size(); i++) {
|
for (const auto i : c10::irange(block_outputs.size())) {
|
||||||
auto nv = getOrAddInputForValue(block_outputs[i]);
|
auto nv = getOrAddInputForValue(block_outputs[i]);
|
||||||
old_to_new_[carried_deps[i]] = nv;
|
old_to_new_[carried_deps[i]] = nv;
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include <torch/csrc/jit/passes/canonicalize.h>
|
#include <torch/csrc/jit/passes/canonicalize.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/ir/ir_views.h>
|
#include <torch/csrc/jit/ir/ir_views.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
@ -33,7 +34,7 @@ std::shared_ptr<Graph> Canonicalize(
|
|||||||
r->appendNode(r_node);
|
r->appendNode(r_node);
|
||||||
auto outputs = node->outputs();
|
auto outputs = node->outputs();
|
||||||
auto r_outputs = r_node->outputs();
|
auto r_outputs = r_node->outputs();
|
||||||
for (size_t i = 0; i < outputs.size(); i++) {
|
for (const auto i : c10::irange(outputs.size())) {
|
||||||
rn_env[outputs.at(i)] = r_outputs.at(i);
|
rn_env[outputs.at(i)] = r_outputs.at(i);
|
||||||
}
|
}
|
||||||
if (node->hasAttribute(attr::Subgraph)) {
|
if (node->hasAttribute(attr::Subgraph)) {
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include <torch/csrc/jit/passes/dead_code_elimination.h>
|
#include <torch/csrc/jit/passes/dead_code_elimination.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/ir/alias_analysis.h>
|
#include <torch/csrc/jit/ir/alias_analysis.h>
|
||||||
#include <torch/csrc/jit/ir/ir_views.h>
|
#include <torch/csrc/jit/ir/ir_views.h>
|
||||||
#include <torch/csrc/jit/jit_log.h>
|
#include <torch/csrc/jit/jit_log.h>
|
||||||
@ -114,7 +115,7 @@ class DeadCodeEliminator {
|
|||||||
outerNode->kind() == c10::onnx::Loop) {
|
outerNode->kind() == c10::onnx::Loop) {
|
||||||
// Special handling to deal with loop carried dependencies.
|
// Special handling to deal with loop carried dependencies.
|
||||||
auto loop = LoopView(outerNode);
|
auto loop = LoopView(outerNode);
|
||||||
for (size_t i = 0; i < loop.carriedOutputs().size(); i++) {
|
for (const auto i : c10::irange(loop.carriedOutputs().size())) {
|
||||||
if (outerNode->kind() == c10::onnx::Loop) {
|
if (outerNode->kind() == c10::onnx::Loop) {
|
||||||
// Special handling for onnx loop.
|
// Special handling for onnx loop.
|
||||||
// The number of body carried inputs and outputs are different.
|
// The number of body carried inputs and outputs are different.
|
||||||
@ -135,7 +136,7 @@ class DeadCodeEliminator {
|
|||||||
liveValues_.insert(loop.nextCond());
|
liveValues_.insert(loop.nextCond());
|
||||||
} else {
|
} else {
|
||||||
AT_ASSERT(outerNode->outputs().size() == node->inputs().size());
|
AT_ASSERT(outerNode->outputs().size() == node->inputs().size());
|
||||||
for (size_t i = 0; i < outerNode->outputs().size(); i++) {
|
for (const auto i : c10::irange(outerNode->outputs().size())) {
|
||||||
auto innerOutput = node->inputs()[i];
|
auto innerOutput = node->inputs()[i];
|
||||||
auto outerOutput = outerNode->outputs()[i];
|
auto outerOutput = outerNode->outputs()[i];
|
||||||
if (liveValues_.count(outerOutput)) {
|
if (liveValues_.count(outerOutput)) {
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include <torch/csrc/jit/passes/fixup_trace_scope_blocks.h>
|
#include <torch/csrc/jit/passes/fixup_trace_scope_blocks.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/frontend/schema_matching.h>
|
#include <torch/csrc/jit/frontend/schema_matching.h>
|
||||||
#include <torch/csrc/jit/passes/canonicalize.h>
|
#include <torch/csrc/jit/passes/canonicalize.h>
|
||||||
#include <torch/csrc/jit/passes/dead_code_elimination.h>
|
#include <torch/csrc/jit/passes/dead_code_elimination.h>
|
||||||
@ -200,14 +201,14 @@ struct ConvertTracedAttrReferences {
|
|||||||
// the proper attribute.
|
// the proper attribute.
|
||||||
auto attr_atoms = attr_qualname.atoms();
|
auto attr_atoms = attr_qualname.atoms();
|
||||||
Value* replaced_value = self;
|
Value* replaced_value = self;
|
||||||
for (size_t i = 0; i < attr_atoms.size(); i++) {
|
for (const auto i : c10::irange(attr_atoms.size())) {
|
||||||
if (i < prefix_atoms.size()) {
|
if (i < prefix_atoms.size()) {
|
||||||
TORCH_INTERNAL_ASSERT(attr_atoms[i] == prefix_atoms[i]);
|
TORCH_INTERNAL_ASSERT(attr_atoms[i] == prefix_atoms[i]);
|
||||||
} else {
|
} else {
|
||||||
replaced_value = n->owningBlock()->owningGraph()->insertGetAttr(
|
replaced_value = n->owningBlock()->owningGraph()->insertGetAttr(
|
||||||
replaced_value, attr_atoms[i]);
|
replaced_value, attr_atoms[i]);
|
||||||
} // if (i < prefix_atoms.size())
|
} // if (i < prefix_atoms.size())
|
||||||
} // for (size_t i = 0; i < attr_atoms.size(); i++)
|
} // for(const auto i : c10::irange(attr_atoms.size()))
|
||||||
n->replaceInput(inp_idx, replaced_value);
|
n->replaceInput(inp_idx, replaced_value);
|
||||||
local_remaps[inp] = replaced_value;
|
local_remaps[inp] = replaced_value;
|
||||||
} else {
|
} else {
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
#include <torch/csrc/jit/jit_log.h>
|
#include <torch/csrc/jit/jit_log.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/ir/alias_analysis.h>
|
#include <torch/csrc/jit/ir/alias_analysis.h>
|
||||||
#include <torch/csrc/jit/passes/clear_profiling.h>
|
#include <torch/csrc/jit/passes/clear_profiling.h>
|
||||||
#include <torch/csrc/jit/passes/inliner.h>
|
#include <torch/csrc/jit/passes/inliner.h>
|
||||||
@ -312,7 +313,7 @@ class AttributePropagator {
|
|||||||
|
|
||||||
} else if (attr.isList()) {
|
} else if (attr.isList()) {
|
||||||
c10::List<IValue> elems = std::move(attr).toList();
|
c10::List<IValue> elems = std::move(attr).toList();
|
||||||
for (size_t i = 0; i < elems.size(); i++) {
|
for (const auto i : c10::irange(elems.size())) {
|
||||||
elems.set(i, overrideGradient(elems.extract(i)));
|
elems.set(i, overrideGradient(elems.extract(i)));
|
||||||
}
|
}
|
||||||
attr = std::move(elems);
|
attr = std::move(elems);
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/ir/alias_analysis.h>
|
#include <torch/csrc/jit/ir/alias_analysis.h>
|
||||||
#include <torch/csrc/jit/ir/ir_views.h>
|
#include <torch/csrc/jit/ir/ir_views.h>
|
||||||
#include <torch/csrc/jit/passes/frozen_conv_folding.h>
|
#include <torch/csrc/jit/passes/frozen_conv_folding.h>
|
||||||
@ -15,7 +16,7 @@ void OptimizeFrozenGraph(
|
|||||||
removeDropout(graph);
|
removeDropout(graph);
|
||||||
// run a couple times to capture Conv -> Mul -> Add etc
|
// run a couple times to capture Conv -> Mul -> Add etc
|
||||||
if (optimize_numerics) {
|
if (optimize_numerics) {
|
||||||
for (size_t i = 0; i < 2; i++) {
|
for (const auto i : c10::irange(2)) {
|
||||||
FoldFrozenConvBatchnorm(graph);
|
FoldFrozenConvBatchnorm(graph);
|
||||||
FoldFrozenConvAddOrSub(graph);
|
FoldFrozenConvAddOrSub(graph);
|
||||||
FoldFrozenConvMulOrDiv(graph);
|
FoldFrozenConvMulOrDiv(graph);
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include <ATen/core/interned_strings.h>
|
#include <ATen/core/interned_strings.h>
|
||||||
#include <c10/core/ScalarType.h>
|
#include <c10/core/ScalarType.h>
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/ir/alias_analysis.h>
|
#include <torch/csrc/jit/ir/alias_analysis.h>
|
||||||
#include <torch/csrc/jit/ir/constants.h>
|
#include <torch/csrc/jit/ir/constants.h>
|
||||||
#include <torch/csrc/jit/ir/ir.h>
|
#include <torch/csrc/jit/ir/ir.h>
|
||||||
@ -193,7 +194,7 @@ void InplaceMKLDNNSubgraph(std::shared_ptr<Graph> graph) {
|
|||||||
// the binary operators (add/mul) are commutative and only take tensor
|
// the binary operators (add/mul) are commutative and only take tensor
|
||||||
// inputs, so we can inplace either the first or second input
|
// inputs, so we can inplace either the first or second input
|
||||||
int64_t reusable_value_index = -1;
|
int64_t reusable_value_index = -1;
|
||||||
for (size_t i = 0; i < 2; i++) {
|
for (const auto i : c10::irange(2)) {
|
||||||
TORCH_INTERNAL_ASSERT(node->inputs().at(i)->type()->cast<TensorType>());
|
TORCH_INTERNAL_ASSERT(node->inputs().at(i)->type()->cast<TensorType>());
|
||||||
if (!set_liveness[alias_mapping[node->inputs().at(i)]]->isAfter(node)) {
|
if (!set_liveness[alias_mapping[node->inputs().at(i)]]->isAfter(node)) {
|
||||||
reusable_value_index = i;
|
reusable_value_index = i;
|
||||||
@ -905,7 +906,7 @@ class MKLDNNSubgraphSlicer {
|
|||||||
|
|
||||||
if (n->kind() == aten::add || n->kind() == aten::mul) {
|
if (n->kind() == aten::add || n->kind() == aten::mul) {
|
||||||
// mkldnn doesn't currently support Tensor-Scalar add
|
// mkldnn doesn't currently support Tensor-Scalar add
|
||||||
for (size_t i = 0; i < 2; i++) {
|
for (const auto i : c10::irange(2)) {
|
||||||
if (!n->inputs().at(i)->type()->cast<TensorType>()) {
|
if (!n->inputs().at(i)->type()->cast<TensorType>()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <torch/csrc/jit/passes/graph_fuser.h>
|
#include <torch/csrc/jit/passes/graph_fuser.h>
|
||||||
|
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/codegen/fuser/interface.h>
|
#include <torch/csrc/jit/codegen/fuser/interface.h>
|
||||||
#include <torch/csrc/jit/frontend/ir_emitter.h>
|
#include <torch/csrc/jit/frontend/ir_emitter.h>
|
||||||
#include <torch/csrc/jit/ir/alias_analysis.h>
|
#include <torch/csrc/jit/ir/alias_analysis.h>
|
||||||
@ -600,7 +601,7 @@ struct GraphFuser {
|
|||||||
// a_broadcasted, b_broadcasted = listUnpack(output_list)
|
// a_broadcasted, b_broadcasted = listUnpack(output_list)
|
||||||
// `a_broadcasted` should receive the same aliasing info as `a`
|
// `a_broadcasted` should receive the same aliasing info as `a`
|
||||||
TORCH_INTERNAL_ASSERT(unpack_node->outputs().size() == inputs.size());
|
TORCH_INTERNAL_ASSERT(unpack_node->outputs().size() == inputs.size());
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for (const auto i : c10::irange(inputs.size())) {
|
||||||
Value* original_input = inputs[i];
|
Value* original_input = inputs[i];
|
||||||
Value* broadcasted_output = unpack_node->outputs()[i];
|
Value* broadcasted_output = unpack_node->outputs()[i];
|
||||||
aliasDb_->copyValue(original_input, broadcasted_output);
|
aliasDb_->copyValue(original_input, broadcasted_output);
|
||||||
@ -753,7 +754,7 @@ struct GraphFuser {
|
|||||||
WithInsertPoint guard(bchunk->next());
|
WithInsertPoint guard(bchunk->next());
|
||||||
|
|
||||||
std::vector<Value*> producer_chunk_outputs;
|
std::vector<Value*> producer_chunk_outputs;
|
||||||
for (size_t i = 0; i < nchunks; i++) {
|
for (const auto i : c10::irange(nchunks)) {
|
||||||
producer_chunk_outputs.push_back(
|
producer_chunk_outputs.push_back(
|
||||||
bchunk->output(nchunks * producer_index + i));
|
bchunk->output(nchunks * producer_index + i));
|
||||||
}
|
}
|
||||||
@ -828,7 +829,7 @@ struct GraphFuser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bchunk->removeInput(producer_index);
|
bchunk->removeInput(producer_index);
|
||||||
for (size_t i = 0; i < nchunks; i++) {
|
for (const auto i : c10::irange(nchunks)) {
|
||||||
bchunk->eraseOutput(nchunks * producer_index);
|
bchunk->eraseOutput(nchunks * producer_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
#include <ATen/core/functional.h>
|
#include <ATen/core/functional.h>
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/ir/constants.h>
|
#include <torch/csrc/jit/ir/constants.h>
|
||||||
#include <torch/csrc/jit/passes/dead_code_elimination.h>
|
#include <torch/csrc/jit/passes/dead_code_elimination.h>
|
||||||
|
|
||||||
@ -229,7 +230,7 @@ static void flattenOutputs(Node* n, Node* insert_point) {
|
|||||||
// is placed at the current insertion point
|
// is placed at the current insertion point
|
||||||
if (TupleTypePtr tt = output->type()->cast<TupleType>()) {
|
if (TupleTypePtr tt = output->type()->cast<TupleType>()) {
|
||||||
if (supported_ops.count(n->kind()) > 0) {
|
if (supported_ops.count(n->kind()) > 0) {
|
||||||
for (size_t j = 0; j < tt->elements().size(); j++) {
|
for (const auto j : c10::irange(tt->elements().size())) {
|
||||||
n->insertOutput(i + 1 + j)->setType(tt->elements()[j]);
|
n->insertOutput(i + 1 + j)->setType(tt->elements()[j]);
|
||||||
}
|
}
|
||||||
auto new_tup =
|
auto new_tup =
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
#include <ATen/core/functional.h>
|
#include <ATen/core/functional.h>
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/autograd/function.h>
|
#include <torch/csrc/autograd/function.h>
|
||||||
#include <torch/csrc/autograd/symbolic.h>
|
#include <torch/csrc/autograd/symbolic.h>
|
||||||
#include <torch/csrc/jit/ir/constants.h>
|
#include <torch/csrc/jit/ir/constants.h>
|
||||||
@ -316,7 +317,7 @@ void NodeToONNX(
|
|||||||
auto cloneNode = [&](Node* node) {
|
auto cloneNode = [&](Node* node) {
|
||||||
auto n_ = new_block->appendNode(
|
auto n_ = new_block->appendNode(
|
||||||
new_block->owningGraph()->createClone(node, envFn));
|
new_block->owningGraph()->createClone(node, envFn));
|
||||||
for (size_t i = 0; i < node->outputs().size(); i++) {
|
for (const auto i : c10::irange(node->outputs().size())) {
|
||||||
// n_->outputs()[i]->setType(node->outputs()[i]->type());
|
// n_->outputs()[i]->setType(node->outputs()[i]->type());
|
||||||
env[node->output(i)] = n_->output(i);
|
env[node->output(i)] = n_->output(i);
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include <torch/torch.h>
|
#include <torch/torch.h>
|
||||||
|
|
||||||
#include <c10/util/Optional.h>
|
#include <c10/util/Optional.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
@ -88,8 +89,7 @@ static void fuseConvBatchNorm(Block* b, ValueToParamPairMap& valsToParamsMap) {
|
|||||||
bn_scale = bn_scale.div(bn_var);
|
bn_scale = bn_scale.div(bn_var);
|
||||||
|
|
||||||
// Calculate weight
|
// Calculate weight
|
||||||
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
for (const auto i : c10::irange(w_conv.size(0))) {
|
||||||
for (size_t i = 0; i < w_conv.size(0); i++) {
|
|
||||||
w_conv[i] = w_conv[i].mul(bn_scale[i]);
|
w_conv[i] = w_conv[i].mul(bn_scale[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h>
|
#include <torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h>
|
||||||
|
|
||||||
#include <aten/src/ATen/InitialTensorOptions.h>
|
#include <aten/src/ATen/InitialTensorOptions.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/jit_log.h>
|
#include <torch/csrc/jit/jit_log.h>
|
||||||
#include <torch/csrc/jit/passes/dead_code_elimination.h>
|
#include <torch/csrc/jit/passes/dead_code_elimination.h>
|
||||||
#include <torch/csrc/jit/passes/onnx/peephole.h>
|
#include <torch/csrc/jit/passes/onnx/peephole.h>
|
||||||
@ -328,7 +329,7 @@ void ONNXFixupUninitializedOutput(Node* node) {
|
|||||||
// Infer shape and type for subblock outputs
|
// Infer shape and type for subblock outputs
|
||||||
TORCH_INTERNAL_ASSERT(
|
TORCH_INTERNAL_ASSERT(
|
||||||
then_block->outputs().size() == else_block->outputs().size())
|
then_block->outputs().size() == else_block->outputs().size())
|
||||||
for (size_t i = 0; i < else_block->outputs().size(); i++) {
|
for (const auto i : c10::irange(else_block->outputs().size())) {
|
||||||
Value* then_block_output = then_block->outputs()[i];
|
Value* then_block_output = then_block->outputs()[i];
|
||||||
Value* else_block_output = else_block->outputs()[i];
|
Value* else_block_output = else_block->outputs()[i];
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ static bool isStaticCondition(Node* node) {
|
|||||||
compare_node->kind() == onnx::Less ||
|
compare_node->kind() == onnx::Less ||
|
||||||
compare_node->kind() == onnx::GreaterOrEqual ||
|
compare_node->kind() == onnx::GreaterOrEqual ||
|
||||||
compare_node->kind() == onnx::LessOrEqual) {
|
compare_node->kind() == onnx::LessOrEqual) {
|
||||||
for (size_t i = 0; i < compare_node->inputs().size(); i++) {
|
for (const auto i : c10::irange(compare_node->inputs().size())) {
|
||||||
auto sym = compare_node->inputs()[i]
|
auto sym = compare_node->inputs()[i]
|
||||||
->type()
|
->type()
|
||||||
->castRaw<TensorType>()
|
->castRaw<TensorType>()
|
||||||
@ -74,7 +74,7 @@ static c10::optional<int> findIndex(
|
|||||||
c10::ArrayRef<torch::jit::Value*> outputs,
|
c10::ArrayRef<torch::jit::Value*> outputs,
|
||||||
Value* input) {
|
Value* input) {
|
||||||
c10::optional<int> idx = c10::nullopt;
|
c10::optional<int> idx = c10::nullopt;
|
||||||
for (size_t i = 0; i < outputs.size(); i++) {
|
for (const auto i : c10::irange(outputs.size())) {
|
||||||
if (input == outputs[i]) {
|
if (input == outputs[i]) {
|
||||||
idx = i;
|
idx = i;
|
||||||
break;
|
break;
|
||||||
@ -122,7 +122,7 @@ static bool constantFoldedConditionValue(Node* node) {
|
|||||||
TORCH_INTERNAL_ASSERT(compare_node != nullptr);
|
TORCH_INTERNAL_ASSERT(compare_node != nullptr);
|
||||||
ScalarTypeAnalysisNodeForONNX(compare_node);
|
ScalarTypeAnalysisNodeForONNX(compare_node);
|
||||||
std::vector<at::Tensor> inputs;
|
std::vector<at::Tensor> inputs;
|
||||||
for (size_t i = 0; i < compare_node->inputs().size(); i++) {
|
for (const auto i : c10::irange(compare_node->inputs().size())) {
|
||||||
auto input_node = compare_node->inputs()[i]->node();
|
auto input_node = compare_node->inputs()[i]->node();
|
||||||
if (input_node->kind() == onnx::Constant) {
|
if (input_node->kind() == onnx::Constant) {
|
||||||
const at::Tensor& val = input_node->t(attr::value);
|
const at::Tensor& val = input_node->t(attr::value);
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <torch/csrc/jit/passes/onnx/peephole.h>
|
#include <torch/csrc/jit/passes/onnx/peephole.h>
|
||||||
|
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/passes/dead_code_elimination.h>
|
#include <torch/csrc/jit/passes/dead_code_elimination.h>
|
||||||
#include <torch/csrc/jit/passes/onnx/helper.h>
|
#include <torch/csrc/jit/passes/onnx/helper.h>
|
||||||
|
|
||||||
@ -94,7 +95,7 @@ c10::optional<size_t> fusibleExpandTo(
|
|||||||
return c10::nullopt;
|
return c10::nullopt;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < from.size(); i++) {
|
for (const auto i : c10::irange(from.size())) {
|
||||||
auto fdim = from[from.size() - 1 - i];
|
auto fdim = from[from.size() - 1 - i];
|
||||||
auto tdim = to[to.size() - 1 - i];
|
auto tdim = to[to.size() - 1 - i];
|
||||||
if (fdim != 1 && fdim != tdim) {
|
if (fdim != 1 && fdim != tdim) {
|
||||||
@ -717,7 +718,7 @@ static void fuseListConstructListUnpack(Block* b) {
|
|||||||
}
|
}
|
||||||
if (it->kind() == prim::ListUnpack &&
|
if (it->kind() == prim::ListUnpack &&
|
||||||
it->input()->node()->kind() == prim::ListConstruct) {
|
it->input()->node()->kind() == prim::ListConstruct) {
|
||||||
for (size_t i = 0; i < it->outputs().size(); i++) {
|
for (const auto i : c10::irange(it->outputs().size())) {
|
||||||
auto output = it->outputs().at(i);
|
auto output = it->outputs().at(i);
|
||||||
output->replaceAllUsesWith(it->input()->node()->inputs().at(i));
|
output->replaceAllUsesWith(it->input()->node()->inputs().at(i));
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include <torch/csrc/jit/passes/onnx/preprocess_for_onnx.h>
|
#include <torch/csrc/jit/passes/onnx/preprocess_for_onnx.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/jit_log.h>
|
#include <torch/csrc/jit/jit_log.h>
|
||||||
#include <torch/csrc/jit/passes/onnx/helper.h>
|
#include <torch/csrc/jit/passes/onnx/helper.h>
|
||||||
|
|
||||||
@ -192,7 +193,7 @@ static void fuseListAndListUnpack(Block* b) {
|
|||||||
fuseListAndListUnpack(child_block);
|
fuseListAndListUnpack(child_block);
|
||||||
}
|
}
|
||||||
if (it->kind() == prim::ListUnpack) {
|
if (it->kind() == prim::ListUnpack) {
|
||||||
for (size_t i = 0; i < it->outputs().size(); i++) {
|
for (const auto i : c10::irange(it->outputs().size())) {
|
||||||
auto output = it->outputs().at(i);
|
auto output = it->outputs().at(i);
|
||||||
if (it->inputs().size() == 1 &&
|
if (it->inputs().size() == 1 &&
|
||||||
it->input()->node()->kind() != prim::ListConstruct &&
|
it->input()->node()->kind() != prim::ListConstruct &&
|
||||||
|
@ -9,6 +9,8 @@
|
|||||||
#include <torch/csrc/jit/passes/onnx/helper.h>
|
#include <torch/csrc/jit/passes/onnx/helper.h>
|
||||||
#include <torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h>
|
#include <torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
|
@ -373,7 +373,7 @@ bool IsBlockReturnTypeSame(Node* n) {
|
|||||||
TORCH_INTERNAL_ASSERT(n->kind() == ::c10::onnx::If);
|
TORCH_INTERNAL_ASSERT(n->kind() == ::c10::onnx::If);
|
||||||
auto then_block = n->blocks()[0];
|
auto then_block = n->blocks()[0];
|
||||||
auto else_block = n->blocks()[1];
|
auto else_block = n->blocks()[1];
|
||||||
for (size_t i = 0; i < n->outputs().size(); i++) {
|
for (const auto i : c10::irange(n->outputs().size())) {
|
||||||
// check the type
|
// check the type
|
||||||
auto then_block_type = then_block->outputs()[i]->type();
|
auto then_block_type = then_block->outputs()[i]->type();
|
||||||
auto else_block_type = else_block->outputs()[i]->type();
|
auto else_block_type = else_block->outputs()[i]->type();
|
||||||
@ -598,7 +598,7 @@ c10::optional<std::vector<int64_t>> GetValueFromListConstructNode(
|
|||||||
Node* lc_node) {
|
Node* lc_node) {
|
||||||
auto rank = lc_node->inputs().size();
|
auto rank = lc_node->inputs().size();
|
||||||
std::vector<int64_t> shape_size;
|
std::vector<int64_t> shape_size;
|
||||||
for (size_t i = 0; i < rank; i++) {
|
for (const auto i : c10::irange(rank)) {
|
||||||
if (TensorTypePtr shape_type =
|
if (TensorTypePtr shape_type =
|
||||||
lc_node->input(i)->type()->cast<TensorType>()) {
|
lc_node->input(i)->type()->cast<TensorType>()) {
|
||||||
if (ConstantValueMap::HasValue(lc_node->input(i)->debugName())) {
|
if (ConstantValueMap::HasValue(lc_node->input(i)->debugName())) {
|
||||||
@ -1157,7 +1157,7 @@ void SpecialPostProcess(Node* n) {
|
|||||||
if (!IsBlockReturnTypeSame(n) && IsStaticConditionONNX(n)) {
|
if (!IsBlockReturnTypeSame(n) && IsStaticConditionONNX(n)) {
|
||||||
auto cond = ConditionValueONNX(n);
|
auto cond = ConditionValueONNX(n);
|
||||||
auto block_idx = cond ? 0 : 1;
|
auto block_idx = cond ? 0 : 1;
|
||||||
for (size_t i = 0; i < n->outputs().size(); i++) {
|
for (const auto i : c10::irange(n->outputs().size())) {
|
||||||
n->outputs()[i]->setType(
|
n->outputs()[i]->setType(
|
||||||
n->blocks()[block_idx]->outputs()[i]->type());
|
n->blocks()[block_idx]->outputs()[i]->type());
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <torch/csrc/jit/passes/peephole.h>
|
#include <torch/csrc/jit/passes/peephole.h>
|
||||||
|
|
||||||
#include <ATen/core/jit_type.h>
|
#include <ATen/core/jit_type.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/ir/alias_analysis.h>
|
#include <torch/csrc/jit/ir/alias_analysis.h>
|
||||||
#include <torch/csrc/jit/ir/ir_views.h>
|
#include <torch/csrc/jit/ir/ir_views.h>
|
||||||
#include <torch/csrc/jit/jit_log.h>
|
#include <torch/csrc/jit/jit_log.h>
|
||||||
@ -375,7 +376,7 @@ bool FuseAddMM(Block* block) {
|
|||||||
// == 0 for it).
|
// == 0 for it).
|
||||||
if (node->get<at::Scalar>(attr::alpha).value().toDouble() == 1.) {
|
if (node->get<at::Scalar>(attr::alpha).value().toDouble() == 1.) {
|
||||||
// Look for mm from both sides of the add
|
// Look for mm from both sides of the add
|
||||||
for (size_t mm_side = 0; mm_side < 2; mm_side++) {
|
for (const auto mm_side : c10::irange(2)) {
|
||||||
// Add will accept tensors of mismatched scalar types, as long as
|
// Add will accept tensors of mismatched scalar types, as long as
|
||||||
// one of them is a scalar. Addmm will throw in that case, so we can
|
// one of them is a scalar. Addmm will throw in that case, so we can
|
||||||
// only perform this fusion if we're sure that it is correct, and
|
// only perform this fusion if we're sure that it is correct, and
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <torch/csrc/jit/passes/shape_analysis.h>
|
#include <torch/csrc/jit/passes/shape_analysis.h>
|
||||||
|
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/frontend/error_report.h>
|
#include <torch/csrc/jit/frontend/error_report.h>
|
||||||
#include <torch/csrc/jit/ir/alias_analysis.h>
|
#include <torch/csrc/jit/ir/alias_analysis.h>
|
||||||
#include <torch/csrc/jit/ir/constants.h>
|
#include <torch/csrc/jit/ir/constants.h>
|
||||||
@ -238,7 +239,7 @@ class ShapePropagator {
|
|||||||
c10::ScalarType dimmed = c10::ScalarType::Undefined;
|
c10::ScalarType dimmed = c10::ScalarType::Undefined;
|
||||||
c10::ScalarType zerodim = c10::ScalarType::Undefined;
|
c10::ScalarType zerodim = c10::ScalarType::Undefined;
|
||||||
// binary arithmetic ops, more than 2 args is alpha.
|
// binary arithmetic ops, more than 2 args is alpha.
|
||||||
for (size_t i = 0; i < 2; i++) {
|
for (const auto i : c10::irange(2)) {
|
||||||
auto dtt = node->inputs()[i]->type()->expect<TensorType>();
|
auto dtt = node->inputs()[i]->type()->expect<TensorType>();
|
||||||
auto inputDtype = dtt->scalarType();
|
auto inputDtype = dtt->scalarType();
|
||||||
if (!dtt || !inputDtype) {
|
if (!dtt || !inputDtype) {
|
||||||
|
@ -8,6 +8,7 @@
|
|||||||
#include <torch/csrc/jit/runtime/profiling_record.h>
|
#include <torch/csrc/jit/runtime/profiling_record.h>
|
||||||
|
|
||||||
#include <ATen/core/interned_strings.h>
|
#include <ATen/core/interned_strings.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
@ -25,7 +26,7 @@ void insertProfileNodesForSpecializeAutogradZero(
|
|||||||
ProfilingRecord* pr) {
|
ProfilingRecord* pr) {
|
||||||
for (auto it = block->nodes().begin(); it != block->nodes().end(); ++it) {
|
for (auto it = block->nodes().begin(); it != block->nodes().end(); ++it) {
|
||||||
auto n = *it;
|
auto n = *it;
|
||||||
for (size_t offset = 0; offset < n->inputs().size(); offset++) {
|
for (const auto offset : c10::irange(n->inputs().size())) {
|
||||||
auto i = n->input(offset);
|
auto i = n->input(offset);
|
||||||
if (i->type()->cast<OptionalType>() && hasGradSumToSizeUses(i)) {
|
if (i->type()->cast<OptionalType>() && hasGradSumToSizeUses(i)) {
|
||||||
// here we are profile the definition instead of the use,
|
// here we are profile the definition instead of the use,
|
||||||
@ -293,7 +294,7 @@ struct AutogradZeroSpecializer {
|
|||||||
graph_->insertNode(versioning_if);
|
graph_->insertNode(versioning_if);
|
||||||
|
|
||||||
auto ret = graph_->return_node();
|
auto ret = graph_->return_node();
|
||||||
for (size_t i = 0; i < ret->inputs().size(); i++) {
|
for (const auto i : c10::irange(ret->inputs().size())) {
|
||||||
auto ogo = ret->input(i);
|
auto ogo = ret->input(i);
|
||||||
auto ngo = versioning_if->output(i);
|
auto ngo = versioning_if->output(i);
|
||||||
ngo->copyMetadata(ogo);
|
ngo->copyMetadata(ogo);
|
||||||
|
@ -3,6 +3,8 @@
|
|||||||
#include <torch/csrc/jit/ir/irparser.h>
|
#include <torch/csrc/jit/ir/irparser.h>
|
||||||
#include <torch/csrc/jit/ir/subgraph_matcher.h>
|
#include <torch/csrc/jit/ir/subgraph_matcher.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
|
|
||||||
@ -171,7 +173,7 @@ void SubgraphRewriter::rewriteSinglePatternOnGraph(
|
|||||||
|
|
||||||
// Record all planned rewritings
|
// Record all planned rewritings
|
||||||
AT_ASSERT(outputs.size() == new_outputs.size());
|
AT_ASSERT(outputs.size() == new_outputs.size());
|
||||||
for (size_t idx = 0; idx < outputs.size(); idx++) {
|
for (const auto idx : c10::irange(outputs.size())) {
|
||||||
values_to_rewrite.push_back(outputs[idx]);
|
values_to_rewrite.push_back(outputs[idx]);
|
||||||
rewrite_map[outputs[idx]] = new_outputs[idx];
|
rewrite_map[outputs[idx]] = new_outputs[idx];
|
||||||
}
|
}
|
||||||
|
@ -4,6 +4,8 @@
|
|||||||
#include <torch/csrc/jit/passes/normalize_ops.h>
|
#include <torch/csrc/jit/passes/normalize_ops.h>
|
||||||
#include <torch/csrc/jit/runtime/operator.h>
|
#include <torch/csrc/jit/runtime/operator.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
namespace {
|
namespace {
|
||||||
@ -91,8 +93,8 @@ struct AliasAndIValue {
|
|||||||
|
|
||||||
// No inputs should alias each other
|
// No inputs should alias each other
|
||||||
void checkInputPreconditions(const Stack& inputs) {
|
void checkInputPreconditions(const Stack& inputs) {
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for (const auto i : c10::irange(inputs.size())) {
|
||||||
for (size_t j = 0; j < inputs.size(); j++) {
|
for (const auto j : c10::irange(inputs.size())) {
|
||||||
if (i == j) {
|
if (i == j) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -133,7 +135,7 @@ void checkWrites(
|
|||||||
const std::vector<AliasAndIValue>& inputs,
|
const std::vector<AliasAndIValue>& inputs,
|
||||||
const std::vector<IValue>& deepCopiedInputs) {
|
const std::vector<IValue>& deepCopiedInputs) {
|
||||||
AT_ASSERT(inputs.size() == deepCopiedInputs.size());
|
AT_ASSERT(inputs.size() == deepCopiedInputs.size());
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for (const auto i : c10::irange(inputs.size())) {
|
||||||
const auto& input = inputs[i];
|
const auto& input = inputs[i];
|
||||||
const auto& deepCopiedInput = deepCopiedInputs[i];
|
const auto& deepCopiedInput = deepCopiedInputs[i];
|
||||||
if (!input.aliasInfo || !input.aliasInfo->isWrite()) {
|
if (!input.aliasInfo || !input.aliasInfo->isWrite()) {
|
||||||
@ -242,7 +244,7 @@ void checkAliasAnnotation(
|
|||||||
const auto schema = node->schema();
|
const auto schema = node->schema();
|
||||||
|
|
||||||
std::vector<AliasAndIValue> inputsToCheck;
|
std::vector<AliasAndIValue> inputsToCheck;
|
||||||
for (size_t i = 0; i < schema.arguments().size(); i++) {
|
for (const auto i : c10::irange(schema.arguments().size())) {
|
||||||
inputsToCheck.emplace_back(
|
inputsToCheck.emplace_back(
|
||||||
schema.arguments().at(i).alias_info(), stack.at(i));
|
schema.arguments().at(i).alias_info(), stack.at(i));
|
||||||
}
|
}
|
||||||
@ -257,7 +259,7 @@ void checkAliasAnnotation(
|
|||||||
const auto outputs = std::move(stack);
|
const auto outputs = std::move(stack);
|
||||||
|
|
||||||
std::vector<AliasAndIValue> outputsToCheck;
|
std::vector<AliasAndIValue> outputsToCheck;
|
||||||
for (size_t i = 0; i < schema.returns().size(); i++) {
|
for (const auto i : c10::irange(schema.returns().size())) {
|
||||||
outputsToCheck.emplace_back(
|
outputsToCheck.emplace_back(
|
||||||
schema.returns().at(i).alias_info(), outputs.at(i));
|
schema.returns().at(i).alias_info(), outputs.at(i));
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
#include <torch/csrc/jit/passes/canonicalize.h>
|
#include <torch/csrc/jit/passes/canonicalize.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
namespace SubgraphUtils {
|
namespace SubgraphUtils {
|
||||||
@ -300,7 +302,7 @@ void mergeNodeIntoSubgraph(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Add n's outputs to the group node and inner subgraph outputs.
|
// Add n's outputs to the group node and inner subgraph outputs.
|
||||||
for (size_t i = 0; i < toMerge->outputs().size(); i++) {
|
for (const auto i : c10::irange(toMerge->outputs().size())) {
|
||||||
auto oldOutput = toMerge->outputs()[i];
|
auto oldOutput = toMerge->outputs()[i];
|
||||||
auto newOutput = mergedNode->outputs()[i];
|
auto newOutput = mergedNode->outputs()[i];
|
||||||
subgraph->registerOutput(newOutput);
|
subgraph->registerOutput(newOutput);
|
||||||
|
@ -2,6 +2,8 @@
|
|||||||
#include <torch/csrc/jit/python/python_dict.h>
|
#include <torch/csrc/jit/python/python_dict.h>
|
||||||
#include <torch/csrc/jit/python/python_ivalue.h>
|
#include <torch/csrc/jit/python/python_ivalue.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
|
|
||||||
@ -182,7 +184,7 @@ IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N) {
|
|||||||
c10::StrongTypePtr(cu, classType), numAttrs);
|
c10::StrongTypePtr(cu, classType), numAttrs);
|
||||||
|
|
||||||
// 2. copy all the contained types
|
// 2. copy all the contained types
|
||||||
for (size_t slot = 0; slot < numAttrs; slot++) {
|
for (const auto slot : c10::irange(numAttrs)) {
|
||||||
const auto& attrType = classType->getAttribute(slot);
|
const auto& attrType = classType->getAttribute(slot);
|
||||||
const auto& attrName = classType->getAttributeName(slot);
|
const auto& attrName = classType->getAttributeName(slot);
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
#include <ATen/core/ivalue.h>
|
#include <ATen/core/ivalue.h>
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/autograd/grad_mode.h>
|
#include <torch/csrc/autograd/grad_mode.h>
|
||||||
#include <torch/csrc/jit/frontend/tracer.h>
|
#include <torch/csrc/jit/frontend/tracer.h>
|
||||||
#include <torch/csrc/jit/ir/ir.h>
|
#include <torch/csrc/jit/ir/ir.h>
|
||||||
@ -160,7 +161,7 @@ struct CaptureList {
|
|||||||
case CAPTURE_LIST: {
|
case CAPTURE_LIST: {
|
||||||
c10::List<at::Tensor> lst;
|
c10::List<at::Tensor> lst;
|
||||||
auto size = *size_it++;
|
auto size = *size_it++;
|
||||||
for (size_t i = 0; i < size; i++) {
|
for (const auto i : c10::irange(size)) {
|
||||||
lst.emplace_back(var_capture_it->unpack(saved_for));
|
lst.emplace_back(var_capture_it->unpack(saved_for));
|
||||||
var_capture_it++;
|
var_capture_it++;
|
||||||
}
|
}
|
||||||
@ -980,12 +981,12 @@ Node* replaceBlockWithFallbackGraph(Block* b, ArrayRef<Value*> inputs) {
|
|||||||
fallback->g_(attr::Subgraph, graph);
|
fallback->g_(attr::Subgraph, graph);
|
||||||
b->prependNode(fallback);
|
b->prependNode(fallback);
|
||||||
|
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for (const auto i : c10::irange(inputs.size())) {
|
||||||
graph->inputs()[i]->setType(inputs[i]->type());
|
graph->inputs()[i]->setType(inputs[i]->type());
|
||||||
graph->inputs()[i]->copyMetadata(inputs[i]);
|
graph->inputs()[i]->copyMetadata(inputs[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < b->outputs().size(); i++) {
|
for (const auto i : c10::irange(b->outputs().size())) {
|
||||||
fallback->output(i)->setType(b->outputs()[i]->type());
|
fallback->output(i)->setType(b->outputs()[i]->type());
|
||||||
fallback->output(i)->copyMetadata(b->outputs()[i]);
|
fallback->output(i)->copyMetadata(b->outputs()[i]);
|
||||||
b->replaceOutput(i, fallback->output(i));
|
b->replaceOutput(i, fallback->output(i));
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#include <ATen/record_function.h>
|
#include <ATen/record_function.h>
|
||||||
#include <c10/core/thread_pool.h>
|
#include <c10/core/thread_pool.h>
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/autograd/edge.h>
|
#include <torch/csrc/autograd/edge.h>
|
||||||
#include <torch/csrc/autograd/grad_mode.h>
|
#include <torch/csrc/autograd/grad_mode.h>
|
||||||
#include <torch/csrc/autograd/profiler.h>
|
#include <torch/csrc/autograd/profiler.h>
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include <torch/csrc/jit/runtime/profiling_graph_executor_impl.h>
|
#include <torch/csrc/jit/runtime/profiling_graph_executor_impl.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/jit_log.h>
|
#include <torch/csrc/jit/jit_log.h>
|
||||||
#include <torch/csrc/jit/passes/bailout_graph.h>
|
#include <torch/csrc/jit/passes/bailout_graph.h>
|
||||||
#include <torch/csrc/jit/passes/batch_mm.h>
|
#include <torch/csrc/jit/passes/batch_mm.h>
|
||||||
@ -132,7 +133,7 @@ static bool needsGradientInProfilingMode(Block* b) {
|
|||||||
bool guardDifferentiableGraph(Node* dnode) {
|
bool guardDifferentiableGraph(Node* dnode) {
|
||||||
auto gi = dnode->g(attr::Subgraph)->inputs();
|
auto gi = dnode->g(attr::Subgraph)->inputs();
|
||||||
bool all_inputs_seen = true;
|
bool all_inputs_seen = true;
|
||||||
for (size_t i = 0; i < gi.size(); i++) {
|
for (const auto i : c10::irange(gi.size())) {
|
||||||
auto ty = gi[i]->type()->cast<TensorType>();
|
auto ty = gi[i]->type()->cast<TensorType>();
|
||||||
if (ty) {
|
if (ty) {
|
||||||
auto n = gi[i]->uses().at(0).user;
|
auto n = gi[i]->uses().at(0).user;
|
||||||
@ -706,7 +707,7 @@ void ProfilingGraphExecutorImpl::replaceFallbackGraphWithFallbackFunction(
|
|||||||
WithInsertPoint wip{*it};
|
WithInsertPoint wip{*it};
|
||||||
auto function_call = insertFallbackFunctionCall(
|
auto function_call = insertFallbackFunctionCall(
|
||||||
b->owningGraph(), fallback_func, it->inputs());
|
b->owningGraph(), fallback_func, it->inputs());
|
||||||
for (size_t i = 0; i < function_call->outputs().size(); i++) {
|
for (const auto i : c10::irange(function_call->outputs().size())) {
|
||||||
it->output(i)->replaceAllUsesWith(function_call->output(i));
|
it->output(i)->replaceAllUsesWith(function_call->output(i));
|
||||||
}
|
}
|
||||||
it.destroyCurrent();
|
it.destroyCurrent();
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <torch/csrc/jit/runtime/profiling_record.h>
|
#include <torch/csrc/jit/runtime/profiling_record.h>
|
||||||
|
|
||||||
#include <ATen/core/interned_strings.h>
|
#include <ATen/core/interned_strings.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/jit_log.h>
|
#include <torch/csrc/jit/jit_log.h>
|
||||||
#include <torch/csrc/jit/passes/clear_profiling.h>
|
#include <torch/csrc/jit/passes/clear_profiling.h>
|
||||||
#include <torch/csrc/jit/passes/constant_propagation.h>
|
#include <torch/csrc/jit/passes/constant_propagation.h>
|
||||||
@ -61,7 +62,7 @@ bool ShapeSymbolTable::bindSymbolicShapes(
|
|||||||
if (*sym_shapes.rank() != new_sizes.size()) {
|
if (*sym_shapes.rank() != new_sizes.size()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < new_sizes.size(); i++) {
|
for (const auto i : c10::irange(new_sizes.size())) {
|
||||||
auto symbol = (*sym_shapes.sizes())[i];
|
auto symbol = (*sym_shapes.sizes())[i];
|
||||||
if (!symbol.is_static()) {
|
if (!symbol.is_static()) {
|
||||||
continue;
|
continue;
|
||||||
@ -137,7 +138,7 @@ c10::SymbolicShape ProfilingRecord::mergeSymbolicShapes(
|
|||||||
new_sizes.rank().has_value() && sym_shapes.rank().has_value() &&
|
new_sizes.rank().has_value() && sym_shapes.rank().has_value() &&
|
||||||
*new_sizes.rank() == *sym_shapes.rank());
|
*new_sizes.rank() == *sym_shapes.rank());
|
||||||
|
|
||||||
for (size_t i = 0; i < *new_sizes.rank(); i++) {
|
for (const auto i : c10::irange(*new_sizes.rank())) {
|
||||||
if (!(*sym_shapes.sizes())[i].is_static() ||
|
if (!(*sym_shapes.sizes())[i].is_static() ||
|
||||||
!(*new_sizes.sizes())[i].is_static()) {
|
!(*new_sizes.sizes())[i].is_static()) {
|
||||||
new_symbols.emplace_back();
|
new_symbols.emplace_back();
|
||||||
@ -260,7 +261,7 @@ void ProfilingRecord::removeProfileCounter(Block* b) {
|
|||||||
void ProfilingRecord::instrumentBlock(Block* block) {
|
void ProfilingRecord::instrumentBlock(Block* block) {
|
||||||
for (auto it = block->nodes().begin(); it != block->nodes().end(); ++it) {
|
for (auto it = block->nodes().begin(); it != block->nodes().end(); ++it) {
|
||||||
auto n = *it;
|
auto n = *it;
|
||||||
for (size_t offset = 0; offset < n->inputs().size(); offset++) {
|
for (const auto offset : c10::irange(n->inputs().size())) {
|
||||||
auto i = n->input(offset);
|
auto i = n->input(offset);
|
||||||
if (i->type()->kind() == c10::TypeKind::TensorType &&
|
if (i->type()->kind() == c10::TypeKind::TensorType &&
|
||||||
(needsProfiledInputs(n) || needsProfiledOutput(i->node()))) {
|
(needsProfiledInputs(n) || needsProfiledOutput(i->node()))) {
|
||||||
|
@ -398,7 +398,7 @@ void listMulIntLeftInPlace(Stack* stack) {
|
|||||||
} else if (n > 1) {
|
} else if (n > 1) {
|
||||||
size_t list_size = list.size();
|
size_t list_size = list.size();
|
||||||
for (int64_t i = 1; i < n; i++) {
|
for (int64_t i = 1; i < n; i++) {
|
||||||
for (size_t j = 0; j < list_size; j++) {
|
for (const auto j : c10::irange(list_size)) {
|
||||||
list.push_back(list.get(j));
|
list.push_back(list.get(j));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -69,7 +69,7 @@ RegisterOperators reg(
|
|||||||
return [rg_props](Stack* stack) {
|
return [rg_props](Stack* stack) {
|
||||||
auto num_inputs = rg_props.size();
|
auto num_inputs = rg_props.size();
|
||||||
// Check every input's shape against profiled (expected) shape.
|
// Check every input's shape against profiled (expected) shape.
|
||||||
for (size_t i = 0; i < num_inputs; i++) {
|
for (const auto i : c10::irange(num_inputs)) {
|
||||||
auto& input = peek(stack, i, num_inputs);
|
auto& input = peek(stack, i, num_inputs);
|
||||||
const auto& t = input.toTensor();
|
const auto& t = input.toTensor();
|
||||||
if (rg_props[i] != t.requires_grad()) {
|
if (rg_props[i] != t.requires_grad()) {
|
||||||
|
@ -689,7 +689,7 @@ std::vector<at::Tensor> StaticRuntime::operator()(
|
|||||||
const std::vector<at::Tensor>& inps) {
|
const std::vector<at::Tensor>& inps) {
|
||||||
std::vector<c10::IValue> stack;
|
std::vector<c10::IValue> stack;
|
||||||
stack.resize(inps.size());
|
stack.resize(inps.size());
|
||||||
for (size_t i = 0; i < inps.size(); i++) {
|
for (const auto i : c10::irange(inps.size())) {
|
||||||
stack[i] = inps[i];
|
stack[i] = inps[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -730,11 +730,11 @@ c10::IValue StaticRuntime::operator()(
|
|||||||
"with StaticModule(const torch::jit::Module& m) instead.");
|
"with StaticModule(const torch::jit::Module& m) instead.");
|
||||||
std::vector<c10::IValue> s = args;
|
std::vector<c10::IValue> s = args;
|
||||||
static_module_.schema()->checkAndNormalizeInputs(s, kwargs);
|
static_module_.schema()->checkAndNormalizeInputs(s, kwargs);
|
||||||
for (size_t i = 0; i < s.size(); i++) {
|
for (const auto i : c10::irange(s.size())) {
|
||||||
Input(i) = std::move(s[i]);
|
Input(i) = std::move(s[i]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (size_t i = 0; i < args.size(); i++) {
|
for (const auto i : c10::irange(args.size())) {
|
||||||
Input(i) = args[i];
|
Input(i) = args[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -797,7 +797,7 @@ void StaticRuntime::benchmark(
|
|||||||
IndividualMetrics results =
|
IndividualMetrics results =
|
||||||
benchmark_individual_ops(args, kwargs, warmup_runs, main_runs);
|
benchmark_individual_ops(args, kwargs, warmup_runs, main_runs);
|
||||||
|
|
||||||
for (size_t i = 0; i < nodes_.size(); i++) {
|
for (const auto i : c10::irange(nodes_.size())) {
|
||||||
const Node* node = nodes_[i].node();
|
const Node* node = nodes_[i].node();
|
||||||
std::cout << "Node #" << i << ": " << results.time_per_node[i]
|
std::cout << "Node #" << i << ": " << results.time_per_node[i]
|
||||||
<< " ms/iter, ";
|
<< " ms/iter, ";
|
||||||
@ -895,7 +895,7 @@ StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
|
|||||||
"with StaticModule(const torch::jit::Module& m) instead.");
|
"with StaticModule(const torch::jit::Module& m) instead.");
|
||||||
static_module_.schema()->checkAndNormalizeInputs(stack, kwargs);
|
static_module_.schema()->checkAndNormalizeInputs(stack, kwargs);
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < stack.size(); i++) {
|
for (const auto i : c10::irange(stack.size())) {
|
||||||
Input(i) = stack[i];
|
Input(i) = stack[i];
|
||||||
}
|
}
|
||||||
results.setup_time = timer.MilliSeconds();
|
results.setup_time = timer.MilliSeconds();
|
||||||
@ -906,8 +906,8 @@ StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// main runs
|
// main runs
|
||||||
for (int k = 0; k < main_runs; k++) {
|
for (const auto k : c10::irange(main_runs)) {
|
||||||
for (size_t i = 0; i < stack.size(); i++) {
|
for (const auto i : c10::irange(stack.size())) {
|
||||||
Input(i) = stack[i];
|
Input(i) = stack[i];
|
||||||
}
|
}
|
||||||
timer.Start();
|
timer.Start();
|
||||||
@ -917,7 +917,7 @@ StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
|
|||||||
float millis = timer.MilliSeconds();
|
float millis = timer.MilliSeconds();
|
||||||
results.memory_alloc_time += millis;
|
results.memory_alloc_time += millis;
|
||||||
|
|
||||||
for (size_t i = 0; i < nodes_.size(); i++) {
|
for (const auto i : c10::irange(nodes_.size())) {
|
||||||
timer.Start();
|
timer.Start();
|
||||||
nodes_[i].run();
|
nodes_[i].run();
|
||||||
millis = timer.MilliSeconds();
|
millis = timer.MilliSeconds();
|
||||||
@ -969,7 +969,7 @@ StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// post processing
|
// post processing
|
||||||
for (size_t i = 0; i < nodes_.size(); i++) {
|
for (const auto i : c10::irange(nodes_.size())) {
|
||||||
const Node* node = nodes_[i].node();
|
const Node* node = nodes_[i].node();
|
||||||
std::string kind = std::string(node->kind().toQualString());
|
std::string kind = std::string(node->kind().toQualString());
|
||||||
results.time_per_node[i] /= static_cast<float>(main_runs);
|
results.time_per_node[i] /= static_cast<float>(main_runs);
|
||||||
@ -998,15 +998,15 @@ void StaticRuntime::check_for_memory_leak(bool output_returned) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// check for inputs
|
// check for inputs
|
||||||
for (size_t i = 0; i < inputs_.size(); i++) {
|
for (const auto i : c10::irange(inputs_.size())) {
|
||||||
TORCH_CHECK(inputs_[i].isNone(), "Input ", i, " was not cleaned up");
|
TORCH_CHECK(inputs_[i].isNone(), "Input ", i, " was not cleaned up");
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unordered_set<const IValue*> output_ivalues(
|
std::unordered_set<const IValue*> output_ivalues(
|
||||||
outputs_.begin(), outputs_.end());
|
outputs_.begin(), outputs_.end());
|
||||||
for (size_t n = 0; n < nodes_.size(); n++) {
|
for (const auto n : c10::irange(nodes_.size())) {
|
||||||
auto& pnode = nodes_[n];
|
auto& pnode = nodes_[n];
|
||||||
for (size_t i = 0; i < pnode.outputs().size(); i++) {
|
for (const auto i : c10::irange(pnode.outputs().size())) {
|
||||||
const IValue* ival = &pnode.Output(i);
|
const IValue* ival = &pnode.Output(i);
|
||||||
const Value* val = pnode.node()->output(i);
|
const Value* val = pnode.node()->output(i);
|
||||||
const std::string error_msg = "Output " + c10::to_string(i) + ", %" +
|
const std::string error_msg = "Output " + c10::to_string(i) + ", %" +
|
||||||
@ -1261,7 +1261,7 @@ void ProcessedNode::run() {
|
|||||||
std::vector<IValue> stack;
|
std::vector<IValue> stack;
|
||||||
const size_t size = node_->inputs().size();
|
const size_t size = node_->inputs().size();
|
||||||
stack.reserve(size);
|
stack.reserve(size);
|
||||||
for (size_t i = 0; i < size; i++) {
|
for (const auto i : c10::irange(size)) {
|
||||||
stack.emplace_back(Input(i));
|
stack.emplace_back(Input(i));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -243,7 +243,7 @@ REGISTER_OPERATOR_FUNCTOR(
|
|||||||
const size_t size = p_node->inputs().size();
|
const size_t size = p_node->inputs().size();
|
||||||
c10::List<IValue> vals(type.getElementType());
|
c10::List<IValue> vals(type.getElementType());
|
||||||
vals.reserve(size);
|
vals.reserve(size);
|
||||||
for (size_t i = 0; i < size; i++) {
|
for (const auto i : c10::irange(size)) {
|
||||||
vals.push_back(p_node->Input(i));
|
vals.push_back(p_node->Input(i));
|
||||||
}
|
}
|
||||||
p_node->Output(0) = std::move(vals);
|
p_node->Output(0) = std::move(vals);
|
||||||
@ -265,7 +265,7 @@ REGISTER_OPERATOR_FUNCTOR(
|
|||||||
const size_t size = p_node->inputs().size();
|
const size_t size = p_node->inputs().size();
|
||||||
std::vector<IValue> vals;
|
std::vector<IValue> vals;
|
||||||
vals.reserve(size);
|
vals.reserve(size);
|
||||||
for (size_t i = 0; i < size; i++) {
|
for (const auto i : c10::irange(size)) {
|
||||||
vals.push_back(p_node->Input(i));
|
vals.push_back(p_node->Input(i));
|
||||||
}
|
}
|
||||||
p_node->Output(0) = c10::ivalue::Tuple::create(std::move(vals));
|
p_node->Output(0) = c10::ivalue::Tuple::create(std::move(vals));
|
||||||
@ -1035,7 +1035,7 @@ std::function<void(ProcessedNode*)> getNativeOperation(Node* n) {
|
|||||||
std::vector<IValue> stack;
|
std::vector<IValue> stack;
|
||||||
const size_t size = p_node->inputs().size();
|
const size_t size = p_node->inputs().size();
|
||||||
stack.reserve(size);
|
stack.reserve(size);
|
||||||
for (size_t i = 0; i < size; i++) {
|
for (const auto i : c10::irange(size)) {
|
||||||
stack.emplace_back(p_node->Input(i));
|
stack.emplace_back(p_node->Input(i));
|
||||||
}
|
}
|
||||||
// run op
|
// run op
|
||||||
@ -1055,7 +1055,7 @@ std::function<void(ProcessedNode*)> getNativeOperation(Node* n) {
|
|||||||
std::vector<IValue> stack;
|
std::vector<IValue> stack;
|
||||||
const size_t size = p_node->inputs().size();
|
const size_t size = p_node->inputs().size();
|
||||||
stack.reserve(size);
|
stack.reserve(size);
|
||||||
for (size_t i = 0; i < size; i++) {
|
for (const auto i : c10::irange(size)) {
|
||||||
stack.emplace_back(p_node->Input(i));
|
stack.emplace_back(p_node->Input(i));
|
||||||
}
|
}
|
||||||
// run op
|
// run op
|
||||||
@ -1088,7 +1088,7 @@ std::function<void(ProcessedNode*)> getNativeOperation(Node* n) {
|
|||||||
std::vector<IValue> stack;
|
std::vector<IValue> stack;
|
||||||
const size_t size = p_node->inputs().size();
|
const size_t size = p_node->inputs().size();
|
||||||
stack.reserve(size);
|
stack.reserve(size);
|
||||||
for (size_t i = 0; i < size; i++) {
|
for (const auto i : c10::irange(size)) {
|
||||||
stack.emplace_back(p_node->Input(i));
|
stack.emplace_back(p_node->Input(i));
|
||||||
}
|
}
|
||||||
// run op
|
// run op
|
||||||
@ -1105,7 +1105,7 @@ std::function<void(ProcessedNode*)> getNativeOperation(Node* n) {
|
|||||||
std::vector<IValue> stack;
|
std::vector<IValue> stack;
|
||||||
const size_t size = p_node->inputs().size();
|
const size_t size = p_node->inputs().size();
|
||||||
stack.reserve(size);
|
stack.reserve(size);
|
||||||
for (size_t i = 0; i < size; i++) {
|
for (const auto i : c10::irange(size)) {
|
||||||
stack.emplace_back(p_node->Input(i));
|
stack.emplace_back(p_node->Input(i));
|
||||||
}
|
}
|
||||||
// run op
|
// run op
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
#include <c10/util/Optional.h>
|
#include <c10/util/Optional.h>
|
||||||
#include <c10/util/accumulate.h>
|
#include <c10/util/accumulate.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/autograd/symbolic.h>
|
#include <torch/csrc/autograd/symbolic.h>
|
||||||
#include <torch/csrc/jit/jit_log.h>
|
#include <torch/csrc/jit/jit_log.h>
|
||||||
#include <torch/csrc/jit/passes/dead_code_elimination.h>
|
#include <torch/csrc/jit/passes/dead_code_elimination.h>
|
||||||
@ -27,6 +28,7 @@
|
|||||||
#include <set>
|
#include <set>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
|
|
||||||
@ -347,7 +349,7 @@ void EncoderBase::EncodeValueInfo(
|
|||||||
if (t->dim()) {
|
if (t->dim()) {
|
||||||
onnx::TensorShapeProto* shape = tensor_type->mutable_shape();
|
onnx::TensorShapeProto* shape = tensor_type->mutable_shape();
|
||||||
auto sizes = t->symbolic_sizes().sizes().value();
|
auto sizes = t->symbolic_sizes().sizes().value();
|
||||||
for (size_t i = 0; i < sizes.size(); i++) {
|
for (const auto i : c10::irange(sizes.size())) {
|
||||||
shape->add_dim();
|
shape->add_dim();
|
||||||
if ((dynamic_axes.find(name) != dynamic_axes.end()) &&
|
if ((dynamic_axes.find(name) != dynamic_axes.end()) &&
|
||||||
(dynamic_axes.at(name).find(i) != dynamic_axes.at(name).end())) {
|
(dynamic_axes.at(name).find(i) != dynamic_axes.at(name).end())) {
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include <ATen/core/functional.h>
|
#include <ATen/core/functional.h>
|
||||||
#include <ATen/core/ivalue_inl.h>
|
#include <ATen/core/ivalue_inl.h>
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/serialization/import_export_helpers.h>
|
#include <torch/csrc/jit/serialization/import_export_helpers.h>
|
||||||
#if !defined(C10_MOBILE) && !defined(C10_DISABLE_LEGACY_IMPORT)
|
#if !defined(C10_MOBILE) && !defined(C10_DISABLE_LEGACY_IMPORT)
|
||||||
#include <torch/csrc/jit/serialization/import_legacy.h>
|
#include <torch/csrc/jit/serialization/import_legacy.h>
|
||||||
@ -39,7 +40,7 @@ using caffe2::serialize::ReadAdapterInterface;
|
|||||||
void postSetStateValidate(const IValue& v) {
|
void postSetStateValidate(const IValue& v) {
|
||||||
auto obj = v.toObject();
|
auto obj = v.toObject();
|
||||||
const auto& objType = obj->type();
|
const auto& objType = obj->type();
|
||||||
for (size_t i = 0; i < objType->numAttributes(); i++) {
|
for (const auto i : c10::irange(objType->numAttributes())) {
|
||||||
const auto& attrType = objType->getAttribute(i);
|
const auto& attrType = objType->getAttribute(i);
|
||||||
const auto& attrName = objType->getAttributeName(i);
|
const auto& attrName = objType->getAttributeName(i);
|
||||||
const auto& slot = obj->getSlot(i);
|
const auto& slot = obj->getSlot(i);
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
#include <caffe2/serialize/inline_container.h>
|
#include <caffe2/serialize/inline_container.h>
|
||||||
|
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
@ -378,7 +379,7 @@ Module ScriptModuleDeserializer::LEGACY_convertModule(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < numPushed; i++) {
|
for (const auto i : c10::irange(numPushed)) {
|
||||||
LEGACY_moduleStack_.pop_back();
|
LEGACY_moduleStack_.pop_back();
|
||||||
}
|
}
|
||||||
return module;
|
return module;
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include <ATen/core/qualified_name.h>
|
#include <ATen/core/qualified_name.h>
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
#include <c10/util/StringUtil.h>
|
#include <c10/util/StringUtil.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/api/module.h>
|
#include <torch/csrc/jit/api/module.h>
|
||||||
#include <torch/csrc/jit/frontend/error_report.h>
|
#include <torch/csrc/jit/frontend/error_report.h>
|
||||||
#include <torch/csrc/jit/frontend/versioned_symbols.h>
|
#include <torch/csrc/jit/frontend/versioned_symbols.h>
|
||||||
@ -1336,7 +1337,7 @@ struct PythonPrintImpl {
|
|||||||
std::vector<std::string> buffers;
|
std::vector<std::string> buffers;
|
||||||
// Populate the __parameters__ field. This tells the importer which
|
// Populate the __parameters__ field. This tells the importer which
|
||||||
// attributes are parameters.
|
// attributes are parameters.
|
||||||
for (size_t i = 0; i < numAttrs; i++) {
|
for (const auto i : c10::irange(numAttrs)) {
|
||||||
if (classType->is_parameter(i)) {
|
if (classType->is_parameter(i)) {
|
||||||
params.push_back(classType->getAttributeName(i));
|
params.push_back(classType->getAttributeName(i));
|
||||||
}
|
}
|
||||||
@ -1378,7 +1379,7 @@ struct PythonPrintImpl {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < numAttrs; i++) {
|
for (const auto i : c10::irange(numAttrs)) {
|
||||||
const auto& name = classType->getAttributeName(i);
|
const auto& name = classType->getAttributeName(i);
|
||||||
const auto& type = classType->getAttribute(i);
|
const auto& type = classType->getAttribute(i);
|
||||||
registerClassDependencies(type);
|
registerClassDependencies(type);
|
||||||
@ -1406,7 +1407,7 @@ struct PythonPrintImpl {
|
|||||||
}
|
}
|
||||||
|
|
||||||
size_t numConstants = classType->numConstants();
|
size_t numConstants = classType->numConstants();
|
||||||
for (size_t i = 0; i < numConstants; i++) {
|
for (const auto i : c10::irange(numConstants)) {
|
||||||
const auto& name = classType->getConstantName(i);
|
const auto& name = classType->getConstantName(i);
|
||||||
IValue v = classType->getConstant(i);
|
IValue v = classType->getConstant(i);
|
||||||
|
|
||||||
|
@ -8,6 +8,8 @@
|
|||||||
#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
|
#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
|
||||||
#include <torch/csrc/jit/tensorexpr/stmt.h>
|
#include <torch/csrc/jit/tensorexpr/stmt.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
namespace tensorexpr {
|
namespace tensorexpr {
|
||||||
@ -163,7 +165,7 @@ std::vector<const Expr*> getBoundExtents(
|
|||||||
// Find the safe size of the temprorary buffer by determining the outer
|
// Find the safe size of the temprorary buffer by determining the outer
|
||||||
// extents of a union of all bounds.
|
// extents of a union of all bounds.
|
||||||
for (const TensorAccessBoundsInfo& p : infos) {
|
for (const TensorAccessBoundsInfo& p : infos) {
|
||||||
for (size_t i = 0; i < p.start.size(); i++) {
|
for (const auto i : c10::irange(p.start.size())) {
|
||||||
if (starts.size() <= i) {
|
if (starts.size() <= i) {
|
||||||
starts.push_back(p.start[i]);
|
starts.push_back(p.start[i]);
|
||||||
} else {
|
} else {
|
||||||
|
@ -158,7 +158,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
|
|||||||
std::vector<T> lhs_v = lhs.as_vec<T>();
|
std::vector<T> lhs_v = lhs.as_vec<T>();
|
||||||
std::vector<T> rhs_v = rhs.as_vec<T>();
|
std::vector<T> rhs_v = rhs.as_vec<T>();
|
||||||
std::vector<T> result_v(lhs_v.size());
|
std::vector<T> result_v(lhs_v.size());
|
||||||
for (size_t i = 0; i < lhs_v.size(); i++) {
|
for (const auto i : c10::irange(lhs_v.size())) {
|
||||||
switch (op_type) {
|
switch (op_type) {
|
||||||
case IRNodeType::kAdd:
|
case IRNodeType::kAdd:
|
||||||
result_v[i] = lhs_v[i] + rhs_v[i];
|
result_v[i] = lhs_v[i] + rhs_v[i];
|
||||||
@ -197,7 +197,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
|
|||||||
std::vector<T> lhs_v = lhs.as_vec<T>();
|
std::vector<T> lhs_v = lhs.as_vec<T>();
|
||||||
std::vector<T> rhs_v = rhs.as_vec<T>();
|
std::vector<T> rhs_v = rhs.as_vec<T>();
|
||||||
std::vector<T> result_v(lhs_v.size());
|
std::vector<T> result_v(lhs_v.size());
|
||||||
for (size_t i = 0; i < lhs_v.size(); i++) {
|
for (const auto i : c10::irange(lhs_v.size())) {
|
||||||
switch (op_type) {
|
switch (op_type) {
|
||||||
case IRNodeType::kAnd:
|
case IRNodeType::kAnd:
|
||||||
result_v[i] = lhs_v[i] & rhs_v[i];
|
result_v[i] = lhs_v[i] & rhs_v[i];
|
||||||
@ -224,7 +224,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
|
|||||||
std::vector<T> lhs_v = lhs.as_vec<T>();
|
std::vector<T> lhs_v = lhs.as_vec<T>();
|
||||||
std::vector<T> rhs_v = rhs.as_vec<T>();
|
std::vector<T> rhs_v = rhs.as_vec<T>();
|
||||||
std::vector<T> result_v(lhs_v.size());
|
std::vector<T> result_v(lhs_v.size());
|
||||||
for (size_t i = 0; i < lhs_v.size(); i++) {
|
for (const auto i : c10::irange(lhs_v.size())) {
|
||||||
switch (op_type) {
|
switch (op_type) {
|
||||||
case IRNodeType::kLshift: {
|
case IRNodeType::kLshift: {
|
||||||
typename std::make_unsigned<T>::type a =
|
typename std::make_unsigned<T>::type a =
|
||||||
@ -255,7 +255,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
|
|||||||
std::vector<R> ret_val1_v = retval1.as_vec<R>();
|
std::vector<R> ret_val1_v = retval1.as_vec<R>();
|
||||||
std::vector<R> ret_val2_v = retval2.as_vec<R>();
|
std::vector<R> ret_val2_v = retval2.as_vec<R>();
|
||||||
std::vector<R> result_v(lhs_v.size());
|
std::vector<R> result_v(lhs_v.size());
|
||||||
for (size_t i = 0; i < lhs_v.size(); i++) {
|
for (const auto i : c10::irange(lhs_v.size())) {
|
||||||
switch (cmp_op) {
|
switch (cmp_op) {
|
||||||
case CompareSelectOperation::kEQ:
|
case CompareSelectOperation::kEQ:
|
||||||
result_v[i] = (lhs_v[i] == rhs_v[i]) ? ret_val1_v[i] : ret_val2_v[i];
|
result_v[i] = (lhs_v[i] == rhs_v[i]) ? ret_val1_v[i] : ret_val2_v[i];
|
||||||
@ -619,14 +619,14 @@ class SimpleIREvaluatorImpl : public IRVisitor {
|
|||||||
std::vector<int> index = value().as_vec<int>();
|
std::vector<int> index = value().as_vec<int>();
|
||||||
ScalarType v_sdtype = v->dtype().scalar_type();
|
ScalarType v_sdtype = v->dtype().scalar_type();
|
||||||
switch (v_sdtype) {
|
switch (v_sdtype) {
|
||||||
#define TYPE_CASE(Type, Name) \
|
#define TYPE_CASE(Type, Name) \
|
||||||
case ScalarType::Name: { \
|
case ScalarType::Name: { \
|
||||||
Type* ptr##Name = static_cast<Type*>(ptr); \
|
Type* ptr##Name = static_cast<Type*>(ptr); \
|
||||||
std::vector<Type> v(index.size()); \
|
std::vector<Type> v(index.size()); \
|
||||||
for (size_t i = 0; i < index.size(); i++) { \
|
for (const auto i : c10::irange(index.size())) { \
|
||||||
v[i] = ptr##Name[index[i]]; \
|
v[i] = ptr##Name[index[i]]; \
|
||||||
} \
|
} \
|
||||||
value_ = Value(v); \
|
value_ = Value(v); \
|
||||||
} break;
|
} break;
|
||||||
AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
|
AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
|
||||||
#undef TYPE_CASE
|
#undef TYPE_CASE
|
||||||
@ -657,7 +657,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
|
|||||||
throw malformed_input("value size mismatch in Store", v); \
|
throw malformed_input("value size mismatch in Store", v); \
|
||||||
} \
|
} \
|
||||||
Type* ptr##Name = static_cast<Type*>(ptr); \
|
Type* ptr##Name = static_cast<Type*>(ptr); \
|
||||||
for (size_t i = 0; i < index.size(); i++) { \
|
for (const auto i : c10::irange(index.size())) { \
|
||||||
ptr##Name[index[i]] = value[i]; \
|
ptr##Name[index[i]] = value[i]; \
|
||||||
} \
|
} \
|
||||||
} break;
|
} break;
|
||||||
@ -748,11 +748,11 @@ class SimpleIREvaluatorImpl : public IRVisitor {
|
|||||||
|
|
||||||
std::vector<TReturn> result(v1.size(), -1);
|
std::vector<TReturn> result(v1.size(), -1);
|
||||||
if (values.size() == 1ULL) {
|
if (values.size() == 1ULL) {
|
||||||
for (size_t i = 0; i < v1.size(); i++) {
|
for (const auto i : c10::irange(v1.size())) {
|
||||||
result[i] = compute_intrinsics<TReturn>(v->op_type(), v1[i]);
|
result[i] = compute_intrinsics<TReturn>(v->op_type(), v1[i]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (size_t i = 0; i < v1.size(); i++) {
|
for (const auto i : c10::irange(v1.size())) {
|
||||||
result[i] = compute_intrinsics<TReturn>(v->op_type(), v1[i], v2[i]);
|
result[i] = compute_intrinsics<TReturn>(v->op_type(), v1[i], v2[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -987,7 +987,7 @@ void SimpleIREvaluator::call_raw(const std::vector<void*>& args) {
|
|||||||
if (args.size() != buffer_args().size()) {
|
if (args.size() != buffer_args().size()) {
|
||||||
throw malformed_input("bad args in IREvaluator call");
|
throw malformed_input("bad args in IREvaluator call");
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < args.size(); i++) {
|
for (const auto i : c10::irange(args.size())) {
|
||||||
bindArg(buffer_args()[i], args[i]);
|
bindArg(buffer_args()[i], args[i]);
|
||||||
}
|
}
|
||||||
stmt()->accept(&*impl_);
|
stmt()->accept(&*impl_);
|
||||||
|
@ -30,7 +30,7 @@ std::vector<at::Tensor> constructTensors(
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<at::Tensor> tensors;
|
std::vector<at::Tensor> tensors;
|
||||||
for (size_t i = 0; i < buf_data_vec.size(); i++) {
|
for (const auto i : c10::irange(buf_data_vec.size())) {
|
||||||
auto options = at::TensorOptions()
|
auto options = at::TensorOptions()
|
||||||
.dtype(buf_dtypes_vec[i])
|
.dtype(buf_dtypes_vec[i])
|
||||||
.layout(at::kStrided)
|
.layout(at::kStrided)
|
||||||
|
@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
#include <torch/csrc/jit/tensorexpr/tensor.h>
|
#include <torch/csrc/jit/tensorexpr/tensor.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
namespace tensorexpr {
|
namespace tensorexpr {
|
||||||
@ -93,7 +95,7 @@ const Expr* flatten_index(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const Expr* total_index = new IntImm(0);
|
const Expr* total_index = new IntImm(0);
|
||||||
for (size_t i = 0; i < ndim; i++) {
|
for (const auto i : c10::irange(ndim)) {
|
||||||
total_index = new Add(total_index, new Mul(indices[i], strides[i]));
|
total_index = new Add(total_index, new Mul(indices[i], strides[i]));
|
||||||
}
|
}
|
||||||
return total_index;
|
return total_index;
|
||||||
@ -187,7 +189,7 @@ ExternalCall* ExternalCall::make(
|
|||||||
std::vector<const Expr*> ExprHandleVectorToExprVector(
|
std::vector<const Expr*> ExprHandleVectorToExprVector(
|
||||||
const std::vector<ExprHandle>& v) {
|
const std::vector<ExprHandle>& v) {
|
||||||
std::vector<const Expr*> result(v.size());
|
std::vector<const Expr*> result(v.size());
|
||||||
for (size_t i = 0; i < v.size(); i++) {
|
for (const auto i : c10::irange(v.size())) {
|
||||||
result[i] = v[i].node();
|
result[i] = v[i].node();
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
@ -196,7 +198,7 @@ std::vector<const Expr*> ExprHandleVectorToExprVector(
|
|||||||
std::vector<ExprHandle> ExprVectorToExprHandleVector(
|
std::vector<ExprHandle> ExprVectorToExprHandleVector(
|
||||||
const std::vector<const Expr*>& v) {
|
const std::vector<const Expr*>& v) {
|
||||||
std::vector<ExprHandle> result(v.size());
|
std::vector<ExprHandle> result(v.size());
|
||||||
for (size_t i = 0; i < v.size(); i++) {
|
for (const auto i : c10::irange(v.size())) {
|
||||||
result[i] = ExprHandle(v[i]);
|
result[i] = ExprHandle(v[i]);
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
@ -205,7 +207,7 @@ std::vector<ExprHandle> ExprVectorToExprHandleVector(
|
|||||||
std::vector<const Var*> VarHandleVectorToVarVector(
|
std::vector<const Var*> VarHandleVectorToVarVector(
|
||||||
const std::vector<VarHandle>& v) {
|
const std::vector<VarHandle>& v) {
|
||||||
std::vector<const Var*> result(v.size());
|
std::vector<const Var*> result(v.size());
|
||||||
for (size_t i = 0; i < v.size(); i++) {
|
for (const auto i : c10::irange(v.size())) {
|
||||||
result[i] = v[i].node();
|
result[i] = v[i].node();
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
@ -214,7 +216,7 @@ std::vector<const Var*> VarHandleVectorToVarVector(
|
|||||||
std::vector<VarHandle> VarVectorToVarHandleVector(
|
std::vector<VarHandle> VarVectorToVarHandleVector(
|
||||||
const std::vector<const Var*>& v) {
|
const std::vector<const Var*>& v) {
|
||||||
std::vector<VarHandle> result(v.size());
|
std::vector<VarHandle> result(v.size());
|
||||||
for (size_t i = 0; i < v.size(); i++) {
|
for (const auto i : c10::irange(v.size())) {
|
||||||
result[i] = VarHandle(v[i]);
|
result[i] = VarHandle(v[i]);
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
|
@ -195,7 +195,7 @@ const Expr* IRMutator::mutate(Buf* v) {
|
|||||||
|
|
||||||
std::vector<const Expr*> dims_old = v->dims();
|
std::vector<const Expr*> dims_old = v->dims();
|
||||||
std::vector<const Expr*> dims_new(dims_old.size());
|
std::vector<const Expr*> dims_new(dims_old.size());
|
||||||
for (size_t i = 0; i < dims_old.size(); i++) {
|
for (const auto i : c10::irange(dims_old.size())) {
|
||||||
dims_new[i] = dims_old[i]->accept_mutator(this);
|
dims_new[i] = dims_old[i]->accept_mutator(this);
|
||||||
any_change |= (dims_new[i] != dims_old[i]);
|
any_change |= (dims_new[i] != dims_old[i]);
|
||||||
}
|
}
|
||||||
|
@ -413,7 +413,7 @@ void IRPrinter::visit(const Allocate* v) {
|
|||||||
<< "); // dtype=" << v->dtype().ToCppString();
|
<< "); // dtype=" << v->dtype().ToCppString();
|
||||||
os() << ", dims=[";
|
os() << ", dims=[";
|
||||||
const std::vector<const Expr*>& dims = v->dims();
|
const std::vector<const Expr*>& dims = v->dims();
|
||||||
for (size_t i = 0; i < dims.size(); i++) {
|
for (const auto i : c10::irange(dims.size())) {
|
||||||
if (i != 0) {
|
if (i != 0) {
|
||||||
os() << ", ";
|
os() << ", ";
|
||||||
}
|
}
|
||||||
@ -583,7 +583,7 @@ std::string to_string(const Tensor* t) {
|
|||||||
std::ostringstream oss;
|
std::ostringstream oss;
|
||||||
// TODO: move this to Buf printer
|
// TODO: move this to Buf printer
|
||||||
oss << "Tensor " << t->buf()->name_hint() << "[";
|
oss << "Tensor " << t->buf()->name_hint() << "[";
|
||||||
for (size_t i = 0; i < t->buf()->ndim(); i++) {
|
for (const auto i : c10::irange(t->buf()->ndim())) {
|
||||||
if (i != 0) {
|
if (i != 0) {
|
||||||
oss << ", ";
|
oss << ", ";
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
#include <ATen/ExpandUtils.h>
|
#include <ATen/ExpandUtils.h>
|
||||||
#include <ATen/TensorGeometry.h>
|
#include <ATen/TensorGeometry.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <c10/util/string_utils.h>
|
#include <c10/util/string_utils.h>
|
||||||
#include <torch/csrc/jit/jit_log.h>
|
#include <torch/csrc/jit/jit_log.h>
|
||||||
#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
|
#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
|
||||||
@ -502,7 +503,7 @@ ArgValue TensorExprKernel::toArg(const torch::jit::Value* v) const {
|
|||||||
std::vector<ExprHandle> TensorExprKernel::sizesFromVaryingShape(
|
std::vector<ExprHandle> TensorExprKernel::sizesFromVaryingShape(
|
||||||
const c10::VaryingShape<int64_t>& shape) {
|
const c10::VaryingShape<int64_t>& shape) {
|
||||||
std::vector<ExprHandle> dims;
|
std::vector<ExprHandle> dims;
|
||||||
for (size_t i = 0; i < *shape.size(); i++) {
|
for (const auto i : c10::irange(*shape.size())) {
|
||||||
dims.push_back(IntImm::make(*shape[i]));
|
dims.push_back(IntImm::make(*shape[i]));
|
||||||
}
|
}
|
||||||
return dims;
|
return dims;
|
||||||
@ -603,7 +604,7 @@ std::vector<ExprHandle> TensorExprKernel::inferSizesForValue(
|
|||||||
case aten::remainder:
|
case aten::remainder:
|
||||||
case aten::atan2: {
|
case aten::atan2: {
|
||||||
std::vector<std::vector<ExprHandle>> shapes;
|
std::vector<std::vector<ExprHandle>> shapes;
|
||||||
for (size_t idx = 0; idx < 2; idx++) {
|
for (const auto idx : c10::irange(2)) {
|
||||||
torch::jit::Value* inp = v->node()->input(idx);
|
torch::jit::Value* inp = v->node()->input(idx);
|
||||||
shapes.push_back(sizesForValue(inp));
|
shapes.push_back(sizesForValue(inp));
|
||||||
}
|
}
|
||||||
@ -614,7 +615,7 @@ std::vector<ExprHandle> TensorExprKernel::inferSizesForValue(
|
|||||||
case aten::threshold:
|
case aten::threshold:
|
||||||
case aten::where: {
|
case aten::where: {
|
||||||
std::vector<std::vector<ExprHandle>> shapes;
|
std::vector<std::vector<ExprHandle>> shapes;
|
||||||
for (size_t idx = 0; idx < 3; idx++) {
|
for (const auto idx : c10::irange(3)) {
|
||||||
torch::jit::Value* inp = v->node()->input(idx);
|
torch::jit::Value* inp = v->node()->input(idx);
|
||||||
shapes.push_back(sizesForValue(inp));
|
shapes.push_back(sizesForValue(inp));
|
||||||
}
|
}
|
||||||
@ -623,7 +624,7 @@ std::vector<ExprHandle> TensorExprKernel::inferSizesForValue(
|
|||||||
|
|
||||||
case aten::addcmul: {
|
case aten::addcmul: {
|
||||||
std::vector<std::vector<ExprHandle>> shapes;
|
std::vector<std::vector<ExprHandle>> shapes;
|
||||||
for (size_t idx = 0; idx < 4; idx++) {
|
for (const auto idx : c10::irange(4)) {
|
||||||
torch::jit::Value* inp = v->node()->input(idx);
|
torch::jit::Value* inp = v->node()->input(idx);
|
||||||
shapes.push_back(sizesForValue(inp));
|
shapes.push_back(sizesForValue(inp));
|
||||||
}
|
}
|
||||||
|
@ -401,7 +401,7 @@ LLVMCodeGenImpl::LLVMCodeGenImpl(
|
|||||||
// Emit prototype and bind argument Vars to parameter indices.
|
// Emit prototype and bind argument Vars to parameter indices.
|
||||||
llvm::Type* retTy = dtypeToLLVM(dtype);
|
llvm::Type* retTy = dtypeToLLVM(dtype);
|
||||||
std::vector<llvm::Type*> params;
|
std::vector<llvm::Type*> params;
|
||||||
for (size_t i = 0; i < args.size(); i++) {
|
for (const auto i : c10::irange(args.size())) {
|
||||||
auto const& arg = args[i];
|
auto const& arg = args[i];
|
||||||
if (arg.isVar()) {
|
if (arg.isVar()) {
|
||||||
params.push_back(dtypeToLLVM(arg.dtype()));
|
params.push_back(dtypeToLLVM(arg.dtype()));
|
||||||
@ -416,7 +416,7 @@ LLVMCodeGenImpl::LLVMCodeGenImpl(
|
|||||||
fn_->addAttribute(
|
fn_->addAttribute(
|
||||||
llvm::AttributeList::AttrIndex::FunctionIndex,
|
llvm::AttributeList::AttrIndex::FunctionIndex,
|
||||||
llvm::Attribute::AlwaysInline);
|
llvm::Attribute::AlwaysInline);
|
||||||
for (size_t i = 0; i < args.size(); i++) {
|
for (const auto i : c10::irange(args.size())) {
|
||||||
if (!args[i].isVar()) {
|
if (!args[i].isVar()) {
|
||||||
fn_->addParamAttr(i, llvm::Attribute::NoAlias);
|
fn_->addParamAttr(i, llvm::Attribute::NoAlias);
|
||||||
}
|
}
|
||||||
@ -465,7 +465,7 @@ void LLVMCodeGenImpl::emitWrapper(const std::vector<llvm::Type*>& params) {
|
|||||||
auto wrapBB = llvm::BasicBlock::Create(getContext(), "wrapBB", wrapper);
|
auto wrapBB = llvm::BasicBlock::Create(getContext(), "wrapBB", wrapper);
|
||||||
irb_.SetInsertPoint(wrapBB);
|
irb_.SetInsertPoint(wrapBB);
|
||||||
llvm::SmallVector<llvm::Value*, 6> wrappedArgs;
|
llvm::SmallVector<llvm::Value*, 6> wrappedArgs;
|
||||||
for (size_t i = 0; i < params.size(); i++) {
|
for (const auto i : c10::irange(params.size())) {
|
||||||
auto argp = irb_.CreateGEP(
|
auto argp = irb_.CreateGEP(
|
||||||
wrapper->arg_begin(), llvm::ConstantInt::getSigned(IntTy_, i));
|
wrapper->arg_begin(), llvm::ConstantInt::getSigned(IntTy_, i));
|
||||||
if (params[i]->isPointerTy()) {
|
if (params[i]->isPointerTy()) {
|
||||||
|
@ -8,6 +8,7 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include <c10/util/Logging.h>
|
#include <c10/util/Logging.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <c10/util/string_utils.h>
|
#include <c10/util/string_utils.h>
|
||||||
|
|
||||||
#include <ATen/core/functional.h>
|
#include <ATen/core/functional.h>
|
||||||
@ -22,6 +23,11 @@
|
|||||||
#include <torch/csrc/jit/tensorexpr/ir_verifier.h>
|
#include <torch/csrc/jit/tensorexpr/ir_verifier.h>
|
||||||
#include <torch/csrc/jit/tensorexpr/tensor.h>
|
#include <torch/csrc/jit/tensorexpr/tensor.h>
|
||||||
|
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <unordered_set>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
namespace tensorexpr {
|
namespace tensorexpr {
|
||||||
@ -501,7 +507,7 @@ class FunctionInliner : public IRMutator {
|
|||||||
const Expr* mutate_loads(const Buf* buf, std::vector<const Expr*> dims) {
|
const Expr* mutate_loads(const Buf* buf, std::vector<const Expr*> dims) {
|
||||||
std::vector<const Var*> index_vars;
|
std::vector<const Var*> index_vars;
|
||||||
TORCH_INTERNAL_ASSERT(buf->ndim() == producer_index_vars_.size());
|
TORCH_INTERNAL_ASSERT(buf->ndim() == producer_index_vars_.size());
|
||||||
for (size_t i = 0; i < buf->ndim(); i++) {
|
for (const auto i : c10::irange(buf->ndim())) {
|
||||||
const Var* func_callee_arg = producer_index_vars_.at(i);
|
const Var* func_callee_arg = producer_index_vars_.at(i);
|
||||||
const Expr* func_caller_param = dims.at(i);
|
const Expr* func_caller_param = dims.at(i);
|
||||||
if (func_callee_arg == nullptr) {
|
if (func_callee_arg == nullptr) {
|
||||||
@ -2348,7 +2354,7 @@ class LoopComputeAtRewriter : public IRMutator {
|
|||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
std::vector<const Expr*> new_indices(v->indices().size());
|
std::vector<const Expr*> new_indices(v->indices().size());
|
||||||
for (size_t i = 0; i < v->indices().size(); i++) {
|
for (const auto i : c10::irange(v->indices().size())) {
|
||||||
new_indices[i] =
|
new_indices[i] =
|
||||||
IRSimplifier::simplify(new Sub(v->indices()[i], offsets_[i]));
|
IRSimplifier::simplify(new Sub(v->indices()[i], offsets_[i]));
|
||||||
}
|
}
|
||||||
@ -2713,7 +2719,7 @@ void LoopNest::computeAt(Stmt* s, For* f) {
|
|||||||
|
|
||||||
// Generate index variables for 'temp'
|
// Generate index variables for 'temp'
|
||||||
std::vector<const Expr*> temp_indices(dims.size());
|
std::vector<const Expr*> temp_indices(dims.size());
|
||||||
for (size_t i = 0; i < dims.size(); i++) {
|
for (const auto i : c10::irange(dims.size())) {
|
||||||
// TODO: Use name-hint of the producer indices instead of 'idx'
|
// TODO: Use name-hint of the producer indices instead of 'idx'
|
||||||
temp_indices[i] = new Var(std::string("idx") + c10::to_string(i), kInt);
|
temp_indices[i] = new Var(std::string("idx") + c10::to_string(i), kInt);
|
||||||
}
|
}
|
||||||
@ -2729,7 +2735,7 @@ void LoopNest::computeAt(Stmt* s, For* f) {
|
|||||||
std::vector<std::pair<const Var*, const Expr*>> rewrite_indices_map;
|
std::vector<std::pair<const Var*, const Expr*>> rewrite_indices_map;
|
||||||
std::vector<const Expr*> offsets;
|
std::vector<const Expr*> offsets;
|
||||||
for (const TensorAccessBoundsInfo& p : bounds_it->second) {
|
for (const TensorAccessBoundsInfo& p : bounds_it->second) {
|
||||||
for (size_t i = 0; i < p.start.size(); i++) {
|
for (const auto i : c10::irange(p.start.size())) {
|
||||||
if (offsets.size() <= i) {
|
if (offsets.size() <= i) {
|
||||||
offsets.push_back(p.start[i]);
|
offsets.push_back(p.start[i]);
|
||||||
} else {
|
} else {
|
||||||
@ -2739,7 +2745,7 @@ void LoopNest::computeAt(Stmt* s, For* f) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < prod_indices.size(); i++) {
|
for (const auto i : c10::irange(prod_indices.size())) {
|
||||||
rewrite_indices_map.push_back(
|
rewrite_indices_map.push_back(
|
||||||
{prod_indices[i], new Add(temp_indices[i], offsets[i])});
|
{prod_indices[i], new Add(temp_indices[i], offsets[i])});
|
||||||
}
|
}
|
||||||
@ -2749,7 +2755,7 @@ void LoopNest::computeAt(Stmt* s, For* f) {
|
|||||||
temp_buf, temp_indices, Substitute(st->value(), rewrite_indices_map));
|
temp_buf, temp_indices, Substitute(st->value(), rewrite_indices_map));
|
||||||
|
|
||||||
// Construct the loop nest for the temp computation
|
// Construct the loop nest for the temp computation
|
||||||
for (size_t i = 0; i < dims.size(); i++) {
|
for (const auto i : c10::irange(dims.size())) {
|
||||||
// We're creating loops from innermost to outermost, so we need to access
|
// We're creating loops from innermost to outermost, so we need to access
|
||||||
// dimensions in reversed order.
|
// dimensions in reversed order.
|
||||||
size_t dim_idx = dims.size() - 1 - i;
|
size_t dim_idx = dims.size() - 1 - i;
|
||||||
|
@ -1,4 +1,7 @@
|
|||||||
#include <torch/csrc/jit/tensorexpr/mem_dependency_checker.h>
|
#include <torch/csrc/jit/tensorexpr/mem_dependency_checker.h>
|
||||||
|
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
@ -725,7 +728,7 @@ void MemDependencyChecker::visit(const For* v) {
|
|||||||
loopIndicesStride.resize(indices.size());
|
loopIndicesStride.resize(indices.size());
|
||||||
|
|
||||||
// index expr must depend on the loop var in some way to have a stride.
|
// index expr must depend on the loop var in some way to have a stride.
|
||||||
for (size_t i = 0; i < indices.size(); i++) {
|
for (const auto i : c10::irange(indices.size())) {
|
||||||
VarFinder vf;
|
VarFinder vf;
|
||||||
if (vf.find(indices[i]).count(var) == 0) {
|
if (vf.find(indices[i]).count(var) == 0) {
|
||||||
loopIndicesStride[i] = new IntImm(0);
|
loopIndicesStride[i] = new IntImm(0);
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <torch/csrc/jit/tensorexpr/tensor.h>
|
#include <torch/csrc/jit/tensorexpr/tensor.h>
|
||||||
|
|
||||||
#include <c10/util/Logging.h>
|
#include <c10/util/Logging.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/jit/tensorexpr/dim_arg.h>
|
#include <torch/csrc/jit/tensorexpr/dim_arg.h>
|
||||||
#include <torch/csrc/jit/tensorexpr/reduction.h>
|
#include <torch/csrc/jit/tensorexpr/reduction.h>
|
||||||
|
|
||||||
@ -27,7 +28,7 @@ Stmt* Tensor::constructStmt(
|
|||||||
const Expr* init_expr = buf()->initializer();
|
const Expr* init_expr = buf()->initializer();
|
||||||
|
|
||||||
if (reduce_ndim > 0) {
|
if (reduce_ndim > 0) {
|
||||||
for (size_t i = 0; i < reduce_ndim; i++) {
|
for (const auto i : c10::irange(reduce_ndim)) {
|
||||||
// Going in reverse order: from innermost loop to the outermost
|
// Going in reverse order: from innermost loop to the outermost
|
||||||
size_t dim_index = reduce_ndim - i - 1;
|
size_t dim_index = reduce_ndim - i - 1;
|
||||||
s = new For(
|
s = new For(
|
||||||
@ -39,7 +40,7 @@ Stmt* Tensor::constructStmt(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < ndim; i++) {
|
for (const auto i : c10::irange(ndim)) {
|
||||||
// Going in reverse order: from innermost loop to the outermost
|
// Going in reverse order: from innermost loop to the outermost
|
||||||
size_t dim_index = ndim - i - 1;
|
size_t dim_index = ndim - i - 1;
|
||||||
s = new For(args[dim_index], new IntImm(0), buf()->dim(dim_index), s);
|
s = new For(args[dim_index], new IntImm(0), buf()->dim(dim_index), s);
|
||||||
|
@ -12,16 +12,17 @@
|
|||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
#include <c10/util/Optional.h>
|
#include <c10/util/Optional.h>
|
||||||
#include <c10/util/StringUtil.h>
|
#include <c10/util/StringUtil.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/WindowsTorchApiMacro.h>
|
#include <torch/csrc/WindowsTorchApiMacro.h>
|
||||||
#include <torch/csrc/jit/frontend/source_range.h>
|
#include <torch/csrc/jit/frontend/source_range.h>
|
||||||
|
#include <torch/csrc/jit/ir/ir.h>
|
||||||
|
#include <torch/csrc/jit/testing/file_check.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include <torch/csrc/jit/ir/ir.h>
|
|
||||||
#include <torch/csrc/jit/testing/file_check.h>
|
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
namespace jit {
|
namespace jit {
|
||||||
|
|
||||||
|
@ -1,10 +1,5 @@
|
|||||||
|
#include <c10/util/irange.h>
|
||||||
#include <torch/csrc/python_headers.h>
|
#include <torch/csrc/python_headers.h>
|
||||||
#include <cstdarg>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <sstream>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <unordered_map>
|
|
||||||
#include <torch/csrc/THP.h>
|
#include <torch/csrc/THP.h>
|
||||||
#include <torch/csrc/utils/python_strings.h>
|
#include <torch/csrc/utils/python_strings.h>
|
||||||
#include <torch/csrc/utils/invalid_arguments.h>
|
#include <torch/csrc/utils/invalid_arguments.h>
|
||||||
@ -32,6 +27,13 @@
|
|||||||
#include <torch/csrc/generic/utils.cpp>
|
#include <torch/csrc/generic/utils.cpp>
|
||||||
#include <TH/THGenerateBoolType.h>
|
#include <TH/THGenerateBoolType.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cstdarg>
|
||||||
|
#include <sstream>
|
||||||
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
int THPUtils_getCallable(PyObject *arg, PyObject **result) {
|
int THPUtils_getCallable(PyObject *arg, PyObject **result) {
|
||||||
if (!PyCallable_Check(arg))
|
if (!PyCallable_Check(arg))
|
||||||
return 0;
|
return 0;
|
||||||
@ -200,7 +202,7 @@ void THPUtils_invalidArguments(PyObject *given_args, PyObject *given_kwargs,
|
|||||||
std::vector<std::string> option_strings;
|
std::vector<std::string> option_strings;
|
||||||
va_list option_list;
|
va_list option_list;
|
||||||
va_start(option_list, num_options);
|
va_start(option_list, num_options);
|
||||||
for (size_t i = 0; i < num_options; i++)
|
for(const auto i : c10::irange(num_options))
|
||||||
option_strings.emplace_back(va_arg(option_list, const char*));
|
option_strings.emplace_back(va_arg(option_list, const char*));
|
||||||
va_end(option_list);
|
va_end(option_list);
|
||||||
|
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
#include <torch/csrc/utils/byte_order.h>
|
#include <torch/csrc/utils/byte_order.h>
|
||||||
#include <c10/util/BFloat16.h>
|
#include <c10/util/BFloat16.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@ -121,7 +123,7 @@ THPByteOrder THP_nativeByteOrder()
|
|||||||
|
|
||||||
void THP_decodeInt16Buffer(int16_t* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
void THP_decodeInt16Buffer(int16_t* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
dst[i] = (int16_t)(
|
dst[i] = (int16_t)(
|
||||||
order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
|
order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
|
||||||
src += sizeof(int16_t);
|
src += sizeof(int16_t);
|
||||||
@ -130,7 +132,7 @@ void THP_decodeInt16Buffer(int16_t* dst, const uint8_t* src, THPByteOrder order,
|
|||||||
|
|
||||||
void THP_decodeInt32Buffer(int32_t* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
void THP_decodeInt32Buffer(int32_t* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
dst[i] = (int32_t)(
|
dst[i] = (int32_t)(
|
||||||
order == THP_BIG_ENDIAN ? decodeUInt32BE(src) : decodeUInt32LE(src));
|
order == THP_BIG_ENDIAN ? decodeUInt32BE(src) : decodeUInt32LE(src));
|
||||||
src += sizeof(int32_t);
|
src += sizeof(int32_t);
|
||||||
@ -139,7 +141,7 @@ void THP_decodeInt32Buffer(int32_t* dst, const uint8_t* src, THPByteOrder order,
|
|||||||
|
|
||||||
void THP_decodeInt64Buffer(int64_t* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
void THP_decodeInt64Buffer(int64_t* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
dst[i] = (int64_t)(
|
dst[i] = (int64_t)(
|
||||||
order == THP_BIG_ENDIAN ? decodeUInt64BE(src) : decodeUInt64LE(src));
|
order == THP_BIG_ENDIAN ? decodeUInt64BE(src) : decodeUInt64LE(src));
|
||||||
src += sizeof(int64_t);
|
src += sizeof(int64_t);
|
||||||
@ -148,7 +150,7 @@ void THP_decodeInt64Buffer(int64_t* dst, const uint8_t* src, THPByteOrder order,
|
|||||||
|
|
||||||
void THP_decodeHalfBuffer(THHalf* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
void THP_decodeHalfBuffer(THHalf* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
||||||
union { uint16_t x; THHalf f; };
|
union { uint16_t x; THHalf f; };
|
||||||
x = (order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
|
x = (order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
|
||||||
@ -159,7 +161,7 @@ void THP_decodeHalfBuffer(THHalf* dst, const uint8_t* src, THPByteOrder order, s
|
|||||||
|
|
||||||
void THP_decodeBFloat16Buffer(at::BFloat16* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
void THP_decodeBFloat16Buffer(at::BFloat16* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
uint16_t x =
|
uint16_t x =
|
||||||
(order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
|
(order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
|
||||||
std::memcpy(&dst[i], &x, sizeof(dst[i]));
|
std::memcpy(&dst[i], &x, sizeof(dst[i]));
|
||||||
@ -169,14 +171,14 @@ void THP_decodeBFloat16Buffer(at::BFloat16* dst, const uint8_t* src, THPByteOrde
|
|||||||
|
|
||||||
void THP_decodeBoolBuffer(bool* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
void THP_decodeBoolBuffer(bool* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
dst[i] = (int)src[i] != 0 ? true : false;
|
dst[i] = (int)src[i] != 0 ? true : false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void THP_decodeFloatBuffer(float* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
void THP_decodeFloatBuffer(float* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
||||||
union { uint32_t x; float f; };
|
union { uint32_t x; float f; };
|
||||||
x = (order == THP_BIG_ENDIAN ? decodeUInt32BE(src) : decodeUInt32LE(src));
|
x = (order == THP_BIG_ENDIAN ? decodeUInt32BE(src) : decodeUInt32LE(src));
|
||||||
@ -187,7 +189,7 @@ void THP_decodeFloatBuffer(float* dst, const uint8_t* src, THPByteOrder order, s
|
|||||||
|
|
||||||
void THP_decodeDoubleBuffer(double* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
void THP_decodeDoubleBuffer(double* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
||||||
union { uint64_t x; double d; };
|
union { uint64_t x; double d; };
|
||||||
x = (order == THP_BIG_ENDIAN ? decodeUInt64BE(src) : decodeUInt64LE(src));
|
x = (order == THP_BIG_ENDIAN ? decodeUInt64BE(src) : decodeUInt64LE(src));
|
||||||
@ -198,7 +200,7 @@ void THP_decodeDoubleBuffer(double* dst, const uint8_t* src, THPByteOrder order,
|
|||||||
|
|
||||||
void THP_decodeComplexFloatBuffer(c10::complex<float>* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
void THP_decodeComplexFloatBuffer(c10::complex<float>* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
||||||
union { uint32_t x; float re; };
|
union { uint32_t x; float re; };
|
||||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
||||||
@ -215,7 +217,7 @@ void THP_decodeComplexFloatBuffer(c10::complex<float>* dst, const uint8_t* src,
|
|||||||
|
|
||||||
void THP_decodeComplexDoubleBuffer(c10::complex<double>* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
void THP_decodeComplexDoubleBuffer(c10::complex<double>* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
||||||
union { uint32_t x; double re; };
|
union { uint32_t x; double re; };
|
||||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
||||||
@ -234,7 +236,7 @@ void THP_encodeInt16Buffer(uint8_t* dst, const int16_t* src, THPByteOrder order,
|
|||||||
{
|
{
|
||||||
memcpy(dst, src, sizeof(int16_t) * len);
|
memcpy(dst, src, sizeof(int16_t) * len);
|
||||||
if (order != THP_nativeByteOrder()) {
|
if (order != THP_nativeByteOrder()) {
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
swapBytes16(dst);
|
swapBytes16(dst);
|
||||||
dst += sizeof(int16_t);
|
dst += sizeof(int16_t);
|
||||||
}
|
}
|
||||||
@ -245,7 +247,7 @@ void THP_encodeInt32Buffer(uint8_t* dst, const int32_t* src, THPByteOrder order,
|
|||||||
{
|
{
|
||||||
memcpy(dst, src, sizeof(int32_t) * len);
|
memcpy(dst, src, sizeof(int32_t) * len);
|
||||||
if (order != THP_nativeByteOrder()) {
|
if (order != THP_nativeByteOrder()) {
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
swapBytes32(dst);
|
swapBytes32(dst);
|
||||||
dst += sizeof(int32_t);
|
dst += sizeof(int32_t);
|
||||||
}
|
}
|
||||||
@ -256,7 +258,7 @@ void THP_encodeInt64Buffer(uint8_t* dst, const int64_t* src, THPByteOrder order,
|
|||||||
{
|
{
|
||||||
memcpy(dst, src, sizeof(int64_t) * len);
|
memcpy(dst, src, sizeof(int64_t) * len);
|
||||||
if (order != THP_nativeByteOrder()) {
|
if (order != THP_nativeByteOrder()) {
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
swapBytes64(dst);
|
swapBytes64(dst);
|
||||||
dst += sizeof(int64_t);
|
dst += sizeof(int64_t);
|
||||||
}
|
}
|
||||||
@ -267,7 +269,7 @@ void THP_encodeFloatBuffer(uint8_t* dst, const float* src, THPByteOrder order, s
|
|||||||
{
|
{
|
||||||
memcpy(dst, src, sizeof(float) * len);
|
memcpy(dst, src, sizeof(float) * len);
|
||||||
if (order != THP_nativeByteOrder()) {
|
if (order != THP_nativeByteOrder()) {
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
swapBytes32(dst);
|
swapBytes32(dst);
|
||||||
dst += sizeof(float);
|
dst += sizeof(float);
|
||||||
}
|
}
|
||||||
@ -278,7 +280,7 @@ void THP_encodeDoubleBuffer(uint8_t* dst, const double* src, THPByteOrder order,
|
|||||||
{
|
{
|
||||||
memcpy(dst, src, sizeof(double) * len);
|
memcpy(dst, src, sizeof(double) * len);
|
||||||
if (order != THP_nativeByteOrder()) {
|
if (order != THP_nativeByteOrder()) {
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
swapBytes64(dst);
|
swapBytes64(dst);
|
||||||
dst += sizeof(double);
|
dst += sizeof(double);
|
||||||
}
|
}
|
||||||
@ -289,7 +291,7 @@ template <typename T>
|
|||||||
std::vector<T> complex_to_float(const c10::complex<T>* src, size_t len) {
|
std::vector<T> complex_to_float(const c10::complex<T>* src, size_t len) {
|
||||||
std::vector<T> new_src;
|
std::vector<T> new_src;
|
||||||
new_src.reserve(2 * len);
|
new_src.reserve(2 * len);
|
||||||
for (size_t i = 0; i < len; i++) {
|
for(const auto i : c10::irange(len)) {
|
||||||
auto elem = src[i];
|
auto elem = src[i];
|
||||||
new_src.emplace_back(elem.real());
|
new_src.emplace_back(elem.real());
|
||||||
new_src.emplace_back(elem.imag());
|
new_src.emplace_back(elem.imag());
|
||||||
|
@ -251,7 +251,7 @@ std::string _formattedArgDesc(
|
|||||||
|
|
||||||
auto num_args = arguments.size() + kwargs.size();
|
auto num_args = arguments.size() + kwargs.size();
|
||||||
std::string result = "(";
|
std::string result = "(";
|
||||||
for (size_t i = 0; i < num_args; i++) {
|
for(const auto i : c10::irange(num_args)) {
|
||||||
bool is_kwarg = i >= arguments.size();
|
bool is_kwarg = i >= arguments.size();
|
||||||
PyObject *arg = is_kwarg ? kwargs.at(option.arguments[i].name) : arguments[i];
|
PyObject *arg = is_kwarg ? kwargs.at(option.arguments[i].name) : arguments[i];
|
||||||
|
|
||||||
|
@ -174,7 +174,7 @@ auto combine_self_args(PyObject *self, PyObject *args) -> py::tuple {
|
|||||||
size_t n = py_args.size();
|
size_t n = py_args.size();
|
||||||
auto args_ = py::tuple(n + 1);
|
auto args_ = py::tuple(n + 1);
|
||||||
args_[0] = py::handle(self);
|
args_[0] = py::handle(self);
|
||||||
for (size_t i = 0; i < n; i++) {
|
for(const auto i : c10::irange(n)) {
|
||||||
args_[i+1] = py_args[i];
|
args_[i+1] = py_args[i];
|
||||||
}
|
}
|
||||||
return args_;
|
return args_;
|
||||||
@ -384,8 +384,7 @@ bool is_scalar_list(PyObject* obj) {
|
|||||||
}
|
}
|
||||||
// NOLINTNEXTLINE(bugprone-branch-clone)
|
// NOLINTNEXTLINE(bugprone-branch-clone)
|
||||||
auto size = tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj);
|
auto size = tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj);
|
||||||
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
|
for (const auto idx : c10::irange(size)) {
|
||||||
for (size_t idx = 0; idx < size; idx++) {
|
|
||||||
PyObject* iobj = tuple ? PyTuple_GET_ITEM(obj, idx) : PyList_GET_ITEM(obj, idx);
|
PyObject* iobj = tuple ? PyTuple_GET_ITEM(obj, idx) : PyList_GET_ITEM(obj, idx);
|
||||||
if (!THPUtils_checkScalar(iobj)) {
|
if (!THPUtils_checkScalar(iobj)) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -34,7 +34,7 @@ static void recursive_apply(IntArrayRef sizes, ScalarType scalarType, int64_t di
|
|||||||
if (dim == ndim) {
|
if (dim == ndim) {
|
||||||
auto args = THPObjectPtr(PyTuple_New(N));
|
auto args = THPObjectPtr(PyTuple_New(N));
|
||||||
if (!args) throw python_error();
|
if (!args) throw python_error();
|
||||||
for (size_t i = 0; i < N; i++) {
|
for(const auto i : c10::irange(N)) {
|
||||||
PyObject* arg = load_scalar(strided_data[i].data, scalarType);
|
PyObject* arg = load_scalar(strided_data[i].data, scalarType);
|
||||||
if (!arg) throw python_error();
|
if (!arg) throw python_error();
|
||||||
PyTuple_SET_ITEM(args.get(), i, arg);
|
PyTuple_SET_ITEM(args.get(), i, arg);
|
||||||
|
@ -74,7 +74,7 @@ static std::vector<npy_intp> to_numpy_shape(IntArrayRef x) {
|
|||||||
// shape and stride conversion from int64_t to npy_intp
|
// shape and stride conversion from int64_t to npy_intp
|
||||||
auto nelem = x.size();
|
auto nelem = x.size();
|
||||||
auto result = std::vector<npy_intp>(nelem);
|
auto result = std::vector<npy_intp>(nelem);
|
||||||
for (size_t i = 0; i < nelem; i++) {
|
for(const auto i : c10::irange(nelem)) {
|
||||||
result[i] = static_cast<npy_intp>(x[i]);
|
result[i] = static_cast<npy_intp>(x[i]);
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include <c10/util/irange.h>
|
||||||
#include <c10d/ProcessGroupGloo.hpp>
|
#include <c10d/ProcessGroupGloo.hpp>
|
||||||
|
|
||||||
#include <c10d/GlooDeviceFactory.hpp>
|
#include <c10d/GlooDeviceFactory.hpp>
|
||||||
@ -211,7 +212,7 @@ void band(void* c, const void* a, const void* b, size_t n) {
|
|||||||
auto tc = static_cast<T*>(c);
|
auto tc = static_cast<T*>(c);
|
||||||
auto ta = static_cast<const T*>(a);
|
auto ta = static_cast<const T*>(a);
|
||||||
auto tb = static_cast<const T*>(b);
|
auto tb = static_cast<const T*>(b);
|
||||||
for (size_t i = 0; i < n; i++) {
|
for(const auto i : c10::irange(n)) {
|
||||||
tc[i] = ta[i] & tb[i];
|
tc[i] = ta[i] & tb[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -224,7 +225,7 @@ void bor(void* c, const void* a, const void* b, size_t n) {
|
|||||||
auto tc = static_cast<T*>(c);
|
auto tc = static_cast<T*>(c);
|
||||||
auto ta = static_cast<const T*>(a);
|
auto ta = static_cast<const T*>(a);
|
||||||
auto tb = static_cast<const T*>(b);
|
auto tb = static_cast<const T*>(b);
|
||||||
for (size_t i = 0; i < n; i++) {
|
for(const auto i : c10::irange(n)) {
|
||||||
tc[i] = ta[i] | tb[i];
|
tc[i] = ta[i] | tb[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -237,7 +238,7 @@ void bxor(void* c, const void* a, const void* b, size_t n) {
|
|||||||
auto tc = static_cast<T*>(c);
|
auto tc = static_cast<T*>(c);
|
||||||
auto ta = static_cast<const T*>(a);
|
auto ta = static_cast<const T*>(a);
|
||||||
auto tb = static_cast<const T*>(b);
|
auto tb = static_cast<const T*>(b);
|
||||||
for (size_t i = 0; i < n; i++) {
|
for(const auto i : c10::irange(n)) {
|
||||||
tc[i] = ta[i] ^ tb[i];
|
tc[i] = ta[i] ^ tb[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -334,7 +335,7 @@ void initializeStreamsEvents(
|
|||||||
at::cuda::OptionalCUDAGuard guard;
|
at::cuda::OptionalCUDAGuard guard;
|
||||||
streams.reserve(tensors.size());
|
streams.reserve(tensors.size());
|
||||||
events.resize(tensors.size());
|
events.resize(tensors.size());
|
||||||
for (size_t i = 0; i < tensors.size(); i++) {
|
for(const auto i : c10::irange(tensors.size())) {
|
||||||
guard.set_index(tensors[i].device().index());
|
guard.set_index(tensors[i].device().index());
|
||||||
// Record event on current stream
|
// Record event on current stream
|
||||||
events[i].record(at::cuda::getCurrentCUDAStream());
|
events[i].record(at::cuda::getCurrentCUDAStream());
|
||||||
@ -390,7 +391,7 @@ void initializeStreamsEvents(
|
|||||||
at::cuda::OptionalCUDAGuard guard;
|
at::cuda::OptionalCUDAGuard guard;
|
||||||
streams.reserve(tensors.size());
|
streams.reserve(tensors.size());
|
||||||
events.resize(tensors.size());
|
events.resize(tensors.size());
|
||||||
for (size_t i = 0; i < tensors.size(); i++) {
|
for(const auto i : c10::irange(tensors.size())) {
|
||||||
guard.set_index(tensors[i][0].device().index());
|
guard.set_index(tensors[i][0].device().index());
|
||||||
// Record event on current stream
|
// Record event on current stream
|
||||||
events[i].record(at::cuda::getCurrentCUDAStream());
|
events[i].record(at::cuda::getCurrentCUDAStream());
|
||||||
@ -714,7 +715,7 @@ ProcessGroupGloo::ProcessGroupGloo(
|
|||||||
// by a single I/O thread.
|
// by a single I/O thread.
|
||||||
//
|
//
|
||||||
contexts_.reserve(options->devices.size());
|
contexts_.reserve(options->devices.size());
|
||||||
for (size_t i = 0; i < options->devices.size(); i++) {
|
for(const auto i : c10::irange(options->devices.size())) {
|
||||||
auto context = std::make_shared<::gloo::rendezvous::Context>(rank_, size_);
|
auto context = std::make_shared<::gloo::rendezvous::Context>(rank_, size_);
|
||||||
auto store = ::gloo::rendezvous::PrefixStore(std::to_string(i), *store_);
|
auto store = ::gloo::rendezvous::PrefixStore(std::to_string(i), *store_);
|
||||||
context->setTimeout(options->timeout);
|
context->setTimeout(options->timeout);
|
||||||
@ -729,7 +730,7 @@ ProcessGroupGloo::ProcessGroupGloo(
|
|||||||
workInProgress_.resize(options->threads);
|
workInProgress_.resize(options->threads);
|
||||||
|
|
||||||
threads_.resize(options->threads);
|
threads_.resize(options->threads);
|
||||||
for (size_t i = 0; i < threads_.size(); i++) {
|
for(const auto i : c10::irange(threads_.size())) {
|
||||||
threads_[i] = std::thread(&ProcessGroupGloo::runLoop, this, i);
|
threads_[i] = std::thread(&ProcessGroupGloo::runLoop, this, i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -834,7 +835,7 @@ class AsyncBroadcastWork : public ProcessGroupGloo::AsyncWork {
|
|||||||
broadcast(inputs[rootTensor]);
|
broadcast(inputs[rootTensor]);
|
||||||
|
|
||||||
// Copy to non-root tensors
|
// Copy to non-root tensors
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
if (i == static_cast<size_t>(rootTensor)) {
|
if (i == static_cast<size_t>(rootTensor)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -878,7 +879,7 @@ class AsyncBroadcastCUDAWork : public AsyncBroadcastWork {
|
|||||||
broadcast(tmp);
|
broadcast(tmp);
|
||||||
|
|
||||||
// Kick off copy back to the CUDA tensors.
|
// Kick off copy back to the CUDA tensors.
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
guard.reset_stream(streams[i]);
|
guard.reset_stream(streams[i]);
|
||||||
inputs[i].copy_(tmp, /* non_blocking */ true);
|
inputs[i].copy_(tmp, /* non_blocking */ true);
|
||||||
events[i].record(streams[i]);
|
events[i].record(streams[i]);
|
||||||
@ -889,7 +890,7 @@ class AsyncBroadcastCUDAWork : public AsyncBroadcastWork {
|
|||||||
at::cuda::OptionalCUDAGuard guard;
|
at::cuda::OptionalCUDAGuard guard;
|
||||||
|
|
||||||
// Synchronize with the copy back to CUDA tensors.
|
// Synchronize with the copy back to CUDA tensors.
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
guard.set_index(inputs[i].device().index());
|
guard.set_index(inputs[i].device().index());
|
||||||
events[i].block(at::cuda::getCurrentCUDAStream());
|
events[i].block(at::cuda::getCurrentCUDAStream());
|
||||||
}
|
}
|
||||||
@ -1210,7 +1211,7 @@ class AsyncSparseAllreduceWork : public ProcessGroupGloo::AsyncWork {
|
|||||||
|
|
||||||
std::vector<size_t> counts(context->size);
|
std::vector<size_t> counts(context->size);
|
||||||
int64_t totalSize = 0;
|
int64_t totalSize = 0;
|
||||||
for (size_t i = 0; i < metadata.size(); i++) {
|
for(const auto i : c10::irange(metadata.size())) {
|
||||||
counts[i] = metadata[i].nnz() * sparseDim;
|
counts[i] = metadata[i].nnz() * sparseDim;
|
||||||
totalSize += counts[i];
|
totalSize += counts[i];
|
||||||
}
|
}
|
||||||
@ -1255,7 +1256,7 @@ class AsyncSparseAllreduceWork : public ProcessGroupGloo::AsyncWork {
|
|||||||
|
|
||||||
std::vector<size_t> counts(context->size);
|
std::vector<size_t> counts(context->size);
|
||||||
int64_t totalSize = 0;
|
int64_t totalSize = 0;
|
||||||
for (size_t i = 0; i < metadata.size(); i++) {
|
for(const auto i : c10::irange(metadata.size())) {
|
||||||
counts[i] = metadata[i].nnz() * denseNumel;
|
counts[i] = metadata[i].nnz() * denseNumel;
|
||||||
totalSize += counts[i];
|
totalSize += counts[i];
|
||||||
}
|
}
|
||||||
@ -1308,7 +1309,7 @@ class AsyncAllreduceCUDAWork : public AsyncAllreduceWork {
|
|||||||
// Kick off copy from CUDA tensors to pinned CPU tensors.
|
// Kick off copy from CUDA tensors to pinned CPU tensors.
|
||||||
tmp.reserve(inputs.size());
|
tmp.reserve(inputs.size());
|
||||||
at::cuda::OptionalCUDAStreamGuard guard;
|
at::cuda::OptionalCUDAStreamGuard guard;
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
guard.reset_stream(streams[i]);
|
guard.reset_stream(streams[i]);
|
||||||
tmp.push_back(pinnedLike(inputs[i]).copy_(inputs[i], true));
|
tmp.push_back(pinnedLike(inputs[i]).copy_(inputs[i], true));
|
||||||
}
|
}
|
||||||
@ -1317,7 +1318,7 @@ class AsyncAllreduceCUDAWork : public AsyncAllreduceWork {
|
|||||||
void run() override {
|
void run() override {
|
||||||
// Synchronize with copy operations.
|
// Synchronize with copy operations.
|
||||||
at::cuda::OptionalCUDAGuard device_guard;
|
at::cuda::OptionalCUDAGuard device_guard;
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
device_guard.set_index(inputs[i].device().index());
|
device_guard.set_index(inputs[i].device().index());
|
||||||
AT_CUDA_CHECK(cudaStreamSynchronize(streams[i]));
|
AT_CUDA_CHECK(cudaStreamSynchronize(streams[i]));
|
||||||
}
|
}
|
||||||
@ -1326,7 +1327,7 @@ class AsyncAllreduceCUDAWork : public AsyncAllreduceWork {
|
|||||||
allreduce(tmp);
|
allreduce(tmp);
|
||||||
|
|
||||||
at::cuda::OptionalCUDAStreamGuard stream_guard;
|
at::cuda::OptionalCUDAStreamGuard stream_guard;
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
stream_guard.reset_stream(streams[i]);
|
stream_guard.reset_stream(streams[i]);
|
||||||
inputs[i].copy_(tmp[i], /* non_blocking */ true);
|
inputs[i].copy_(tmp[i], /* non_blocking */ true);
|
||||||
events[i].record(streams[i]);
|
events[i].record(streams[i]);
|
||||||
@ -1336,7 +1337,7 @@ class AsyncAllreduceCUDAWork : public AsyncAllreduceWork {
|
|||||||
void synchronize() override {
|
void synchronize() override {
|
||||||
// Synchronize with the copy back to CUDA tensors.
|
// Synchronize with the copy back to CUDA tensors.
|
||||||
at::cuda::OptionalCUDAGuard guard;
|
at::cuda::OptionalCUDAGuard guard;
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
guard.set_index(inputs[i].device().index());
|
guard.set_index(inputs[i].device().index());
|
||||||
events[i].block(at::cuda::getCurrentCUDAStream());
|
events[i].block(at::cuda::getCurrentCUDAStream());
|
||||||
}
|
}
|
||||||
@ -1361,7 +1362,7 @@ class AsyncSparseAllreduceCUDAWork : public AsyncSparseAllreduceWork {
|
|||||||
// memory must be performed asynchronously, or we block the caller.
|
// memory must be performed asynchronously, or we block the caller.
|
||||||
tmp.reserve(inputs.size());
|
tmp.reserve(inputs.size());
|
||||||
at::cuda::OptionalCUDAStreamGuard guard;
|
at::cuda::OptionalCUDAStreamGuard guard;
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
guard.reset_stream(streams[i]);
|
guard.reset_stream(streams[i]);
|
||||||
tmp.push_back(
|
tmp.push_back(
|
||||||
inputs[i].coalesce().to(at::DeviceType::CPU, /*non_blocking=*/true));
|
inputs[i].coalesce().to(at::DeviceType::CPU, /*non_blocking=*/true));
|
||||||
@ -1371,7 +1372,7 @@ class AsyncSparseAllreduceCUDAWork : public AsyncSparseAllreduceWork {
|
|||||||
void run() override {
|
void run() override {
|
||||||
// Synchronize with copy operations.
|
// Synchronize with copy operations.
|
||||||
at::cuda::OptionalCUDAGuard device_guard;
|
at::cuda::OptionalCUDAGuard device_guard;
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
device_guard.set_index(inputs[i].device().index());
|
device_guard.set_index(inputs[i].device().index());
|
||||||
AT_CUDA_CHECK(cudaStreamSynchronize(streams[i]));
|
AT_CUDA_CHECK(cudaStreamSynchronize(streams[i]));
|
||||||
}
|
}
|
||||||
@ -1381,7 +1382,7 @@ class AsyncSparseAllreduceCUDAWork : public AsyncSparseAllreduceWork {
|
|||||||
|
|
||||||
// Kick off copy back to the CUDA tensors.
|
// Kick off copy back to the CUDA tensors.
|
||||||
at::cuda::OptionalCUDAStreamGuard stream_guard;
|
at::cuda::OptionalCUDAStreamGuard stream_guard;
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
stream_guard.reset_stream(streams[i]);
|
stream_guard.reset_stream(streams[i]);
|
||||||
inputs[i].copy_(output, /*non_blocking=*/true);
|
inputs[i].copy_(output, /*non_blocking=*/true);
|
||||||
events[i].record(streams[i]);
|
events[i].record(streams[i]);
|
||||||
@ -1391,7 +1392,7 @@ class AsyncSparseAllreduceCUDAWork : public AsyncSparseAllreduceWork {
|
|||||||
void synchronize() override {
|
void synchronize() override {
|
||||||
// Synchronize with the copy back to CUDA tensors.
|
// Synchronize with the copy back to CUDA tensors.
|
||||||
at::cuda::OptionalCUDAGuard guard;
|
at::cuda::OptionalCUDAGuard guard;
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
guard.set_index(inputs[i].device().index());
|
guard.set_index(inputs[i].device().index());
|
||||||
events[i].block(at::cuda::getCurrentCUDAStream());
|
events[i].block(at::cuda::getCurrentCUDAStream());
|
||||||
}
|
}
|
||||||
@ -1600,7 +1601,7 @@ class AsyncReduceCUDAWork : public AsyncReduceWork {
|
|||||||
// Kick off copy from CUDA tensors to pinned CPU tensors.
|
// Kick off copy from CUDA tensors to pinned CPU tensors.
|
||||||
tmp.reserve(inputs.size());
|
tmp.reserve(inputs.size());
|
||||||
at::cuda::OptionalCUDAStreamGuard guard;
|
at::cuda::OptionalCUDAStreamGuard guard;
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
guard.reset_stream(streams[i]);
|
guard.reset_stream(streams[i]);
|
||||||
tmp.push_back(pinnedLike(inputs[i]).copy_(inputs[i], true));
|
tmp.push_back(pinnedLike(inputs[i]).copy_(inputs[i], true));
|
||||||
}
|
}
|
||||||
@ -1609,7 +1610,7 @@ class AsyncReduceCUDAWork : public AsyncReduceWork {
|
|||||||
void run() override {
|
void run() override {
|
||||||
// Synchronize with copy operations.
|
// Synchronize with copy operations.
|
||||||
at::cuda::OptionalCUDAGuard device_guard;
|
at::cuda::OptionalCUDAGuard device_guard;
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
device_guard.set_index(inputs[i].device().index());
|
device_guard.set_index(inputs[i].device().index());
|
||||||
AT_CUDA_CHECK(cudaStreamSynchronize(streams[i]));
|
AT_CUDA_CHECK(cudaStreamSynchronize(streams[i]));
|
||||||
}
|
}
|
||||||
@ -1619,7 +1620,7 @@ class AsyncReduceCUDAWork : public AsyncReduceWork {
|
|||||||
|
|
||||||
// Kick off copy back to the CUDA tensors.
|
// Kick off copy back to the CUDA tensors.
|
||||||
at::cuda::OptionalCUDAStreamGuard stream_guard;
|
at::cuda::OptionalCUDAStreamGuard stream_guard;
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
stream_guard.reset_stream(streams[i]);
|
stream_guard.reset_stream(streams[i]);
|
||||||
inputs[i].copy_(tmp[i], /* non_blocking */ true);
|
inputs[i].copy_(tmp[i], /* non_blocking */ true);
|
||||||
events[i].record(streams[i]);
|
events[i].record(streams[i]);
|
||||||
@ -1629,7 +1630,7 @@ class AsyncReduceCUDAWork : public AsyncReduceWork {
|
|||||||
void synchronize() override {
|
void synchronize() override {
|
||||||
// Synchronize with the copy back to CUDA tensors.
|
// Synchronize with the copy back to CUDA tensors.
|
||||||
at::cuda::OptionalCUDAGuard guard;
|
at::cuda::OptionalCUDAGuard guard;
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
guard.set_index(inputs[i].device().index());
|
guard.set_index(inputs[i].device().index());
|
||||||
events[i].block(at::cuda::getCurrentCUDAStream());
|
events[i].block(at::cuda::getCurrentCUDAStream());
|
||||||
}
|
}
|
||||||
@ -1764,15 +1765,15 @@ class AsyncAllgatherCUDAWork : public AsyncAllgatherWork {
|
|||||||
// Kick off copy from CUDA tensors to pinned CPU tensors.
|
// Kick off copy from CUDA tensors to pinned CPU tensors.
|
||||||
tmpInputs.reserve(inputs.size());
|
tmpInputs.reserve(inputs.size());
|
||||||
at::cuda::OptionalCUDAStreamGuard guard;
|
at::cuda::OptionalCUDAStreamGuard guard;
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
guard.reset_stream(inputStreams[i]);
|
guard.reset_stream(inputStreams[i]);
|
||||||
tmpInputs.push_back(pinnedLike(inputs[i]).copy_(inputs[i], true));
|
tmpInputs.push_back(pinnedLike(inputs[i]).copy_(inputs[i], true));
|
||||||
}
|
}
|
||||||
|
|
||||||
tmpOutputs.resize(outputs.size());
|
tmpOutputs.resize(outputs.size());
|
||||||
for (size_t i = 0; i < outputs.size(); i++) {
|
for(const auto i : c10::irange(outputs.size())) {
|
||||||
tmpOutputs[i].reserve(outputs[i].size());
|
tmpOutputs[i].reserve(outputs[i].size());
|
||||||
for (size_t j = 0; j < outputs[i].size(); j++) {
|
for(const auto j : c10::irange(outputs[i].size())) {
|
||||||
tmpOutputs[i].push_back(pinnedLike(outputs[i][j]));
|
tmpOutputs[i].push_back(pinnedLike(outputs[i][j]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1781,12 +1782,12 @@ class AsyncAllgatherCUDAWork : public AsyncAllgatherWork {
|
|||||||
void run() override {
|
void run() override {
|
||||||
// Synchronize with copy operations.
|
// Synchronize with copy operations.
|
||||||
at::cuda::OptionalCUDAGuard device_guard;
|
at::cuda::OptionalCUDAGuard device_guard;
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
device_guard.set_index(inputs[i].device().index());
|
device_guard.set_index(inputs[i].device().index());
|
||||||
AT_CUDA_CHECK(cudaStreamSynchronize(inputStreams[i]));
|
AT_CUDA_CHECK(cudaStreamSynchronize(inputStreams[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < outputs.size(); i++) {
|
for(const auto i : c10::irange(outputs.size())) {
|
||||||
device_guard.set_index(outputs[i][0].device().index());
|
device_guard.set_index(outputs[i][0].device().index());
|
||||||
AT_CUDA_CHECK(cudaStreamSynchronize(outputStreams[i]));
|
AT_CUDA_CHECK(cudaStreamSynchronize(outputStreams[i]));
|
||||||
}
|
}
|
||||||
@ -1796,9 +1797,9 @@ class AsyncAllgatherCUDAWork : public AsyncAllgatherWork {
|
|||||||
|
|
||||||
// Kick off copy back to the CUDA tensors.
|
// Kick off copy back to the CUDA tensors.
|
||||||
at::cuda::OptionalCUDAStreamGuard stream_guard;
|
at::cuda::OptionalCUDAStreamGuard stream_guard;
|
||||||
for (size_t i = 0; i < outputs.size(); i++) {
|
for(const auto i : c10::irange(outputs.size())) {
|
||||||
stream_guard.reset_stream(outputStreams[i]);
|
stream_guard.reset_stream(outputStreams[i]);
|
||||||
for (size_t j = 0; j < outputs[i].size(); j++) {
|
for(const auto j : c10::irange(outputs[i].size())) {
|
||||||
outputs[i][j].copy_(tmpOutputs[i][j], /* non_blocking */ true);
|
outputs[i][j].copy_(tmpOutputs[i][j], /* non_blocking */ true);
|
||||||
}
|
}
|
||||||
outputEvents[i].record(outputStreams[i]);
|
outputEvents[i].record(outputStreams[i]);
|
||||||
@ -1808,7 +1809,7 @@ class AsyncAllgatherCUDAWork : public AsyncAllgatherWork {
|
|||||||
void synchronize() override {
|
void synchronize() override {
|
||||||
// Synchronize with the copy back to CUDA tensors.
|
// Synchronize with the copy back to CUDA tensors.
|
||||||
at::cuda::OptionalCUDAGuard guard;
|
at::cuda::OptionalCUDAGuard guard;
|
||||||
for (size_t i = 0; i < outputs.size(); i++) {
|
for(const auto i : c10::irange(outputs.size())) {
|
||||||
guard.set_index(outputs[i][0].device().index());
|
guard.set_index(outputs[i][0].device().index());
|
||||||
outputEvents[i].block(at::cuda::getCurrentCUDAStream());
|
outputEvents[i].block(at::cuda::getCurrentCUDAStream());
|
||||||
}
|
}
|
||||||
@ -1846,7 +1847,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::allgather(
|
|||||||
"requires input/output tensor lists to have the same length");
|
"requires input/output tensor lists to have the same length");
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < outputs.size(); i++) {
|
for(const auto i : c10::irange(outputs.size())) {
|
||||||
const auto expected = inputs.size() * getSize();
|
const auto expected = inputs.size() * getSize();
|
||||||
const auto actual = outputs[i].size();
|
const auto actual = outputs[i].size();
|
||||||
if (actual != expected) {
|
if (actual != expected) {
|
||||||
@ -2073,7 +2074,7 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
|
|||||||
|
|
||||||
// Unflatten into output tensors on root process.
|
// Unflatten into output tensors on root process.
|
||||||
if (context->rank == root) {
|
if (context->rank == root) {
|
||||||
for (size_t i = 0; i < outputs[0].size(); i++) {
|
for(const auto i : c10::irange(outputs[0].size())) {
|
||||||
outputs[0][i].copy_(flatOutputTensor[i]);
|
outputs[0][i].copy_(flatOutputTensor[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2106,15 +2107,15 @@ class AsyncGatherCUDAWork : public AsyncGatherWork {
|
|||||||
// Kick off copy from CUDA tensors to pinned CPU tensors.
|
// Kick off copy from CUDA tensors to pinned CPU tensors.
|
||||||
tmpInputs.reserve(inputs.size());
|
tmpInputs.reserve(inputs.size());
|
||||||
at::cuda::OptionalCUDAStreamGuard guard;
|
at::cuda::OptionalCUDAStreamGuard guard;
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
guard.reset_stream(inputStreams[i]);
|
guard.reset_stream(inputStreams[i]);
|
||||||
tmpInputs.push_back(pinnedLike(inputs[i]).copy_(inputs[i], true));
|
tmpInputs.push_back(pinnedLike(inputs[i]).copy_(inputs[i], true));
|
||||||
}
|
}
|
||||||
|
|
||||||
tmpOutputs.resize(outputs.size());
|
tmpOutputs.resize(outputs.size());
|
||||||
for (size_t i = 0; i < outputs.size(); i++) {
|
for(const auto i : c10::irange(outputs.size())) {
|
||||||
tmpOutputs[i].reserve(outputs[i].size());
|
tmpOutputs[i].reserve(outputs[i].size());
|
||||||
for (size_t j = 0; j < outputs[i].size(); j++) {
|
for(const auto j : c10::irange(outputs[i].size())) {
|
||||||
tmpOutputs[i].push_back(pinnedLike(outputs[i][j]));
|
tmpOutputs[i].push_back(pinnedLike(outputs[i][j]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2123,12 +2124,12 @@ class AsyncGatherCUDAWork : public AsyncGatherWork {
|
|||||||
void run() override {
|
void run() override {
|
||||||
// Synchronize with copy operations.
|
// Synchronize with copy operations.
|
||||||
at::cuda::OptionalCUDAGuard device_guard;
|
at::cuda::OptionalCUDAGuard device_guard;
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
device_guard.set_index(inputs[i].get_device());
|
device_guard.set_index(inputs[i].get_device());
|
||||||
AT_CUDA_CHECK(cudaStreamSynchronize(inputStreams[i]));
|
AT_CUDA_CHECK(cudaStreamSynchronize(inputStreams[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < outputs.size(); i++) {
|
for(const auto i : c10::irange(outputs.size())) {
|
||||||
device_guard.set_index(outputs[i][0].get_device());
|
device_guard.set_index(outputs[i][0].get_device());
|
||||||
AT_CUDA_CHECK(cudaStreamSynchronize(outputStreams[i]));
|
AT_CUDA_CHECK(cudaStreamSynchronize(outputStreams[i]));
|
||||||
}
|
}
|
||||||
@ -2138,9 +2139,9 @@ class AsyncGatherCUDAWork : public AsyncGatherWork {
|
|||||||
|
|
||||||
// Kick off copy back to the CUDA tensors.
|
// Kick off copy back to the CUDA tensors.
|
||||||
at::cuda::OptionalCUDAStreamGuard stream_guard;
|
at::cuda::OptionalCUDAStreamGuard stream_guard;
|
||||||
for (size_t i = 0; i < outputs.size(); i++) {
|
for(const auto i : c10::irange(outputs.size())) {
|
||||||
stream_guard.reset_stream(outputStreams[i]);
|
stream_guard.reset_stream(outputStreams[i]);
|
||||||
for (size_t j = 0; j < outputs[i].size(); j++) {
|
for(const auto j : c10::irange(outputs[i].size())) {
|
||||||
outputs[i][j].copy_(tmpOutputs[i][j], /* non_blocking */ true);
|
outputs[i][j].copy_(tmpOutputs[i][j], /* non_blocking */ true);
|
||||||
}
|
}
|
||||||
outputEvents[i].record(outputStreams[i]);
|
outputEvents[i].record(outputStreams[i]);
|
||||||
@ -2150,7 +2151,7 @@ class AsyncGatherCUDAWork : public AsyncGatherWork {
|
|||||||
void synchronize() override {
|
void synchronize() override {
|
||||||
// Synchronize with the copy back to CUDA tensors.
|
// Synchronize with the copy back to CUDA tensors.
|
||||||
at::cuda::OptionalCUDAGuard guard;
|
at::cuda::OptionalCUDAGuard guard;
|
||||||
for (size_t i = 0; i < outputs.size(); i++) {
|
for(const auto i : c10::irange(outputs.size())) {
|
||||||
guard.set_index(static_cast<at::DeviceIndex>(outputs[i][0].get_device()));
|
guard.set_index(static_cast<at::DeviceIndex>(outputs[i][0].get_device()));
|
||||||
outputEvents[i].block(at::cuda::getCurrentCUDAStream());
|
outputEvents[i].block(at::cuda::getCurrentCUDAStream());
|
||||||
}
|
}
|
||||||
@ -2301,10 +2302,10 @@ class AsyncScatterCUDAWork : public AsyncScatterWork {
|
|||||||
// Kick off copy from CUDA tensors to pinned CPU tensors.
|
// Kick off copy from CUDA tensors to pinned CPU tensors.
|
||||||
tmpInputs.resize(inputs.size());
|
tmpInputs.resize(inputs.size());
|
||||||
at::cuda::OptionalCUDAStreamGuard guard;
|
at::cuda::OptionalCUDAStreamGuard guard;
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
guard.reset_stream(inputStreams[i]);
|
guard.reset_stream(inputStreams[i]);
|
||||||
tmpInputs[i].reserve(inputs[i].size());
|
tmpInputs[i].reserve(inputs[i].size());
|
||||||
for (size_t j = 0; j < inputs[i].size(); j++) {
|
for(const auto j : c10::irange(inputs[i].size())) {
|
||||||
tmpInputs[i].push_back(
|
tmpInputs[i].push_back(
|
||||||
pinnedLike(inputs[i][j]).copy_(inputs[i][j], true));
|
pinnedLike(inputs[i][j]).copy_(inputs[i][j], true));
|
||||||
}
|
}
|
||||||
@ -2319,11 +2320,11 @@ class AsyncScatterCUDAWork : public AsyncScatterWork {
|
|||||||
void run() override {
|
void run() override {
|
||||||
// Synchronize with copy operations.
|
// Synchronize with copy operations.
|
||||||
at::cuda::OptionalCUDAGuard device_guard;
|
at::cuda::OptionalCUDAGuard device_guard;
|
||||||
for (size_t i = 0; i < inputs.size(); i++) {
|
for(const auto i : c10::irange(inputs.size())) {
|
||||||
device_guard.set_index(inputs[i][0].get_device());
|
device_guard.set_index(inputs[i][0].get_device());
|
||||||
AT_CUDA_CHECK(cudaStreamSynchronize(inputStreams[i]));
|
AT_CUDA_CHECK(cudaStreamSynchronize(inputStreams[i]));
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < outputs.size(); i++) {
|
for(const auto i : c10::irange(outputs.size())) {
|
||||||
device_guard.set_index(outputs[i].get_device());
|
device_guard.set_index(outputs[i].get_device());
|
||||||
AT_CUDA_CHECK(cudaStreamSynchronize(outputStreams[i]));
|
AT_CUDA_CHECK(cudaStreamSynchronize(outputStreams[i]));
|
||||||
}
|
}
|
||||||
@ -2333,7 +2334,7 @@ class AsyncScatterCUDAWork : public AsyncScatterWork {
|
|||||||
|
|
||||||
// Kick off copy back to the CUDA tensors.
|
// Kick off copy back to the CUDA tensors.
|
||||||
at::cuda::OptionalCUDAStreamGuard stream_guard;
|
at::cuda::OptionalCUDAStreamGuard stream_guard;
|
||||||
for (size_t i = 0; i < outputs.size(); i++) {
|
for(const auto i : c10::irange(outputs.size())) {
|
||||||
stream_guard.reset_stream(outputStreams[i]);
|
stream_guard.reset_stream(outputStreams[i]);
|
||||||
outputs[i].copy_(tmpOutputs[i], /* non_blocking */ true);
|
outputs[i].copy_(tmpOutputs[i], /* non_blocking */ true);
|
||||||
outputEvents[i].record(outputStreams[i]);
|
outputEvents[i].record(outputStreams[i]);
|
||||||
@ -2343,7 +2344,7 @@ class AsyncScatterCUDAWork : public AsyncScatterWork {
|
|||||||
void synchronize() override {
|
void synchronize() override {
|
||||||
// Synchronize with the copy back to CUDA tensors.
|
// Synchronize with the copy back to CUDA tensors.
|
||||||
at::cuda::OptionalCUDAGuard guard;
|
at::cuda::OptionalCUDAGuard guard;
|
||||||
for (size_t i = 0; i < outputs.size(); i++) {
|
for(const auto i : c10::irange(outputs.size())) {
|
||||||
guard.set_index(static_cast<at::DeviceIndex>(outputs[i].get_device()));
|
guard.set_index(static_cast<at::DeviceIndex>(outputs[i].get_device()));
|
||||||
outputEvents[i].block(at::cuda::getCurrentCUDAStream());
|
outputEvents[i].block(at::cuda::getCurrentCUDAStream());
|
||||||
}
|
}
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include <c10/util/irange.h>
|
||||||
#include <c10/util/Optional.h>
|
#include <c10/util/Optional.h>
|
||||||
#include <c10d/ProcessGroupNCCL.hpp>
|
#include <c10d/ProcessGroupNCCL.hpp>
|
||||||
|
|
||||||
@ -162,7 +163,7 @@ void syncStreams(
|
|||||||
std::string buildNcclUniqueIdStr(const ncclUniqueId& ncclID) {
|
std::string buildNcclUniqueIdStr(const ncclUniqueId& ncclID) {
|
||||||
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&ncclID);
|
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&ncclID);
|
||||||
std::ostringstream oss;
|
std::ostringstream oss;
|
||||||
for (size_t i = 0; i < NCCL_UNIQUE_ID_BYTES; i++) {
|
for(const auto i : c10::irange(NCCL_UNIQUE_ID_BYTES)) {
|
||||||
oss << std::hex << static_cast<int>(bytes[i]);
|
oss << std::hex << static_cast<int>(bytes[i]);
|
||||||
}
|
}
|
||||||
return oss.str();
|
return oss.str();
|
||||||
@ -1696,7 +1697,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall(
|
|||||||
std::vector<at::Tensor>& inputTensors,
|
std::vector<at::Tensor>& inputTensors,
|
||||||
const AllToAllOptions& /* unused */) {
|
const AllToAllOptions& /* unused */) {
|
||||||
auto device = outputTensors[0].device();
|
auto device = outputTensors[0].device();
|
||||||
for (size_t r = 0; r < outputTensors.size(); r++) {
|
for(const auto r : c10::irange(outputTensors.size())) {
|
||||||
check_gpu_single_tensor(outputTensors[r]);
|
check_gpu_single_tensor(outputTensors[r]);
|
||||||
check_gpu_single_tensor(inputTensors[r]);
|
check_gpu_single_tensor(inputTensors[r]);
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user