diff --git a/aten/src/ATen/core/op_registration/infer_schema.h b/aten/src/ATen/core/op_registration/infer_schema.h index 2266442451fa..57409442950f 100644 --- a/aten/src/ATen/core/op_registration/infer_schema.h +++ b/aten/src/ATen/core/op_registration/infer_schema.h @@ -37,8 +37,8 @@ constexpr int checkStaticTypes() { // Give nice error messages for some of the common error cases. // Use a LOUD ERROR MESSAGE SO USERS SEE THE STATIC_ASSERT static_assert(std::conjunction< - bool_t::value || std::is_same::value || std::is_same::value || std::is_same::value>... - >::value, "INVALID TYPE: Only int16_t, int64_t and bool are supported as an integral argument type"); + bool_t::value || std::is_same::value || std::is_same::value || std::is_same::value>... + >::value, "INVALID TYPE: Only int8_t, int64_t and bool are supported as an integral argument type"); static_assert(std::conjunction< bool_t::value>... >::value, "INVALID TYPE: float is not supported as an argument type, use double instead"); diff --git a/aten/src/ATen/cuda/jiterator.cu b/aten/src/ATen/cuda/jiterator.cu index 8db51cd6b9af..0a4ac757b1ad 100644 --- a/aten/src/ATen/cuda/jiterator.cu +++ b/aten/src/ATen/cuda/jiterator.cu @@ -43,7 +43,8 @@ static inline void launch_jitted_vectorized_kernel_dynamic( ss << static_cast(at::cuda::jit::BinaryFuncVariant::NoScalar); ss << extra_args_types; ss << vec_size; - ss << dev_idx; +// DeviceIndex, e.g. int8_t, is not treated as a number by the stream, cast to int as a workaround + ss << static_cast(dev_idx); const std::string cache_key = ss.str(); static std::mutex _jiterator_mutex; diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h index 7a4360f1ece5..9c22c35ee940 100644 --- a/aten/src/ATen/native/ForeachUtils.h +++ b/aten/src/ATen/native/ForeachUtils.h @@ -252,17 +252,10 @@ using IndicesT = std::vector; using nested_optional_tensorvec_t = std::vector>>; using TensorsAndIndicesT = std::pair; - -// Warning: Do not use ParamsHash for keys with potentially uninitialized -// padding bytes! -struct _DeviceDtypeHasher { - std::size_t operator()(const DeviceDtypeKey& k) const noexcept { - return std::hash{}(k.first) ^ - std::hash{}(k.second); - } -}; -using FlatMap = - std::unordered_map; +using FlatMap = std::unordered_map< + DeviceDtypeKey, + TensorsAndIndicesT, + ParamsHash>; inline FlatMap _group_tensors_by_first_tensors_device_and_dtype( const nested_optional_tensorvec_t& nested_tensorlist, diff --git a/aten/src/ATen/native/utils/ParamsHash.h b/aten/src/ATen/native/utils/ParamsHash.h index 7c1820ca87ee..6b7894cb8549 100644 --- a/aten/src/ATen/native/utils/ParamsHash.h +++ b/aten/src/ATen/native/utils/ParamsHash.h @@ -10,8 +10,6 @@ namespace at::native { // Fowler–Noll–Vo hash function // see // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function -// WARNING: This hash function will produce unexpected results for `Params` with uninitialized padding values, as the -// padding is also part of the hash. Use with caution. template struct ParamsHash { // Params must be a POD because we read out its memory diff --git a/c10/core/Device.cpp b/c10/core/Device.cpp index c9bc2a26f16b..7cc97d1a33ac 100644 --- a/c10/core/Device.cpp +++ b/c10/core/Device.cpp @@ -125,29 +125,19 @@ Device::Device(const std::string& device_string) : Device(Type::CPU) { TORCH_CHECK(!has_error, "Invalid device string: '", device_string, "'"); - if (!device_index_str.empty()) { - // If the user passed an index in the device string, check if it is a valid - // int between 0 and c10::Device::MAX_NUM_DEVICES - 1 inclusively - int full_index = -1; - try { - full_index = std::stoi(device_index_str); - } catch (const std::exception&) { - TORCH_CHECK( - false, - "Could not parse device index '", - device_index_str, - "' in device string '", - device_string, - "'"); + try { + if (!device_index_str.empty()) { + index_ = static_cast(std::stoi(device_index_str)); } + } catch (const std::exception&) { TORCH_CHECK( - 0 <= full_index && full_index < c10::Device::MAX_NUM_DEVICES, - "Device index must be between 0 and ", - c10::Device::MAX_NUM_DEVICES - 1, - " inclusively."); - index_ = static_cast(full_index); + false, + "Could not parse device index '", + device_index_str, + "' in device string '", + device_string, + "'"); } - type_ = parse_type(device_name); validate(); } diff --git a/c10/core/Device.h b/c10/core/Device.h index de1388c1261f..c58c03c9b9ad 100644 --- a/c10/core/Device.h +++ b/c10/core/Device.h @@ -16,7 +16,7 @@ namespace c10 { /// A DeviceIndex is not independently meaningful without knowing /// the DeviceType it is associated; try to use Device rather than /// DeviceIndex directly. -using DeviceIndex = int16_t; +using DeviceIndex = int8_t; /// Represents a compute device on which a tensor is located. A device is /// uniquely identified by a type, which specifies the type of machine it is @@ -29,18 +29,6 @@ using DeviceIndex = int16_t; /// represents a specific, concrete device, /// 2. When the device type is CPU, the device index must be zero. struct C10_API Device final { - /// The maximum number of devices that we recognize (formerly known as - /// C10_COMPILE_TIME_MAX_GPUS). This value cannot be more than 32767 because - /// our DeviceIndex is a int16_t. Note that this does not include the default - /// device index -1, but instead defines the range from 0 to MAX_NUM_DEVICES-1 - /// inclusively. -#ifdef FBCODE_CAFFE2 - // fbcode depends on this value being 16 - static constexpr DeviceIndex MAX_NUM_DEVICES = 16; -#else - static constexpr DeviceIndex MAX_NUM_DEVICES = 512; -#endif - using Type = DeviceType; /// Constructs a new `Device` from a `DeviceType` and an optional device @@ -72,7 +60,6 @@ struct C10_API Device final { /// Sets the device index. void set_index(DeviceIndex index) { index_ = index; - validate(); } /// Returns the type of device this is. @@ -188,10 +175,8 @@ struct C10_API Device final { // This is safe to do, because backends that use the DeviceIndex // have a later check when we actually try to switch to that device. TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - index_ >= -1 && index_ < MAX_NUM_DEVICES, - "Device index must be between -1 and ", - MAX_NUM_DEVICES - 1, - " inclusively, got ", + index_ >= -1, + "Device index must be -1 or non-negative, got ", static_cast(index_)); TORCH_INTERNAL_ASSERT_DEBUG_ONLY( !is_cpu() || index_ <= 0, @@ -211,7 +196,7 @@ struct hash { // Are you here because this static assert failed? Make sure you ensure // that the bitmasking code below is updated accordingly! static_assert(sizeof(c10::DeviceType) == 1, "DeviceType is not 8-bit"); - static_assert(sizeof(c10::DeviceIndex) == 2, "DeviceIndex is not 16-bit"); + static_assert(sizeof(c10::DeviceIndex) == 1, "DeviceIndex is not 8-bit"); // Note [Hazard when concatenating signed integers] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // We must first convert to a same-sized unsigned type, before promoting to @@ -224,7 +209,7 @@ struct hash { // sake. uint32_t bits = static_cast(static_cast(d.type())) << 16 | - static_cast(static_cast(d.index())); + static_cast(static_cast(d.index())); return std::hash{}(bits); } }; diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 692c7b82cf15..1653094ddb00 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -3169,7 +3169,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl { #if UINTPTR_MAX == 0xFFFFFFFF // This is a 32-bit system static constexpr bool check_sizes() { - constexpr size_t tsize = 21 * sizeof(int64_t); + constexpr size_t tsize = 20 * sizeof(int64_t); // clang-format off are_equal(); @@ -3181,7 +3181,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl { are_equal(); are_equal(); are_equal(); - are_equal(); + are_equal(); are_equal(); is_le(); // clang-format on @@ -3206,7 +3206,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl { are_equal(); are_equal(); are_equal(); - are_equal(); + are_equal(); are_equal(); is_le(); // clang-format on diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp index 7f9a5aa51450..f5bb7bb4486e 100644 --- a/c10/cuda/CUDAFunctions.cpp +++ b/c10/cuda/CUDAFunctions.cpp @@ -99,7 +99,7 @@ DeviceIndex device_count() noexcept { try { auto result = device_count_impl(/*fail_if_no_driver=*/false); TORCH_INTERNAL_ASSERT( - result <= c10::Device::MAX_NUM_DEVICES, + result <= std::numeric_limits::max(), "Too many CUDA devices, DeviceIndex overflowed"); return result; } catch (const c10::Error& ex) { @@ -118,7 +118,7 @@ DeviceIndex device_count_ensure_non_zero() { // Zero gpus doesn't produce a warning in `device_count` but we fail here TORCH_CHECK(count, "No CUDA GPUs are available"); TORCH_INTERNAL_ASSERT( - count <= c10::Device::MAX_NUM_DEVICES, + count <= std::numeric_limits::max(), "Too many CUDA devices, DeviceIndex overflowed"); return static_cast(count); } @@ -219,7 +219,8 @@ cudaError_t GetDevice(DeviceIndex* device) { auto err = cudaGetDevice(&tmp_device); if (err == cudaSuccess) { TORCH_INTERNAL_ASSERT( - tmp_device >= 0 && tmp_device < c10::Device::MAX_NUM_DEVICES, + tmp_device >= 0 && + tmp_device <= std::numeric_limits::max(), "cudaGetDevice returns invalid device ", tmp_device); *device = static_cast(tmp_device); @@ -269,7 +270,8 @@ DeviceIndex MaybeExchangeDevice(DeviceIndex to_device) { int tmp_cur_device = -1; C10_CUDA_CHECK(cudaGetDevice(&tmp_cur_device)); TORCH_INTERNAL_ASSERT( - tmp_cur_device >= 0 && tmp_cur_device < c10::Device::MAX_NUM_DEVICES, + tmp_cur_device >= 0 && + tmp_cur_device <= std::numeric_limits::max(), "cudaGetDevice returns invalid device ", tmp_cur_device); auto cur_device = static_cast(tmp_cur_device); @@ -295,7 +297,8 @@ cudaError_t GetDevice(DeviceIndex* device) { auto err = cudaGetDevice(&tmp_device); if (err == cudaSuccess) { TORCH_INTERNAL_ASSERT( - tmp_device >= 0 && tmp_device < c10::Device::MAX_NUM_DEVICES, + tmp_device >= 0 && + tmp_device <= std::numeric_limits::max(), "cudaGetDevice returns invalid device ", tmp_device); *device = static_cast(tmp_device); diff --git a/c10/cuda/CUDAMacros.h b/c10/cuda/CUDAMacros.h index f6f195fb3cc4..bd36477d5036 100644 --- a/c10/cuda/CUDAMacros.h +++ b/c10/cuda/CUDAMacros.h @@ -37,3 +37,15 @@ #else #define C10_CUDA_API C10_CUDA_IMPORT #endif + +/** + * The maximum number of GPUs that we recognizes. Increasing this beyond the + * initial limit of 16 broke Caffe2 testing, hence the ifdef guards. + * This value cannot be more than 255 because our DeviceIndex is a uint8_t. +o */ +#ifdef FBCODE_CAFFE2 +// fbcode depends on this value being 16 +#define C10_COMPILE_TIME_MAX_GPUS 16 +#else +#define C10_COMPILE_TIME_MAX_GPUS 64 +#endif diff --git a/c10/cuda/CUDAStream.cpp b/c10/cuda/CUDAStream.cpp index f6b635cf00ef..0569e355d3f5 100644 --- a/c10/cuda/CUDAStream.cpp +++ b/c10/cuda/CUDAStream.cpp @@ -38,18 +38,18 @@ static int max_stream_priorities; // the destruction. #if !defined(USE_ROCM) // CUDA-only: used to initializes the stream pools (once) -static c10::once_flag device_flags[c10::Device::MAX_NUM_DEVICES]; +static c10::once_flag device_flags[C10_COMPILE_TIME_MAX_GPUS]; #endif static std::atomic priority_counters[c10::cuda::max_compile_time_stream_priorities] - [c10::Device::MAX_NUM_DEVICES]; + [C10_COMPILE_TIME_MAX_GPUS]; static cudaStream_t streams[c10::cuda::max_compile_time_stream_priorities] - [c10::Device::MAX_NUM_DEVICES][kStreamsPerPool]; + [C10_COMPILE_TIME_MAX_GPUS][kStreamsPerPool]; #ifdef USE_ROCM static c10::once_flag stream_flags[c10::cuda::max_compile_time_stream_priorities] - [c10::Device::MAX_NUM_DEVICES][kStreamsPerPool]; + [C10_COMPILE_TIME_MAX_GPUS][kStreamsPerPool]; #endif // Note [HIP Lazy Streams] @@ -168,10 +168,10 @@ static void initGlobalStreamState() { // Check if the number of GPUs matches the expected compile-time max number // of GPUs. TORCH_CHECK( - num_gpus <= c10::Device::MAX_NUM_DEVICES, + num_gpus <= C10_COMPILE_TIME_MAX_GPUS, "Number of CUDA devices on the machine is larger than the compiled " "max number of gpus expected (", - c10::Device::MAX_NUM_DEVICES, + C10_COMPILE_TIME_MAX_GPUS, "). Increase that and recompile."); int leastPriority = -1, greatestPriority = -1; C10_CUDA_CHECK( diff --git a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc index 19342ff8fb24..4ffe5aedcfe4 100644 --- a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc +++ b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc @@ -224,8 +224,8 @@ std::pair, std::vector> ncclOpDevInfer( REGISTER_CUDA_OPERATOR(NCCLAllreduce, NCCLAllreduceOp); OPERATOR_SCHEMA(NCCLAllreduce) - .NumInputs(1, c10::Device::MAX_NUM_DEVICES) - .NumOutputs(1, c10::Device::MAX_NUM_DEVICES) + .NumInputs(1, C10_COMPILE_TIME_MAX_GPUS) + .NumOutputs(1, C10_COMPILE_TIME_MAX_GPUS) .CostInferenceFunction(NCCLAllreduceOp::CostInference) .TensorInferenceFunction(NCCLAllreduceOp::ShapeInference) .IdenticalTypeAndShape() @@ -236,8 +236,8 @@ SHOULD_NOT_DO_GRADIENT(NCCLAllreduce); REGISTER_CUDA_OPERATOR(NCCLBroadcast, NCCLBroadcastOp); OPERATOR_SCHEMA(NCCLBroadcast) - .NumInputs(1, c10::Device::MAX_NUM_DEVICES) - .NumOutputs(1, c10::Device::MAX_NUM_DEVICES) + .NumInputs(1, C10_COMPILE_TIME_MAX_GPUS) + .NumOutputs(1, C10_COMPILE_TIME_MAX_GPUS) .IdenticalTypeAndShape() .InputsCanCrossDevices() .EnforceOneToOneInplace() @@ -247,7 +247,7 @@ SHOULD_NOT_DO_GRADIENT(NCCLBroadcast); REGISTER_CUDA_OPERATOR(NCCLReduce, NCCLReduceOp); OPERATOR_SCHEMA(NCCLReduce) - .NumInputs(1, c10::Device::MAX_NUM_DEVICES) + .NumInputs(1, C10_COMPILE_TIME_MAX_GPUS) .NumOutputs(1) .IdenticalTypeAndShapeOfInput(0) .InputsCanCrossDevices() @@ -257,16 +257,16 @@ SHOULD_NOT_DO_GRADIENT(NCCLReduce); REGISTER_CUDA_OPERATOR(NCCLAllGather, NCCLAllGatherOp); OPERATOR_SCHEMA(NCCLAllGather) - .NumInputs(1, c10::Device::MAX_NUM_DEVICES) - .NumOutputs(1, c10::Device::MAX_NUM_DEVICES) + .NumInputs(1, C10_COMPILE_TIME_MAX_GPUS) + .NumOutputs(1, C10_COMPILE_TIME_MAX_GPUS) .InputsCanCrossDevices() .DeviceInferenceFunction(ncclOpDevInfer); SHOULD_NOT_DO_GRADIENT(NCCLAllGather); REGISTER_CUDA_OPERATOR(NCCLReduceScatter, NCCLReduceScatterOp); OPERATOR_SCHEMA(NCCLReduceScatter) - .NumInputs(1, c10::Device::MAX_NUM_DEVICES) - .NumOutputs(1, c10::Device::MAX_NUM_DEVICES) + .NumInputs(1, C10_COMPILE_TIME_MAX_GPUS) + .NumOutputs(1, C10_COMPILE_TIME_MAX_GPUS) .InputsCanCrossDevices() .DeviceInferenceFunction(ncclOpDevInfer); SHOULD_NOT_DO_GRADIENT(NCCLReduceScatter); diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu index 4558b5644916..6555b9732c9a 100644 --- a/caffe2/core/context_gpu.cu +++ b/caffe2/core/context_gpu.cu @@ -178,8 +178,8 @@ static std::unordered_map g_cuda_device_affiliation; // Data structures for optional memory tracking. Access to these structures // is guarded by the CUDAContext::mutex. static std::unordered_map g_size_map; -static std::vector g_total_by_gpu_map(c10::Device::MAX_NUM_DEVICES, 0); -static std::vector g_max_by_gpu_map(c10::Device::MAX_NUM_DEVICES, 0); +static std::vector g_total_by_gpu_map(C10_COMPILE_TIME_MAX_GPUS, 0); +static std::vector g_max_by_gpu_map(C10_COMPILE_TIME_MAX_GPUS, 0); static long g_total_mem = 0; static long g_last_rep = 0; @@ -208,10 +208,10 @@ static void Caffe2InitializeCuda() { // of GPUs. CAFFE_ENFORCE_LE( NumCudaDevices(), - c10::Device::MAX_NUM_DEVICES, + C10_COMPILE_TIME_MAX_GPUS, "Number of CUDA devices on the machine is larger than the compiled " "max number of gpus expected (", - c10::Device::MAX_NUM_DEVICES, + C10_COMPILE_TIME_MAX_GPUS, "). Increase that and recompile."); for (DeviceIndex i = 0; i < NumCudaDevices(); ++i) { diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h index 4da3d85526ed..8490a5002e5f 100644 --- a/caffe2/core/context_gpu.h +++ b/caffe2/core/context_gpu.h @@ -58,7 +58,7 @@ class CAFFE2_CUDA_API ThreadLocalCUDAObjects { private: ThreadLocalCUDAObjects() { - for (DeviceIndex i = 0; i < c10::Device::MAX_NUM_DEVICES; ++i) { + for (DeviceIndex i = 0; i < C10_COMPILE_TIME_MAX_GPUS; ++i) { cuda_streams_[i] = vector(); } } @@ -164,7 +164,7 @@ class CAFFE2_CUDA_API ThreadLocalCUDAObjects { // WARNING: mapping from logical stream ID to c10::cuda::CUDAStream // is NOT bijective; multiple logical stream IDs may map to the // same underlying stream ID. - vector cuda_streams_[c10::Device::MAX_NUM_DEVICES]; + vector cuda_streams_[C10_COMPILE_TIME_MAX_GPUS]; std::unordered_map cublas_handles_; #ifdef CAFFE2_USE_CUDNN std::unordered_map cudnn_handles_; diff --git a/caffe2/core/cudnn_wrappers.h b/caffe2/core/cudnn_wrappers.h index f31314198565..ce3d297bab65 100644 --- a/caffe2/core/cudnn_wrappers.h +++ b/caffe2/core/cudnn_wrappers.h @@ -188,7 +188,7 @@ class CuDNNWrapper { using PerGPUCuDNNStates = std::array< std::array, - c10::Device::MAX_NUM_DEVICES>; + C10_COMPILE_TIME_MAX_GPUS>; static PerGPUCuDNNStates& cudnn_states(); C10_DISABLE_COPY_AND_ASSIGN(CuDNNWrapper); diff --git a/caffe2/core/hip/miopen_wrapper.h b/caffe2/core/hip/miopen_wrapper.h index 653a1e49eb99..f60bed6c277d 100644 --- a/caffe2/core/hip/miopen_wrapper.h +++ b/caffe2/core/hip/miopen_wrapper.h @@ -155,7 +155,7 @@ class MIOPENWrapper using PerGPUMIOPENStates = std::array< std::array, - c10::Device::MAX_NUM_DEVICES>; + C10_COMPILE_TIME_MAX_GPUS>; static PerGPUMIOPENStates& miopen_states(); C10_DISABLE_COPY_AND_ASSIGN(MIOPENWrapper); diff --git a/test/test_jit.py b/test/test_jit.py index 863ebd9e68fb..4d4fae3bdaf8 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -3440,21 +3440,6 @@ def foo(x): else: cu.define(full) - def test_int16_device_index(self): - # This used to fail after the switch from int8 to int16 DeviceIndex as the ArgumentInfo struct hardcoded - # the bit width. Thus, the default device (-1) wrapped around to 255. - # See https://github.com/pytorch/pytorch/issues/115331 - tensor = torch.tensor([1.]) - code_template = """ - def fn(x): - return x.device - """ - cu = torch.jit.CompilationUnit() - cu.define(code_template) - res = cu.fn(tensor) - self.assertEqual(tensor.device, res) - - def test_namedtuple_python(self): global MyTuple, MyMod # see [local resolution in python] MyTuple = namedtuple('MyTuple', ['a']) diff --git a/test/test_utils.py b/test/test_utils.py index a53e4c7370e3..a642353100a5 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1017,22 +1017,6 @@ class TestDeviceUtils(TestCase): tree_all_only(torch.Tensor, lambda x: x.device.type == 'meta', r) ) - def test_int16_device_index(self): - # Test if index does not wrap around when larger than int8 - large_index = 500 - x = torch.device('meta', large_index) - self.assertEqual(x.index, large_index) - - def test_raise_on_device_index_out_of_bounds(self): - # Tests if an error is raised when the device index is out of bounds - index_larger_than_max = 100000 - error_msg_regex = "^Device index must be.*" - # Explicit index - with self.assertRaisesRegex(RuntimeError, error_msg_regex): - x = torch.device('meta', index=index_larger_than_max) - # Index in device string - with self.assertRaisesRegex(RuntimeError, error_msg_regex): - x = torch.device(f'meta:{index_larger_than_max}') instantiate_device_type_tests(TestDeviceUtils, globals()) diff --git a/torch/csrc/Device.cpp b/torch/csrc/Device.cpp index 50be4e95e5d1..aaf04fb4b33d 100644 --- a/torch/csrc/Device.cpp +++ b/torch/csrc/Device.cpp @@ -31,7 +31,10 @@ PyObject* THPDevice_repr(THPDevice* self) { std::ostringstream oss; oss << "device(type=\'" << self->device.type() << "\'"; if (self->device.has_index()) { - oss << ", index=" << self->device.index(); + // `self->device.index()` returns uint8_t which is treated as ascii while + // printing, hence casting it to uint16_t. + // https://stackoverflow.com/questions/19562103/uint8-t-cant-be-printed-with-cout + oss << ", index=" << static_cast(self->device.index()); } oss << ")"; return THPUtils_packString(oss.str().c_str()); @@ -74,11 +77,7 @@ PyObject* THPDevice_pynew( device_index = r.toInt64(1); // -1 is allowed in ATen/C++, to mean the default device, but not in // Python. - TORCH_CHECK( - device_index >= 0 && device_index < c10::Device::MAX_NUM_DEVICES, - "Device index must be between 0 and ", - c10::Device::MAX_NUM_DEVICES - 1, - " inclusively."); + TORCH_CHECK(device_index >= 0, "Device index must not be negative"); } at::Device device( as_device.type(), static_cast(device_index)); diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index b36e7d501b08..1f7a7e47ac3d 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -2028,10 +2028,23 @@ Call this whenever a new thread is created in order to propagate values from // torch/csrc/pybind.h` would solve this but it caused segmentation fault in // my environment. using _DeviceDtypeKey = std::pair; + // Custom hasher is necessary to make unordered_map compilable for Windows + // debug targets. As `at::native::ParamsHash` only works on structs with + // standard layout, but std::string isn't one in Visual C++ debug builds, + // which one can easily verify by running something like: + // #define _DEBUG + // #include + // #include + // static_assert(std::is_standard_layout_v, "Oh noes"); + // If above condition is not met, VC++ raises a very cryptic compilation + // error. See + // https://github.com/pytorch/pytorch/pull/100007#discussion_r1227116292 for + // more detail struct _DeviceDtypeHasher { std::size_t operator()(const _DeviceDtypeKey& k) const noexcept { - return std::hash{}(k.first) ^ - std::hash{}(k.second); + static at::native::ParamsHash device_hasher; + static std::hash string_hasher; + return device_hasher(k.first) ^ string_hasher(k.second); } }; using _FlatMap = std::unordered_map< diff --git a/torch/csrc/cuda/device_set.h b/torch/csrc/cuda/device_set.h index 936d9d481c8d..c533dae3baad 100644 --- a/torch/csrc/cuda/device_set.h +++ b/torch/csrc/cuda/device_set.h @@ -1,12 +1,11 @@ #pragma once -#include #include #include #include namespace torch { -using device_set = std::bitset; +using device_set = std::bitset; } // namespace torch diff --git a/torch/csrc/jit/runtime/argument_spec.h b/torch/csrc/jit/runtime/argument_spec.h index dd860bad551d..06c77edca718 100644 --- a/torch/csrc/jit/runtime/argument_spec.h +++ b/torch/csrc/jit/runtime/argument_spec.h @@ -2,7 +2,6 @@ #include #include -#include #include #include #include @@ -57,10 +56,12 @@ struct ArgumentInfo { private: unsigned defined_ : 1; unsigned requires_grad_ : 1; + unsigned : 5; unsigned dim_ : 8; - signed device_ : sizeof(c10::DeviceIndex) * 8; + unsigned device_ : 8; unsigned type_ : 8; unsigned dev_type_ : 16; + unsigned : 16; }; static_assert( @@ -68,7 +69,7 @@ static_assert( "ArgumentInfo is to be a POD struct"); static_assert( sizeof(ArgumentInfo) == sizeof(ArgumentInfo::plain_data_type), - "ArgumentInfo is expected to be a 64-bit struct"); + "ArgumentInfo is expected to be a 32-bit struct"); struct ArgumentSpec { ArgumentSpec(size_t num_flat_tensor_inputs, size_t num_flat_optional_inputs) @@ -222,8 +223,8 @@ struct CompleteArgumentInfoPOD { unsigned type : 8; // scalar type unsigned defined : 1; unsigned requires_grad : 1; - signed dev_type : sizeof(c10::DeviceType) * 8; - signed device : sizeof(c10::DeviceIndex) * 8; + signed device : 14; + unsigned dev_type : 16; unsigned total_dims : 16; // all TensorInfoPODs are in CompleteArgumentSpec's // tensor_info() array. total_dims is the total number of diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index 9cfb663b1d4f..cec99a843301 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -807,11 +807,7 @@ inline at::Device toDevice(PyObject* obj) { } if (THPUtils_checkLong(obj)) { const auto device_index = THPUtils_unpackLong(obj); - TORCH_CHECK( - device_index >= 0 && device_index < c10::Device::MAX_NUM_DEVICES, - "Device index must be between 0 and ", - c10::Device::MAX_NUM_DEVICES - 1, - " inclusively."); + TORCH_CHECK(device_index >= 0, "Device index must not be negative"); if (c10::is_privateuse1_backend_registered()) { return at::Device( c10::DeviceType::PrivateUse1, diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py index 47a58129455c..25495f0bf888 100644 --- a/torch/testing/_internal/distributed/rpc/rpc_test.py +++ b/torch/testing/_internal/distributed/rpc/rpc_test.py @@ -5258,12 +5258,12 @@ class TensorPipeAgentCudaRpcTest(RpcAgentTestFixture, RpcTestCommon): options = self.rpc_backend_options dst = worker_name((self.rank + 1) % self.world_size) with self.assertRaisesRegex( - RuntimeError, "Device index must .*" + RuntimeError, "Device index must not be negative" ): options.set_device_map(dst, {-1: 0}) with self.assertRaisesRegex( - RuntimeError, "Device index must .*" + RuntimeError, "Device index must not be negative" ): options.set_device_map(dst, {0: -1})