From a6630e25afb49642cf517ee948f5a0c06a9cba6d Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Sat, 22 Sep 2018 18:07:38 -0700
Subject: [PATCH] Remove many caffe2::TIndex and replace them with int64_t
 (#11943)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11943

See title

Reviewed By: ezyang

Differential Revision: D9992645

fbshipit-source-id: e8f80d6ea762971513e5e8072975ceea53e1f11a
---
 binaries/core_overhead_benchmark_gpu.cc       |   2 +-
 caffe2/contrib/aten/aten_op_template.h        |   2 +-
 caffe2/contrib/gloo/allgather_ops.h           |   2 +-
 caffe2/contrib/nccl/cuda_nccl_gpu.cc          |   4 +-
 caffe2/contrib/tensorrt/tensorrt_op_trt.cc    |  14 +-
 caffe2/contrib/tensorrt/tensorrt_op_trt.h     |   4 +-
 caffe2/core/blob_serialization.cc             |   6 +-
 caffe2/core/blob_test.cc                      |  10 +-
 caffe2/core/logging.h                         |   2 +-
 caffe2/core/operator.cc                       |   4 +-
 caffe2/core/operator.h                        |   6 +-
 caffe2/core/operator_schema.cc                |  14 +-
 caffe2/core/operator_schema.h                 |  12 +-
 caffe2/core/qtensor.h                         |   8 +-
 caffe2/core/tensor.cc                         |   2 +-
 caffe2/core/tensor.h                          |  30 +--
 caffe2/core/tensor_impl.h                     |  70 +++----
 caffe2/cuda_rtc/pool_op_rtc_gpu.cc            |   4 +-
 .../operators/fully_connected_op_prune.h      |  12 +-
 .../operators/fully_connected_op_sparse.h     |   8 +-
 caffe2/experiments/operators/funhash_op.h     |  60 +++---
 .../experiments/operators/sparse_funhash_op.h |  70 +++----
 .../operators/sparse_matrix_reshape_op.h      |  20 +-
 .../experiments/operators/tt_contraction_op.h |  54 ++---
 caffe2/experiments/operators/tt_pad_op.h      |  12 +-
 caffe2/ideep/operators/concat_split_op.cc     |   2 +-
 caffe2/ideep/operators/conv_pool_base_op.h    |   2 +-
 caffe2/ideep/operators/squeeze_op.cc          |   2 +-
 caffe2/image/image_input_op.h                 |  14 +-
 caffe2/mkl/mkl_utils_test.cc                  |  10 +-
 caffe2/mkl/mklmemory_serialization.cc         |   4 +-
 caffe2/mkl/operators/concat_op.cc             |   2 +-
 caffe2/mkl/operators/conv_op.cc               |  16 +-
 caffe2/mkl/operators/conv_op_mkldnn.cc        |   4 +-
 caffe2/mkl/operators/elementwise_sum_op.cc    |   2 +-
 caffe2/mkl/operators/fully_connected_op.cc    |   4 +-
 .../local_response_normalization_op.cc        |   2 +-
 caffe2/mkl/operators/packed_fc_op.cc          |   2 +-
 caffe2/mkl/operators/pool_op.cc               |   4 +-
 caffe2/mkl/operators/relu_op.cc               |   2 +-
 caffe2/mkl/operators/spatial_batch_norm_op.cc |   2 +-
 caffe2/mkl/operators/squeeze_op.cc            |   2 +-
 caffe2/mkl/utils/mkl_memory.cc                |   2 +-
 caffe2/mkl/utils/mkl_memory.h                 |  18 +-
 caffe2/mkl/utils/mkl_operator.h               |   2 +-
 .../mobile/contrib/arm-compute/core/context.h |  18 +-
 .../operators/fully_connected_op.cc           |   2 +-
 .../contrib/arm-compute/operators/pool_op.cc  |   4 +-
 .../arm-compute/operators/resize_op.cc        |   2 +-
 caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm    |   8 +-
 .../mobile/contrib/ios/mpscnn/mpscnn_test.mm  |   2 +-
 caffe2/mobile/contrib/ios/pool_test.cc        |   4 +-
 caffe2/mobile/contrib/ios/resize_test.cc      |   4 +-
 .../contrib/opengl/test/TestGLConvolution.cc  |  18 +-
 .../mobile/contrib/snpe/snpe_op_benchmark.cc  |   8 +-
 caffe2/mobile/contrib/ulp2/ulp_neon.cc        |   2 +-
 caffe2/mobile/contrib/ulp2/ulp_test.cc        |   8 +-
 caffe2/mpi/mpi_ops.h                          |   2 +-
 caffe2/operators/accuracy_op.cc               |   2 +-
 caffe2/operators/accuracy_op.cu               |   2 +-
 caffe2/operators/arg_ops.cc                   |  10 +-
 caffe2/operators/arg_ops.cu                   |   8 +-
 caffe2/operators/arg_ops.h                    |   6 +-
 caffe2/operators/assert_op.h                  |   2 +-
 caffe2/operators/atomic_ops.cc                |   4 +-
 caffe2/operators/batch_box_cox_op.cc          |  26 +--
 caffe2/operators/batch_box_cox_op.h           |   8 +-
 caffe2/operators/batch_bucketize_op.cc        |  12 +-
 caffe2/operators/batch_gather_ops.cu          |   2 +-
 caffe2/operators/batch_gather_ops.h           |   2 +-
 caffe2/operators/batch_matmul_op.cc           |  10 +-
 caffe2/operators/batch_matmul_op.h            |   2 +-
 caffe2/operators/batch_matmul_op_gpu_test.cc  |  16 +-
 caffe2/operators/batch_matmul_op_test.cc      |  16 +-
 caffe2/operators/batch_sparse_to_dense_op.cc  |  36 ++--
 caffe2/operators/batch_sparse_to_dense_op.h   |   6 +-
 caffe2/operators/bbox_transform_op.cc         |   2 +-
 caffe2/operators/boolean_mask_ops.cc          |   8 +-
 caffe2/operators/boolean_mask_ops.cu          |  34 ++--
 caffe2/operators/boolean_unmask_ops_test.cc   |   4 +-
 caffe2/operators/cast_op.cc                   |   2 +-
 caffe2/operators/cast_op.h                    |   2 +-
 caffe2/operators/concat_split_op.h            |   8 +-
 caffe2/operators/conditional_op.cc            |   2 +-
 caffe2/operators/conv_op_cache_cudnn.h        |  10 +-
 caffe2/operators/conv_op_cache_cudnn_test.cc  |  14 +-
 caffe2/operators/conv_op_cudnn.cc             |   4 +-
 caffe2/operators/conv_op_eigen.cc             |  16 +-
 caffe2/operators/conv_op_impl.h               |   6 +-
 caffe2/operators/conv_pool_op_base.h          |   6 +-
 caffe2/operators/conv_transpose_op_cudnn.cc   |   4 +-
 caffe2/operators/conv_transpose_op_impl.h     |  12 +-
 .../conv_transpose_op_mobile_test.cc          |  10 +-
 caffe2/operators/cross_entropy_op.cc          |  18 +-
 caffe2/operators/cross_entropy_op.cu          |  10 +-
 .../operators/ctc_beam_search_decoder_op.cc   |   4 +-
 caffe2/operators/ctc_greedy_decoder_op.cc     |   4 +-
 caffe2/operators/dataset_ops.cc               |  16 +-
 caffe2/operators/dataset_ops.h                |   8 +-
 caffe2/operators/deform_conv_op.cu            |  16 +-
 caffe2/operators/deform_conv_op.h             |  12 +-
 caffe2/operators/deform_conv_op_impl.h        |   8 +-
 caffe2/operators/distance_op.cc               |   2 +-
 caffe2/operators/distance_op.cu               |   4 +-
 caffe2/operators/dropout_op_cudnn.cc          |   4 +-
 caffe2/operators/elementwise_op_test.h        |   2 +-
 caffe2/operators/elementwise_ops_schema.cc    |   2 +-
 caffe2/operators/expand_squeeze_dims_op.h     |   2 +-
 .../experimental/c10/cpu/averaged_loss_cpu.cc |   3 +-
 .../experimental/c10/cpu/batch_gather_cpu.cc  |   3 +-
 .../experimental/c10/cpu/batch_matmul_cpu.cc  |   3 +-
 .../experimental/c10/cpu/cast_cpu.cc          |   3 +-
 .../experimental/c10/cpu/concat_cpu.cc        |   5 +-
 .../experimental/c10/cpu/filler_cpu.cc        |   7 +-
 .../sigmoid_cross_entropy_with_logits_cpu.cc  |   5 +-
 .../c10/cpu/sparse_lengths_sum_cpu.cc         |   7 +-
 .../operators/experimental/c10/schemas/fc.h   |   2 +-
 caffe2/operators/extend_tensor_op.cc          |   2 +-
 caffe2/operators/filler_op.cc                 |   2 +-
 caffe2/operators/filler_op.cu                 |   4 +-
 caffe2/operators/filler_op.h                  |  24 +--
 caffe2/operators/flatten_op.cc                |   4 +-
 caffe2/operators/flexible_top_k.cc            |  46 ++---
 caffe2/operators/fully_connected_op.h         |   2 +-
 .../fused_rowwise_8bit_conversion_ops.h       |   4 +-
 .../fused_rowwise_random_quantization_ops.cc  |   8 +-
 .../operators/gather_fused_8bit_rowwise_op.h  |   2 +-
 caffe2/operators/gather_ranges_to_dense_op.h  |   2 +-
 caffe2/operators/generate_proposals_op.cc     |   6 +-
 .../operators/generate_proposals_op_test.cc   |  50 ++---
 caffe2/operators/glu_op.h                     |   2 +-
 caffe2/operators/half_float_ops.h             |   4 +-
 .../local_response_normalization_op_miopen.cc |   4 +-
 caffe2/operators/hip/relu_op_miopen.cc        |   4 +-
 caffe2/operators/hip/softmax_op_miopen.cc     |   4 +-
 .../hip/spatial_batch_norm_op_miopen.cc       |   4 +-
 caffe2/operators/im2col_op.h                  |   4 +-
 caffe2/operators/index_hash_ops.cc            |   2 +-
 caffe2/operators/index_ops.cc                 |  24 +--
 caffe2/operators/integral_image_op.cc         |   2 +-
 caffe2/operators/integral_image_op.cu         |   4 +-
 caffe2/operators/is_empty_op.h                |   2 +-
 caffe2/operators/layer_norm_op.cc             |   2 +-
 caffe2/operators/layer_norm_op.cu             |   4 +-
 caffe2/operators/lengths_pad_op.h             |   2 +-
 .../lengths_reducer_fused_8bit_rowwise_ops.h  |   2 +-
 caffe2/operators/lengths_reducer_ops.h        |   6 +-
 .../lengths_reducer_rowwise_8bit_ops.h        |  14 +-
 caffe2/operators/lengths_tile_op.cc           |   2 +-
 caffe2/operators/lengths_tile_op.cu           |   2 +-
 caffe2/operators/lengths_top_k_op.cc          |  12 +-
 .../local_response_normalization_op.cc        |  10 +-
 .../local_response_normalization_op_cudnn.cc  |   4 +-
 caffe2/operators/lpnorm_op.cc                 |   4 +-
 caffe2/operators/map_ops.h                    |   2 +-
 caffe2/operators/matmul_op.h                  |   2 +-
 caffe2/operators/numpy_tile_op.h              |   2 +-
 caffe2/operators/one_hot_ops.cc               |  42 ++--
 caffe2/operators/one_hot_ops.cu               |  12 +-
 caffe2/operators/one_hot_ops.h                |  12 +-
 caffe2/operators/onnx_while_op.h              |   6 +-
 caffe2/operators/onnxifi_op.cc                |   2 +-
 caffe2/operators/onnxifi_op.h                 |   6 +-
 .../operators/operator_fallback_gpu_test.cc   |   4 +-
 caffe2/operators/order_switch_ops.cc          |   4 +-
 caffe2/operators/order_switch_ops.cu          |   4 +-
 caffe2/operators/pack_rnn_sequence_op.h       |   4 +-
 caffe2/operators/pack_segments.cc             |  14 +-
 caffe2/operators/pack_segments.h              |   4 +-
 caffe2/operators/partition_ops.h              |  20 +-
 caffe2/operators/perplexity_op.cc             |   2 +-
 caffe2/operators/perplexity_op.cu             |   2 +-
 .../piecewise_linear_transform_op.cu          |  34 ++--
 .../operators/piecewise_linear_transform_op.h |  58 +++---
 caffe2/operators/pool_op_cudnn.cu             |   4 +-
 caffe2/operators/reducer_functors.h           |  80 ++++----
 caffe2/operators/reduction_front_back_ops.h   |  12 +-
 caffe2/operators/reduction_ops.h              |   6 +-
 caffe2/operators/replace_nan_op.cc            |   4 +-
 caffe2/operators/replace_nan_op.cu            |   4 +-
 caffe2/operators/replace_nan_op.h             |   2 +-
 caffe2/operators/reshape_op_gpu_test.cc       |   4 +-
 caffe2/operators/reverse_packed_segs_op.h     |   4 +-
 .../operators/rnn/hip/recurrent_op_miopen.h   |   2 +-
 .../rnn/recurrent_network_blob_fetcher_op.h   |   2 +-
 caffe2/operators/rnn/recurrent_network_op.h   |   2 +-
 caffe2/operators/rnn/recurrent_op_cudnn.h     |   2 +-
 caffe2/operators/roi_align_op_gpu_test.cc     |  16 +-
 caffe2/operators/segment_reduction_op.h       | 190 +++++++++---------
 caffe2/operators/segment_reduction_op_gpu.cu  |  26 +--
 caffe2/operators/sequence_ops.cc              |   2 +-
 caffe2/operators/sequence_ops.cu              |   2 +-
 caffe2/operators/sequence_ops.h               |  10 +-
 caffe2/operators/shape_op.h                   |  10 +-
 caffe2/operators/slice_op.cu                  |  24 +--
 caffe2/operators/slice_op.h                   |  24 +--
 caffe2/operators/softmax_op_cudnn.cc          |   4 +-
 caffe2/operators/softmax_ops.cu               |   4 +-
 caffe2/operators/softmax_with_loss_op.cc      |   2 +-
 caffe2/operators/sparse_to_dense_mask_op.h    |   8 +-
 caffe2/operators/sparse_to_dense_op.cu        |   2 +-
 .../operators/spatial_softmax_with_loss_op.cc |   2 +-
 caffe2/operators/text_file_reader.cc          |   2 +-
 caffe2/operators/tile_op.h                    |   4 +-
 caffe2/operators/top_k.cc                     | 112 +++++------
 caffe2/operators/top_k.cu                     | 114 +++++------
 caffe2/operators/transpose_op.h               |   2 +-
 caffe2/operators/utility_ops.cu               |  38 ++--
 caffe2/operators/utility_ops.h                |  34 ++--
 caffe2/operators/utility_ops_gpu_test.cc      |   4 +-
 caffe2/operators/utility_ops_test.cc          |   4 +-
 caffe2/opt/onnxifi_transformer.cc             |   2 +-
 caffe2/perfkernels/embedding_lookup.cc        |  28 +--
 caffe2/perfkernels/embedding_lookup.h         |   8 +-
 caffe2/perfkernels/embedding_lookup_avx2.cc   | 156 +++++++-------
 ...mbedding_lookup_fused_8bit_rowwise_avx2.cc | 156 +++++++-------
 .../fused_8bit_rowwise_embedding_lookup.cc    |  30 +--
 .../fused_8bit_rowwise_embedding_lookup.h     |   8 +-
 caffe2/perfkernels/hp_emblookup_codegen.py    |  10 +-
 caffe2/predictor/predictor_test.cc            |   2 +-
 caffe2/python/pybind_state.cc                 |  10 +-
 caffe2/python/pybind_state.h                  |   2 +-
 caffe2/python/pybind_state_dlpack.h           |   2 +-
 caffe2/python/pybind_state_gpu.cc             |   2 +-
 caffe2/python/pybind_state_hip.cc             |   2 +-
 caffe2/python/pybind_state_mkl.cc             |   2 +-
 caffe2/queue/rebatching_queue.cc              |   2 +-
 caffe2/sgd/ftrl_op.cc                         |  12 +-
 caffe2/sgd/lars_op.h                          |   4 +-
 caffe2/sgd/learning_rate_op.h                 |   2 +-
 .../depthwise/depthwise3x3_conv_op_test.cc    |   8 +-
 caffe2/share/contrib/nnpack/nnpack_test.cc    |   8 +-
 .../contrib/zstd/quant_decomp_zstd_op.cc      |   2 +-
 caffe2/utils/filler.h                         |  10 +-
 caffe2/utils/hip/math_hip.cc                  |  12 +-
 caffe2/utils/math_cpu.cc                      |   6 +-
 caffe2/utils/math_gpu.cu                      |  14 +-
 caffe2/utils/math_gpu_test.cc                 |   6 +-
 caffe2/utils/math_test.cc                     |   6 +-
 caffe2/utils/smart_tensor_printer_test.cc     |   2 +-
 caffe2/video/video_input_op.h                 |   8 +-
 modules/detectron/sample_as_op.cu             |   2 +-
 modules/detectron/select_smooth_l1_loss_op.cu |   2 +-
 .../sigmoid_cross_entropy_loss_op.cu          |   6 +-
 modules/detectron/sigmoid_focal_loss_op.cu    |   2 +-
 modules/detectron/smooth_l1_loss_op.cu        |   2 +-
 modules/detectron/softmax_focal_loss_op.cu    |   2 +-
 modules/detectron/upsample_nearest_op.cu      |   2 +-
 248 files changed, 1446 insertions(+), 1454 deletions(-)

diff --git a/binaries/core_overhead_benchmark_gpu.cc b/binaries/core_overhead_benchmark_gpu.cc
index 018880432d4b..e024e4ddc9fa 100644
--- a/binaries/core_overhead_benchmark_gpu.cc
+++ b/binaries/core_overhead_benchmark_gpu.cc
@@ -139,7 +139,7 @@ BENCHMARK(BM_cudaStreamWaitEventThenStreamSynchronize);
 
 static void BM_CudaPointerAffinity(benchmark::State& state) {
   CAFFE2_SKIP_IF_NO_GPU;
-  Tensor tensor(vector<TIndex>{1, 2, 3, 4}, CUDA);
+  Tensor tensor(vector<int64_t>{1, 2, 3, 4}, CUDA);
   float* ptr = tensor.mutable_data<float>();
   while (state.KeepRunning()) {
     volatile int id = GetGPUIDForPointer(ptr);
diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h
index d01c1240aae6..b6d31268db0f 100644
--- a/caffe2/contrib/aten/aten_op_template.h
+++ b/caffe2/contrib/aten/aten_op_template.h
@@ -144,7 +144,7 @@ private:
   }
   template <typename T>
   void assignToValue(Tensor* dst, T v) {
-    dst->Resize(std::vector<TIndex>());
+    dst->Resize(std::vector<int64_t>());
     math::Set(1, v, dst->template mutable_data<T>(), &context_);
   }
   int findImplementation(const OperatorDef& operator_def) {
diff --git a/caffe2/contrib/gloo/allgather_ops.h b/caffe2/contrib/gloo/allgather_ops.h
index 044357cd06ae..1f55233a095c 100644
--- a/caffe2/contrib/gloo/allgather_ops.h
+++ b/caffe2/contrib/gloo/allgather_ops.h
@@ -75,7 +75,7 @@ class AllgatherOp final : public Operator<Context> {
     auto comm_size =
         OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0)->size;
     const auto dims =
-        std::vector<TIndex>(1, (InputSize() - 1) * Input(1).size() * comm_size);
+        std::vector<int64_t>(1, (InputSize() - 1) * Input(1).size() * comm_size);
     Output(0)->Resize(dims);
 
     // Store which inputs/outputs this instance initialized with
diff --git a/caffe2/contrib/nccl/cuda_nccl_gpu.cc b/caffe2/contrib/nccl/cuda_nccl_gpu.cc
index 603281b30be8..b544445a2687 100644
--- a/caffe2/contrib/nccl/cuda_nccl_gpu.cc
+++ b/caffe2/contrib/nccl/cuda_nccl_gpu.cc
@@ -269,7 +269,7 @@ void NCCL<T>::AllGather(const NCCLExecution& ex) {
       ex,
       [n](const NCCLElement& ctx) {
         CAFFE_ENFORCE_NE(ctx.src, ctx.dst);
-        std::vector<TIndex> dims;
+        std::vector<int64_t> dims;
         dims.reserve(ctx.src->ndim() + 1);
         dims.push_back(n);
         for (auto d : ctx.src->dims()) {
@@ -307,7 +307,7 @@ void NCCL<T>::ReduceScatter(const NCCLExecution& ex) {
       [](const NCCLElement& ctx) {
         CAFFE_ENFORCE_NE(ctx.src, ctx.dst);
         const auto& srcDims = ctx.src->dims();
-        std::vector<TIndex> dstDims(srcDims.begin() + 1, srcDims.end());
+        std::vector<int64_t> dstDims(srcDims.begin() + 1, srcDims.end());
         ctx.dst->Resize(dstDims);
         ctx.dst->template mutable_data<T>();
       },
diff --git a/caffe2/contrib/tensorrt/tensorrt_op_trt.cc b/caffe2/contrib/tensorrt/tensorrt_op_trt.cc
index 0d0ddc49b6cb..260d2efbe8cd 100644
--- a/caffe2/contrib/tensorrt/tensorrt_op_trt.cc
+++ b/caffe2/contrib/tensorrt/tensorrt_op_trt.cc
@@ -15,7 +15,7 @@ namespace {
 // Otherwise, return the product of CHW dimensions
 int64_t CheckDims(
     const nvinfer1::Dims& nv_dims,
-    const std::vector<TIndex>& c2_dims) {
+    const std::vector<int64_t>& c2_dims) {
   if (nv_dims.nbDims + 1 != c2_dims.size()) {
     CAFFE_THROW(
         "Mismatched dimensions between TRT input (",
@@ -115,7 +115,7 @@ TensorRTOp::TensorRTOp(const OperatorDef& operator_def, Workspace* ws)
       const std::string key = MakeString("output_size_hint_", output_idx);
       auto output_size_hint = OperatorBase::GetRepeatedArgument<int>(key);
       if (!output_size_hint.empty()) {
-        std::vector<TIndex> dims;
+        std::vector<int64_t> dims;
         for (const auto v : output_size_hint) {
           dims.push_back(v);
         }
@@ -130,17 +130,17 @@ TensorRTOp::TensorRTOp(const OperatorDef& operator_def, Workspace* ws)
 
 void TensorRTOp::MaybeAdjustOutputShape(
     int output_idx,
-    std::vector<TIndex>* dims) {
+    std::vector<int64_t>* dims) {
   const auto it = output_size_hints_.find(output_idx);
   if (it != output_size_hints_.end()) {
     const auto& dims_hint = it->second;
     auto total_trt = std::accumulate(
-        dims->begin(), dims->end(), (TIndex)(1), std::multiplies<TIndex>());
+        dims->begin(), dims->end(), (int64_t)(1), std::multiplies<int64_t>());
     auto total_c2 = std::accumulate(
         dims_hint.begin(),
         dims_hint.end(),
-        (TIndex)(1),
-        std::multiplies<TIndex>());
+        (int64_t)(1),
+        std::multiplies<int64_t>());
     CAFFE_ENFORCE_EQ(
         total_trt,
         total_c2,
@@ -204,7 +204,7 @@ bool TensorRTOp::RunOnDevice() {
       } else {
         // output, we need to allocate the output tensor at first batch run
         auto* output_tensor = Output(output_idx);
-        std::vector<TIndex> tensor_dims;
+        std::vector<int64_t> tensor_dims;
         tensor_dims.push_back(N);
         int64_t chw = 1;
         for (int i = 0; i < dims.nbDims; ++i) {
diff --git a/caffe2/contrib/tensorrt/tensorrt_op_trt.h b/caffe2/contrib/tensorrt/tensorrt_op_trt.h
index cd0700f96a26..a98b8a33a331 100644
--- a/caffe2/contrib/tensorrt/tensorrt_op_trt.h
+++ b/caffe2/contrib/tensorrt/tensorrt_op_trt.h
@@ -17,13 +17,13 @@ class TensorRTOp final : public Operator<CUDAContext> {
   virtual ~TensorRTOp() noexcept {}
 
  private:
-  void MaybeAdjustOutputShape(int output_idx, std::vector<TIndex>* dims);
+  void MaybeAdjustOutputShape(int output_idx, std::vector<int64_t>* dims);
 
   tensorrt::TrtLogger logger_;
   int max_batch_size_;
   std::vector<nvinfer1::Dims> nv_dims_;
   std::vector<bool> is_input_;
-  std::unordered_map<int, std::vector<TIndex>> output_size_hints_;
+  std::unordered_map<int, std::vector<int64_t>> output_size_hints_;
   std::shared_ptr<nvinfer1::ICudaEngine> trt_engine_{nullptr};
   std::shared_ptr<nvinfer1::IExecutionContext> trt_executor_{nullptr};
   bool batch_warning_issued_{false};
diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc
index 780897b1b532..7ff5a2b25eac 100644
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@@ -139,7 +139,7 @@ void TensorSerializer::SerializeWithChunkSize(
   // Serialize whole vector. If vector is empty, it's shape still needs to be
   // serialized in empty proto
   for (size_t chunkBegin = 0;
-       chunkBegin < std::max(tensor.size(), static_cast<TIndex>(1));
+       chunkBegin < std::max(tensor.size(), static_cast<int64_t>(1));
        chunkBegin += chunk_size) {
     VLOG(2) << "Starting a chunk at " << chunkBegin;
 #ifndef __ANDROID__
@@ -374,8 +374,8 @@ void TensorDeserializer::Deserialize(const TensorProto& proto, Tensor* tensor) {
       tensor->GetStaticContext()->CreateContext(proto.device_detail());
   auto context = uniq_ptr.get();
   context->SwitchToDevice(0);
-  vector<TIndex> dims;
-  for (const TIndex d : proto.dims()) {
+  vector<int64_t> dims;
+  for (const int64_t d : proto.dims()) {
     dims.push_back(d);
   }
   tensor->Resize(dims);
diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc
index 628731d31bc8..24b2a2d0593d 100644
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@@ -557,9 +557,9 @@ TEST(TensorTest, TensorNonFundamentalTypeClone) {
 
 TEST(TensorTest, Tensor64BitDimension) {
   // Initialize a large tensor.
-  TIndex large_number =
+  int64_t large_number =
       static_cast<int64_t>(std::numeric_limits<int>::max()) + 1;
-  Tensor tensor(vector<TIndex>{large_number}, CPU);
+  Tensor tensor(vector<int64_t>{large_number}, CPU);
   EXPECT_EQ(tensor.ndim(), 1);
   EXPECT_EQ(tensor.dim(0), large_number);
   EXPECT_EQ(tensor.size(), large_number);
@@ -589,9 +589,9 @@ TEST(TensorTest, Tensor64BitDimension) {
 }
 
 TEST(TensorDeathTest, CannotCastDownLargeDims) {
-  TIndex large_number =
+  int64_t large_number =
       static_cast<int64_t>(std::numeric_limits<int>::max()) + 1;
-  Tensor tensor(vector<TIndex>{large_number}, CPU);
+  Tensor tensor(vector<int64_t>{large_number}, CPU);
   EXPECT_EQ(tensor.ndim(), 1);
   EXPECT_EQ(tensor.dim(0), large_number);
   ASSERT_THROW(tensor.dim32(0), EnforceNotMet);
@@ -694,7 +694,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
 }
 
 TEST(TensorTest, Half) {
-  const TIndex kSize = 3000000;
+  const int64_t kSize = 3000000;
   Blob blob;
   TensorCPU* tensor = blob.GetMutableTensor(CPU);
   tensor->Resize(kSize);
diff --git a/caffe2/core/logging.h b/caffe2/core/logging.h
index 67428df833dd..37fcd939c4d6 100644
--- a/caffe2/core/logging.h
+++ b/caffe2/core/logging.h
@@ -145,7 +145,7 @@ using EnforceNotMet = at::Error;
  * functions to caffe2::enforce_detail namespace. For example:
  *
  *   namespace caffe2 { namespace enforce_detail {
- *   inline EnforceFailMessage IsVector(const vector<TIndex>& shape) {
+ *   inline EnforceFailMessage IsVector(const vector<int64_t>& shape) {
  *     if (shape.size() == 1) { return EnforceOK(); }
  *     return MakeString("Shape ", shape, " is not a vector");
  *   }
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index 51f614546431..5f3f653b5a4b 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -581,7 +581,7 @@ TensorShapes InferBlobShapesAndTypesFromWorkspace(
 }
 
 TensorShapes InferBlobShapesAndTypesFromMap(
-    const CaffeMap<std::string, std::vector<TIndex>>& blob_dimensions,
+    const CaffeMap<std::string, std::vector<int64_t>>& blob_dimensions,
     const vector<NetDef*>& nets) {
   CaffeMap<string, TensorShape> blob_desc;
   // Populate shapes from known blobs
@@ -597,7 +597,7 @@ TensorShapes InferBlobShapesAndTypesFromMap(
 }
 
 TensorShapes InferBlobShapesAndTypesFromMap(
-    const CaffeMap<std::string, std::vector<TIndex>>& blob_dimensions,
+    const CaffeMap<std::string, std::vector<int64_t>>& blob_dimensions,
     const CaffeMap<std::string, TensorProto_DataType>& blob_types,
     const vector<NetDef*>& nets) {
   CaffeMap<string, TensorShape> blob_desc;
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 9b428f9003d9..25aa801d265d 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -700,7 +700,7 @@ struct DispatchHelper<FixedValues<FirstVal, Values...>, ExtraArgs...> {
 template <typename... ExtraArgs>
 struct DispatchHelper<FixedValues<>, ExtraArgs...> {
   template <typename Op>
-  static bool call(Op* op, TIndex /*size*/) {
+  static bool call(Op* op, int64_t /*size*/) {
     return op->template DoRunWithValue<ExtraArgs..., -1>();
   }
 };
@@ -973,11 +973,11 @@ CAFFE2_API TensorShapes InferBlobShapesAndTypesFromWorkspace(
     const vector<NetDef*>& nets);
 
 CAFFE2_API TensorShapes InferBlobShapesAndTypesFromMap(
-    const CaffeMap<std::string, std::vector<TIndex>>& blob_dimensions,
+    const CaffeMap<std::string, std::vector<int64_t>>& blob_dimensions,
     const vector<NetDef*>& nets);
 
 CAFFE2_API TensorShapes InferBlobShapesAndTypesFromMap(
-    const CaffeMap<std::string, std::vector<TIndex>>& blob_dimensions,
+    const CaffeMap<std::string, std::vector<int64_t>>& blob_dimensions,
     const CaffeMap<std::string, TensorProto_DataType>& blob_types,
     const vector<NetDef*>& nets);
 
diff --git a/caffe2/core/operator_schema.cc b/caffe2/core/operator_schema.cc
index 5d9b640a4039..a76a0df9bd00 100644
--- a/caffe2/core/operator_schema.cc
+++ b/caffe2/core/operator_schema.cc
@@ -331,7 +331,7 @@ int OpSchema::CalculateOutput(int num_input) const {
 }
 
 static void SparseLengthsFillerHelper(
-    const std::vector<std::vector<TIndex>>& shapes,
+    const std::vector<std::vector<int64_t>>& shapes,
     size_t value_index,
     size_t length_index,
     std::vector<TensorFiller>* fillers) {
@@ -341,7 +341,7 @@ static void SparseLengthsFillerHelper(
 }
 
 static void SparseSegmentsFillerHelper(
-    const std::vector<std::vector<TIndex>>& shapes,
+    const std::vector<std::vector<int64_t>>& shapes,
     size_t value_index,
     size_t segment_index,
     std::vector<TensorFiller>* fillers) {
@@ -364,7 +364,7 @@ OpSchema& OpSchema::ValueKeyLengthInputFillers(
     size_t key_index,
     size_t length_index) {
   filler_supplier_ = [this, value_index, key_index, length_index](
-                         const std::vector<std::vector<TIndex>>& shapes) {
+                         const std::vector<std::vector<int64_t>>& shapes) {
     auto fillers = SupplyDenseFillers(shapes);
     // fill in the length (value_index is used to get the correct shape)
     SparseLengthsFillerHelper(shapes, key_index, length_index, &fillers);
@@ -383,7 +383,7 @@ OpSchema& OpSchema::ValueLengthInputFillers(
     size_t value_index,
     size_t length_index) {
   filler_supplier_ = [this, value_index, length_index](
-                         const std::vector<std::vector<TIndex>>& shapes) {
+                         const std::vector<std::vector<int64_t>>& shapes) {
     auto fillers = SupplyDenseFillers(shapes);
     // fill in the length (value_index is used to get the correct shape)
     SparseLengthsFillerHelper(shapes, value_index, length_index, &fillers);
@@ -394,7 +394,7 @@ OpSchema& OpSchema::ValueLengthInputFillers(
 
 OpSchema& OpSchema::DisallowInputFillers() {
   filler_supplier_ =
-      [this](const std::vector<std::vector<TIndex>>& /* unused */) {
+      [this](const std::vector<std::vector<int64_t>>& /* unused */) {
         throw std::invalid_argument(type_ + " does not have input fillers");
         return std::vector<TensorFiller>();
       };
@@ -402,12 +402,12 @@ OpSchema& OpSchema::DisallowInputFillers() {
 }
 
 std::vector<TensorFiller> OpSchema::InputFillers(
-    const std::vector<std::vector<TIndex>>& shapes) const {
+    const std::vector<std::vector<int64_t>>& shapes) const {
   return filler_supplier_(shapes);
 }
 
 std::vector<TensorFiller> OpSchema::SupplyDenseFillers(
-    const std::vector<std::vector<TIndex>>& shapes) {
+    const std::vector<std::vector<int64_t>>& shapes) {
   std::vector<TensorFiller> fillers;
   for (const auto& shape : shapes) {
     fillers.emplace_back(shape);
diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h
index 0653de28c68b..e0b6495647eb 100644
--- a/caffe2/core/operator_schema.h
+++ b/caffe2/core/operator_schema.h
@@ -383,11 +383,11 @@ class CAFFE2_API OpSchema {
   OpSchema& DisallowInputFillers();
 
   std::vector<TensorFiller> InputFillers(
-      const std::vector<std::vector<TIndex>>& shapes) const;
+      const std::vector<std::vector<int64_t>>& shapes) const;
 
  private:
   std::vector<TensorFiller> SupplyDenseFillers(
-      const std::vector<std::vector<TIndex>>& shapes);
+      const std::vector<std::vector<int64_t>>& shapes);
 
  private:
   string type_;
@@ -438,9 +438,9 @@ class CAFFE2_API OpSchema {
       };
 
   std::function<std::vector<TensorFiller>(
-      const std::vector<std::vector<TIndex>>&)>
+      const std::vector<std::vector<int64_t>>&)>
       filler_supplier_ =
-          [this](const std::vector<std::vector<TIndex>>& shapes) {
+          [this](const std::vector<std::vector<int64_t>>& shapes) {
             return SupplyDenseFillers(shapes);
           };
 };
@@ -508,8 +508,8 @@ inline TensorShape CreateTensorShape(
 }
 
 // Helper function
-inline vector<TIndex> GetDimsVector(const TensorShape& shape) {
-  vector<TIndex> dims;
+inline vector<int64_t> GetDimsVector(const TensorShape& shape) {
+  vector<int64_t> dims;
   for (auto d : shape.dims()) {
     dims.push_back(d);
   }
diff --git a/caffe2/core/qtensor.h b/caffe2/core/qtensor.h
index e4f373ab3722..f277ffdbdd0a 100644
--- a/caffe2/core/qtensor.h
+++ b/caffe2/core/qtensor.h
@@ -212,8 +212,8 @@ class CAFFE2_EXPORT QTensor {
   /**
    * Return product of all dimensions starting from K.
    */
-  inline TIndex size_from_dim(int k) const {
-    TIndex r = 1;
+  inline int64_t size_from_dim(int k) const {
+    int64_t r = 1;
     for (int i = k; i < dims_.size(); ++i) {
       r *= dims_[i];
     }
@@ -223,9 +223,9 @@ class CAFFE2_EXPORT QTensor {
   /**
    * Product of all dims up to.
    */
-  inline TIndex size_to_dim(int k) const {
+  inline int64_t size_to_dim(int k) const {
     CAFFE_ENFORCE(k < dims_.size());
-    TIndex r = 1;
+    int64_t r = 1;
     for (int i = 0; i < k; ++i) {
       r *= dims_[i];
     }
diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc
index 58b4c4b75e91..e142e1a6b6a9 100644
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@@ -77,7 +77,7 @@ void RegisterTypeCallFunction(TypeIdentifier id, TypeCall c) {
 
 int GetGPUIDForPointer(const void* ptr);
 
-vector<TIndex> GetTensorInfo(
+vector<int64_t> GetTensorInfo(
     const void* c,
     size_t* capacity,
     DeviceOption* device) {
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 932cbc2587e2..286718d4268c 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -59,7 +59,7 @@ class CAFFE2_API Tensor final {
    * Note that the actual data allocation is not going to be carried out until
    * the first time mutable_data() is called.
    */
-  explicit Tensor(const vector<TIndex>& dims, DeviceType type)
+  explicit Tensor(const vector<int64_t>& dims, DeviceType type)
       : Tensor(Storage(type)) {
     // TODO: here, we create a Storage
     // and immediately discard it in Resize() since
@@ -96,7 +96,7 @@ class CAFFE2_API Tensor final {
    */
   template <typename T>
   Tensor(
-      const vector<TIndex>& dims,
+      const vector<int64_t>& dims,
       const vector<T>& values,
       BaseContext* context)
       : Tensor(Storage(context->device_type(), TypeMeta::Make<T>())) {
@@ -115,7 +115,7 @@ class CAFFE2_API Tensor final {
       typename = typename std::enable_if<std::is_scalar<T>::value>::type>
   Tensor(const T& value, BaseContext* context)
       : Tensor(Storage(context->device_type(), TypeMeta::Make<T>())) {
-    Resize(std::vector<TIndex>{});
+    Resize(std::vector<int64_t>{});
     context->CopyItemsFromCPU(
         storage().dtype(), size(), &value, mutable_data<T>());
   }
@@ -142,15 +142,15 @@ class CAFFE2_API Tensor final {
     impl_.get()->CopyFrom(*src.impl_.get(), context);
   }
 
-  void ExtendTo(TIndex num, float growthPct, BaseContext* context) const {
+  void ExtendTo(int64_t num, float growthPct, BaseContext* context) const {
     impl_.get()->ExtendTo(num, growthPct, context);
   }
 
-  void Extend(TIndex num, float growthPct, BaseContext* context) const {
+  void Extend(int64_t num, float growthPct, BaseContext* context) const {
     impl_.get()->Extend(num, growthPct, context);
   }
 
-  void ShrinkTo(TIndex outer_dim) const {
+  void ShrinkTo(int64_t outer_dim) const {
     impl_.get()->ShrinkTo(outer_dim);
   }
 
@@ -168,7 +168,7 @@ class CAFFE2_API Tensor final {
     impl_.get()->ResizeLike(*src_tensor.impl_.get());
   }
 
-  inline void Reshape(const vector<TIndex>& dims) const {
+  inline void Reshape(const vector<int64_t>& dims) const {
     impl_.get()->Reshape(dims);
   }
 
@@ -250,7 +250,7 @@ class CAFFE2_API Tensor final {
     return impl_.get()->ndim();
   }
 
-  inline TIndex size() const {
+  inline int64_t size() const {
     return impl_.get()->size();
   }
 
@@ -266,19 +266,19 @@ class CAFFE2_API Tensor final {
     return impl_.get()->capacity_nbytes();
   }
 
-  inline const vector<TIndex>& dims() const {
+  inline const vector<int64_t>& dims() const {
     return impl_.get()->dims();
   }
 
-  inline TIndex size_from_dim(int k) const {
+  inline int64_t size_from_dim(int k) const {
     return impl_.get()->size_from_dim(k);
   }
 
-  inline TIndex size_to_dim(int k) const {
+  inline int64_t size_to_dim(int k) const {
     return impl_.get()->size_to_dim(k);
   }
 
-  inline TIndex size_between_dim(int k, int l) const {
+  inline int64_t size_between_dim(int k, int l) const {
     return impl_.get()->size_between_dim(k, l);
   }
 
@@ -311,7 +311,7 @@ class CAFFE2_API Tensor final {
     return impl_.get()->dim32(i);
   }
 
-  inline TIndex dim(const int i) const {
+  inline int64_t dim(const int i) const {
     return impl_.get()->dim(i);
   }
 
@@ -337,7 +337,7 @@ TypeCall GetTypeCallFunction(TypeIdentifier id);
 void RegisterTypeCallFunction(TypeIdentifier id, TypeCall c);
 
 // Shape call registry
-typedef vector<TIndex> (*TensorInfoCall)(
+typedef vector<int64_t> (*TensorInfoCall)(
     const void*,
     size_t* capacity,
     DeviceOption* device);
@@ -377,7 +377,7 @@ void TensorPrinter::Print(const Tensor& tensor) {
   std::stringstream values_stream;
   // One most likely doesn't want to print int64-number of items for visual
   // inspection, so we cast down to int here.
-  int total_count = static_cast<int>(std::min(tensor.size(), TIndex(limit_)));
+  int total_count = static_cast<int>(std::min(tensor.size(), int64_t(limit_)));
   const T* tensor_data = tensor.template data<T>();
   for (int i = 0; i < total_count - 1; ++i) {
     values_stream << tensor_data[i] << ",";
diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index 5a2d2c821d2b..20c398f7e4c8 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -26,17 +26,17 @@ namespace caffe2 {
 class DeviceOption;
 
 /**
- * A utility function to convert vector<int> to vector<TIndex>.
+ * A utility function to convert vector<int> to vector<int64_t>.
  */
-inline std::vector<TIndex> ToVectorTIndex(const std::vector<int>& src) {
-  return std::vector<TIndex>(src.begin(), src.end());
+inline std::vector<int64_t> ToVectorint64_t(const std::vector<int>& src) {
+  return std::vector<int64_t>(src.begin(), src.end());
 }
 
 /**
  * Return product of all dimensions starting from k
  */
-inline TIndex size_from_dim_(int k, const std::vector<TIndex>& dims) {
-  TIndex r = 1;
+inline int64_t size_from_dim_(int k, const std::vector<int64_t>& dims) {
+  int64_t r = 1;
   for (size_t i = k; i < dims.size(); ++i) {
     r *= dims[i];
   }
@@ -44,9 +44,9 @@ inline TIndex size_from_dim_(int k, const std::vector<TIndex>& dims) {
 }
 
 // Product of all dims up to k (not including dims[k])
-inline TIndex size_to_dim_(int k, const std::vector<TIndex>& dims) {
+inline int64_t size_to_dim_(int k, const std::vector<int64_t>& dims) {
   CAFFE_ENFORCE((unsigned)k <= dims.size());
-  TIndex r = 1;
+  int64_t r = 1;
   for (int i = 0; i < k; ++i) {
     r *= dims[i];
   }
@@ -54,9 +54,9 @@ inline TIndex size_to_dim_(int k, const std::vector<TIndex>& dims) {
 }
 
 // Product of all dims between k and l (not including dims[k] and dims[l])
-inline TIndex size_between_dim_(int k, int l, const std::vector<TIndex>& dims) {
+inline int64_t size_between_dim_(int k, int l, const std::vector<int64_t>& dims) {
   CAFFE_ENFORCE((unsigned)l < dims.size());
-  TIndex r = 1;
+  int64_t r = 1;
   if (k < l) {
     for (int i = k + 1; i < l; ++i) {
       r *= dims[i];
@@ -191,7 +191,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * @brief Extend the outer-most dimension of this tensor
    *        to dimension of `num`.
    */
-  void ExtendTo(TIndex num, float growthPct, at::BaseContext* context) {
+  void ExtendTo(int64_t num, float growthPct, at::BaseContext* context) {
     CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
     CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0);
     CAFFE_ENFORCE(context != nullptr, "Context must be provided.");
@@ -207,7 +207,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * growthPct. This ensures that Extend runs on an amortized O(1) time
    * complexity.
    */
-  void Extend(TIndex num, float growthPct, at::BaseContext* context) {
+  void Extend(int64_t num, float growthPct, at::BaseContext* context) {
     CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
     CAFFE_ENFORCE_GE_WITH_CALLER(
         num, 0, "`num` must be non-negative for Extend");
@@ -223,8 +223,8 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     auto newNumel = std::accumulate(
         newDims.begin(),
         newDims.end(),
-        static_cast<TIndex>(1),
-        std::multiplies<TIndex>());
+        static_cast<int64_t>(1),
+        std::multiplies<int64_t>());
     if (newNumel * storage_.itemsize() <= storage_.capacity()) {
       dims_ = newDims;
       numel_ = newNumel;
@@ -253,7 +253,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * This method guarantees that no re-allocations are carried out, which means
    * that the extra capacity after the end of the shurnk tensor is maintained.
    */
-  void ShrinkTo(TIndex outer_dim) {
+  void ShrinkTo(int64_t outer_dim) {
     CAFFE_ENFORCE_WITH_CALLER(
         is_contiguous_,
         "Right now ShrinkTo is only supported on contiguous Tensor.");
@@ -268,8 +268,8 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     numel_ = std::accumulate(
         dims_.begin(),
         dims_.end(),
-        static_cast<TIndex>(1),
-        std::multiplies<TIndex>());
+        static_cast<int64_t>(1),
+        std::multiplies<int64_t>());
   }
 
   /**
@@ -292,8 +292,8 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     auto newNumel = std::accumulate(
         newCapacity.begin(),
         newCapacity.end(),
-        static_cast<TIndex>(1),
-        std::multiplies<TIndex>());
+        static_cast<int64_t>(1),
+        std::multiplies<int64_t>());
     if (newNumel * storage_.itemsize() <= storage_.capacity()) {
       return;
     }
@@ -365,11 +365,11 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * Resizes the tensor without touching underlying storage.
    * This requires the total size of the tensor to remains constant.
    */
-  inline void Reshape(const std::vector<TIndex>& dims) {
+  inline void Reshape(const std::vector<int64_t>& dims) {
     CAFFE_ENFORCE_WITH_CALLER(
         is_contiguous_,
         "Right now Reshape is only supported for contiguous Tensor.");
-    TIndex new_size = 1;
+    int64_t new_size = 1;
     for (auto d : dims) {
       CAFFE_ENFORCE_GE_WITH_CALLER(d, 0);
       new_size *= d;
@@ -387,7 +387,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
   inline void Reshape(const std::vector<int>& dims) {
-    Reshape(ToVectorTIndex(dims));
+    Reshape(ToVectorint64_t(dims));
   }
 
   /**
@@ -674,7 +674,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   /**
    * Returns the size (i.e. the number of items) of the tensor.
    */
-  inline TIndex size() const {
+  inline int64_t size() const {
     return numel_;
   }
   /**
@@ -701,19 +701,19 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   /**
    * Returns the dimensions of the tensor as a vector.
    */
-  inline const std::vector<TIndex>& dims() const {
+  inline const std::vector<int64_t>& dims() const {
     return dims_;
   }
 
-  inline TIndex size_from_dim(int k) const {
+  inline int64_t size_from_dim(int k) const {
     return size_from_dim_(k, dims_);
   }
 
-  inline TIndex size_to_dim(int k) const {
+  inline int64_t size_to_dim(int k) const {
     return size_to_dim_(k, dims_);
   }
 
-  inline TIndex size_between_dim(int k, int l) const {
+  inline int64_t size_between_dim(int k, int l) const {
     return size_between_dim_(k, l, dims_);
   }
 
@@ -772,7 +772,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   /**
    * Returns the i-th dimension of the tensor in int.
    *
-   * This function returns an int value instead of TIndex, which depending on
+   * This function returns an int value instead of int64_t, which depending on
    * the typedef could be int64. If you want int64 dim values, make sure you
    * call dim() instead.
    */
@@ -790,7 +790,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
    * must be between 0 (inclusive) and the number of dimensions, otherwise
    * this function will produce a fatal message.
    */
-  inline TIndex dim(const int i) const {
+  inline int64_t dim(const int i) const {
 #ifndef NDEBUG
     CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit");
     CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
@@ -818,9 +818,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
 
  protected:
   // TODO: change to DimVector
-  std::vector<TIndex> dims_; // sizes_
+  std::vector<int64_t> dims_; // sizes_
   at::DimVector strides_;
-  TIndex numel_ = -1; // numel_
+  int64_t numel_ = -1; // numel_
   bool is_contiguous_ = true;
   // we decide to keep reserved_ and it will
   // live in Tensor after the split
@@ -838,7 +838,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   bool SetDims(const std::vector<T>& src) {
     auto old_numel = numel_;
     dims_.resize(src.size());
-    TIndex new_numel = 1;
+    int64_t new_numel = 1;
     for (size_t i = 0; i < src.size(); ++i) {
       new_numel *= src[i];
       dims_[i] = src[i];
@@ -859,7 +859,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   // TODO(jiayq): maybe rewrite the following functions with initializer list.
   // NVCC does not play well with initializer lists last time, but worth
   // another shot.
-  bool SetDims(const TIndex d0) {
+  bool SetDims(const int64_t d0) {
     auto old_numel = numel_;
     dims_.resize(1);
     dims_[0] = d0;
@@ -868,7 +868,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     return numel_ != old_numel;
   }
 
-  bool SetDims(const TIndex d0, const TIndex d1) {
+  bool SetDims(const int64_t d0, const int64_t d1) {
     auto old_numel = numel_;
     dims_.resize(2);
     dims_[0] = d0;
@@ -878,7 +878,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     return numel_ != old_numel;
   }
 
-  bool SetDims(const TIndex d0, const TIndex d1, const TIndex d2) {
+  bool SetDims(const int64_t d0, const int64_t d1, const int64_t d2) {
     auto old_numel = numel_;
     dims_.resize(3);
     dims_[0] = d0;
@@ -890,7 +890,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
   bool
-  SetDims(const TIndex d0, const TIndex d1, const TIndex d2, const TIndex d3) {
+  SetDims(const int64_t d0, const int64_t d1, const int64_t d2, const int64_t d3) {
     auto old_numel = numel_;
     dims_.resize(4);
     dims_[0] = d0;
diff --git a/caffe2/cuda_rtc/pool_op_rtc_gpu.cc b/caffe2/cuda_rtc/pool_op_rtc_gpu.cc
index 4dc8d598044b..0362829eaf3f 100644
--- a/caffe2/cuda_rtc/pool_op_rtc_gpu.cc
+++ b/caffe2/cuda_rtc/pool_op_rtc_gpu.cc
@@ -232,7 +232,7 @@ class MaxPoolRTCOp final : public ConvPoolOpBase<CUDAContext> {
 
  private:
   MaxPoolRTCFunction func_;
-  vector<TIndex> input_dims_;
+  vector<int64_t> input_dims_;
 };
 
 class MaxPoolGradientRTCOp final : public ConvPoolOpBase<CUDAContext> {
@@ -285,7 +285,7 @@ class MaxPoolGradientRTCOp final : public ConvPoolOpBase<CUDAContext> {
 
  private:
   MaxPoolGradientRTCFunction func_;
-  vector<TIndex> input_dims_;
+  vector<int64_t> input_dims_;
 };
 
 namespace {
diff --git a/caffe2/experiments/operators/fully_connected_op_prune.h b/caffe2/experiments/operators/fully_connected_op_prune.h
index 05d5bc10d5a4..c1995845f49b 100644
--- a/caffe2/experiments/operators/fully_connected_op_prune.h
+++ b/caffe2/experiments/operators/fully_connected_op_prune.h
@@ -29,8 +29,8 @@ namespace caffe2 {
       using Shape = std::array<int, N>;
 
     template<int N>
-      const std::vector<TIndex>& shape(Shape<N> vs) {
-        static thread_local std::vector<TIndex> cache;
+      const std::vector<int64_t>& shape(Shape<N> vs) {
+        static thread_local std::vector<int64_t> cache;
         cache.resize(vs.size());
         for (auto i = 0; i < vs.size(); ++i) {
           cache[i] = vs[i];
@@ -38,11 +38,11 @@ namespace caffe2 {
         return cache;
       }
 
-    inline const std::vector<TIndex>& shape(int i) {
+    inline const std::vector<int64_t>& shape(int i) {
       return shape<1>(Shape<1>({i}));
     }
 
-    inline const std::vector<TIndex>& shape(int i, int j) {
+    inline const std::vector<int64_t>& shape(int i, int j) {
       return shape<2>(Shape<2>({i, j}));
     }
 
@@ -177,7 +177,7 @@ namespace caffe2 {
               Y->template mutable_data<T>(), &context_);
           if (OutputSize() == 2){
             auto* Comp_rate = Output(1);
-            Comp_rate->Resize(vector<TIndex>());
+            Comp_rate->Resize(vector<int64_t>());
             T* comp_data = Comp_rate->template mutable_data<T>();
             math::Sum<T, Context>(
                 Mask.size(), Mask.template data<T>(), comp_data, &context_);
@@ -262,7 +262,7 @@ namespace caffe2 {
               0, dW->template mutable_data<T>(),
               &context_);
 
-          comp_r_buf_.Resize(vector<TIndex>());
+          comp_r_buf_.Resize(vector<int64_t>());
           T* comp_data = comp_r_buf_.template mutable_data<T>();
           math::Sum<T, Context>(
               Mask.size(), Mask.template data<T>(), comp_data, &context_);
diff --git a/caffe2/experiments/operators/fully_connected_op_sparse.h b/caffe2/experiments/operators/fully_connected_op_sparse.h
index 6f19c1bacdc5..4c13e51dde8c 100644
--- a/caffe2/experiments/operators/fully_connected_op_sparse.h
+++ b/caffe2/experiments/operators/fully_connected_op_sparse.h
@@ -32,8 +32,8 @@ template<int N>
 using Shape = std::array<int, N>;
 
 template<int N>
-const std::vector<TIndex>& shape(Shape<N> vs) {
-  static thread_local std::vector<TIndex> cache;
+const std::vector<int64_t>& shape(Shape<N> vs) {
+  static thread_local std::vector<int64_t> cache;
   cache.resize(vs.size());
   for (auto i = 0; i < vs.size(); ++i) {
     cache[i] = vs[i];
@@ -41,11 +41,11 @@ const std::vector<TIndex>& shape(Shape<N> vs) {
   return cache;
 }
 
-inline const std::vector<TIndex>& shape(int i) {
+inline const std::vector<int64_t>& shape(int i) {
   return shape<1>(Shape<1>({i}));
 }
 
-inline const std::vector<TIndex>& shape(int i, int j) {
+inline const std::vector<int64_t>& shape(int i, int j) {
   return shape<2>(Shape<2>({i, j}));
 }
 
diff --git a/caffe2/experiments/operators/funhash_op.h b/caffe2/experiments/operators/funhash_op.h
index 76bd37aee08e..98ffc83bb424 100644
--- a/caffe2/experiments/operators/funhash_op.h
+++ b/caffe2/experiments/operators/funhash_op.h
@@ -37,9 +37,9 @@ class FunHashOp : public Operator<Context> {
   FunHashOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         num_outputs_(
-            OperatorBase::GetSingleArgument<TIndex>("num_outputs", -1)),
+            OperatorBase::GetSingleArgument<int64_t>("num_outputs", -1)),
         num_segments_(
-            OperatorBase::GetSingleArgument<TIndex>("num_segments", -1)),
+            OperatorBase::GetSingleArgument<int64_t>("num_segments", -1)),
         seed_(OperatorBase::GetSingleArgument<uint64_t>("seed", 0)) {
     CAFFE_ENFORCE(
         OperatorBase::HasArgument("num_outputs"),
@@ -54,7 +54,7 @@ class FunHashOp : public Operator<Context> {
     const auto& seg = Input(2);
     const auto& weight = Input(3);
 
-    TIndex num_alpha = 1;
+    int64_t num_alpha = 1;
     if (adaptive_) {
       const auto& alpha = Input(4);
       num_alpha = alpha.dim(0);
@@ -62,12 +62,12 @@ class FunHashOp : public Operator<Context> {
 
     const auto* seg_data = seg.template data<int>();
 
-    TIndex num_weight = weight.dim(0);
-    TIndex num_nz_ent = seg.dim(0);
+    int64_t num_weight = weight.dim(0);
+    int64_t num_nz_ent = seg.dim(0);
 
-    TIndex n_segments = num_segments_;
+    int64_t n_segments = num_segments_;
     if (num_segments_ == -1) {
-      for (TIndex i = 0; i < num_nz_ent; ++i) {
+      for (int64_t i = 0; i < num_nz_ent; ++i) {
         if (seg_data[i] > n_segments) {
           n_segments = seg_data[i];
         }
@@ -85,16 +85,16 @@ class FunHashOp : public Operator<Context> {
     const auto* weight_data = weight.template data<T>();
     const auto* alpha_data = adaptive_ ? Input(4).template data<T>() : 0;
     const auto* val_data = val.template data<T>();
-    const auto* key_data = key.template data<TIndex>();
+    const auto* key_data = key.template data<int64_t>();
 
-    for (TIndex j = 0; j < num_nz_ent; ++j) {
-      TIndex cur_seg = seg_data[j];
-      TIndex cur_key = key_data[j];
+    for (int64_t j = 0; j < num_nz_ent; ++j) {
+      int64_t cur_seg = seg_data[j];
+      int64_t cur_key = key_data[j];
       T cur_val = val_data[j];
-      TIndex output_stride = cur_seg * num_outputs_;
-      for (TIndex i = 0; i < num_outputs_; ++i) {
+      int64_t output_stride = cur_seg * num_outputs_;
+      for (int64_t i = 0; i < num_outputs_; ++i) {
         T sum = 0;
-        for (TIndex k = 0; k < num_alpha; ++k) {
+        for (int64_t k = 0; k < num_alpha; ++k) {
           uint64_t hash;
           // The hash function takes as input four integers:
           // 1. feature index
@@ -108,7 +108,7 @@ class FunHashOp : public Operator<Context> {
 
           hash_data[3] = INDEX_MAGIC;
           hash = XXH64(hash_data.data(), hash_data.size(), seed_);
-          TIndex index = hash % num_weight;
+          int64_t index = hash % num_weight;
 
           T cur_weight = weight_data[index];
 #ifdef USE_SIGN
@@ -133,8 +133,8 @@ class FunHashOp : public Operator<Context> {
   }
 
  protected:
-  TIndex num_outputs_;
-  TIndex num_segments_;
+  int64_t num_outputs_;
+  int64_t num_segments_;
   uint64_t seed_;
   std::array<uint64_t, 4> hash_data;
   bool adaptive_;
@@ -147,7 +147,7 @@ class FunHashGradientOp : public Operator<Context> {
   FunHashGradientOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         num_outputs_(
-            OperatorBase::GetSingleArgument<TIndex>("num_outputs", -1)),
+            OperatorBase::GetSingleArgument<int64_t>("num_outputs", -1)),
         seed_(OperatorBase::GetSingleArgument<uint64_t>("seed", 0)) {
     adaptive_ = (InputSize() == 6);
   }
@@ -159,7 +159,7 @@ class FunHashGradientOp : public Operator<Context> {
     const auto& seg = Input(3);
     const auto& weight = Input(4);
 
-    TIndex num_alpha = 1;
+    int64_t num_alpha = 1;
     T* grad_alpha_data = 0;
 
     if (adaptive_) {
@@ -173,8 +173,8 @@ class FunHashGradientOp : public Operator<Context> {
 
     const auto* seg_data = seg.template data<int>();
 
-    TIndex num_weight = weight.dim(0);
-    TIndex num_nz_ent = seg.dim(0);
+    int64_t num_weight = weight.dim(0);
+    int64_t num_nz_ent = seg.dim(0);
 
     auto* grad_weight = Output(0);
     grad_weight->ResizeLike(weight);
@@ -184,18 +184,18 @@ class FunHashGradientOp : public Operator<Context> {
     const auto* weight_data = weight.template data<T>();
     const auto* alpha_data = adaptive_ ? Input(5).template data<T>() : 0;
     const auto* val_data = val.template data<T>();
-    const auto* key_data = key.template data<TIndex>();
+    const auto* key_data = key.template data<int64_t>();
 
     memset(grad_weight_data, 0, sizeof(T) * num_weight);
 
-    for (TIndex j = 0; j < num_nz_ent; ++j) {
-      TIndex cur_seg = seg_data[j];
-      TIndex cur_key = key_data[j];
+    for (int64_t j = 0; j < num_nz_ent; ++j) {
+      int64_t cur_seg = seg_data[j];
+      int64_t cur_key = key_data[j];
       T cur_val = val_data[j];
-      TIndex grad_out_stride = cur_seg * num_outputs_;
-      for (TIndex i = 0; i < num_outputs_; ++i) {
+      int64_t grad_out_stride = cur_seg * num_outputs_;
+      for (int64_t i = 0; i < num_outputs_; ++i) {
         T grad_out_scale = grad_out_data[grad_out_stride + i] * cur_val;
-        for (TIndex k = 0; k < num_alpha; ++k) {
+        for (int64_t k = 0; k < num_alpha; ++k) {
           uint64_t hash;
           hash_data[0] = cur_key;
           hash_data[1] = i;
@@ -203,7 +203,7 @@ class FunHashGradientOp : public Operator<Context> {
 
           hash_data[3] = INDEX_MAGIC;
           hash = XXH64(hash_data.data(), hash_data.size(), seed_);
-          TIndex index = hash % num_weight;
+          int64_t index = hash % num_weight;
 
           T cur_grad_out_scale = grad_out_scale;
 #ifdef USE_SIGN
@@ -227,7 +227,7 @@ class FunHashGradientOp : public Operator<Context> {
   }
 
  protected:
-  TIndex num_outputs_;
+  int64_t num_outputs_;
   uint64_t seed_;
   std::array<uint64_t, 4> hash_data;
   bool adaptive_;
diff --git a/caffe2/experiments/operators/sparse_funhash_op.h b/caffe2/experiments/operators/sparse_funhash_op.h
index 04c2441f297b..d4febbc8fa6d 100644
--- a/caffe2/experiments/operators/sparse_funhash_op.h
+++ b/caffe2/experiments/operators/sparse_funhash_op.h
@@ -36,9 +36,9 @@ class SparseFunHashOp : public Operator<Context> {
   SparseFunHashOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         num_outputs_(
-            OperatorBase::GetSingleArgument<TIndex>("num_outputs", -1)),
+            OperatorBase::GetSingleArgument<int64_t>("num_outputs", -1)),
         num_segments_(
-            OperatorBase::GetSingleArgument<TIndex>("num_segments", -1)),
+            OperatorBase::GetSingleArgument<int64_t>("num_segments", -1)),
         seed_(OperatorBase::GetSingleArgument<uint64_t>("seed", 0)) {
     CAFFE_ENFORCE(
         OperatorBase::HasArgument("num_outputs"),
@@ -53,7 +53,7 @@ class SparseFunHashOp : public Operator<Context> {
     const auto& seg = Input(2);
     const auto& weight = Input(3);
 
-    TIndex num_alpha = 1;
+    int64_t num_alpha = 1;
     if (adaptive_) {
       const auto& alpha = Input(4);
       num_alpha = alpha.dim(0);
@@ -61,12 +61,12 @@ class SparseFunHashOp : public Operator<Context> {
 
     const auto* seg_data = seg.template data<int>();
 
-    TIndex num_weight = weight.dim(0);
-    TIndex num_nz_ent = seg.dim(0);
+    int64_t num_weight = weight.dim(0);
+    int64_t num_nz_ent = seg.dim(0);
 
-    TIndex n_segments = num_segments_;
+    int64_t n_segments = num_segments_;
     if (num_segments_ == -1) {
-      for (TIndex i = 0; i < num_nz_ent; ++i) {
+      for (int64_t i = 0; i < num_nz_ent; ++i) {
         if (seg_data[i] > n_segments) {
           n_segments = seg_data[i];
         }
@@ -84,16 +84,16 @@ class SparseFunHashOp : public Operator<Context> {
     const auto* weight_data = weight.template data<T>();
     const auto* alpha_data = adaptive_ ? Input(4).template data<T>() : 0;
     const auto* val_data = val.template data<T>();
-    const auto* key_data = key.template data<TIndex>();
+    const auto* key_data = key.template data<int64_t>();
 
-    for (TIndex j = 0; j < num_nz_ent; ++j) {
-      TIndex cur_seg = seg_data[j];
-      TIndex cur_key = key_data[j];
+    for (int64_t j = 0; j < num_nz_ent; ++j) {
+      int64_t cur_seg = seg_data[j];
+      int64_t cur_key = key_data[j];
       T cur_val = val_data[j];
-      TIndex output_stride = cur_seg * num_outputs_;
-      for (TIndex i = 0; i < num_outputs_; ++i) {
+      int64_t output_stride = cur_seg * num_outputs_;
+      for (int64_t i = 0; i < num_outputs_; ++i) {
         T sum = 0;
-        for (TIndex k = 0; k < num_alpha; ++k) {
+        for (int64_t k = 0; k < num_alpha; ++k) {
           // The hash function takes as input three integers:
           // 1. feature index
           // 2. output index
@@ -108,13 +108,13 @@ class SparseFunHashOp : public Operator<Context> {
 
 #ifdef USE_SIGN
           // Use the least significant bit for sign, the rest for weights.
-          TIndex index = (hash >> 1) % num_weight;
+          int64_t index = (hash >> 1) % num_weight;
           T cur_weight = weight_data[index];
           if (hash & 1) {
             cur_weight = -cur_weight;
           }
 #else
-          TIndex index = hash % num_weight;
+          int64_t index = hash % num_weight;
           T cur_weight = weight_data[index];
 #endif
 
@@ -132,8 +132,8 @@ class SparseFunHashOp : public Operator<Context> {
   }
 
  protected:
-  TIndex num_outputs_;
-  TIndex num_segments_;
+  int64_t num_outputs_;
+  int64_t num_segments_;
   uint64_t seed_;
   std::array<uint64_t, 4> hash_data;
   bool adaptive_;
@@ -146,7 +146,7 @@ class SparseFunHashGradientOp : public Operator<Context> {
   SparseFunHashGradientOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         num_outputs_(
-            OperatorBase::GetSingleArgument<TIndex>("num_outputs", -1)),
+            OperatorBase::GetSingleArgument<int64_t>("num_outputs", -1)),
         seed_(OperatorBase::GetSingleArgument<uint64_t>("seed", 0)) {
     adaptive_ = (InputSize() == 6);
   }
@@ -158,7 +158,7 @@ class SparseFunHashGradientOp : public Operator<Context> {
     const auto& seg = Input(3);
     const auto& weight = Input(4);
 
-    TIndex num_alpha = 1;
+    int64_t num_alpha = 1;
     T* grad_alpha_data = 0;
 
     if (adaptive_) {
@@ -172,10 +172,10 @@ class SparseFunHashGradientOp : public Operator<Context> {
 
     const auto* seg_data = seg.template data<int>();
 
-    TIndex num_weight = weight.dim(0);
-    TIndex num_nz_ent = seg.dim(0);
+    int64_t num_weight = weight.dim(0);
+    int64_t num_nz_ent = seg.dim(0);
 
-    TIndex grad_weight_size = num_nz_ent * num_outputs_ * num_alpha;
+    int64_t grad_weight_size = num_nz_ent * num_outputs_ * num_alpha;
     auto* grad_weight_val = Output(0);
     grad_weight_val->Resize(grad_weight_size);
     T* grad_weight_val_data = grad_weight_val->template mutable_data<T>();
@@ -183,23 +183,23 @@ class SparseFunHashGradientOp : public Operator<Context> {
     auto* grad_weight_ind = Output(1);
     grad_weight_ind->Resize(grad_weight_size);
     auto* grad_weight_ind_data =
-        grad_weight_ind->template mutable_data<TIndex>();
+        grad_weight_ind->template mutable_data<int64_t>();
 
     const auto* grad_out_data = grad_out.template data<T>();
     const auto* weight_data = weight.template data<T>();
     const auto* alpha_data = adaptive_ ? Input(5).template data<T>() : 0;
     const auto* val_data = val.template data<T>();
-    const auto* key_data = key.template data<TIndex>();
+    const auto* key_data = key.template data<int64_t>();
 
-    TIndex w_ind = 0;
-    for (TIndex j = 0; j < num_nz_ent; ++j) {
-      TIndex cur_seg = seg_data[j];
-      TIndex cur_key = key_data[j];
+    int64_t w_ind = 0;
+    for (int64_t j = 0; j < num_nz_ent; ++j) {
+      int64_t cur_seg = seg_data[j];
+      int64_t cur_key = key_data[j];
       T cur_val = val_data[j];
-      TIndex grad_out_stride = cur_seg * num_outputs_;
-      for (TIndex i = 0; i < num_outputs_; ++i) {
+      int64_t grad_out_stride = cur_seg * num_outputs_;
+      for (int64_t i = 0; i < num_outputs_; ++i) {
         T grad_out_scale = grad_out_data[grad_out_stride + i] * cur_val;
-        for (TIndex k = 0; k < num_alpha; ++k) {
+        for (int64_t k = 0; k < num_alpha; ++k) {
           hash_data[0] = cur_key;
           hash_data[1] = i;
           hash_data[2] = k;
@@ -209,12 +209,12 @@ class SparseFunHashGradientOp : public Operator<Context> {
 
           T cur_grad_out_scale = grad_out_scale;
 #ifdef USE_SIGN
-          TIndex index = (hash >> 1) % num_weight;
+          int64_t index = (hash >> 1) % num_weight;
           if (hash & 1) {
             cur_grad_out_scale = -cur_grad_out_scale;
           }
 #else
-          TIndex index = hash % num_weight;
+          int64_t index = hash % num_weight;
 #endif
 
           if (adaptive_) {
@@ -232,7 +232,7 @@ class SparseFunHashGradientOp : public Operator<Context> {
   }
 
  protected:
-  TIndex num_outputs_;
+  int64_t num_outputs_;
   uint64_t seed_;
   std::array<uint64_t, 4> hash_data;
   bool adaptive_;
diff --git a/caffe2/experiments/operators/sparse_matrix_reshape_op.h b/caffe2/experiments/operators/sparse_matrix_reshape_op.h
index b2026a866ff8..f35f0cbbe6df 100644
--- a/caffe2/experiments/operators/sparse_matrix_reshape_op.h
+++ b/caffe2/experiments/operators/sparse_matrix_reshape_op.h
@@ -36,10 +36,10 @@ class SparseMatrixReshapeOp : public Operator<Context> {
         OperatorBase::HasArgument("new_shape"),
         "Argument `new_shape` is missing.");
 
-    vector<TIndex> old_shape =
-        OperatorBase::GetRepeatedArgument<TIndex>("old_shape");
-    vector<TIndex> new_shape =
-        OperatorBase::GetRepeatedArgument<TIndex>("new_shape");
+    vector<int64_t> old_shape =
+        OperatorBase::GetRepeatedArgument<int64_t>("old_shape");
+    vector<int64_t> new_shape =
+        OperatorBase::GetRepeatedArgument<int64_t>("new_shape");
 
     CAFFE_ENFORCE(
         old_shape.size() == 2,
@@ -63,7 +63,7 @@ class SparseMatrixReshapeOp : public Operator<Context> {
           old_shape[0] > 0,
           "The first dimension in `old_shape` must be positive.");
 
-      TIndex matrix_size = old_shape[0] * old_shape[1];
+      int64_t matrix_size = old_shape[0] * old_shape[1];
 
       if (new_shape[0] == -1) {
         CAFFE_ENFORCE(
@@ -106,14 +106,14 @@ class SparseMatrixReshapeOp : public Operator<Context> {
     new_col->Resize(nnz);
     new_row->Resize(nnz);
 
-    const auto* old_col_data = old_col.template data<TIndex>();
+    const auto* old_col_data = old_col.template data<int64_t>();
     const auto* old_row_data = old_row.template data<int>();
 
-    auto* new_col_data = new_col->template mutable_data<TIndex>();
+    auto* new_col_data = new_col->template mutable_data<int64_t>();
     auto* new_row_data = new_row->template mutable_data<int>();
 
     for (int i = 0; i < nnz; ++i) {
-      TIndex offset = old_row_data[i] * old_stride_ + old_col_data[i];
+      int64_t offset = old_row_data[i] * old_stride_ + old_col_data[i];
       new_row_data[i] = offset / new_stride_;
       new_col_data[i] = offset % new_stride_;
     }
@@ -122,8 +122,8 @@ class SparseMatrixReshapeOp : public Operator<Context> {
   }
 
  private:
-  TIndex old_stride_;
-  TIndex new_stride_;
+  int64_t old_stride_;
+  int64_t new_stride_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/experiments/operators/tt_contraction_op.h b/caffe2/experiments/operators/tt_contraction_op.h
index 11ef35bd235a..7f42d1f68d0b 100644
--- a/caffe2/experiments/operators/tt_contraction_op.h
+++ b/caffe2/experiments/operators/tt_contraction_op.h
@@ -29,9 +29,9 @@ class TTContractionOp final : public Operator<Context> {
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   TTContractionOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
-        K_(OperatorBase::GetSingleArgument<TIndex>("K", 0)),
-        M_(OperatorBase::GetSingleArgument<TIndex>("M", 0)),
-        N_(OperatorBase::GetSingleArgument<TIndex>("N", 0)) {
+        K_(OperatorBase::GetSingleArgument<int64_t>("K", 0)),
+        M_(OperatorBase::GetSingleArgument<int64_t>("M", 0)),
+        N_(OperatorBase::GetSingleArgument<int64_t>("N", 0)) {
     CAFFE_ENFORCE(OperatorBase::HasArgument("K"), "Argument `K` is missing.");
     CAFFE_ENFORCE(OperatorBase::HasArgument("M"), "Argument `M` is missing.");
     CAFFE_ENFORCE(OperatorBase::HasArgument("N"), "Argument `N` is missing.");
@@ -44,8 +44,8 @@ class TTContractionOp final : public Operator<Context> {
 
     CAFFE_ENFORCE(A.ndim() == 2, A.ndim());
 
-    TIndex A_size = A.size_from_dim(0);
-    TIndex B_size = B.size_from_dim(0);
+    int64_t A_size = A.size_from_dim(0);
+    int64_t B_size = B.size_from_dim(0);
 
     CAFFE_ENFORCE(
         K_ * M_ == A_size,
@@ -55,19 +55,19 @@ class TTContractionOp final : public Operator<Context> {
         B_size % (K_ * N_) == 0,
         "Argument `K` and `N` do not agree with the size of B.");
 
-    TIndex D_ = B_size / (K_ * N_);
+    int64_t D_ = B_size / (K_ * N_);
 
-    TIndex C_size = D_ * M_ * N_;
-    C->Resize(vector<TIndex>{C_size});
+    int64_t C_size = D_ * M_ * N_;
+    C->Resize(vector<int64_t>{C_size});
 
-    TIndex B_stride = K_ * N_;
-    TIndex C_stride = M_ * N_;
+    int64_t B_stride = K_ * N_;
+    int64_t C_stride = M_ * N_;
 
     const T* A_data = A.template data<T>();
     const T* B_data = B.template data<T>();
     T* C_data = C->template mutable_data<T>();
 
-    for (TIndex B_index = 0; B_index < B_size; B_index += B_stride) {
+    for (int64_t B_index = 0; B_index < B_size; B_index += B_stride) {
       math::Gemm<T, Context, Engine>(
           CblasTrans,
           CblasNoTrans,
@@ -84,9 +84,9 @@ class TTContractionOp final : public Operator<Context> {
   }
 
  protected:
-  TIndex K_;
-  TIndex M_;
-  TIndex N_;
+  int64_t K_;
+  int64_t M_;
+  int64_t N_;
 };
 
 template <typename T, class Context, class Engine = DefaultEngine>
@@ -95,9 +95,9 @@ class TTContractionGradientOp final : public Operator<Context> {
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   TTContractionGradientOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
-        K_(OperatorBase::GetSingleArgument<TIndex>("K", 0)),
-        M_(OperatorBase::GetSingleArgument<TIndex>("M", 0)),
-        N_(OperatorBase::GetSingleArgument<TIndex>("N", 0)) {}
+        K_(OperatorBase::GetSingleArgument<int64_t>("K", 0)),
+        M_(OperatorBase::GetSingleArgument<int64_t>("M", 0)),
+        N_(OperatorBase::GetSingleArgument<int64_t>("N", 0)) {}
 
   bool RunOnDevice() override {
     const auto& G = Input(0);
@@ -106,16 +106,16 @@ class TTContractionGradientOp final : public Operator<Context> {
     auto* dA = Output(0);
     auto* dB = Output(1);
 
-    TIndex G_size = G.size_from_dim(0);
-    TIndex D_ = G_size / (M_ * N_);
+    int64_t G_size = G.size_from_dim(0);
+    int64_t D_ = G_size / (M_ * N_);
 
-    TIndex dB_size = D_ * K_ * N_;
+    int64_t dB_size = D_ * K_ * N_;
 
     dA->Resize(A.dims());
     dB->Resize(B.dims());
 
-    TIndex B_stride = K_ * N_;
-    TIndex G_stride = M_ * N_;
+    int64_t B_stride = K_ * N_;
+    int64_t G_stride = M_ * N_;
 
     const T* G_data = G.template data<T>();
     const T* A_data = A.template data<T>();
@@ -125,7 +125,7 @@ class TTContractionGradientOp final : public Operator<Context> {
     T* dB_data = dB->template mutable_data<T>();
 
     const T* G_ptr = G_data;
-    for (TIndex B_index = 0; B_index < dB_size; B_index += B_stride) {
+    for (int64_t B_index = 0; B_index < dB_size; B_index += B_stride) {
       math::Gemm<T, Context, Engine>(
           CblasNoTrans,
           CblasTrans,
@@ -139,7 +139,7 @@ class TTContractionGradientOp final : public Operator<Context> {
     }
 
     G_ptr = G_data;
-    for (TIndex B_index = 0; B_index < dB_size; B_index += B_stride) {
+    for (int64_t B_index = 0; B_index < dB_size; B_index += B_stride) {
       math::Gemm<T, Context, Engine>(
           CblasNoTrans,
           CblasNoTrans,
@@ -156,9 +156,9 @@ class TTContractionGradientOp final : public Operator<Context> {
   }
 
  protected:
-  TIndex K_;
-  TIndex M_;
-  TIndex N_;
+  int64_t K_;
+  int64_t M_;
+  int64_t N_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/experiments/operators/tt_pad_op.h b/caffe2/experiments/operators/tt_pad_op.h
index 83d197782790..c78df9c5f29f 100644
--- a/caffe2/experiments/operators/tt_pad_op.h
+++ b/caffe2/experiments/operators/tt_pad_op.h
@@ -29,7 +29,7 @@ class TTPadOp final : public Operator<Context> {
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   TTPadOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
-        scale_(OperatorBase::GetSingleArgument<TIndex>("scale", 0)) {
+        scale_(OperatorBase::GetSingleArgument<int64_t>("scale", 0)) {
     CAFFE_ENFORCE(
         OperatorBase::HasArgument("scale"), "Argument `scale` is missing.");
   }
@@ -46,16 +46,16 @@ class TTPadOp final : public Operator<Context> {
 
     auto* X_orig_dim0 = Output(1);
     X_orig_dim0->Resize(1);
-    *X_orig_dim0->template mutable_data<TIndex>() = X_dim0;
+    *X_orig_dim0->template mutable_data<int64_t>() = X_dim0;
 
     if (X_dim0 % scale_ != 0) {
-      TIndex padded_dim0 = (X_dim0 / scale_ + 1) * scale_;
+      int64_t padded_dim0 = (X_dim0 / scale_ + 1) * scale_;
       auto dim0_diff = padded_dim0 - X_dim0;
       // set growthPct to the upper bound percentage: (100 * scale_ / X_dim0)
       X_pad->Extend(dim0_diff, 100 * scale_ / X_dim0, &context_);
 
       auto* X_pad_data = X_pad->template mutable_data<T>();
-      TIndex X_size = X_dim0 * X_dim1;
+      int64_t X_size = X_dim0 * X_dim1;
       memset(X_pad_data + X_size, 0, dim0_diff * X_dim1 * sizeof(T));
     }
 
@@ -63,7 +63,7 @@ class TTPadOp final : public Operator<Context> {
   }
 
  protected:
-  TIndex scale_;
+  int64_t scale_;
 };
 
 template <typename T, class Context, class Engine = DefaultEngine>
@@ -78,7 +78,7 @@ class TTPadGradientOp final : public Operator<Context> {
     auto* output = Output(0);
     CAFFE_ENFORCE(&G == output);
 
-    auto old_dim0 = *Input(1).template data<TIndex>();
+    auto old_dim0 = *Input(1).template data<int64_t>();
     auto new_dim0 = G.dim(0);
     auto dim1 = G.dim(1);
 
diff --git a/caffe2/ideep/operators/concat_split_op.cc b/caffe2/ideep/operators/concat_split_op.cc
index 311c6446184a..8d011cd3be8b 100644
--- a/caffe2/ideep/operators/concat_split_op.cc
+++ b/caffe2/ideep/operators/concat_split_op.cc
@@ -43,7 +43,7 @@ class IDEEPConcatOp final : public IDEEPOperator {
     }
 
     auto axis_vdata = ideep::concat::compute(inputs, axis_, add_axis_, *output);
-    axis_info->Resize(vector<TIndex>(1, InputSize()));
+    axis_info->Resize(vector<int64_t>(1, InputSize()));
     int* axis_data = axis_info->template mutable_data<int>();
     for (int i = 0; i < axis_vdata.size(); i++) {
       axis_data[i] = axis_vdata[i];
diff --git a/caffe2/ideep/operators/conv_pool_base_op.h b/caffe2/ideep/operators/conv_pool_base_op.h
index 03da0792acb3..5f026efde9a3 100644
--- a/caffe2/ideep/operators/conv_pool_base_op.h
+++ b/caffe2/ideep/operators/conv_pool_base_op.h
@@ -39,7 +39,7 @@ class IDEEPConvPoolOpBase : public ConvPoolOpBase<IDEEPContext> {
     ideep::tensor::dims output_dims;
 
     auto input_dims = input.get_dims();
-    vector<TIndex> input_Tdims (input_dims.begin(), input_dims.end());
+    vector<int64_t> input_Tdims (input_dims.begin(), input_dims.end());
     InferOutputSize(
         input_Tdims,
         output_channel,
diff --git a/caffe2/ideep/operators/squeeze_op.cc b/caffe2/ideep/operators/squeeze_op.cc
index fe78e30a7d33..4cf73fe9c70d 100644
--- a/caffe2/ideep/operators/squeeze_op.cc
+++ b/caffe2/ideep/operators/squeeze_op.cc
@@ -35,7 +35,7 @@ class IDEEPSqueezeOp final : public IDEEPOperator {
         (dims_.back() + 1),
         " dimensions.");
     const auto& ideep_dims = X.get_dims();
-    vector<TIndex> dims(ideep_dims.begin(), ideep_dims.end());
+    vector<int64_t> dims(ideep_dims.begin(), ideep_dims.end());
     const auto& new_dims = SqueezeOp<IDEEPContext>::ComputeDims(dims, dims_);
     itensor::dims new_dims_ideep(new_dims.begin(), new_dims.end());
     if (&X != Y) {
diff --git a/caffe2/image/image_input_op.h b/caffe2/image/image_input_op.h
index 8e9a81190017..2ce313758589 100644
--- a/caffe2/image/image_input_op.h
+++ b/caffe2/image/image_input_op.h
@@ -372,14 +372,14 @@ ImageInputOp<Context>::ImageInputOp(
     randgen_per_thread_.emplace_back(meta_randgen());
   }
   prefetched_image_.Resize(
-      TIndex(batch_size_),
-      TIndex(crop_),
-      TIndex(crop_),
-      TIndex(color_ ? 3 : 1));
+      int64_t(batch_size_),
+      int64_t(crop_),
+      int64_t(crop_),
+      int64_t(color_ ? 3 : 1));
   if (label_type_ != SINGLE_LABEL && label_type_ != SINGLE_LABEL_WEIGHTED) {
-    prefetched_label_.Resize(TIndex(batch_size_), TIndex(num_labels_));
+    prefetched_label_.Resize(int64_t(batch_size_), int64_t(num_labels_));
   } else {
-    prefetched_label_.Resize(vector<TIndex>(1, batch_size_));
+    prefetched_label_.Resize(vector<int64_t>(1, batch_size_));
   }
 
   for (int i = 0; i < additional_output_sizes.size(); ++i) {
@@ -387,7 +387,7 @@ ImageInputOp<Context>::ImageInputOp(
         Context::GetDeviceType());
     prefetched_additional_outputs_.emplace_back(CPU);
     prefetched_additional_outputs_[i].Resize(
-        TIndex(batch_size_), TIndex(additional_output_sizes[i]));
+        int64_t(batch_size_), int64_t(additional_output_sizes[i]));
   }
 }
 
diff --git a/caffe2/mkl/mkl_utils_test.cc b/caffe2/mkl/mkl_utils_test.cc
index a16224bbe270..622bbca6613c 100644
--- a/caffe2/mkl/mkl_utils_test.cc
+++ b/caffe2/mkl/mkl_utils_test.cc
@@ -23,10 +23,10 @@ TEST(MKLDNNTest, SimpleConvolutionTest) {
   int pads[2] = {0, 0};
 
   // Creating Input and output tensors
-  Tensor X(vector<TIndex>{16, 8, 32, 32}, CPU);
-  Tensor W(vector<TIndex>{64, 8, 3, 3}, CPU);
-  Tensor b(vector<TIndex>{64}, CPU);
-  Tensor Y(vector<TIndex>{16, 64, 30, 30}, CPU);
+  Tensor X(vector<int64_t>{16, 8, 32, 32}, CPU);
+  Tensor W(vector<int64_t>{64, 8, 3, 3}, CPU);
+  Tensor b(vector<int64_t>{64}, CPU);
+  Tensor Y(vector<int64_t>{16, 64, 30, 30}, CPU);
 
   float* data = X.mutable_data<float>();
   for (int i = 0; i < X.size(); ++i) {
@@ -91,7 +91,7 @@ TEST(MKLDNNTest, MKLMemoryCopyTest) {
   // the buffer size being empty for both - former in dnnAllocateBuffer and
   // the latter in dnnConversionExecute (likely due to some difference in
   // layout?). Test both cases.
-  vector<vector<TIndex>> dims_list{{10, 3, 20, 20}, {0}, {0, 10}};
+  vector<vector<int64_t>> dims_list{{10, 3, 20, 20}, {0}, {0, 10}};
   for (const auto& dims : dims_list) {
     auto X_cpu_in = caffe2::make_unique<Tensor>(dims, CPU);
     CPUContext ctx;
diff --git a/caffe2/mkl/mklmemory_serialization.cc b/caffe2/mkl/mklmemory_serialization.cc
index e59a9a15f422..a613623aa2f9 100644
--- a/caffe2/mkl/mklmemory_serialization.cc
+++ b/caffe2/mkl/mklmemory_serialization.cc
@@ -84,8 +84,8 @@ class MKLMemoryDeserializer : public BlobDeserializerBase {
         "MKLMemory only supports either float or double formats.");
     CAFFE_ENFORCE(
         !proto.has_segment(), "MKLMemory does not support segment right now.");
-    vector<TIndex> dims;
-    for (const TIndex d : proto.dims()) {
+    vector<int64_t> dims;
+    for (const int64_t d : proto.dims()) {
       dims.push_back(d);
     }
     // TODO: right now, every time we do a deserializer we create a new MKL
diff --git a/caffe2/mkl/operators/concat_op.cc b/caffe2/mkl/operators/concat_op.cc
index 204f1c1cda93..b8a1c18950a3 100644
--- a/caffe2/mkl/operators/concat_op.cc
+++ b/caffe2/mkl/operators/concat_op.cc
@@ -96,7 +96,7 @@ class MKLConcatOp final : public MKLOperator<T> {
 
  private:
   int axis_;
-  vector<TIndex> cached_output_dims_;
+  vector<int64_t> cached_output_dims_;
 };
 
 } // namespace mkl
diff --git a/caffe2/mkl/operators/conv_op.cc b/caffe2/mkl/operators/conv_op.cc
index 2678f4c37e17..87c8522f1a5a 100644
--- a/caffe2/mkl/operators/conv_op.cc
+++ b/caffe2/mkl/operators/conv_op.cc
@@ -37,7 +37,7 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
       math::Set<T, CPUContext>(
           M, 0.0, cpu_zero_bias.template mutable_data<float>(), &ctx);
 
-      zero_bias_.reset(new MKLMemory<T>(std::vector<TIndex>{M}));
+      zero_bias_.reset(new MKLMemory<T>(std::vector<int64_t>{M}));
       zero_bias_->CopyFrom(cpu_zero_bias);
     }
     const auto& bias = InputSize() == 2
@@ -130,11 +130,11 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
     if (group_ > 1) {
       // Explicitly reformat the buffer.
       MKLMemory<float> group_filter(
-          std::vector<TIndex>{TIndex(group_),
-                              TIndex(filter.dim32(0) / group_),
-                              TIndex(filter.dim32(1)),
-                              TIndex(filter.dim32(2)),
-                              TIndex(filter.dim32(3))},
+          std::vector<int64_t>{int64_t(group_),
+                              int64_t(filter.dim32(0) / group_),
+                              int64_t(filter.dim32(1)),
+                              int64_t(filter.dim32(2)),
+                              int64_t(filter.dim32(3))},
           nullptr,
           dnnResourceFilter,
           /*share_memory_if_possible=*/true);
@@ -168,8 +168,8 @@ class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
   // Input: X, W, b
   // Output: Y
   std::unique_ptr<MKLMemory<T>> zero_bias_;
-  vector<TIndex> cached_input_dims_;
-  vector<TIndex> cached_filter_dims_;
+  vector<int64_t> cached_input_dims_;
+  vector<int64_t> cached_filter_dims_;
   PrimitiveWrapper<T> primitive_;
   LayoutWrapper<T> input_layout_;
   LayoutWrapper<T> filter_layout_;
diff --git a/caffe2/mkl/operators/conv_op_mkldnn.cc b/caffe2/mkl/operators/conv_op_mkldnn.cc
index 80edf1332d06..cb0fe8eae06d 100644
--- a/caffe2/mkl/operators/conv_op_mkldnn.cc
+++ b/caffe2/mkl/operators/conv_op_mkldnn.cc
@@ -106,8 +106,8 @@ class ConvMKLDNNOp final : public ConvPoolOpBase<CPUContext> {
  private:
   // Input: X, W, b
   // Output: Y
-  vector<TIndex> cached_input_dims_;
-  vector<TIndex> cached_filter_dims_;
+  vector<int64_t> cached_input_dims_;
+  vector<int64_t> cached_filter_dims_;
   PrimitiveWrapper<T> primitive_;
   unique_ptr<MKLMemory<T>> X_wrapper_ = nullptr;
   unique_ptr<MKLMemory<T>> filter_wrapper_ = nullptr;
diff --git a/caffe2/mkl/operators/elementwise_sum_op.cc b/caffe2/mkl/operators/elementwise_sum_op.cc
index 7827e874716e..bcd095df9716 100644
--- a/caffe2/mkl/operators/elementwise_sum_op.cc
+++ b/caffe2/mkl/operators/elementwise_sum_op.cc
@@ -64,7 +64,7 @@ class MKLSumOp final : public MKLOperator<T> {
 
  private:
   std::vector<float> coefficients_;
-  vector<TIndex> cached_input_dims_;
+  vector<int64_t> cached_input_dims_;
   vector<std::shared_ptr<void>> input_views_;
 };
 
diff --git a/caffe2/mkl/operators/fully_connected_op.cc b/caffe2/mkl/operators/fully_connected_op.cc
index 404a67a6b7ac..5d21823f8646 100644
--- a/caffe2/mkl/operators/fully_connected_op.cc
+++ b/caffe2/mkl/operators/fully_connected_op.cc
@@ -90,8 +90,8 @@ class MKLFullyConnectedOp final : public MKLOperator<T> {
   // Input: X, W, b
   // Output: Y
   size_t axis_{1};
-  vector<TIndex> cached_input_dims_;
-  vector<TIndex> cached_filter_dims_;
+  vector<int64_t> cached_input_dims_;
+  vector<int64_t> cached_filter_dims_;
   PrimitiveWrapper<T> primitive_;
   LayoutWrapper<T> input_layout_;
   LayoutWrapper<T> filter_layout_;
diff --git a/caffe2/mkl/operators/local_response_normalization_op.cc b/caffe2/mkl/operators/local_response_normalization_op.cc
index f57b4b48b7e7..a57398933f62 100644
--- a/caffe2/mkl/operators/local_response_normalization_op.cc
+++ b/caffe2/mkl/operators/local_response_normalization_op.cc
@@ -19,7 +19,7 @@ class MKLLRNOp final : public LRNOpBase<T, MKLContext> {
   bool RunOnDeviceWithOrderNHWC() override;
 
  private:
-  vector<TIndex> cached_input_dims_;
+  vector<int64_t> cached_input_dims_;
   LayoutWrapper<T> workspace_layout_;
   std::unique_ptr<MKLWorkspace<T>> workspace_buffer_;
   PrimitiveWrapper<T> primitive_;
diff --git a/caffe2/mkl/operators/packed_fc_op.cc b/caffe2/mkl/operators/packed_fc_op.cc
index 0ed93cf06107..5e7b0931c390 100644
--- a/caffe2/mkl/operators/packed_fc_op.cc
+++ b/caffe2/mkl/operators/packed_fc_op.cc
@@ -141,7 +141,7 @@ class PackedFCOp final : public Operator<CPUContext> {
   }
   size_t axis_{1};
   uint32_t hash_{0};
-  vector<TIndex> Y_shape_cache_;
+  vector<int64_t> Y_shape_cache_;
   Tensor bias_multiplier_{CPU};
   std::unique_ptr<MKLPackedMatrix> local_packed_matrix_;
 };
diff --git a/caffe2/mkl/operators/pool_op.cc b/caffe2/mkl/operators/pool_op.cc
index 284e7f80b8c3..281c9db22a2c 100644
--- a/caffe2/mkl/operators/pool_op.cc
+++ b/caffe2/mkl/operators/pool_op.cc
@@ -41,8 +41,8 @@ class MKLPoolOp final : public ConvPoolOpBase<MKLContext> {
   // Input: X
   // Output: Y
  private:
-  vector<TIndex> cached_input_dims_;
-  // vector<TIndex> cached_avgpool_input_dims_;
+  vector<int64_t> cached_input_dims_;
+  // vector<int64_t> cached_avgpool_input_dims_;
   LayoutWrapper<T> workspace_layout_;
   std::unique_ptr<MKLWorkspace<T>> workspace_buffer_;
   PrimitiveWrapper<T> primitive_;
diff --git a/caffe2/mkl/operators/relu_op.cc b/caffe2/mkl/operators/relu_op.cc
index ef734eda72f8..98443d42f2e2 100644
--- a/caffe2/mkl/operators/relu_op.cc
+++ b/caffe2/mkl/operators/relu_op.cc
@@ -43,7 +43,7 @@ class MKLReluOp : public MKLOperator<T> {
   }
 
  private:
-  vector<TIndex> cached_input_dims_;
+  vector<int64_t> cached_input_dims_;
 };
 
 template <typename T>
diff --git a/caffe2/mkl/operators/spatial_batch_norm_op.cc b/caffe2/mkl/operators/spatial_batch_norm_op.cc
index 7d9856f1adfd..13f83dcff7e4 100644
--- a/caffe2/mkl/operators/spatial_batch_norm_op.cc
+++ b/caffe2/mkl/operators/spatial_batch_norm_op.cc
@@ -146,7 +146,7 @@ class MKLBNOp final : public Operator<MKLContext> {
   const StorageOrder order_;
   const int num_batches_;
 
-  vector<TIndex> cached_input_dims_;
+  vector<int64_t> cached_input_dims_;
   LayoutWrapper<T> scale_bias_layout_;
   LayoutWrapper<T> saved_mean_layout_;
   LayoutWrapper<T> saved_var_layout_;
diff --git a/caffe2/mkl/operators/squeeze_op.cc b/caffe2/mkl/operators/squeeze_op.cc
index fb71be56f774..c89258773520 100644
--- a/caffe2/mkl/operators/squeeze_op.cc
+++ b/caffe2/mkl/operators/squeeze_op.cc
@@ -57,7 +57,7 @@ class MKLSqueezeOp final : public MKLOperator<T> {
 
  private:
   vector<int> dims_;
-  vector<TIndex> cached_input_dims_;
+  vector<int64_t> cached_input_dims_;
 };
 
 } // namespace mkl
diff --git a/caffe2/mkl/utils/mkl_memory.cc b/caffe2/mkl/utils/mkl_memory.cc
index 26e423220e3f..3f05f9c5d24b 100644
--- a/caffe2/mkl/utils/mkl_memory.cc
+++ b/caffe2/mkl/utils/mkl_memory.cc
@@ -19,7 +19,7 @@ CAFFE_KNOWN_TYPE(mkl::MKLMemory<float>);
 CAFFE_KNOWN_TYPE(mkl::MKLMemory<double>);
 
 template <typename T>
-static vector<TIndex> GetMKLTensorInfo(
+static vector<int64_t> GetMKLTensorInfo(
     const void* c,
     size_t* capacity,
     DeviceOption* device) {
diff --git a/caffe2/mkl/utils/mkl_memory.h b/caffe2/mkl/utils/mkl_memory.h
index 9d9e91a565eb..bd0ad4042207 100644
--- a/caffe2/mkl/utils/mkl_memory.h
+++ b/caffe2/mkl/utils/mkl_memory.h
@@ -5,8 +5,8 @@
 #include <vector>
 #include <mutex>
 
-#include "caffe2/core/flags.h" // for TIndex
-#include "caffe2/core/tensor.h" // for TIndex
+#include "caffe2/core/flags.h" // for int64_t
+#include "caffe2/core/tensor.h" // for int64_t
 #include "caffe2/mkl/utils/mkl_dnn_cppwrapper.h"
 
 // A global boolean variable that controls the behavior when we call View() on
@@ -270,7 +270,7 @@ class MKLMemory {
         "Reshape is not allowed for custom layouts. "
         "Convert to plain layout before invoking Reshape().");
 
-    TIndex new_size = 1;
+    int64_t new_size = 1;
     for (auto i = 0; i < dims.size(); ++i) {
       CAFFE_ENFORCE_GE_WITH_CALLER(dims[i], 0);
       new_size *= dims[i];
@@ -279,7 +279,7 @@ class MKLMemory {
         new_size == size_,
         "New size and old size are not equal. Reshape is not possible.");
 
-    vector<TIndex> new_dims(dims.size());
+    vector<int64_t> new_dims(dims.size());
     vector<size_t> size(dims.size());
     vector<size_t> strides(dims.size());
     for (int i = 0; i < dims.size(); ++i) {
@@ -456,7 +456,7 @@ class MKLMemory {
     return buffer_.get();
   }
 
-  inline const vector<TIndex>& dims() const {
+  inline const vector<int64_t>& dims() const {
     return dims_;
   }
 
@@ -470,7 +470,7 @@ class MKLMemory {
   /**
    * Returns the size (i.e., the number of items) in the buffer.
    */
-  inline TIndex size() const {
+  inline int64_t size() const {
     return size_;
   }
 
@@ -479,7 +479,7 @@ class MKLMemory {
    * must be between 0 (inclusive) and the number of dimensions, otherwise
    * this function will produce a fatal message.
    */
-  inline TIndex dim(const int i) const {
+  inline int64_t dim(const int i) const {
     return dims_.at(i);
   }
 
@@ -545,9 +545,9 @@ class MKLMemory {
   mutable std::mutex buffer_lock_;
   // The dimensions in the same order as Caffe2 does. This is used to
   // interface with C2.
-  vector<TIndex> dims_;
+  vector<int64_t> dims_;
   // Number of items in the buffer.
-  TIndex size_ = -1;
+  int64_t size_ = -1;
   // The user dnn layout.
   LayoutWrapper<T> user_layout_;
   // The internal dnn layout.
diff --git a/caffe2/mkl/utils/mkl_operator.h b/caffe2/mkl/utils/mkl_operator.h
index 1b91788c89c0..2236e9267af5 100644
--- a/caffe2/mkl/utils/mkl_operator.h
+++ b/caffe2/mkl/utils/mkl_operator.h
@@ -97,7 +97,7 @@ class MKLOperator : public OperatorBase {
   // The primitive used in the operator.
   PrimitiveWrapper<T> primitive_;
   // Size cache for all the input sizes.
-  vector<vector<TIndex>> input_size_cache_;
+  vector<vector<int64_t>> input_size_cache_;
   // An internal MKLMemory buffer. This is usually handy when we have a
   // single output from the operator. If your operator has multiple outputs
   // then you should allocate your own buffer.
diff --git a/caffe2/mobile/contrib/arm-compute/core/context.h b/caffe2/mobile/contrib/arm-compute/core/context.h
index 4085e4983cc8..5ec668cb6d49 100644
--- a/caffe2/mobile/contrib/arm-compute/core/context.h
+++ b/caffe2/mobile/contrib/arm-compute/core/context.h
@@ -249,7 +249,7 @@ public:
 
   const int32_t ndim() const { return dims_.size(); }
 
-  vector<TIndex> dims() const { return dims_; }
+  vector<int64_t> dims() const { return dims_; }
 
   const int32_t dim32(const int index) const { return dims_.at(index); }
 
@@ -283,7 +283,7 @@ private:
   bool SetDims(const vector<TI> &src) {
     auto old_size = size_;
     dims_.resize(src.size());
-    TIndex new_size = 1;
+    int64_t new_size = 1;
     for (unsigned int i = 0; i < src.size(); ++i) {
       new_size *= src[i];
       dims_[i] = src[i];
@@ -299,7 +299,7 @@ private:
     return size_ > old_size;
   }
 
-  bool SetDims(const TIndex d0) {
+  bool SetDims(const int64_t d0) {
     auto old_size = size_;
     dims_.resize(1);
     dims_[0] = d0;
@@ -307,7 +307,7 @@ private:
     return size_ > old_size;
   }
 
-  bool SetDims(const TIndex d0, const TIndex d1) {
+  bool SetDims(const int64_t d0, const int64_t d1) {
     auto old_size = size_;
     dims_.resize(2);
     dims_[0] = d0;
@@ -316,7 +316,7 @@ private:
     return size_ > old_size;
   }
 
-  bool SetDims(const TIndex d0, const TIndex d1, const TIndex d2) {
+  bool SetDims(const int64_t d0, const int64_t d1, const int64_t d2) {
     auto old_size = size_;
     dims_.resize(3);
     dims_[0] = d0;
@@ -326,8 +326,8 @@ private:
     return size_ > old_size;
   }
 
-  bool SetDims(const TIndex d0, const TIndex d1, const TIndex d2,
-               const TIndex d3) {
+  bool SetDims(const int64_t d0, const int64_t d1, const int64_t d2,
+               const int64_t d3) {
     auto old_size = size_;
     dims_.resize(4);
     dims_[0] = d0;
@@ -338,8 +338,8 @@ private:
     return size_ > old_size;
   }
 
-  vector<TIndex> dims_;
-  TIndex size_ = -1;
+  vector<int64_t> dims_;
+  int64_t size_ = -1;
   arm_compute::TensorShape shape_;
   unique_ptr<arm_compute::GCTensor> tensor_;
 };
diff --git a/caffe2/mobile/contrib/arm-compute/operators/fully_connected_op.cc b/caffe2/mobile/contrib/arm-compute/operators/fully_connected_op.cc
index ac36118054cc..448d84315cc9 100644
--- a/caffe2/mobile/contrib/arm-compute/operators/fully_connected_op.cc
+++ b/caffe2/mobile/contrib/arm-compute/operators/fully_connected_op.cc
@@ -40,7 +40,7 @@ bool GLFullyConnectedOp<T>::RunOnDevice() {
   CAFFE_ENFORCE_EQ(1, B_->ndim());
   CAFFE_ENFORCE_EQ(N, B_->dim32(0));
 
-  vector<TIndex> output_dims = {M, N};
+  vector<int64_t> output_dims = {M, N};
   GLTensor<T> *Y =
       OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
   if (first_run_) {
diff --git a/caffe2/mobile/contrib/arm-compute/operators/pool_op.cc b/caffe2/mobile/contrib/arm-compute/operators/pool_op.cc
index 19aede792d01..cc9af9c7359e 100644
--- a/caffe2/mobile/contrib/arm-compute/operators/pool_op.cc
+++ b/caffe2/mobile/contrib/arm-compute/operators/pool_op.cc
@@ -53,7 +53,7 @@ bool GLAveragePoolOp<DataType>::RunOnDeviceWithOrderNCHW() {
   int height = X_->dim32(2);
   int width = X_->dim32(3);
 
-  vector<TIndex> output_dims = {N, channels, 1, 1};
+  vector<int64_t> output_dims = {N, channels, 1, 1};
   if (!global_pooling_) {
     output_dims[2] = (height + pad_t() + pad_b() - kernel_h()) / stride_h() + 1;
     output_dims[3] = (width + pad_l() + pad_r() - kernel_w()) / stride_w() + 1;
@@ -116,7 +116,7 @@ template <> bool GLMaxPoolOp<DataType>::RunOnDeviceWithOrderNCHW() {
   int height = X_->dim32(2);
   int width = X_->dim32(3);
 
-  vector<TIndex> output_dims = {N, channels, 1, 1};
+  vector<int64_t> output_dims = {N, channels, 1, 1};
   if (!global_pooling_) {
     output_dims[2] = (height + pad_t() + pad_b() - kernel_h()) / stride_h() + 1;
     output_dims[3] = (width + pad_l() + pad_r() - kernel_w()) / stride_w() + 1;
diff --git a/caffe2/mobile/contrib/arm-compute/operators/resize_op.cc b/caffe2/mobile/contrib/arm-compute/operators/resize_op.cc
index ed9f672ce52b..e15663b674c4 100644
--- a/caffe2/mobile/contrib/arm-compute/operators/resize_op.cc
+++ b/caffe2/mobile/contrib/arm-compute/operators/resize_op.cc
@@ -45,7 +45,7 @@ bool GLResizeNearestOp<T>::RunOnDevice() {
 
   GLTensor<T> *Y =
       OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
-  vector<TIndex> output_dims = {N, C, H * height_scale_, W * width_scale_};
+  vector<int64_t> output_dims = {N, C, H * height_scale_, W * width_scale_};
 
   if (first_run_) {
     Y->Resize(output_dims);
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
index 2238d7af08dd..52f746f63f31 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@@ -329,7 +329,7 @@ class CopyToMPSCNNOp final : public Operator<CPUContext> {
     for (auto i = 0; i < Inputs().size(); ++i) {
       const auto& X = Input(i);
       CAFFE_ENFORCE(X.ndim() > 0 && X.ndim() <= 4);
-      std::vector<TIndex> XDims = {1, 1, 1, 1};
+      std::vector<int64_t> XDims = {1, 1, 1, 1};
       XDims.assign(X.dims().begin(), X.dims().end());
 
       caffe2::Timer t;
@@ -2259,15 +2259,15 @@ class MPSCNNGenerateProposalsCPPOp final : public Operator<CPUContext> {
 
     // bbox_deltas: (num_images, A * 4, H, W)
     CAFFE_ENFORCE_EQ(
-        bbox_deltas.dims(), (vector<TIndex>{num_images, 4 * A, height, width}));
+        bbox_deltas.dims(), (vector<int64_t>{num_images, 4 * A, height, width}));
 
     // im_info_tensor: (num_images, 3), format [height, width, scale; ...]
-    CAFFE_ENFORCE_EQ(im_info_tensor.dims(), (vector<TIndex>{num_images, 3}));
+    CAFFE_ENFORCE_EQ(im_info_tensor.dims(), (vector<int64_t>{num_images, 3}));
     CAFFE_ENFORCE(
         im_info_tensor.template IsType<float>(), im_info_tensor.meta().name());
 
     // anchors: (A, 4)
-    CAFFE_ENFORCE_EQ(anchors.dims(), (vector<TIndex>{A, 4}));
+    CAFFE_ENFORCE_EQ(anchors.dims(), (vector<int64_t>{A, 4}));
     CAFFE_ENFORCE(anchors.template IsType<float>(), anchors.meta().name());
     // Broadcast the anchors to all pixels
     auto all_anchors_vec =
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
index bcf588d8a384..7216b16611aa 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
@@ -640,7 +640,7 @@ void testMPSCNN() {
               CAFFE_ENFORCE_EQ(t1.ndim(), 2);
               CAFFE_ENFORCE(t2.dim32(2) == 1 && t2.dim32(3) == 1);
               const_cast<TensorCPU&>(t2).Reshape(
-                  std::vector<TIndex>{TIndex(batchSize), TIndex(COut)});
+                  std::vector<int64_t>{int64_t(batchSize), int64_t(COut)});
               // Note dims do not match, as Metal leaves a 1x1 spatial
               // dimension.
               CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
diff --git a/caffe2/mobile/contrib/ios/pool_test.cc b/caffe2/mobile/contrib/ios/pool_test.cc
index c4f6ff4d6a3a..47fd405eef01 100644
--- a/caffe2/mobile/contrib/ios/pool_test.cc
+++ b/caffe2/mobile/contrib/ios/pool_test.cc
@@ -12,7 +12,7 @@ namespace caffe2 {
 
 namespace {
 
-void AddNoiseInput(const vector<TIndex>& shape, const string& name, Workspace* ws) {
+void AddNoiseInput(const vector<int64_t>& shape, const string& name, Workspace* ws) {
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
@@ -58,7 +58,7 @@ void compareMaxPooling(int N,
   def1.add_arg()->CopyFrom(MakeArgument("pad_b", padB));
   def1.add_arg()->CopyFrom(MakeArgument("pad_r", padR));
 
-  AddNoiseInput(vector<TIndex>{N, C, H, W}, "X", &ws);
+  AddNoiseInput(vector<int64_t>{N, C, H, W}, "X", &ws);
 
   unique_ptr<OperatorBase> op1(CreateOperator(def1, &ws));
   EXPECT_NE(nullptr, op1.get());
diff --git a/caffe2/mobile/contrib/ios/resize_test.cc b/caffe2/mobile/contrib/ios/resize_test.cc
index 90e672397b82..1c08df0f32a1 100644
--- a/caffe2/mobile/contrib/ios/resize_test.cc
+++ b/caffe2/mobile/contrib/ios/resize_test.cc
@@ -12,7 +12,7 @@ namespace caffe2 {
 
 namespace {
 
-void AddNoiseInput(const vector<TIndex>& shape, const string& name, Workspace* ws) {
+void AddNoiseInput(const vector<int64_t>& shape, const string& name, Workspace* ws) {
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
@@ -44,7 +44,7 @@ void compareResizeNeareast(int N,
   def1.add_arg()->CopyFrom(MakeArgument("width_scale", wscale));
   def1.add_arg()->CopyFrom(MakeArgument("height_scale", hscale));
 
-  AddNoiseInput(vector<TIndex>{N, C, H, W}, "X", &ws);
+  AddNoiseInput(vector<int64_t>{N, C, H, W}, "X", &ws);
 
   unique_ptr<OperatorBase> op1(CreateOperator(def1, &ws));
   EXPECT_NE(nullptr, op1.get());
diff --git a/caffe2/mobile/contrib/opengl/test/TestGLConvolution.cc b/caffe2/mobile/contrib/opengl/test/TestGLConvolution.cc
index 13c5a72dcd8d..cb175c5b0537 100644
--- a/caffe2/mobile/contrib/opengl/test/TestGLConvolution.cc
+++ b/caffe2/mobile/contrib/opengl/test/TestGLConvolution.cc
@@ -12,7 +12,7 @@
 
 #include <vector>
 
-void AddNoiseInput(const std::vector<caffe2::TIndex>& shape,
+void AddNoiseInput(const std::vector<int64_t>& shape,
                    const std::string& name,
                    caffe2::Workspace* ws) {
   caffe2::CPUContext context;
@@ -60,13 +60,13 @@ double BenchOp(const std::string& typ,
   def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_r", 0));
   def1.add_arg()->CopyFrom(caffe2::MakeArgument("convolution_transform_strategy", std::string("PRECOMPUTE")));
 
-  AddNoiseInput(std::vector<caffe2::TIndex>{1, inputC, inH, inW}, "X", ws);
+  AddNoiseInput(std::vector<int64_t>{1, inputC, inH, inW}, "X", ws);
   if (transposed) {
-    AddNoiseInput(std::vector<caffe2::TIndex>{inputC, outputC, kH, kW}, "W", ws);
+    AddNoiseInput(std::vector<int64_t>{inputC, outputC, kH, kW}, "W", ws);
   } else {
-    AddNoiseInput(std::vector<caffe2::TIndex>{outputC, inputC, kH, kW}, "W", ws);
+    AddNoiseInput(std::vector<int64_t>{outputC, inputC, kH, kW}, "W", ws);
   }
-  AddNoiseInput(std::vector<caffe2::TIndex>{outputC}, "B", ws);
+  AddNoiseInput(std::vector<int64_t>{outputC}, "B", ws);
 
   std::unique_ptr<caffe2::OperatorBase> op1(CreateOperator(def1, ws));
 
@@ -131,19 +131,19 @@ static double BenchGLConvolution(int input_channels,
   }
 
   AddNoiseInput(
-      std::vector<caffe2::TIndex>{1, input_channels, input_height, input_width}, "X_cpu", ws);
+      std::vector<int64_t>{1, input_channels, input_height, input_width}, "X_cpu", ws);
   if (transposed) {
     AddNoiseInput(
-        std::vector<caffe2::TIndex>{input_channels, output_channels, kernel_height, kernel_width},
+        std::vector<int64_t>{input_channels, output_channels, kernel_height, kernel_width},
         "W",
         ws);
   } else {
     AddNoiseInput(
-        std::vector<caffe2::TIndex>{output_channels, input_channels, kernel_height, kernel_width},
+        std::vector<int64_t>{output_channels, input_channels, kernel_height, kernel_width},
         "W",
         ws);
   }
-  AddNoiseInput(std::vector<caffe2::TIndex>{output_channels}, "b", ws);
+  AddNoiseInput(std::vector<int64_t>{output_channels}, "b", ws);
 
   caffe2::NetDef netdef;
   {
diff --git a/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc b/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
index 1bbe303ef777..deced7196449 100644
--- a/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
+++ b/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
@@ -36,7 +36,7 @@
 
 namespace caffe2 {
 
-void AddConstInput(const vector<TIndex>& shape,
+void AddConstInput(const vector<int64_t>& shape,
                    const float value,
                    const string& name,
                    Workspace* ws) {
@@ -50,7 +50,7 @@ void AddConstInput(const vector<TIndex>& shape,
                                &context);
 }
 
-void AddNoiseInput(const vector<TIndex>& shape,
+void AddNoiseInput(const vector<int64_t>& shape,
                    const string& name,
                    Workspace* ws) {
   DeviceOption option;
@@ -72,7 +72,7 @@ float snpe_run(int iters, Workspace& ws) {
   const int W = 227;
   const int C = 3;
 
-  POPULATE_DATA("X_snpe", (caffe2::vector<caffe2::TIndex>{H, W, C}), hwc);
+  POPULATE_DATA("X_snpe", (caffe2::vector<int64_t>{H, W, C}), hwc);
 
   OperatorDef def;
   def.set_name("snpe_test");
@@ -108,7 +108,7 @@ float caffe2_run(int iters, Workspace& ws) {
   ReadProtoFromBinaryFile("/data/local/tmp/squeeze_init_net.pb", &init_net);
   ReadProtoFromBinaryFile("/data/local/tmp/squeeze_predict_net.pb", &predict_net);
   ws.RunNetOnce(init_net);
-  POPULATE_DATA("data", (caffe2::vector<caffe2::TIndex>{N, C, H, W}), chw);
+  POPULATE_DATA("data", (caffe2::vector<int64_t>{N, C, H, W}), chw);
   predict_net.set_name("SqueezeNet");
   ws.CreateNet(predict_net);
 
diff --git a/caffe2/mobile/contrib/ulp2/ulp_neon.cc b/caffe2/mobile/contrib/ulp2/ulp_neon.cc
index 4c8e668775e5..7d84662d20c5 100644
--- a/caffe2/mobile/contrib/ulp2/ulp_neon.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp_neon.cc
@@ -538,7 +538,7 @@ void run2b1bConvIm2ColGEMM(QConvState* state,
     CAFFE_ENFORCE_EQ(Y->dim32(0), divRoundUp(X.dim32(0) * OH * OW, kGEMMTileSize) * kGEMMTileSize);
     CAFFE_ENFORCE_EQ(Y->dim32(1), OC);
     Y->ShrinkTo(X.dim32(0) * OH * OW);
-    Y->Reshape(std::vector<TIndex>{{TIndex(X.dim(0)), TIndex(OH), TIndex(OW), TIndex(OC)}});
+    Y->Reshape(std::vector<int64_t>{{int64_t(X.dim(0)), int64_t(OH), int64_t(OW), int64_t(OC)}});
   }
 }
 
diff --git a/caffe2/mobile/contrib/ulp2/ulp_test.cc b/caffe2/mobile/contrib/ulp2/ulp_test.cc
index f6705e638dda..a1c1af0f6dfb 100644
--- a/caffe2/mobile/contrib/ulp2/ulp_test.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp_test.cc
@@ -62,7 +62,7 @@ int randInt(int a, int b) {
   return std::uniform_int_distribution<int>(a, b)(gen);
 }
 
-TensorCPU genTensor11(std::vector<TIndex> shape) {
+TensorCPU genTensor11(std::vector<int64_t> shape) {
   Tensor r(CPU);
   r.Resize(shape);
 
@@ -76,7 +76,7 @@ TensorCPU genTensor11(std::vector<TIndex> shape) {
   return r;
 }
 
-TensorCPU genTensorUniform11(std::vector<TIndex> shape) {
+TensorCPU genTensorUniform11(std::vector<int64_t> shape) {
   Tensor r(CPU);
   r.Resize(shape);
 
@@ -90,7 +90,7 @@ TensorCPU genTensorUniform11(std::vector<TIndex> shape) {
   return r;
 }
 
-TensorCPU genTensor0123(std::vector<TIndex> shape) {
+TensorCPU genTensor0123(std::vector<int64_t> shape) {
   Tensor r(CPU);
   r.Resize(shape);
 
@@ -171,7 +171,7 @@ inline void qgemmNT(int M, int N, int K, const uint8_t* A, const uint8_t* B, flo
   }
 }
 
-void gemmTest(TIndex M, TIndex N, TIndex K) {
+void gemmTest(int64_t M, int64_t N, int64_t K) {
   auto X = genTensor11({M, K});
   auto W = genTensor11({N, K});
   Tensor XQ(CPU), WQ(CPU), YQ(CPU), Y(CPU);
diff --git a/caffe2/mpi/mpi_ops.h b/caffe2/mpi/mpi_ops.h
index 8657c107ed0f..911b51b96039 100644
--- a/caffe2/mpi/mpi_ops.h
+++ b/caffe2/mpi/mpi_ops.h
@@ -98,7 +98,7 @@ class MPIAllgatherOp final : public Operator<Context> {
     MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
     auto& input = Input(1);
     auto* output = Output(0);
-    vector<TIndex> output_dims = input.dims();
+    vector<int64_t> output_dims = input.dims();
     output_dims[0] *= OperatorBase::Input<MPICommonWorldWrapper>(0).size();
     output->Resize(output_dims);
     MPI_CHECK(MPI_Allgather(
diff --git a/caffe2/operators/accuracy_op.cc b/caffe2/operators/accuracy_op.cc
index 8c1273eca209..2ee730aa9b44 100644
--- a/caffe2/operators/accuracy_op.cc
+++ b/caffe2/operators/accuracy_op.cc
@@ -12,7 +12,7 @@ bool AccuracyOp<float, CPUContext>::RunOnDevice() {
   int D = X.dim32(1);
   CAFFE_ENFORCE_EQ(label.ndim(), 1);
   CAFFE_ENFORCE_EQ(label.dim32(0), N);
-  Y->Resize(vector<TIndex>());
+  Y->Resize(vector<int64_t>());
   const auto* Xdata = X.data<float>();
   const auto* labelData = label.data<int>();
   const int top_k = top_k_;
diff --git a/caffe2/operators/accuracy_op.cu b/caffe2/operators/accuracy_op.cu
index 5d27707662c7..b1f6f137831e 100644
--- a/caffe2/operators/accuracy_op.cu
+++ b/caffe2/operators/accuracy_op.cu
@@ -53,7 +53,7 @@ bool AccuracyOp<float, CUDAContext>::RunOnDevice() {
   int D = X.dim32(1);
   CAFFE_ENFORCE_EQ(label.ndim(), 1);
   CAFFE_ENFORCE_EQ(label.dim32(0), N);
-  Y->Resize(vector<TIndex>());
+  Y->Resize(vector<int64_t>());
   float* Ydata = Y->template mutable_data<float>();
   math::Set<float, CUDAContext>(1, 0, Ydata, &context_);
   AccuracyKernel<<<
diff --git a/caffe2/operators/arg_ops.cc b/caffe2/operators/arg_ops.cc
index aeedbd5f1437..c381509bc8ba 100644
--- a/caffe2/operators/arg_ops.cc
+++ b/caffe2/operators/arg_ops.cc
@@ -15,14 +15,14 @@ void ComputeArgImpl(
     const int n,
     const Compare& comp,
     const T* X,
-    TIndex* Y,
+    int64_t* Y,
     Context* context) {
-  math::Set<TIndex, Context>(prev_size * next_size, TIndex(0), Y, context);
+  math::Set<int64_t, Context>(prev_size * next_size, int64_t(0), Y, context);
   for (int i = 0; i < prev_size; ++i) {
     const T* cur_X = X + i * n * next_size + next_size;
     for (int k = 1; k < n; ++k) {
       for (int j = 0; j < next_size; ++j) {
-        TIndex* cur_Y = Y + i * next_size + j;
+        int64_t* cur_Y = Y + i * next_size + j;
         if (comp(*cur_X, X[i * n * next_size + *cur_Y * next_size + j])) {
           *cur_Y = k;
         }
@@ -41,7 +41,7 @@ bool ArgMaxReducer<CPUContext>::operator()(
     const int next_size,
     const int n,
     const T* X,
-    TIndex* Y,
+    int64_t* Y,
     CPUContext* context) const {
   ComputeArgImpl(prev_size, next_size, n, std::greater<T>(), X, Y, context);
   return true;
@@ -54,7 +54,7 @@ bool ArgMinReducer<CPUContext>::operator()(
     const int next_size,
     const int n,
     const T* X,
-    TIndex* Y,
+    int64_t* Y,
     CPUContext* context) const {
   ComputeArgImpl(prev_size, next_size, n, std::less<T>(), X, Y, context);
   return true;
diff --git a/caffe2/operators/arg_ops.cu b/caffe2/operators/arg_ops.cu
index 1735e4268e3b..fdc6331dccc1 100644
--- a/caffe2/operators/arg_ops.cu
+++ b/caffe2/operators/arg_ops.cu
@@ -28,7 +28,7 @@ __global__ void ComputeArgCUDAKernel(
     const Reducer reducer,
     const T init,
     const T* X,
-    TIndex* Y) {
+    int64_t* Y) {
   __shared__ typename BlockReduce<int, T>::TempStorage temp_storage;
   const int d = stride.d();
   for (int idx = blockIdx.x; idx < outer_size; idx += gridDim.x) {
@@ -41,7 +41,7 @@ __global__ void ComputeArgCUDAKernel(
     }
     kv = BlockReduce<int, T>(temp_storage).Reduce(kv, reducer);
     if (threadIdx.x == 0) {
-      Y[idx] = static_cast<TIndex>(kv.key);
+      Y[idx] = static_cast<int64_t>(kv.key);
     }
     __syncthreads();
   }
@@ -56,7 +56,7 @@ bool ArgMaxReducer<CUDAContext>::operator()(
     const int next_size,
     const int n,
     const T* X,
-    TIndex* Y,
+    int64_t* Y,
     CUDAContext* context) const {
   const int outer_size = prev_size * next_size;
   const FixedDivisor<int> stride(next_size);
@@ -82,7 +82,7 @@ bool ArgMinReducer<CUDAContext>::operator()(
     const int next_size,
     const int n,
     const T* X,
-    TIndex* Y,
+    int64_t* Y,
     CUDAContext* context) const {
   const int outer_size = prev_size * next_size;
   const FixedDivisor<int> stride(next_size);
diff --git a/caffe2/operators/arg_ops.h b/caffe2/operators/arg_ops.h
index 98917a350dcf..f29c0d5aa50a 100644
--- a/caffe2/operators/arg_ops.h
+++ b/caffe2/operators/arg_ops.h
@@ -60,7 +60,7 @@ class ArgOp final : public Operator<Context> {
         next_size,
         n,
         X.template data<T>(),
-        Y->template mutable_data<TIndex>(),
+        Y->template mutable_data<int64_t>(),
         &context_);
   }
 
@@ -78,7 +78,7 @@ struct ArgMaxReducer {
       const int next_size,
       const int n,
       const T* X,
-      TIndex* Y,
+      int64_t* Y,
       Context* context) const;
 };
 
@@ -90,7 +90,7 @@ struct ArgMinReducer {
       const int next_size,
       const int n,
       const T* X,
-      TIndex* Y,
+      int64_t* Y,
       Context* context) const;
 };
 
diff --git a/caffe2/operators/assert_op.h b/caffe2/operators/assert_op.h
index 3e74c5afc647..796f8af257bd 100644
--- a/caffe2/operators/assert_op.h
+++ b/caffe2/operators/assert_op.h
@@ -22,7 +22,7 @@ class AssertOp final : public Operator<Context> {
     cmp_tensor_.CopyFrom(Input(0));
     auto* cmp_data = cmp_tensor_.template data<T>();
 
-    for (TIndex i = 0; i < cmp_tensor_.size(); ++i) {
+    for (int64_t i = 0; i < cmp_tensor_.size(); ++i) {
       CAFFE_ENFORCE((bool)cmp_data[i], [&]() {
         std::stringstream ss;
         ss << "Assert failed for element " << i
diff --git a/caffe2/operators/atomic_ops.cc b/caffe2/operators/atomic_ops.cc
index 73c4196b6e9b..2ce97b0d58c5 100644
--- a/caffe2/operators/atomic_ops.cc
+++ b/caffe2/operators/atomic_ops.cc
@@ -29,8 +29,8 @@ class AtomicFetchAddOp final : public Operator<CPUContext> {
     auto& b = Input(2);
     auto* c = Output(0);
     auto* d = Output(1);
-    c->Resize(std::vector<TIndex>());
-    d->Resize(std::vector<TIndex>());
+    c->Resize(std::vector<int64_t>());
+    d->Resize(std::vector<int64_t>());
     auto* aPtr = a.data<int32_t>();
     auto* bPtr = b.data<int32_t>();
     auto* cPtr = c->template mutable_data<int32_t>();
diff --git a/caffe2/operators/batch_box_cox_op.cc b/caffe2/operators/batch_box_cox_op.cc
index e35c726d185a..aad1daf91f3e 100644
--- a/caffe2/operators/batch_box_cox_op.cc
+++ b/caffe2/operators/batch_box_cox_op.cc
@@ -105,7 +105,7 @@ bool BatchBoxCoxOp<CPUContext>::DoRunWithType() {
     zeros_.clear();
     nonzeros_.reserve(D);
     zeros_.reserve(D);
-    for (TIndex j = 0; j < D; j++) {
+    for (int64_t j = 0; j < D; j++) {
       if (lambda1_ptr[j] == 0) {
         zeros_.push_back(j);
       } else {
@@ -121,7 +121,7 @@ bool BatchBoxCoxOp<CPUContext>::DoRunWithType() {
     // rows by replicating the input parameters K times. Then finish row-by-row.
     TypedCachedBuffers<T>& b = GetBuffers<T>();
     if (nonzeros_.size() == D) {
-      TIndex i = 0;
+      int64_t i = 0;
       if (K > 1) {
         TileArrayIntoVector(lambda1_ptr, D, K, &b.lambda1_);
         TileArrayIntoVector(lambda2_ptr, D, K, &b.lambda2_);
@@ -142,7 +142,7 @@ bool BatchBoxCoxOp<CPUContext>::DoRunWithType() {
             D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
       }
     } else if (zeros_.size() == D) {
-      TIndex i = 0;
+      int64_t i = 0;
       if (K > 1) {
         TileArrayIntoVector(lambda2_ptr, D, K, &b.lambda2_z_);
         DCHECK_EQ(K * D, b.lambda2_z_.size());
@@ -169,7 +169,7 @@ bool BatchBoxCoxOp<CPUContext>::DoRunWithType() {
       PackV(nonzeros_.size(), lambda2_ptr, nonzeros_.data(), b.lambda2_.data());
       PackV(zeros_.size(), lambda2_ptr, zeros_.data(), b.lambda2_z_.data());
 
-      TIndex i = 0;
+      int64_t i = 0;
       b.accumulator_.resize(std::max(nonzeros_.size(), zeros_.size()));
       if (K > 1) {
         // Truncate to original size, and re-tile with offsets this time.
@@ -219,15 +219,15 @@ bool BatchBoxCoxOp<CPUContext>::DoRunWithType() {
 template <>
 template <typename T>
 void BatchBoxCoxOp<CPUContext>::BoxCoxNaive(
-    TIndex N,
-    TIndex D,
+    int64_t N,
+    int64_t D,
     const T* data_ptr,
     const T* lambda1_ptr,
     const T* lambda2_ptr,
     T k_eps,
     T* output_ptr) {
-  for (TIndex i = 0; i < N; i++) {
-    for (TIndex j = 0; j < D; j++, data_ptr++, output_ptr++) {
+  for (int64_t i = 0; i < N; i++) {
+    for (int64_t j = 0; j < D; j++, data_ptr++, output_ptr++) {
       T lambda1_v = lambda1_ptr[j];
       T lambda2_v = lambda2_ptr[j];
       T tmp = std::max(*data_ptr + lambda2_v, k_eps);
@@ -245,18 +245,18 @@ void BatchBoxCoxOp<CPUContext>::BoxCoxNaive(
 template <>
 template <typename T>
 void BatchBoxCoxOp<CPUContext>::BoxCoxNonzeroLambda(
-    TIndex D,
+    int64_t D,
     const T* data_ptr,
     const T* lambda1,
     const T* lambda2,
     T k_eps,
     T* out) {
   caffe2::math::Add(D, data_ptr, lambda2, out, &context_);
-  for (TIndex j = 0; j < D; j++) {
+  for (int64_t j = 0; j < D; j++) {
     out[j] = std::max(out[j], k_eps);
   }
   Pow(D, out, lambda1, out);
-  for (TIndex j = 0; j < D; j++) {
+  for (int64_t j = 0; j < D; j++) {
     out[j] -= 1.0;
   }
   caffe2::math::Div(D, out, lambda1, out, &context_);
@@ -265,13 +265,13 @@ void BatchBoxCoxOp<CPUContext>::BoxCoxNonzeroLambda(
 template <>
 template <typename T>
 void BatchBoxCoxOp<CPUContext>::BoxCoxZeroLambda(
-    TIndex D,
+    int64_t D,
     const T* data_ptr,
     const T* lambda2,
     T k_eps,
     T* output_ptr) {
   caffe2::math::Add(D, data_ptr, lambda2, output_ptr, &context_);
-  for (TIndex j = 0; j < D; j++) {
+  for (int64_t j = 0; j < D; j++) {
     output_ptr[j] = std::max(output_ptr[j], k_eps);
   }
   caffe2::math::Log(D, output_ptr, output_ptr, &context_);
diff --git a/caffe2/operators/batch_box_cox_op.h b/caffe2/operators/batch_box_cox_op.h
index 7f7b2dd8da03..6bdc3c759370 100644
--- a/caffe2/operators/batch_box_cox_op.h
+++ b/caffe2/operators/batch_box_cox_op.h
@@ -27,8 +27,8 @@ class BatchBoxCoxOp final : public Operator<Context> {
  protected:
   template <typename T>
   void BoxCoxNaive(
-      TIndex N,
-      TIndex D,
+      int64_t N,
+      int64_t D,
       const T* data_ptr,
       const T* lambda1_ptr,
       const T* lambda2_ptr,
@@ -38,7 +38,7 @@ class BatchBoxCoxOp final : public Operator<Context> {
 #ifdef CAFFE2_USE_MKL
   template <typename T>
   void BoxCoxNonzeroLambda(
-      TIndex D,
+      int64_t D,
       const T* data_ptr,
       const T* lambda1,
       const T* lambda2,
@@ -47,7 +47,7 @@ class BatchBoxCoxOp final : public Operator<Context> {
 
   template <typename T>
   void BoxCoxZeroLambda(
-      TIndex D,
+      int64_t D,
       const T* data_ptr,
       const T* lambda2,
       T k_eps,
diff --git a/caffe2/operators/batch_bucketize_op.cc b/caffe2/operators/batch_bucketize_op.cc
index dbbd56d75f46..21f3029de4d4 100644
--- a/caffe2/operators/batch_bucketize_op.cc
+++ b/caffe2/operators/batch_bucketize_op.cc
@@ -26,21 +26,21 @@ bool BatchBucketizeOp<CPUContext>::RunOnDevice() {
   auto feature_dim = feature.dim(1);
   auto output_dim = indices.size();
 
-  TIndex length_sum = 0;
-  for (TIndex i = 0; i < lengths.size(); i++) {
+  int64_t length_sum = 0;
+  for (int64_t i = 0; i < lengths.size(); i++) {
     CAFFE_ENFORCE_GE(feature_dim, indices_data[i]);
     length_sum += lengths_data[i];
   }
   CAFFE_ENFORCE_EQ(length_sum, boundaries.size());
 
-  TIndex lower_bound = 0;
+  int64_t lower_bound = 0;
   output->Resize(batch_size, output_dim);
   auto* output_data = output->template mutable_data<int32_t>();
 
-  for (TIndex i = 0; i < batch_size; i++) {
+  for (int64_t i = 0; i < batch_size; i++) {
     lower_bound = 0;
-    for (TIndex j = 0; j < output_dim; j++) {
-      for (TIndex k = 0; k <= lengths_data[j]; k++) {
+    for (int64_t j = 0; j < output_dim; j++) {
+      for (int64_t k = 0; k <= lengths_data[j]; k++) {
         if (k == lengths_data[j] ||
             feature_data[i * feature_dim + indices_data[j]] <=
                 boundaries_data[lower_bound + k]) {
diff --git a/caffe2/operators/batch_gather_ops.cu b/caffe2/operators/batch_gather_ops.cu
index 2d047660491b..d1559dc6d9a3 100644
--- a/caffe2/operators/batch_gather_ops.cu
+++ b/caffe2/operators/batch_gather_ops.cu
@@ -41,7 +41,7 @@ bool BatchGatherOp<CUDAContext>::DoRunWithType() {
   auto& indices = Input(INDICES);
   auto* output = Output(0);
 
-  vector<TIndex> shape;
+  vector<int64_t> shape;
   shape.push_back(data.dim(0));
   shape.insert(shape.end(), indices.dims().begin(), indices.dims().end());
   shape.insert(shape.end(), data.dims().begin() + 2, data.dims().end());
diff --git a/caffe2/operators/batch_gather_ops.h b/caffe2/operators/batch_gather_ops.h
index 9478150265dd..2b9e4d6d5e6e 100644
--- a/caffe2/operators/batch_gather_ops.h
+++ b/caffe2/operators/batch_gather_ops.h
@@ -26,7 +26,7 @@ class BatchGatherOp final : public Operator<Context> {
 
     CAFFE_ENFORCE_GE(data.ndim(), 2, "DATA should be at least 2-D");
 
-    vector<TIndex> shape;
+    vector<int64_t> shape;
     shape.push_back(data.dim(0));
     shape.insert(shape.end(), indices.dims().begin(), indices.dims().end());
     shape.insert(shape.end(), data.dims().begin() + 2, data.dims().end());
diff --git a/caffe2/operators/batch_matmul_op.cc b/caffe2/operators/batch_matmul_op.cc
index eda519586e78..431e1f4a0a75 100644
--- a/caffe2/operators/batch_matmul_op.cc
+++ b/caffe2/operators/batch_matmul_op.cc
@@ -27,16 +27,16 @@ vector<TensorShape> TensorInferenceForBatchMatMul(
       b_dim1 = in[1].dims(ndim - 1);
     }
 
-    auto output_dims = vector<TIndex>{in[0].dims().begin(), in[0].dims().end()};
+    auto output_dims = vector<int64_t>{in[0].dims().begin(), in[0].dims().end()};
     output_dims[ndim - 2] = a_dim0;
     output_dims[ndim - 1] = b_dim1;
 
     return vector<TensorShape>{
-        CreateTensorShape(vector<TIndex>{output_dims}, in[0].data_type())};
+        CreateTensorShape(vector<int64_t>{output_dims}, in[0].data_type())};
   } else {
     auto ndims_A = in[0].dims_size();
     auto ndims_B = in[1].dims_size();
-    std::vector<TIndex> dims_A(ndims_A), dims_B(ndims_B);
+    std::vector<int64_t> dims_A(ndims_A), dims_B(ndims_B);
     for (int i = 0; i < ndims_A; ++i) {
       dims_A[i] = in[0].dims(i);
     }
@@ -66,7 +66,7 @@ vector<TensorShape> TensorInferenceForBatchMatMul(
       N = dims_B[ndims_B - 1];
     }
 
-    std::vector<TIndex> new_dims;
+    std::vector<int64_t> new_dims;
     if (ndims_A >= ndims_B) {
       new_dims.assign(dims_A.begin(), dims_A.end() - 2);
     } else {
@@ -82,7 +82,7 @@ vector<TensorShape> TensorInferenceForBatchMatMul(
       new_dims.push_back(1);
     }
     return vector<TensorShape>{
-        CreateTensorShape(vector<TIndex>{new_dims}, in[0].data_type())};
+        CreateTensorShape(vector<int64_t>{new_dims}, in[0].data_type())};
   }
 }
 
diff --git a/caffe2/operators/batch_matmul_op.h b/caffe2/operators/batch_matmul_op.h
index f4d32e5e3782..e4d5d01fada9 100644
--- a/caffe2/operators/batch_matmul_op.h
+++ b/caffe2/operators/batch_matmul_op.h
@@ -175,7 +175,7 @@ class BatchMatMulOp final : public Operator<Context> {
       // Calculate output tensor shapes [B..., (M), (N)]
       // Batch dimensions will be broadcasted out to those of the longer tensor
       // A or B. Either M or N are optional if A or B, respectively are 1-D.
-      std::vector<TIndex> new_dims;
+      std::vector<int64_t> new_dims;
       if (ndims_A >= ndims_B) {
         new_dims.assign(dims_A.begin(), dims_A.end() - 2);
       } else {
diff --git a/caffe2/operators/batch_matmul_op_gpu_test.cc b/caffe2/operators/batch_matmul_op_gpu_test.cc
index 57a09e3e60c7..804296307d6e 100644
--- a/caffe2/operators/batch_matmul_op_gpu_test.cc
+++ b/caffe2/operators/batch_matmul_op_gpu_test.cc
@@ -26,7 +26,7 @@ class BatchMatMulOpGPUTest : public testing::Test {
   }
 
   void AddConstInput(
-      const std::vector<TIndex>& dims,
+      const std::vector<int64_t>& dims,
       const float value,
       const string& name) {
     Blob* blob = ws_.CreateBlob(name);
@@ -39,7 +39,7 @@ class BatchMatMulOpGPUTest : public testing::Test {
         cuda_context_.get());
   }
 
-  void VerifyOutput(const std::vector<TIndex>& dims, const float value) const {
+  void VerifyOutput(const std::vector<int64_t>& dims, const float value) const {
     const Blob* Y_blob = ws_.GetBlob("Y");
     ASSERT_NE(nullptr, Y_blob);
     const auto& Y = Y_blob->Get<Tensor>();
@@ -64,12 +64,12 @@ TEST_F(BatchMatMulOpGPUTest, BatchMatMulOpGPUNormalTest) {
   if (!HasCudaGPU()) {
     return;
   }
-  AddConstInput(std::vector<TIndex>{3, 5, 10}, 1.0f, "A");
-  AddConstInput(std::vector<TIndex>{3, 10, 6}, 1.0f, "B");
+  AddConstInput(std::vector<int64_t>{3, 5, 10}, 1.0f, "A");
+  AddConstInput(std::vector<int64_t>{3, 10, 6}, 1.0f, "B");
   std::unique_ptr<OperatorBase> op(CreateOperator(def_, &ws_));
   ASSERT_NE(nullptr, op);
   ASSERT_TRUE(op->Run());
-  VerifyOutput(std::vector<TIndex>{3, 5, 6}, 10.0f);
+  VerifyOutput(std::vector<int64_t>{3, 5, 6}, 10.0f);
 }
 
 TEST_F(BatchMatMulOpGPUTest, BatchMatMulOpGPUBroadcastTest) {
@@ -79,12 +79,12 @@ TEST_F(BatchMatMulOpGPUTest, BatchMatMulOpGPUBroadcastTest) {
   auto* arg = def_.add_arg();
   arg->set_name("broadcast");
   arg->set_i(1);
-  AddConstInput(std::vector<TIndex>{3, 5, 10}, 1.0f, "A");
-  AddConstInput(std::vector<TIndex>{2, 3, 10, 6}, 1.0f, "B");
+  AddConstInput(std::vector<int64_t>{3, 5, 10}, 1.0f, "A");
+  AddConstInput(std::vector<int64_t>{2, 3, 10, 6}, 1.0f, "B");
   std::unique_ptr<OperatorBase> op(CreateOperator(def_, &ws_));
   ASSERT_NE(nullptr, op);
   ASSERT_TRUE(op->Run());
-  VerifyOutput(std::vector<TIndex>{2, 3, 5, 6}, 10.0f);
+  VerifyOutput(std::vector<int64_t>{2, 3, 5, 6}, 10.0f);
 }
 
 } // namespace
diff --git a/caffe2/operators/batch_matmul_op_test.cc b/caffe2/operators/batch_matmul_op_test.cc
index 28fa8c1a9086..45db7dd5b848 100644
--- a/caffe2/operators/batch_matmul_op_test.cc
+++ b/caffe2/operators/batch_matmul_op_test.cc
@@ -20,7 +20,7 @@ class BatchMatMulOpTest : public testing::Test {
   }
 
   void AddConstInput(
-      const std::vector<TIndex>& dims,
+      const std::vector<int64_t>& dims,
       const float value,
       const string& name) {
     Blob* blob = ws_.CreateBlob(name);
@@ -33,7 +33,7 @@ class BatchMatMulOpTest : public testing::Test {
         cpu_context_.get());
   }
 
-  void VerifyOutput(const std::vector<TIndex>& dims, const float value) const {
+  void VerifyOutput(const std::vector<int64_t>& dims, const float value) const {
     const Blob* Y_blob = ws_.GetBlob("Y");
     ASSERT_NE(nullptr, Y_blob);
     const auto& Y = Y_blob->Get<TensorCPU>();
@@ -54,24 +54,24 @@ class BatchMatMulOpTest : public testing::Test {
 };
 
 TEST_F(BatchMatMulOpTest, BatchMatMulOpNormalTest) {
-  AddConstInput(std::vector<TIndex>{3, 5, 10}, 1.0f, "A");
-  AddConstInput(std::vector<TIndex>{3, 10, 6}, 1.0f, "B");
+  AddConstInput(std::vector<int64_t>{3, 5, 10}, 1.0f, "A");
+  AddConstInput(std::vector<int64_t>{3, 10, 6}, 1.0f, "B");
   std::unique_ptr<OperatorBase> op(CreateOperator(def_, &ws_));
   ASSERT_NE(nullptr, op);
   ASSERT_TRUE(op->Run());
-  VerifyOutput(std::vector<TIndex>{3, 5, 6}, 10.0f);
+  VerifyOutput(std::vector<int64_t>{3, 5, 6}, 10.0f);
 }
 
 TEST_F(BatchMatMulOpTest, BatchMatMulOpBroadcastTest) {
   auto* arg = def_.add_arg();
   arg->set_name("broadcast");
   arg->set_i(1);
-  AddConstInput(std::vector<TIndex>{3, 5, 10}, 1.0f, "A");
-  AddConstInput(std::vector<TIndex>{2, 3, 10, 6}, 1.0f, "B");
+  AddConstInput(std::vector<int64_t>{3, 5, 10}, 1.0f, "A");
+  AddConstInput(std::vector<int64_t>{2, 3, 10, 6}, 1.0f, "B");
   std::unique_ptr<OperatorBase> op(CreateOperator(def_, &ws_));
   ASSERT_NE(nullptr, op);
   ASSERT_TRUE(op->Run());
-  VerifyOutput(std::vector<TIndex>{2, 3, 5, 6}, 10.0f);
+  VerifyOutput(std::vector<int64_t>{2, 3, 5, 6}, 10.0f);
 }
 
 } // namespace
diff --git a/caffe2/operators/batch_sparse_to_dense_op.cc b/caffe2/operators/batch_sparse_to_dense_op.cc
index b02e4992889e..8d191c63ce93 100644
--- a/caffe2/operators/batch_sparse_to_dense_op.cc
+++ b/caffe2/operators/batch_sparse_to_dense_op.cc
@@ -14,15 +14,15 @@ bool BatchSparseToDenseOp<T, Context>::RunOnDevice() {
   CAFFE_ENFORCE_EQ(lengths.ndim(), 1);
   CAFFE_ENFORCE_EQ(indices.ndim(), 1);
 
-  const TIndex* lengths_data = lengths.template data<TIndex>();
-  const TIndex* indices_data = indices.template data<TIndex>();
+  const int64_t* lengths_data = lengths.template data<int64_t>();
+  const int64_t* indices_data = indices.template data<int64_t>();
   const T* values_data = values.template data<T>();
-  TIndex batch_size = lengths.size();
-  TIndex lengths_sum = 0;
-  math::Sum<TIndex, Context>(batch_size, lengths_data, &lengths_sum, &context_);
+  int64_t batch_size = lengths.size();
+  int64_t lengths_sum = 0;
+  math::Sum<int64_t, Context>(batch_size, lengths_data, &lengths_sum, &context_);
   CAFFE_ENFORCE_EQ(lengths_sum, indices.size());
 
-  vector<TIndex> output_shape = {batch_size};
+  vector<int64_t> output_shape = {batch_size};
   if (InputSize() == 4) {
     auto& shaper = Input(3);
     CAFFE_ENFORCE_EQ(shaper.ndim(), 2);
@@ -42,9 +42,9 @@ bool BatchSparseToDenseOp<T, Context>::RunOnDevice() {
   math::Set(
       output->size(), static_cast<T>(default_value_), output_data, &context_);
 
-  TIndex k = 0;
-  for (TIndex i = 0; i < batch_size; ++i) {
-    for (TIndex j = 0; j < lengths_data[i]; ++j) {
+  int64_t k = 0;
+  for (int64_t i = 0; i < batch_size; ++i) {
+    for (int64_t j = 0; j < lengths_data[i]; ++j) {
       CAFFE_ENFORCE(
           indices_data[k] < dense_last_dim_,
           "An indice (",
@@ -69,24 +69,24 @@ bool BatchDenseToSparseOp<T, Context>::RunOnDevice() {
   CAFFE_ENFORCE_EQ(lengths.ndim(), 1);
   CAFFE_ENFORCE_EQ(indices.ndim(), 1);
   CAFFE_ENFORCE_EQ(dense.ndim(), 2);
-  const TIndex* lengths_data = lengths.template data<TIndex>();
-  const TIndex* indices_data = indices.template data<TIndex>();
+  const int64_t* lengths_data = lengths.template data<int64_t>();
+  const int64_t* indices_data = indices.template data<int64_t>();
   const T* dense_data = dense.template data<T>();
 
-  TIndex batch_size = lengths.size();
-  TIndex lengths_sum = 0;
-  math::Sum<TIndex, Context>(batch_size, lengths_data, &lengths_sum, &context_);
+  int64_t batch_size = lengths.size();
+  int64_t lengths_sum = 0;
+  math::Sum<int64_t, Context>(batch_size, lengths_data, &lengths_sum, &context_);
   CAFFE_ENFORCE_EQ(lengths_sum, indices.size());
 
   CAFFE_ENFORCE_EQ(batch_size, dense.dim(0));
   dense_last_dim_ = dense.dim(1);
-  vector<TIndex> output_shape = indices.dims();
+  vector<int64_t> output_shape = indices.dims();
   output->Resize(output_shape);
   T* output_data = output->template mutable_data<T>();
 
-  TIndex k = 0;
-  for (TIndex i = 0; i < batch_size; ++i) {
-    for (TIndex j = 0; j < lengths_data[i]; ++j) {
+  int64_t k = 0;
+  for (int64_t i = 0; i < batch_size; ++i) {
+    for (int64_t j = 0; j < lengths_data[i]; ++j) {
       CAFFE_ENFORCE(
           indices_data[k] < dense.dim(1),
           "An indice (",
diff --git a/caffe2/operators/batch_sparse_to_dense_op.h b/caffe2/operators/batch_sparse_to_dense_op.h
index de6c69b795d1..8dc9da3dcf70 100644
--- a/caffe2/operators/batch_sparse_to_dense_op.h
+++ b/caffe2/operators/batch_sparse_to_dense_op.h
@@ -15,12 +15,12 @@ class BatchSparseToDenseOp : public Operator<Context> {
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   BatchSparseToDenseOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
-        OP_SINGLE_ARG(TIndex, "dense_last_dim", dense_last_dim_, -1),
+        OP_SINGLE_ARG(int64_t, "dense_last_dim", dense_last_dim_, -1),
         OP_SINGLE_ARG(T, "default_value", default_value_, static_cast<T>(0)) {}
   bool RunOnDevice() override;
 
  private:
-  TIndex dense_last_dim_;
+  int64_t dense_last_dim_;
   T default_value_;
   INPUT_TAGS(LENGTHS, INDICES, VALUES);
 };
@@ -34,7 +34,7 @@ class BatchDenseToSparseOp : public Operator<Context> {
   bool RunOnDevice() override;
 
  private:
-  TIndex dense_last_dim_;
+  int64_t dense_last_dim_;
   INPUT_TAGS(LENGTHS, INDICES, DENSE);
 };
 
diff --git a/caffe2/operators/bbox_transform_op.cc b/caffe2/operators/bbox_transform_op.cc
index 79520face8c0..6dbea350960a 100644
--- a/caffe2/operators/bbox_transform_op.cc
+++ b/caffe2/operators/bbox_transform_op.cc
@@ -138,7 +138,7 @@ bool BBoxTransformOp<float, CPUContext>::RunOnDevice() {
     }
   }
 
-  CAFFE_ENFORCE_EQ(iminfo_in.dims(), (vector<TIndex>{batch_size, 3}));
+  CAFFE_ENFORCE_EQ(iminfo_in.dims(), (vector<int64_t>{batch_size, 3}));
   Eigen::Map<const ERArrXXf> iminfo(
       iminfo_in.data<float>(), iminfo_in.dim(0), iminfo_in.dim(1));
 
diff --git a/caffe2/operators/boolean_mask_ops.cc b/caffe2/operators/boolean_mask_ops.cc
index 2d1deb0badc5..c2ab55891a48 100644
--- a/caffe2/operators/boolean_mask_ops.cc
+++ b/caffe2/operators/boolean_mask_ops.cc
@@ -62,7 +62,7 @@ bool BooleanMaskOp<CPUContext>::RunOnDevice() {
       ++numOutputs;
     }
   }
-  std::vector<TIndex> outShape;
+  std::vector<int64_t> outShape;
   outShape.push_back(numOutputs);
   outShape.insert(outShape.end(), data.dims().begin() + 1, data.dims().end());
   dataOut->Resize(outShape);
@@ -81,11 +81,11 @@ bool BooleanMaskOp<CPUContext>::RunOnDevice() {
   const auto innerSize = data.size_from_dim(1);
   const auto innerSizeBytes = innerSize * data.meta().itemsize();
 
-  TIndex lastStart = -1;
+  int64_t lastStart = -1;
   const auto* inPtr = (char*)data.raw_data();
-  TIndex outStart = 0;
+  int64_t outStart = 0;
 
-  for (TIndex i = 0;; ++i) {
+  for (int64_t i = 0;; ++i) {
     // mask was true and either a) became false, or b) sequence finished
     if (lastStart != -1 && ((i >= outerSize) || !maskPtr[i])) {
       const auto* src = inPtr + lastStart * innerSizeBytes;
diff --git a/caffe2/operators/boolean_mask_ops.cu b/caffe2/operators/boolean_mask_ops.cu
index 855cf6202bc9..2dcc28a064ca 100644
--- a/caffe2/operators/boolean_mask_ops.cu
+++ b/caffe2/operators/boolean_mask_ops.cu
@@ -7,15 +7,15 @@ namespace caffe2 {
 
 namespace {
 __global__ void BooleanMaskCopyKernel(
-    const TIndex numOfOutput,
-    const TIndex numBytes,
-    const TIndex* indices,
+    const int64_t numOfOutput,
+    const int64_t numBytes,
+    const int64_t* indices,
     const uint8_t* src,
     uint8_t* dest) {
-  for (TIndex i = blockIdx.x; i < numOfOutput; i += gridDim.x) {
+  for (int64_t i = blockIdx.x; i < numOfOutput; i += gridDim.x) {
     const auto srcBase = indices[i] * numBytes;
     const auto destBase = i * numBytes;
-    for (TIndex j = threadIdx.x; j < numBytes; j += blockDim.x) {
+    for (int64_t j = threadIdx.x; j < numBytes; j += blockDim.x) {
       dest[destBase + j] = src[srcBase + j];
     }
   }
@@ -40,7 +40,7 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
     const auto* maskData = mask.data<bool>();
     const auto outerSize = mask.dims()[0];
     indices_.Resize(outerSize);
-    auto* indicesData = indices_.mutable_data<TIndex>();
+    auto* indicesData = indices_.mutable_data<int64_t>();
 
     size_t numBytes = 0;
     cub::CountingInputIterator<int> itr(0);
@@ -50,16 +50,16 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
         itr,
         maskData,
         indicesData,
-        static_cast<TIndex*>(nullptr),
+        static_cast<int64_t*>(nullptr),
         outerSize,
         context_.cuda_stream());
 
-    auto numTIndex =
-        static_cast<TIndex>((numBytes + sizeof(TIndex) - 1) / sizeof(TIndex));
-    // allocate one more TIndex at the end of scratch for storing numOfOutput
-    scratch_.Resize(numTIndex + 1);
-    auto* scratchData = scratch_.mutable_data<TIndex>();
-    auto* numOfOutputData = scratchData + numTIndex;
+    auto numint64_t =
+        static_cast<int64_t>((numBytes + sizeof(int64_t) - 1) / sizeof(int64_t));
+    // allocate one more int64_t at the end of scratch for storing numOfOutput
+    scratch_.Resize(numint64_t + 1);
+    auto* scratchData = scratch_.mutable_data<int64_t>();
+    auto* numOfOutputData = scratchData + numint64_t;
 
     cub::DeviceSelect::Flagged(
         static_cast<void*>(scratchData),
@@ -72,11 +72,11 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
         context_.cuda_stream());
 
     // Copy numOfOutput from gpu to cpu
-    TIndex numOfOutput;
+    int64_t numOfOutput;
     context_.CopyToCPU(1, numOfOutputData, &numOfOutput);
 
     indices_.Resize(numOfOutput);
-    std::vector<TIndex> dims = src.dims();
+    std::vector<int64_t> dims = src.dims();
     dims[0] = numOfOutput;
     dest->Resize(dims);
     auto* destData = (uint8_t*)dest->raw_mutable_data(src.meta());
@@ -84,12 +84,12 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
     if (OutputSize() == 2) {
       auto* indicesOut = Output(1);
       indicesOut->Resize(numOfOutput);
-      indicesOut->template mutable_data<TIndex>();
+      indicesOut->template mutable_data<int64_t>();
     }
 
     if (numOfOutput > 0) {
       BooleanMaskCopyKernel<<<
-          min(numOfOutput, static_cast<TIndex>(CAFFE_MAXIMUM_NUM_BLOCKS)),
+          min(numOfOutput, static_cast<int64_t>(CAFFE_MAXIMUM_NUM_BLOCKS)),
           CAFFE_CUDA_NUM_THREADS,
           0,
           context_.cuda_stream()>>>(
diff --git a/caffe2/operators/boolean_unmask_ops_test.cc b/caffe2/operators/boolean_unmask_ops_test.cc
index 2972cee49574..8814be17153d 100644
--- a/caffe2/operators/boolean_unmask_ops_test.cc
+++ b/caffe2/operators/boolean_unmask_ops_test.cc
@@ -18,10 +18,10 @@ static void AddScalarInput(
   Blob* blob = ws->CreateBlob(name);
   auto* tensor = blob->GetMutableTensor(CPU);
   if (!isEmpty) {
-    tensor->Resize(vector<TIndex>{1});
+    tensor->Resize(vector<int64_t>{1});
     *(tensor->template mutable_data<DataT>()) = value;
   } else {
-    tensor->Resize(vector<TIndex>{0});
+    tensor->Resize(vector<int64_t>{0});
     tensor->template mutable_data<DataT>();
   }
   return;
diff --git a/caffe2/operators/cast_op.cc b/caffe2/operators/cast_op.cc
index de3345b832cc..eb7ba0d86857 100644
--- a/caffe2/operators/cast_op.cc
+++ b/caffe2/operators/cast_op.cc
@@ -11,7 +11,7 @@ bool CastOp<CPUContext>::DoRunWithType() {
   const auto* data = input.template data<SrcType>();
   auto* out = output->template mutable_data<DstType>();
   auto N = input.size();
-  for (TIndex i = 0; i < N; ++i) {
+  for (int64_t i = 0; i < N; ++i) {
     out[i] = static_cast<DstType>(data[i]);
   }
   return true;
diff --git a/caffe2/operators/cast_op.h b/caffe2/operators/cast_op.h
index 491028c8105a..e880a10a5ff1 100644
--- a/caffe2/operators/cast_op.h
+++ b/caffe2/operators/cast_op.h
@@ -42,7 +42,7 @@ class CastOp : public Operator<Context> {
     const auto* data = input.template data<SrcType>();
     auto* out = output->template mutable_data<DstType>();
     auto N = input.size();
-    for (TIndex i = 0; i < N; ++i) {
+    for (int64_t i = 0; i < N; ++i) {
       out[i] = static_cast<DstType>(data[i]);
     }
     return true;
diff --git a/caffe2/operators/concat_split_op.h b/caffe2/operators/concat_split_op.h
index d62017152070..1b5a2430ff41 100644
--- a/caffe2/operators/concat_split_op.h
+++ b/caffe2/operators/concat_split_op.h
@@ -161,7 +161,7 @@ bool SplitOp<Context>::RunOnDevice() {
       input_channels,
       "Sum of split dimensions do not match: should be ",
       input_channels);
-  vector<TIndex> output_dims(input.dims());
+  vector<int64_t> output_dims(input.dims());
   int before = 1, after = 1;
   for (int i = 0; i < canonical_axis; ++i) {
     before *= input.dim32(i);
@@ -215,7 +215,7 @@ bool SplitByLengthsOp<Context>::RunOnDevice() {
       input_channels,
       "Sum of split dimensions do not match: should be ",
       input_channels);
-  vector<TIndex> output_dims(input.dims());
+  vector<int64_t> output_dims(input.dims());
   int before = input.size_to_dim(canonical_axis);
   int after = input.size_from_dim(canonical_axis + 1);
   size_t input_offset = 0;
@@ -245,7 +245,7 @@ template <class Context>
 bool ConcatOp<Context>::RunOnDevice() {
   auto* output = Output(0);
   Tensor* split = this->template Output<Tensor>(1, CPU);
-  split->Resize(vector<TIndex>(1, InputSize()));
+  split->Resize(vector<int64_t>(1, InputSize()));
   int* axis_data = split->template mutable_data<int>();
   auto& input_zero = Input(0);
   int adj_size = input_zero.ndim() + (add_axis_ ? 1 : 0);
@@ -263,7 +263,7 @@ bool ConcatOp<Context>::RunOnDevice() {
   }
 
   int before = 1, after = 1;
-  vector<TIndex> output_dims(input_zero.dims());
+  vector<int64_t> output_dims(input_zero.dims());
   for (int i = 0; i < input_zero.ndim(); ++i) {
     if (i == canonical_axis && !add_axis_) {
       continue;
diff --git a/caffe2/operators/conditional_op.cc b/caffe2/operators/conditional_op.cc
index e202ea2e9881..3cb301cc66f7 100644
--- a/caffe2/operators/conditional_op.cc
+++ b/caffe2/operators/conditional_op.cc
@@ -31,7 +31,7 @@ bool ConditionalOp<CPUContext>::RunOnDevice() {
   // perform conditional op along first dimension
   const auto* ptrT = (char*)dataT.raw_data();
   const auto* ptrF = (char*)dataF.raw_data();
-  for (TIndex i = 0; i < condition.size(); i++) {
+  for (int64_t i = 0; i < condition.size(); i++) {
     auto* dst = outPtr + i * innerSizeBytes;
     if (condPtr[i]) {
       context_.CopyItemsSameDevice(
diff --git a/caffe2/operators/conv_op_cache_cudnn.h b/caffe2/operators/conv_op_cache_cudnn.h
index ee3bae2363bc..aefb1b61205f 100644
--- a/caffe2/operators/conv_op_cache_cudnn.h
+++ b/caffe2/operators/conv_op_cache_cudnn.h
@@ -16,8 +16,8 @@ class AlgorithmsCache {
   // combination of tensor dimensions & compute data type.
   //
   TAlgorithm getAlgorithm(
-      const std::vector<TIndex>& tensorDimensions1,
-      const std::vector<TIndex>& tensorDimensions2,
+      const std::vector<int64_t>& tensorDimensions1,
+      const std::vector<int64_t>& tensorDimensions2,
       int algorithmFlags, // Differentiate between algorithms with different
                           // parameters in a generic way
       std::function<TAlgorithm()> generatingFunc);
@@ -28,14 +28,14 @@ class AlgorithmsCache {
 
 template <typename TAlgorithm>
 TAlgorithm AlgorithmsCache<TAlgorithm>::getAlgorithm(
-    const std::vector<TIndex>& tensorDimensions1,
-    const std::vector<TIndex>& tensorDimensions2,
+    const std::vector<int64_t>& tensorDimensions1,
+    const std::vector<int64_t>& tensorDimensions2,
     int algorithmFlags,
     std::function<TAlgorithm()> generatingFunc) {
   int64_t seed = 0;
   // Hash all of the inputs, which we wiill then use to try and look up
   // a previously discovered algorithm, or fall back to generating a new one.
-  std::hash<TIndex> hashFn;
+  std::hash<int64_t> hashFn;
   for (const auto num : tensorDimensions1) {
     // Copied from boost::hash_combine.
     // Adding 1 to differentiate between first and second vector.
diff --git a/caffe2/operators/conv_op_cache_cudnn_test.cc b/caffe2/operators/conv_op_cache_cudnn_test.cc
index 2d2da0d465f6..5867e487de75 100644
--- a/caffe2/operators/conv_op_cache_cudnn_test.cc
+++ b/caffe2/operators/conv_op_cache_cudnn_test.cc
@@ -12,11 +12,11 @@ namespace caffe2 {
 TEST(AlgorithmsCacheTest, CachesCorrectly) {
   AlgorithmsCache<int> cache;
   int result = cache.getAlgorithm(
-      std::vector<TIndex>(1), std::vector<TIndex>(1), 0, []() { return 5; });
+      std::vector<int64_t>(1), std::vector<int64_t>(1), 0, []() { return 5; });
   EXPECT_EQ(result, 5);
 
   int res2 = cache.getAlgorithm(
-      std::vector<TIndex>(1), std::vector<TIndex>(1), 0, []() { return 10; });
+      std::vector<int64_t>(1), std::vector<int64_t>(1), 0, []() { return 10; });
 
   EXPECT_EQ(res2, 5);
 }
@@ -24,11 +24,11 @@ TEST(AlgorithmsCacheTest, CachesCorrectly) {
 TEST(AlgorithmsCacheTest, KeysDifferIfOneVectorIsEmpty) {
   AlgorithmsCache<int> cache;
   int result = cache.getAlgorithm(
-      std::vector<TIndex>(1, 10), std::vector<TIndex>(), 0, []() { return 5; });
+      std::vector<int64_t>(1, 10), std::vector<int64_t>(), 0, []() { return 5; });
   EXPECT_EQ(result, 5);
 
   int res2 = cache.getAlgorithm(
-      std::vector<TIndex>(), std::vector<TIndex>(1, 10), 0, []() {
+      std::vector<int64_t>(), std::vector<int64_t>(1, 10), 0, []() {
         return 10;
       });
 
@@ -38,20 +38,20 @@ TEST(AlgorithmsCacheTest, KeysDifferIfOneVectorIsEmpty) {
 TEST(AlgorithmsCacheTest, KeysDifferIfFlagsAreDifferent) {
   AlgorithmsCache<int> cache;
   int result = cache.getAlgorithm(
-      std::vector<TIndex>{2, 3, 4}, std::vector<TIndex>{5, 6}, 123, []() {
+      std::vector<int64_t>{2, 3, 4}, std::vector<int64_t>{5, 6}, 123, []() {
         return 5;
       });
   EXPECT_EQ(result, 5);
 
   int res2 = cache.getAlgorithm(
-      std::vector<TIndex>{2, 3, 4}, std::vector<TIndex>{5, 6}, 456, []() {
+      std::vector<int64_t>{2, 3, 4}, std::vector<int64_t>{5, 6}, 456, []() {
         return 10;
       });
 
   EXPECT_EQ(res2, 10);
 
   int res3 = cache.getAlgorithm(
-      std::vector<TIndex>{2, 3, 4}, std::vector<TIndex>{5, 6}, 456, []() {
+      std::vector<int64_t>{2, 3, 4}, std::vector<int64_t>{5, 6}, 456, []() {
         return 15;
       });
 
diff --git a/caffe2/operators/conv_op_cudnn.cc b/caffe2/operators/conv_op_cudnn.cc
index 7a983bf4e629..fa52a5bd5a48 100644
--- a/caffe2/operators/conv_op_cudnn.cc
+++ b/caffe2/operators/conv_op_cudnn.cc
@@ -411,8 +411,8 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
     }
   }
 
-  vector<TIndex> cudnn_input_dims_;
-  vector<TIndex> cudnn_filter_dims_;
+  vector<int64_t> cudnn_input_dims_;
+  vector<int64_t> cudnn_filter_dims_;
 
   CuDNNWrapper cudnn_wrapper_;
   cudnnTensorDescriptor_t bottom_desc_;
diff --git a/caffe2/operators/conv_op_eigen.cc b/caffe2/operators/conv_op_eigen.cc
index b565b567ab29..a559a7c574db 100644
--- a/caffe2/operators/conv_op_eigen.cc
+++ b/caffe2/operators/conv_op_eigen.cc
@@ -42,10 +42,10 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNCHW() {
   CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
   CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
   ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
-  Eigen::array<TIndex, 4> kernel_shuffles
-      { {TIndex(2), TIndex(3), TIndex(1), TIndex(0)} };
-  Eigen::array<TIndex, 4> input_shuffles
-      { {TIndex(0), TIndex(2), TIndex(3), TIndex(1)} };
+  Eigen::array<int64_t, 4> kernel_shuffles
+      { {int64_t(2), int64_t(3), int64_t(1), int64_t(0)} };
+  Eigen::array<int64_t, 4> input_shuffles
+      { {int64_t(0), int64_t(2), int64_t(3), int64_t(1)} };
 
   Eigen::Tensor<T, 4, Eigen::RowMajor> filter_tensor =
       Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
@@ -109,14 +109,14 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNCHW() {
     // It seems that the bias broadcast is still slower so let's do the
     // following for now.
     EigenArrayMap<T> Y_arr(
-        Y_tensor.data(), static_cast<TIndex>(M), Y->size() / M);
+        Y_tensor.data(), static_cast<int64_t>(M), Y->size() / M);
     ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), M);
     Y_arr = Y_arr.colwise() + bias_arr;
   }
 
   // Do a last transpose.
-  Eigen::array<TIndex, 4> output_shuffles
-      { {TIndex(0), TIndex(3), TIndex(1), TIndex(2) } };
+  Eigen::array<int64_t, 4> output_shuffles
+      { {int64_t(0), int64_t(3), int64_t(1), int64_t(2) } };
 
   Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
       Y->template mutable_data<T>(), N, M, Y->dim32(2), Y->dim32(3)) =
@@ -204,7 +204,7 @@ bool EigenConvOp<T>::RunOnDeviceWithOrderNHWC() {
     // It seems that the bias broadcast is still slower so let's do the
     // following for now.
     EigenArrayMap<T> Y_arr(
-        Y->template mutable_data<T>(), static_cast<TIndex>(M), Y->size() / M);
+        Y->template mutable_data<T>(), static_cast<int64_t>(M), Y->size() / M);
     ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), M);
     Y_arr = Y_arr.colwise() + bias_arr;
   }
diff --git a/caffe2/operators/conv_op_impl.h b/caffe2/operators/conv_op_impl.h
index 8fea61f7e627..7455bfc29cf0 100644
--- a/caffe2/operators/conv_op_impl.h
+++ b/caffe2/operators/conv_op_impl.h
@@ -240,7 +240,7 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
   }
   auto f = [&](Tensor* col_buffer) {
     col_buffer->Resize(
-        vector<TIndex>{Y->dim32(1), Y->dim32(2), kernel_h(), kernel_w(), C});
+        vector<int64_t>{Y->dim32(1), Y->dim32(2), kernel_h(), kernel_w(), C});
     T* col_buffer_data = col_buffer->template mutable_data<T>();
     // Im2Col, followed by gemm.
     for (int image_id = 0; image_id < N; ++image_id) {
@@ -504,7 +504,7 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
     dbias->Resize(M);
     if (bias_multiplier_.size() != output_image_size) {
       // If the helper bias multiplier is not M, reshape and fill it with one.
-      bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
+      bias_multiplier_.Resize(vector<int64_t>(1, output_image_size));
       math::Set<T, Context>(
           output_image_size,
           static_cast<T>(1),
@@ -689,7 +689,7 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
     math::Set<T, Context>(dbias->size(), 0, dbias_data, &context_);
     if (bias_multiplier_.size() != output_image_size) {
       // If the helper bias multiplier is not M, reshape and fill it with one.
-      bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
+      bias_multiplier_.Resize(vector<int64_t>(1, output_image_size));
       math::Set<T, Context>(
           output_image_size,
           static_cast<T>(1),
diff --git a/caffe2/operators/conv_pool_op_base.h b/caffe2/operators/conv_pool_op_base.h
index 9b2fa02fa712..43b0bee665f1 100644
--- a/caffe2/operators/conv_pool_op_base.h
+++ b/caffe2/operators/conv_pool_op_base.h
@@ -246,7 +246,7 @@ class ConvPoolOpBase : public Operator<Context> {
   // Helper function that is also called from OperatorSchema. Modified
   // kernel parameters and output output_dims and channel_first.
   static inline void InferOutputSize(
-      vector<TIndex> input_dims,
+      vector<int64_t> input_dims,
       int /*output_channel*/,
       StorageOrder order,
       bool global_pooling,
@@ -259,7 +259,7 @@ class ConvPoolOpBase : public Operator<Context> {
       vector<int>& pads,
       bool& channel_first) {
     channel_first = false; // initialized to suppress compiler warning.
-    vector<TIndex> dims;
+    vector<int64_t> dims;
     switch (order) {
       case StorageOrder::NHWC:
         channel_first = false;
@@ -358,7 +358,7 @@ class ConvPoolOpBase : public Operator<Context> {
     if (bias_multiplier_->size() != size) {
       // If the helper bias multiplier is not image size, reshape and fill it
       // with one.
-      bias_multiplier_->Resize(std::vector<TIndex>{size});
+      bias_multiplier_->Resize(std::vector<int64_t>{size});
       math::Set<T, Context>(
           size,
           static_cast<T>(1),
diff --git a/caffe2/operators/conv_transpose_op_cudnn.cc b/caffe2/operators/conv_transpose_op_cudnn.cc
index 28435325f231..b02210f192e6 100644
--- a/caffe2/operators/conv_transpose_op_cudnn.cc
+++ b/caffe2/operators/conv_transpose_op_cudnn.cc
@@ -64,8 +64,8 @@ class CudnnConvTransposeOpBase : public ConvTransposeUnpoolBase<CUDAContext> {
   }
 
  protected:
-  vector<TIndex> cudnn_input_dims_;
-  vector<TIndex> cudnn_filter_dims_;
+  vector<int64_t> cudnn_input_dims_;
+  vector<int64_t> cudnn_filter_dims_;
 
   CuDNNWrapper cudnn_wrapper_;
   cudnnTensorDescriptor_t bottom_desc_;
diff --git a/caffe2/operators/conv_transpose_op_impl.h b/caffe2/operators/conv_transpose_op_impl.h
index 23def95ea9bd..a5f85303b2d9 100644
--- a/caffe2/operators/conv_transpose_op_impl.h
+++ b/caffe2/operators/conv_transpose_op_impl.h
@@ -45,7 +45,7 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
         bias.dim32(0) == C,
         "bias dimension must be equal to output channel number");
     if (bias_multiplier_.size() != output_image_size) {
-      bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
+      bias_multiplier_.Resize(vector<int64_t>(1, output_image_size));
       T* bm_data = bias_multiplier_.template mutable_data<T>();
       math::Set<T, Context>(
           output_image_size,
@@ -61,7 +61,7 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
 
   auto f = [&](Tensor* col_buffer) {
     col_buffer->Resize(
-        vector<TIndex>{C, this->kernel_h(), this->kernel_w(), H, W});
+        vector<int64_t>{C, this->kernel_h(), this->kernel_w(), H, W});
     T* col_buffer_data = col_buffer->template mutable_data<T>();
     for (auto image_id = 0; image_id < N; ++image_id) {
       // Weight term
@@ -167,7 +167,7 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNHWC() {
         bias.dim32(0) == C,
         "bias dimension must be equal to output channel number");
     if (bias_multiplier_.size() != output_image_size) {
-      bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
+      bias_multiplier_.Resize(vector<int64_t>(1, output_image_size));
       T* bm_data = bias_multiplier_.template mutable_data<T>();
       math::Set<T, Context>(
           output_image_size,
@@ -182,7 +182,7 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNHWC() {
 
   auto f = [&](Tensor* /*col_buffer*/) {
     col_buffer_.Resize(
-        vector<TIndex>{H, W, this->kernel_h(), this->kernel_w(), C});
+        vector<int64_t>{H, W, this->kernel_h(), this->kernel_w(), C});
     T* col_buffer_data = col_buffer_.template mutable_data<T>();
     for (auto image_id = 0; image_id < N; ++image_id) {
       // Weight term
@@ -270,7 +270,7 @@ bool ConvTransposeGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
   const int output_image_size = dY.dim32(2) * dY.dim32(3);
   // The col buffer is stored in CHW order as well
   col_buffer_.Resize(
-      vector<TIndex>{C, this->kernel_h(), this->kernel_w(), H, W});
+      vector<int64_t>{C, this->kernel_h(), this->kernel_w(), H, W});
   if (!no_bias_) {
     auto* dbias = Output(BIAS_OR_INPUT_GRAD);
     dbias->Resize(C);
@@ -422,7 +422,7 @@ bool ConvTransposeGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
   const int output_image_size = dY.dim32(1) * dY.dim32(2);
   // The col buffer is stored in HWC order as well
   col_buffer_.Resize(
-      vector<TIndex>{H, W, this->kernel_h(), this->kernel_w(), C});
+      vector<int64_t>{H, W, this->kernel_h(), this->kernel_w(), C});
   if (!no_bias_) {
     auto* dbias = Output(BIAS_OR_INPUT_GRAD);
     dbias->Resize(C);
diff --git a/caffe2/operators/conv_transpose_op_mobile_test.cc b/caffe2/operators/conv_transpose_op_mobile_test.cc
index da443928a974..6eb45eb5f8d1 100644
--- a/caffe2/operators/conv_transpose_op_mobile_test.cc
+++ b/caffe2/operators/conv_transpose_op_mobile_test.cc
@@ -10,7 +10,7 @@
 
 namespace caffe2 {
 
-void AddConstInput(const vector<TIndex>& shape,
+void AddConstInput(const vector<int64_t>& shape,
                    const float value,
                    const string& name,
                    Workspace* ws) {
@@ -23,7 +23,7 @@ void AddConstInput(const vector<TIndex>& shape,
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
 }
 
-void AddNoiseInput(const vector<TIndex>& shape,
+void AddNoiseInput(const vector<int64_t>& shape,
                    const string& name,
                    Workspace* ws) {
   DeviceOption option;
@@ -81,9 +81,9 @@ void compare(int N, int inputC, int H, int W,
   def1.add_arg()->CopyFrom(MakeArgument("adj_h", adjH));
   def1.add_arg()->CopyFrom(MakeArgument("adj_w", adjW));
 
-  AddNoiseInput(vector<TIndex>{N, inputC, H, W}, "X", &ws);
-  AddNoiseInput(vector<TIndex>{inputC, outputC, kernelH, kernelW}, "W", &ws);
-  AddNoiseInput(vector<TIndex>{outputC}, "B", &ws);
+  AddNoiseInput(vector<int64_t>{N, inputC, H, W}, "X", &ws);
+  AddNoiseInput(vector<int64_t>{inputC, outputC, kernelH, kernelW}, "W", &ws);
+  AddNoiseInput(vector<int64_t>{outputC}, "B", &ws);
 
   unique_ptr<OperatorBase> op1(CreateOperator(def1, &ws));
   EXPECT_NE(nullptr, op1.get());
diff --git a/caffe2/operators/cross_entropy_op.cc b/caffe2/operators/cross_entropy_op.cc
index 0473e7d4e435..c635c355e371 100644
--- a/caffe2/operators/cross_entropy_op.cc
+++ b/caffe2/operators/cross_entropy_op.cc
@@ -80,9 +80,9 @@ bool SigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() {
 
   auto* out = Output(0);
   if (logits.ndim() == 0) {
-    out->Resize(std::vector<TIndex>{});
+    out->Resize(std::vector<int64_t>{});
   } else {
-    std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
+    std::vector<int64_t> dims(logits.dims().begin(), logits.dims().end() - 1);
     out->Resize(dims);
   }
   auto* out_ptr = out->template mutable_data<float>();
@@ -162,9 +162,9 @@ bool WeightedSigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() {
 
   auto* out = Output(0);
   if (logits.ndim() == 0) {
-    out->Resize(std::vector<TIndex>{});
+    out->Resize(std::vector<int64_t>{});
   } else {
-    std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
+    std::vector<int64_t> dims(logits.dims().begin(), logits.dims().end() - 1);
     out->Resize(dims);
   }
   auto* out_ptr = out->template mutable_data<float>();
@@ -260,11 +260,11 @@ bool MakeTwoClassOp<float, CPUContext>::RunOnDevice() {
   auto* Y = Output(0);
   auto shape = X.dims();
   shape.push_back(2);
-  TIndex N = X.size();
+  int64_t N = X.size();
   Y->Resize(shape);
   const auto* Xdata = X.data<float>();
   auto* Ydata = Y->template mutable_data<float>();
-  for (TIndex i = 0; i < N; ++i) {
+  for (int64_t i = 0; i < N; ++i) {
     DCHECK_GE(Xdata[i], 0.0);
     DCHECK_LE(Xdata[i], 1.0);
     Ydata[i * 2] = 1.0 - Xdata[i];
@@ -284,9 +284,9 @@ bool MakeTwoClassGradientOp<float, CPUContext>::RunOnDevice() {
   dX->Resize(shape);
   const float* dYdata = dY.data<float>();
   float* dXdata = dX->template mutable_data<float>();
-  TIndex N = dX->size();
+  int64_t N = dX->size();
   // use eigen?
-  for (TIndex i = 0; i < N; ++i) {
+  for (int64_t i = 0; i < N; ++i) {
     dXdata[i] = dYdata[i * 2 + 1] - dYdata[i * 2];
   }
   return true;
@@ -308,7 +308,7 @@ bool CrossEntropyOp<float, CPUContext>::RunOnDevice() {
   CAFFE_ENFORCE(
       (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == D));
   CAFFE_ENFORCE_EQ(label.dim32(0), N);
-  Y->Resize(vector<TIndex>{N});
+  Y->Resize(vector<int64_t>{N});
   const float* Xdata = X.data<float>();
   const float* labelData = label.data<float>();
   auto* Ydata = Y->template mutable_data<float>();
diff --git a/caffe2/operators/cross_entropy_op.cu b/caffe2/operators/cross_entropy_op.cu
index df7a124d2971..b8fc2521971d 100644
--- a/caffe2/operators/cross_entropy_op.cu
+++ b/caffe2/operators/cross_entropy_op.cu
@@ -42,7 +42,7 @@ bool LabelCrossEntropyOp<float, CUDAContext>::RunOnDevice() {
   CAFFE_ENFORCE(
       (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == 1));
   CAFFE_ENFORCE_EQ(label.dim32(0), N);
-  Y->Resize(vector<TIndex>(size_t(1), N));
+  Y->Resize(vector<int64_t>(size_t(1), N));
   LabelCrossEntropyKernel<<<
       CAFFE_GET_BLOCKS(N),
       CAFFE_CUDA_NUM_THREADS,
@@ -250,9 +250,9 @@ bool SigmoidCrossEntropyWithLogitsOp<float, CUDAContext>::RunOnDevice() {
 
   auto* out = Output(0);
   if (logits.ndim() == 0) {
-    out->Resize(std::vector<TIndex>{});
+    out->Resize(std::vector<int64_t>{});
   } else {
-    std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
+    std::vector<int64_t> dims(logits.dims().begin(), logits.dims().end() - 1);
     out->Resize(dims);
   }
   auto* out_ptr = out->template mutable_data<float>();
@@ -372,9 +372,9 @@ bool WeightedSigmoidCrossEntropyWithLogitsOp<float, CUDAContext>::
 
   auto* out = Output(0);
   if (logits.ndim() == 0) {
-    out->Resize(std::vector<TIndex>{});
+    out->Resize(std::vector<int64_t>{});
   } else {
-    std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
+    std::vector<int64_t> dims(logits.dims().begin(), logits.dims().end() - 1);
     out->Resize(dims);
   }
   auto* out_ptr = out->template mutable_data<float>();
diff --git a/caffe2/operators/ctc_beam_search_decoder_op.cc b/caffe2/operators/ctc_beam_search_decoder_op.cc
index e299950e9d94..c6a565df340f 100644
--- a/caffe2/operators/ctc_beam_search_decoder_op.cc
+++ b/caffe2/operators/ctc_beam_search_decoder_op.cc
@@ -32,7 +32,7 @@ bool CTCBeamSearchDecoderOp<CPUContext>::RunOnDevice() {
       (InputSize() == 2) ? Input(SEQ_LEN).data<int>() : nullptr;
 
   vector<int32_t> values_cache;
-  output_len->Resize(vector<TIndex>{batch_size});
+  output_len->Resize(vector<int64_t>{batch_size});
   int* output_len_data = output_len->mutable_data<int>();
 
   for (int32_t i = 0; i < batch_size; ++i) {
@@ -121,7 +121,7 @@ bool CTCBeamSearchDecoderOp<CPUContext>::RunOnDevice() {
   }
 
   int32_t cache_size = values_cache.size();
-  values->Resize(vector<TIndex>{cache_size});
+  values->Resize(vector<int64_t>{cache_size});
   int* values_data = values->mutable_data<int>();
   for (int i = 0; i < values_cache.size(); ++i) {
     values_data[i] = values_cache.at(i);
diff --git a/caffe2/operators/ctc_greedy_decoder_op.cc b/caffe2/operators/ctc_greedy_decoder_op.cc
index 8a5e0932defd..d1b8621b03b3 100644
--- a/caffe2/operators/ctc_greedy_decoder_op.cc
+++ b/caffe2/operators/ctc_greedy_decoder_op.cc
@@ -32,7 +32,7 @@ bool CTCGreedyDecoderOp<CPUContext>::RunOnDevice() {
       (InputSize() == 2) ? Input(SEQ_LEN).data<int>() : nullptr;
 
   vector<int> values_cach;
-  output_len->Resize(vector<TIndex>{batch_size});
+  output_len->Resize(vector<int64_t>{batch_size});
   int* output_len_data = output_len->template mutable_data<int>();
 
   for (int32_t i = 0; i < batch_size; ++i) {
@@ -54,7 +54,7 @@ bool CTCGreedyDecoderOp<CPUContext>::RunOnDevice() {
   }
 
   int32_t values_cach_size = values_cach.size();
-  values->Resize(vector<TIndex>{values_cach_size});
+  values->Resize(vector<int64_t>{values_cach_size});
   int* values_data = values->mutable_data<int>();
   for (int i = 0; i < values_cach.size(); ++i) {
     values_data[i] = values_cach.at(i);
diff --git a/caffe2/operators/dataset_ops.cc b/caffe2/operators/dataset_ops.cc
index 92b7f80129ec..832942242808 100644
--- a/caffe2/operators/dataset_ops.cc
+++ b/caffe2/operators/dataset_ops.cc
@@ -155,7 +155,7 @@ void TreeWalker::advance() {
   cursor_.it.advance(lengths_, cursor_.offsets, sizes_, limits_, 1);
 }
 
-std::vector<TIndex> TreeWalker::fieldDim(int fieldId) const {
+std::vector<int64_t> TreeWalker::fieldDim(int fieldId) const {
   auto tensorDim = input(fieldId).dims();
   tensorDim[0] = sizes_[lengthIdx(fieldId)];
   return tensorDim;
@@ -355,7 +355,7 @@ class UnPackRecordsOp : public Operator<CPUContext> {
     auto numTensors = OutputSize();
 
     // Precomputer the output sizes to avoid resizing
-    std::vector<std::vector<TIndex>> outputDims(numTensors);
+    std::vector<std::vector<int64_t>> outputDims(numTensors);
     std::vector<const TypeMeta*> metas(numTensors);
 
     CAFFE_ENFORCE(
@@ -414,7 +414,7 @@ class UnPackRecordsOp : public Operator<CPUContext> {
 
  private:
   void getShapeAndMetaFromInput(
-      std::vector<std::vector<TIndex>>& outputDims,
+      std::vector<std::vector<int64_t>>& outputDims,
       std::vector<const TypeMeta*>& metas) {
     const auto* inputs = Input(0).template data<SharedTensorVectorPtr>();
 
@@ -434,7 +434,7 @@ class UnPackRecordsOp : public Operator<CPUContext> {
   }
 
   void getShapeAndMetaFromPrototypeBlobs(
-      std::vector<std::vector<TIndex>>& outputDims,
+      std::vector<std::vector<int64_t>>& outputDims,
       std::vector<const TypeMeta*>& metas) {
     const auto numTensors = fields_.size();
     CAFFE_ENFORCE_EQ(numTensors, InputSize() - 1);
@@ -501,7 +501,7 @@ class ReadNextBatchOp : public Operator<CPUContext> {
       }
     }
     // gather data
-    std::vector<TIndex> outDim;
+    std::vector<int64_t> outDim;
     for (int i = 0; i < cursor->it.fields().size(); ++i) {
       auto lengthIdx = cursor->it.fields()[i].lengthFieldId + 1;
       auto size = sizes[lengthIdx];
@@ -676,7 +676,7 @@ class ReadRandomBatchOp : public Operator<CPUContext> {
     auto idxvec = idxblob.template data<int64_t>();
     auto& offsetdim = offsetsmat.dims();
     // gather data
-    std::vector<TIndex> outDim;
+    std::vector<int64_t> outDim;
     int64_t idx;
     {
       std::lock_guard<std::mutex> lock(cursor->mutex_);
@@ -883,7 +883,7 @@ class ConcatTensorVectorOp final : public Operator<Context> {
     auto* tensor = Output(TENSOR);
     CAFFE_ENFORCE(!tensorVector->empty());
 
-    vector<TIndex> outputDims(tensorVector->at(0).dims());
+    vector<int64_t> outputDims(tensorVector->at(0).dims());
     CAFFE_ENFORCE(outputDims.size() > 0);
     for (int i = 1; i < tensorVector->size(); i++) {
       // the tensor shapes are the same except for the first dimension
@@ -895,7 +895,7 @@ class ConcatTensorVectorOp final : public Operator<Context> {
     }
 
     tensor->Resize(outputDims);
-    TIndex offset = 0;
+    int64_t offset = 0;
     auto* dst = (char*)tensor->raw_mutable_data(tensorVector->at(0).meta());
 
     for (const auto& t : *tensorVector) {
diff --git a/caffe2/operators/dataset_ops.h b/caffe2/operators/dataset_ops.h
index 809e570ba3c0..47a5260c83c2 100644
--- a/caffe2/operators/dataset_ops.h
+++ b/caffe2/operators/dataset_ops.h
@@ -123,7 +123,7 @@ class TreeWalker {
     return prevOffsets_[lengthIdx(fieldId)];
   }
 
-  std::vector<TIndex> fieldDim(int fieldId) const;
+  std::vector<int64_t> fieldDim(int fieldId) const;
 
   void* fieldPtr(int fieldId) const;
 
@@ -134,12 +134,12 @@ class TreeWalker {
     Field(TreeWalker& walker, int fieldId)
         : walker_(walker), fieldId_(fieldId) {}
 
-    inline std::vector<TIndex> dim() const {
+    inline std::vector<int64_t> dim() const {
       return walker_.fieldDim(fieldId_);
     }
 
-    inline TIndex size() const {
-      TIndex size = 1;
+    inline int64_t size() const {
+      int64_t size = 1;
       for (const auto d : dim()) {
         size *= d;
       }
diff --git a/caffe2/operators/deform_conv_op.cu b/caffe2/operators/deform_conv_op.cu
index 29e5552612bf..63ba77eed20e 100644
--- a/caffe2/operators/deform_conv_op.cu
+++ b/caffe2/operators/deform_conv_op.cu
@@ -67,8 +67,8 @@
 
 namespace caffe2 {
 
-typedef TIndex index_t;
-typedef std::vector<TIndex> TShape;
+typedef int64_t index_t;
+typedef std::vector<int64_t> TShape;
 
 template <typename DType>
 __device__ DType deformable_im2col_bilinear(
@@ -304,8 +304,8 @@ template <typename DType, typename Context>
 void DeformConvOpBase<DType, Context>::DeformableIm2col(
     const DType* data_im,
     const DType* data_offset,
-    const std::vector<TIndex>& im_shape,
-    const std::vector<TIndex>& col_shape,
+    const std::vector<int64_t>& im_shape,
+    const std::vector<int64_t>& col_shape,
     DType* data_col) {
   CHECK_LT(2, CAFFE_CUDA_NUM_THREADS);
   CAFFE_ENFORCE_EQ(pad_t(), pad_b());
@@ -430,8 +430,8 @@ template <typename DType, typename Context>
 void DeformConvOpBase<DType, Context>::DeformableCol2im(
     const DType* data_col,
     const DType* data_offset,
-    const std::vector<TIndex>& im_shape,
-    const std::vector<TIndex>& col_shape,
+    const std::vector<int64_t>& im_shape,
+    const std::vector<int64_t>& col_shape,
     DType* grad_im) {
   CAFFE_ENFORCE_EQ(pad_t(), pad_b());
   CAFFE_ENFORCE_EQ(pad_l(), pad_r());
@@ -577,8 +577,8 @@ void DeformConvOpBase<DType, Context>::DeformableCol2imCoord(
     const DType* data_col,
     const DType* data_im,
     const DType* data_offset,
-    const std::vector<TIndex>& im_shape,
-    const std::vector<TIndex>& col_shape,
+    const std::vector<int64_t>& im_shape,
+    const std::vector<int64_t>& col_shape,
     DType* grad_offset) {
   CAFFE_ENFORCE_EQ(pad_t(), pad_b());
   CAFFE_ENFORCE_EQ(pad_l(), pad_r());
diff --git a/caffe2/operators/deform_conv_op.h b/caffe2/operators/deform_conv_op.h
index cfe29d4d56be..fb75ec9b0b2c 100644
--- a/caffe2/operators/deform_conv_op.h
+++ b/caffe2/operators/deform_conv_op.h
@@ -24,21 +24,21 @@ class DeformConvOpBase : public ConvPoolOpBase<Context> {
   void DeformableIm2col(
       const T* data_im,
       const T* data_offset,
-      const std::vector<TIndex>& im_shape,
-      const std::vector<TIndex>& col_shape,
+      const std::vector<int64_t>& im_shape,
+      const std::vector<int64_t>& col_shape,
       T* data_col);
   void DeformableCol2im(
       const T* data_col,
       const T* data_offset,
-      const std::vector<TIndex>& im_shape,
-      const std::vector<TIndex>& col_shape,
+      const std::vector<int64_t>& im_shape,
+      const std::vector<int64_t>& col_shape,
       T* grad_im);
   void DeformableCol2imCoord(
       const T* data_col,
       const T* data_im,
       const T* data_offset,
-      const std::vector<TIndex>& im_shape,
-      const std::vector<TIndex>& col_shape,
+      const std::vector<int64_t>& im_shape,
+      const std::vector<int64_t>& col_shape,
       T* grad_offset);
 
  protected:
diff --git a/caffe2/operators/deform_conv_op_impl.h b/caffe2/operators/deform_conv_op_impl.h
index 5d84d5905fd9..96d555460e0b 100644
--- a/caffe2/operators/deform_conv_op_impl.h
+++ b/caffe2/operators/deform_conv_op_impl.h
@@ -119,7 +119,7 @@ bool DeformConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
       // If the helper bias multiplier is not image size, reshape and fill it
       // with
       // one.
-      bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
+      bias_multiplier_.Resize(vector<int64_t>(1, output_image_size));
       math::Set<T, Context>(
           output_image_size,
           static_cast<T>(1),
@@ -274,9 +274,9 @@ bool DeformConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
 
   // The col buffer is stored in CHW order as well - kernel_dim, and the
   // height and width.
-  vector<TIndex> img_shape;
+  vector<int64_t> img_shape;
   img_shape.assign(X.dims().begin() + 1, X.dims().end());
-  vector<TIndex> col_buffer_shape;
+  vector<int64_t> col_buffer_shape;
   col_buffer_shape.push_back(C * kernel_dims_size);
   col_buffer_shape.insert(
       col_buffer_shape.end(), output_dims.begin(), output_dims.end());
@@ -301,7 +301,7 @@ bool DeformConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
     dbias->Resize(M);
     if (bias_multiplier_.size() != output_image_size) {
       // If the helper bias multiplier is not M, reshape and fill it with one.
-      bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
+      bias_multiplier_.Resize(vector<int64_t>(1, output_image_size));
       math::Set<T, Context>(
           output_image_size,
           static_cast<T>(1),
diff --git a/caffe2/operators/distance_op.cc b/caffe2/operators/distance_op.cc
index 9a38a4a77a00..6cb940c5ad58 100644
--- a/caffe2/operators/distance_op.cc
+++ b/caffe2/operators/distance_op.cc
@@ -237,7 +237,7 @@ vector<TensorShape> TensorInferenceForDotProduct(
     const vector<TensorShape>& in) {
   CAFFE_ENFORCE_GT(in.size(), 0);
 
-  vector<TIndex> dims(1);
+  vector<int64_t> dims(1);
   dims[0] = in[0].dims().size() > 0 ? in[0].dims(0) : 1;
   return vector<TensorShape>{CreateTensorShape(dims, in[0].data_type())};
 }
diff --git a/caffe2/operators/distance_op.cu b/caffe2/operators/distance_op.cu
index 82ff859cae67..bfafb1523d6e 100644
--- a/caffe2/operators/distance_op.cu
+++ b/caffe2/operators/distance_op.cu
@@ -49,7 +49,7 @@ bool SquaredL2DistanceOp<float, CUDAContext>::RunOnDevice() {
   }
   int N = X.ndim() > 0 ? X.dim32(0) : 1;
   int D = X.size() / N;
-  distance->Resize(vector<TIndex>(size_t(1), N));
+  distance->Resize(vector<int64_t>(size_t(1), N));
   SquaredL2DistanceKernel<<<
       std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
       CAFFE_CUDA_NUM_THREADS,
@@ -164,7 +164,7 @@ bool L1DistanceOp<float, CUDAContext>::RunOnDevice() {
   }
   const int N = X.ndim() > 0 ? X.dim32(0) : 1;
   const int D = N > 0 ? X.size() / N : 0;
-  distance->Resize(vector<TIndex>(size_t(1), N));
+  distance->Resize(vector<int64_t>(size_t(1), N));
   L1DistanceKernel<<<
       std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
       CAFFE_CUDA_NUM_THREADS,
diff --git a/caffe2/operators/dropout_op_cudnn.cc b/caffe2/operators/dropout_op_cudnn.cc
index fa8c437463e5..a68a1263f6f4 100644
--- a/caffe2/operators/dropout_op_cudnn.cc
+++ b/caffe2/operators/dropout_op_cudnn.cc
@@ -55,7 +55,7 @@ class CuDNNDropoutOp final : public Operator<CUDAContext> {
   cudnnTensorDescriptor_t data_desc_;
   cudnnDropoutDescriptor_t dropout_desc_;
 
-  vector<TIndex> cudnn_input_dims_;
+  vector<int64_t> cudnn_input_dims_;
 
   float ratio_;
   bool is_test_;
@@ -113,7 +113,7 @@ class CuDNNDropoutGradientOp final : public Operator<CUDAContext> {
   cudnnTensorDescriptor_t data_desc_;
   cudnnDropoutDescriptor_t dropout_desc_;
 
-  vector<TIndex> cudnn_input_dims_;
+  vector<int64_t> cudnn_input_dims_;
 
   Blob* scratch_blob_;
 
diff --git a/caffe2/operators/elementwise_op_test.h b/caffe2/operators/elementwise_op_test.h
index 9afb154d9bdd..bcd547e28f09 100644
--- a/caffe2/operators/elementwise_op_test.h
+++ b/caffe2/operators/elementwise_op_test.h
@@ -16,7 +16,7 @@ template <typename Context, typename I_Type, typename O_Type>
 void FillTensor(
     caffe2::Workspace* ws,
     const std::string& name,
-    const std::vector<caffe2::TIndex>& shape,
+    const std::vector<int64_t>& shape,
     const std::vector<I_Type>& values) {
   auto* blob = ws->CreateBlob(name);
   auto* tensor = blob->GetMutableTensor(Context::GetDeviceType());
diff --git a/caffe2/operators/elementwise_ops_schema.cc b/caffe2/operators/elementwise_ops_schema.cc
index 4cf970284675..98e2c9a9d786 100644
--- a/caffe2/operators/elementwise_ops_schema.cc
+++ b/caffe2/operators/elementwise_ops_schema.cc
@@ -636,7 +636,7 @@ Performs element-wise {desc} comparison **{name}** (with limited broadcast suppo
               }                                                                \
             }                                                                  \
             auto output_dims =                                                 \
-                std::vector<TIndex>(in[0].dims().begin(), in[0].dims().end()); \
+                std::vector<int64_t>(in[0].dims().begin(), in[0].dims().end()); \
             return vector<TensorShape>{                                        \
                 CreateTensorShape(output_dims, TensorProto::BOOL)};            \
           })                                                                   \
diff --git a/caffe2/operators/expand_squeeze_dims_op.h b/caffe2/operators/expand_squeeze_dims_op.h
index 1493abebe36a..505b1ec7d690 100644
--- a/caffe2/operators/expand_squeeze_dims_op.h
+++ b/caffe2/operators/expand_squeeze_dims_op.h
@@ -85,7 +85,7 @@ class SqueezeOp : public Operator<Context> {
   }
 
   static std::vector<int> ComputeDims(
-      std::vector<TIndex> inputDims,
+      std::vector<int64_t> inputDims,
       std::vector<int> dims) {
     int j = 0;
     std::vector<int> newDims;
diff --git a/caffe2/operators/experimental/c10/cpu/averaged_loss_cpu.cc b/caffe2/operators/experimental/c10/cpu/averaged_loss_cpu.cc
index dc0d727fc4cd..8e8f6411b4d2 100644
--- a/caffe2/operators/experimental/c10/cpu/averaged_loss_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/averaged_loss_cpu.cc
@@ -4,7 +4,6 @@
 
 using caffe2::BaseContext;
 using caffe2::Tensor;
-using caffe2::TIndex;
 using std::vector;
 
 namespace caffe2 {
@@ -16,7 +15,7 @@ void averaged_loss_op_cpu_impl(
     Tensor* sum,
     caffe2::ops::AveragedLoss::State* state,
     BaseContext* context) {
-  sum->Resize(vector<TIndex>());
+  sum->Resize(vector<int64_t>());
 
   T* data = sum->template mutable_data<T>();
 
diff --git a/caffe2/operators/experimental/c10/cpu/batch_gather_cpu.cc b/caffe2/operators/experimental/c10/cpu/batch_gather_cpu.cc
index 62c0c790b3ee..6a03dfcd2a4d 100644
--- a/caffe2/operators/experimental/c10/cpu/batch_gather_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/batch_gather_cpu.cc
@@ -4,7 +4,6 @@
 
 using caffe2::BaseContext;
 using caffe2::Tensor;
-using caffe2::TIndex;
 using std::vector;
 
 namespace caffe2 {
@@ -18,7 +17,7 @@ void batch_gather_op_cpu_impl(
     BaseContext* context) {
   CAFFE_ENFORCE_GE(data.ndim(), 2, "DATA should be at least 2-D");
 
-  vector<TIndex> shape;
+  vector<int64_t> shape;
   shape.push_back(data.dim(0));
   shape.insert(shape.end(), indices.dims().begin(), indices.dims().end());
   shape.insert(shape.end(), data.dims().begin() + 2, data.dims().end());
diff --git a/caffe2/operators/experimental/c10/cpu/batch_matmul_cpu.cc b/caffe2/operators/experimental/c10/cpu/batch_matmul_cpu.cc
index 5e74f83656bc..72677f26b1b5 100644
--- a/caffe2/operators/experimental/c10/cpu/batch_matmul_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/batch_matmul_cpu.cc
@@ -4,7 +4,6 @@
 
 using caffe2::BaseContext;
 using caffe2::Tensor;
-using caffe2::TIndex;
 using std::vector;
 namespace math = caffe2::math;
 
@@ -163,7 +162,7 @@ void batch_matmul_op_cpu_impl(
     // Calculate output tensor shapes [B..., (M), (N)]
     // Batch dimensions will be broadcasted out to those of the longer tensor
     // A or B. Either M or N are optional if A or B, respectively are 1-D.
-    std::vector<TIndex> new_dims;
+    std::vector<int64_t> new_dims;
     if (ndims_A >= ndims_B) {
       new_dims.assign(dims_A.begin(), dims_A.end() - 2);
     } else {
diff --git a/caffe2/operators/experimental/c10/cpu/cast_cpu.cc b/caffe2/operators/experimental/c10/cpu/cast_cpu.cc
index ec0cce711e60..178a977b4e32 100644
--- a/caffe2/operators/experimental/c10/cpu/cast_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/cast_cpu.cc
@@ -5,7 +5,6 @@
 using caffe2::CPUContext;
 using caffe2::Tensor;
 using caffe2::TensorProto_DataType;
-using caffe2::TIndex;
 
 namespace caffe2 {
 namespace {
@@ -16,7 +15,7 @@ void do_cast_(const Tensor& input, Tensor* output) {
   const auto* data = input.template data<SrcType>();
   auto* out = output->template mutable_data<DstType>();
   auto N = input.size();
-  for (TIndex i = 0; i < N; ++i) {
+  for (int64_t i = 0; i < N; ++i) {
     out[i] = static_cast<DstType>(data[i]);
   }
 }
diff --git a/caffe2/operators/experimental/c10/cpu/concat_cpu.cc b/caffe2/operators/experimental/c10/cpu/concat_cpu.cc
index 48df1dcbecd0..b118dabe463e 100644
--- a/caffe2/operators/experimental/c10/cpu/concat_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/concat_cpu.cc
@@ -6,7 +6,6 @@ using caffe2::BaseContext;
 using caffe2::CPUContext;
 using caffe2::Tensor;
 using caffe2::TensorCPU;
-using caffe2::TIndex;
 using std::vector;
 
 namespace caffe2 {
@@ -19,7 +18,7 @@ void concat_op_cpu_impl(
     int axis,
     int add_axis,
     BaseContext* context) {
-  split->Resize(vector<TIndex>(1, inputs.size()));
+  split->Resize(vector<int64_t>(1, inputs.size()));
   int* axis_data = split->template mutable_data<int>();
   int adj_size = inputs[0]->ndim() + (add_axis ? 1 : 0);
   int canonical_axis = caffe2::canonical_axis_index_(axis, adj_size);
@@ -36,7 +35,7 @@ void concat_op_cpu_impl(
   }
 
   int before = 1, after = 1;
-  vector<TIndex> output_dims(inputs[0]->dims());
+  vector<int64_t> output_dims(inputs[0]->dims());
   for (int i = 0; i < inputs[0]->ndim(); ++i) {
     if (i == canonical_axis && !add_axis) {
       continue;
diff --git a/caffe2/operators/experimental/c10/cpu/filler_cpu.cc b/caffe2/operators/experimental/c10/cpu/filler_cpu.cc
index a813616b6b94..848f9b2984e6 100644
--- a/caffe2/operators/experimental/c10/cpu/filler_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/filler_cpu.cc
@@ -5,7 +5,6 @@
 using caffe2::CPUContext;
 using caffe2::Tensor;
 using caffe2::TensorCPU;
-using caffe2::TIndex;
 using std::vector;
 
 namespace caffe2 {
@@ -17,7 +16,7 @@ void filler_init(
     const std::vector<int>& extra_shape,
     bool input_as_shape) {
   if (inputs.size()) {
-    auto real_shape = vector<TIndex>{};
+    auto real_shape = vector<int64_t>{};
     if (input_as_shape) {
       // Shape input must be in CPU context
       auto& input = *inputs[0];
@@ -25,8 +24,8 @@ void filler_init(
           input.ndim(),
           1,
           "When input_as_shape is true, the input must be a 1D tensor of "
-          "data type TIndex");
-      auto* shape_data = input.template data<TIndex>();
+          "data type int64_t");
+      auto* shape_data = input.template data<int64_t>();
       real_shape.insert(
           real_shape.end(), shape_data, shape_data + input.dim32(0));
     } else {
diff --git a/caffe2/operators/experimental/c10/cpu/sigmoid_cross_entropy_with_logits_cpu.cc b/caffe2/operators/experimental/c10/cpu/sigmoid_cross_entropy_with_logits_cpu.cc
index 6912cb507606..64c01f2e9351 100644
--- a/caffe2/operators/experimental/c10/cpu/sigmoid_cross_entropy_with_logits_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/sigmoid_cross_entropy_with_logits_cpu.cc
@@ -3,7 +3,6 @@
 #include "caffe2/utils/math.h"
 
 using caffe2::Tensor;
-using caffe2::TIndex;
 
 namespace caffe2 {
 namespace {
@@ -36,9 +35,9 @@ void sigmoid_cross_entropy_with_logits_op_cpu_impl(
   const auto outer_size = logits.size() / inner_size;
 
   if (logits.ndim() == 0) {
-    out->Resize(std::vector<TIndex>{});
+    out->Resize(std::vector<int64_t>{});
   } else {
-    std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
+    std::vector<int64_t> dims(logits.dims().begin(), logits.dims().end() - 1);
     out->Resize(dims);
   }
   auto* out_ptr = out->mutable_data<float>();
diff --git a/caffe2/operators/experimental/c10/cpu/sparse_lengths_sum_cpu.cc b/caffe2/operators/experimental/c10/cpu/sparse_lengths_sum_cpu.cc
index dcdeab823027..7fe6e5b579c6 100644
--- a/caffe2/operators/experimental/c10/cpu/sparse_lengths_sum_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/sparse_lengths_sum_cpu.cc
@@ -4,7 +4,6 @@
 #include "caffe2/utils/math.h"
 
 using caffe2::Tensor;
-using caffe2::TIndex;
 
 namespace caffe2 {
 namespace {
@@ -21,10 +20,10 @@ void sparse_lengths_sum_op_cpu_impl(
 
   CAFFE_ENFORCE_EQ(1, indicesInput.ndim(), "INDICES must be a vector");
   CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
-  const TIndex N = dataInput.dim(0);
+  const int64_t N = dataInput.dim(0);
   const int D = dataInput.size_from_dim(1);
-  const TIndex M = lengthsInput.dim(0);
-  const TIndex indices_size = indicesInput.size();
+  const int64_t M = lengthsInput.dim(0);
+  const int64_t indices_size = indicesInput.size();
 
   auto shape = dataInput.dims();
   shape[0] = M;
diff --git a/caffe2/operators/experimental/c10/schemas/fc.h b/caffe2/operators/experimental/c10/schemas/fc.h
index 869adf05fd04..8fc15db02547 100644
--- a/caffe2/operators/experimental/c10/schemas/fc.h
+++ b/caffe2/operators/experimental/c10/schemas/fc.h
@@ -10,7 +10,7 @@ struct FullyConnected final {
   static constexpr const char* name = "FC";
 
   struct Cache final {
-    vector<TIndex> Y_shape_cache_;
+    vector<int64_t> Y_shape_cache_;
     Tensor bias_multiplier_ = Tensor{CPU};
   };
 
diff --git a/caffe2/operators/extend_tensor_op.cc b/caffe2/operators/extend_tensor_op.cc
index 6ac1be087ae3..5f05901a8515 100644
--- a/caffe2/operators/extend_tensor_op.cc
+++ b/caffe2/operators/extend_tensor_op.cc
@@ -34,7 +34,7 @@ class ExtendTensorOp final : public Operator<Context> {
             indices.template data<int>(),
             indices.template data<int>() + indices.size()));
 
-    auto extendSize = (TIndex)maxElem - oldSize;
+    auto extendSize = (int64_t)maxElem - oldSize;
     if (extendSize > 0) {
       new_tensor->Extend(extendSize, growthPct_, &context_);
       if (!new_tensor->meta().ctor()) {
diff --git a/caffe2/operators/filler_op.cc b/caffe2/operators/filler_op.cc
index ab41072ca480..b1a486625b93 100644
--- a/caffe2/operators/filler_op.cc
+++ b/caffe2/operators/filler_op.cc
@@ -21,7 +21,7 @@ bool DiagonalFillOp<CPUContext>::FillWithType(Tensor* output) {
   math::Set<T, CPUContext>(output->size(), T(0), data, &context_);
   // then calculate step size for diagonal
   auto step = GetStepSize(output);
-  for (TIndex i = 0; i < output->size(); i += step) {
+  for (int64_t i = 0; i < output->size(); i += step) {
     math::Set<T, CPUContext>(1, value, data, &context_);
     data += step;
   }
diff --git a/caffe2/operators/filler_op.cu b/caffe2/operators/filler_op.cu
index a754d361442e..7b6f2ce01664 100644
--- a/caffe2/operators/filler_op.cu
+++ b/caffe2/operators/filler_op.cu
@@ -15,7 +15,7 @@ __global__ void FillRangeKernel(const int n, float* data) {
 template <typename T>
 __global__ void FillDiagonalKernel(
     const int num_diagonal_elements,
-    const TIndex step_size,
+    const int64_t step_size,
     const T value,
     T* data) {
   CUDA_1D_KERNEL_LOOP(index, num_diagonal_elements) {
@@ -45,7 +45,7 @@ bool DiagonalFillOp<CUDAContext>::FillWithType(Tensor* output) {
   math::Set<T, CUDAContext>(size, T(0), data, &context_);
 
   T value = OperatorBase::GetSingleArgument<T>("value", 0);
-  TIndex step_size = GetStepSize(output);
+  int64_t step_size = GetStepSize(output);
   int num_diagonal_elements = ceil((float)size / step_size);
 
   FillDiagonalKernel<<<
diff --git a/caffe2/operators/filler_op.h b/caffe2/operators/filler_op.h
index b09ae0843882..a757490e38ef 100644
--- a/caffe2/operators/filler_op.h
+++ b/caffe2/operators/filler_op.h
@@ -23,7 +23,7 @@ class FillerOp : public Operator<Context> {
   FillerOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         shape_(this->template GetRepeatedArgument<int64_t>("shape")),
-        extra_shape_(ToVectorTIndex(
+        extra_shape_(ToVectorint64_t(
             this->template GetRepeatedArgument<int>("extra_shape"))),
         input_as_shape_(
             this->template GetSingleArgument<bool>("input_as_shape", false)) {
@@ -53,7 +53,7 @@ class FillerOp : public Operator<Context> {
   bool RunOnDevice() override {
     auto* output = Operator<Context>::Output(0);
     if (InputSize()) {
-      auto shape = vector<TIndex>{};
+      auto shape = vector<int64_t>{};
       if (input_as_shape_) {
         // Shape input must be in CPU context
         auto& input = this->template Input<Tensor>(0, CPU);
@@ -61,8 +61,8 @@ class FillerOp : public Operator<Context> {
             input.ndim(),
             1,
             "When input_as_shape is true, the input must be a 1D tensor of "
-            "data type TIndex");
-        auto* shape_data = input.template data<TIndex>();
+            "data type int64_t");
+        auto* shape_data = input.template data<int64_t>();
         shape.insert(shape.end(), shape_data, shape_data + input.dim32(0));
       } else {
         auto& input = Input(0);
@@ -79,8 +79,8 @@ class FillerOp : public Operator<Context> {
   virtual bool Fill(Tensor* output) = 0;
 
  protected:
-  vector<TIndex> shape_;
-  vector<TIndex> extra_shape_;
+  vector<int64_t> shape_;
+  vector<int64_t> extra_shape_;
   bool input_as_shape_;
 };
 
@@ -367,27 +367,27 @@ class DiagonalFillOp final : public FillerOp<Context> {
     CAFFE_ENFORCE(output->ndim() >= 2, "Input shape must be >= 2D");
   }
 
-  TIndex GetStepSize(Tensor* output) {
-    TIndex step;
+  int64_t GetStepSize(Tensor* output) {
+    int64_t step;
     if (output->ndim() == 2) {
       step = output->dim(1) + 1;
     } else {
-      TIndex prev_i = output->dim(0);
+      int64_t prev_i = output->dim(0);
       for (auto i : output->dims()) {
         if (i != prev_i) {
           CAFFE_THROW("All dimensions of input must be of equal length");
         }
       }
-      vector<TIndex> cumprod(output->ndim());
+      vector<int64_t> cumprod(output->ndim());
       auto dims = output->dims();
       std::partial_sum(
           dims.begin(),
           dims.end() - 1,
           cumprod.begin(),
-          std::multiplies<TIndex>());
+          std::multiplies<int64_t>());
       step = 1 +
           std::accumulate(
-                 cumprod.begin(), cumprod.end(), static_cast<TIndex>(0));
+                 cumprod.begin(), cumprod.end(), static_cast<int64_t>(0));
       VLOG(0) << step;
     }
     return step;
diff --git a/caffe2/operators/flatten_op.cc b/caffe2/operators/flatten_op.cc
index 342e5c839cbe..0b88b678212f 100644
--- a/caffe2/operators/flatten_op.cc
+++ b/caffe2/operators/flatten_op.cc
@@ -12,8 +12,8 @@ OPERATOR_SCHEMA(Flatten)
       ArgumentHelper helper(def);
       const int axis = helper.GetSingleArgument<int>("axis", 1);
       vector<TensorShape> out(1);
-      TIndex outer = 1;
-      TIndex inner = 1;
+      int64_t outer = 1;
+      int64_t inner = 1;
       std::size_t index = 0;
       for (auto d : in[0].dims()) {
         if (index < axis) {
diff --git a/caffe2/operators/flexible_top_k.cc b/caffe2/operators/flexible_top_k.cc
index aff0a27842ab..0f1133ea5591 100644
--- a/caffe2/operators/flexible_top_k.cc
+++ b/caffe2/operators/flexible_top_k.cc
@@ -9,8 +9,8 @@ namespace {
 template <typename T>
 struct ValueCmp {
   bool operator()(
-      const std::pair<T, TIndex>& lhs,
-      const std::pair<T, TIndex>& rhs) {
+      const std::pair<T, int64_t>& lhs,
+      const std::pair<T, int64_t>& rhs) {
     return (
         lhs.first > rhs.first ||
         (lhs.first == rhs.first && lhs.second < rhs.second));
@@ -27,20 +27,20 @@ bool FlexibleTopKOp<T, Context>::RunOnDevice() {
   auto* indices = Output(1);
 
   const T* input_data = input.template data<T>();
-  const TIndex* k_data = k.template data<TIndex>();
+  const int64_t* k_data = k.template data<int64_t>();
 
   // get flatten shape of input
   CAFFE_ENFORCE_GT(input.ndim(), 0);
-  vector<TIndex> input_dims = input.dims();
-  vector<TIndex> linear_shape = {
+  vector<int64_t> input_dims = input.dims();
+  vector<int64_t> linear_shape = {
       size_to_dim_(input_dims.size() - 1, input_dims), input_dims.back()};
   CAFFE_ENFORCE_EQ(
       linear_shape[0],
       k.size(),
       "first n-1 dims of input data and K does not match.");
 
-  TIndex output_size = 0;
-  for (TIndex i = 0; i < linear_shape[0]; ++i) {
+  int64_t output_size = 0;
+  for (int64_t i = 0; i < linear_shape[0]; ++i) {
     CAFFE_ENFORCE(
         linear_shape[1] >= k_data[i],
         "k should not be greater than last dim, error at index ",
@@ -58,21 +58,21 @@ bool FlexibleTopKOp<T, Context>::RunOnDevice() {
   values->Resize(output_size);
   indices->Resize(output_size);
   T* values_data = values->template mutable_data<T>();
-  TIndex* indices_data = indices->template mutable_data<TIndex>();
+  int64_t* indices_data = indices->template mutable_data<int64_t>();
 
-  TIndex output_offset = 0;
+  int64_t output_offset = 0;
   // Sort preserving indices
-  for (TIndex i = 0; i < linear_shape[0]; ++i) {
+  for (int64_t i = 0; i < linear_shape[0]; ++i) {
     // Build a min-heap, the heap element is pair of (value, idx)
     // the top of the heap is the smallest value
     std::priority_queue<
-        std::pair<T, TIndex>,
-        std::vector<std::pair<T, TIndex>>,
+        std::pair<T, int64_t>,
+        std::vector<std::pair<T, int64_t>>,
         ValueCmp<T>>
         PQ;
 
-    TIndex k_ = k_data[i];
-    for (TIndex j = 0; j < linear_shape[1]; ++j) {
+    int64_t k_ = k_data[i];
+    for (int64_t j = 0; j < linear_shape[1]; ++j) {
       const T value = input_data[i * linear_shape[1] + j];
       if (PQ.size() < k_ || value > PQ.top().first) {
         PQ.push(std::make_pair(value, j));
@@ -81,7 +81,7 @@ bool FlexibleTopKOp<T, Context>::RunOnDevice() {
         PQ.pop();
       }
     }
-    for (TIndex j = 0; j < k_; ++j) {
+    for (int64_t j = 0; j < k_; ++j) {
       auto& pqElem = PQ.top();
       values_data[output_offset + k_ - j - 1] = pqElem.first;
       indices_data[output_offset + k_ - j - 1] = pqElem.second;
@@ -101,24 +101,24 @@ bool FlexibleTopKGradientOp<T, Context>::RunOnDevice() {
   auto& indices = Input(3);
   auto* output = Output(0);
 
-  const TIndex* k_data = k.template data<TIndex>();
+  const int64_t* k_data = k.template data<int64_t>();
   const T* values_data = values.template data<T>();
-  const TIndex* indices_data = indices.template data<TIndex>();
+  const int64_t* indices_data = indices.template data<int64_t>();
 
   // Resize output tensors to be as orignial_input size and initialized with 0
   CAFFE_ENFORCE_GT(original_input.ndim(), 0);
-  vector<TIndex> original_dims = original_input.dims();
+  vector<int64_t> original_dims = original_input.dims();
   output->Resize(original_dims);
   T* output_data = output->template mutable_data<T>();
   math::Set<T, Context>(
       output->size(), static_cast<T>(0), output_data, &context_);
 
-  TIndex index_offset = 0;
-  for (TIndex i = 0; i < k.size(); ++i) {
+  int64_t index_offset = 0;
+  for (int64_t i = 0; i < k.size(); ++i) {
     // offset of output_data
-    TIndex output_offset = i * original_dims.back();
-    for (TIndex j = 0; j < k_data[i]; ++j) {
-      TIndex index = indices_data[index_offset + j];
+    int64_t output_offset = i * original_dims.back();
+    for (int64_t j = 0; j < k_data[i]; ++j) {
+      int64_t index = indices_data[index_offset + j];
       T value = values_data[index_offset + j];
       output_data[output_offset + index] = value;
     }
diff --git a/caffe2/operators/fully_connected_op.h b/caffe2/operators/fully_connected_op.h
index d2fce7d9751a..27b8da695559 100644
--- a/caffe2/operators/fully_connected_op.h
+++ b/caffe2/operators/fully_connected_op.h
@@ -143,7 +143,7 @@ class FullyConnectedOp final : public Operator<Context> {
   size_t axis_w_{1};
   // A local vector to cache the output shape so we don't need to recreate
   // a vector object every time we run Run().
-  vector<TIndex> Y_shape_cache_;
+  vector<int64_t> Y_shape_cache_;
   Tensor bias_multiplier_{Context::GetDeviceType()};
   ;
 
diff --git a/caffe2/operators/fused_rowwise_8bit_conversion_ops.h b/caffe2/operators/fused_rowwise_8bit_conversion_ops.h
index ca5002078129..5eace583588e 100644
--- a/caffe2/operators/fused_rowwise_8bit_conversion_ops.h
+++ b/caffe2/operators/fused_rowwise_8bit_conversion_ops.h
@@ -41,7 +41,7 @@ class FloatToFused8BitRowwiseQuantizedOp : public Operator<Context> {
     // bytes of each row for scale (4 bytes) and bias (4 bytes).
     // | ... int8 data ... | scale | bias |
     // | number_of_columns |  4B   |  4B  |
-    const std::vector<TIndex> output_dimensions = {input_rows,
+    const std::vector<int64_t> output_dimensions = {input_rows,
                                                    input_columns + 8};
     output->Resize(output_dimensions);
 
@@ -96,7 +96,7 @@ class Fused8BitRowwiseQuantizedToFloatOp : public Operator<Context> {
 
     // The last 8 bytes per row are the scale and the bias. The rest of
     // input_columns is the number of values in the original row.
-    const std::vector<TIndex> output_dimensions = {input_rows,
+    const std::vector<int64_t> output_dimensions = {input_rows,
                                                    input_columns - 8};
     output->Resize(output_dimensions);
     const auto output_columns = output->dim(1);
diff --git a/caffe2/operators/fused_rowwise_random_quantization_ops.cc b/caffe2/operators/fused_rowwise_random_quantization_ops.cc
index 1f498e225147..ca5d8f25d3a9 100644
--- a/caffe2/operators/fused_rowwise_random_quantization_ops.cc
+++ b/caffe2/operators/fused_rowwise_random_quantization_ops.cc
@@ -38,8 +38,8 @@ bool FloatToFusedRandRowwiseQuantizedOp<Context>::RunOnDevice() {
   size_t data_per_byte = 8 / bitwidth_;
   // How many bytes in the output
   size_t segment_size = (input_columns + data_per_byte - 1) / data_per_byte;
-  const std::vector<TIndex> output_dimensions = {
-      input_rows, 10 + static_cast<TIndex>(segment_size)};
+  const std::vector<int64_t> output_dimensions = {
+      input_rows, 10 + static_cast<int64_t>(segment_size)};
   output->Resize(output_dimensions);
 
   const auto* input_data = input.template data<float>();
@@ -92,8 +92,8 @@ bool FusedRandRowwiseQuantizedToFloatOp<Context>::RunOnDevice() {
       "Unsupported bitwidth");
   const size_t tail = input_data[1];
   const size_t output_columns = (input_columns - 10) * (8 / bitwidth) - tail;
-  const std::vector<TIndex> output_dimensions = {
-      input_rows, static_cast<TIndex>(output_columns)};
+  const std::vector<int64_t> output_dimensions = {
+      input_rows, static_cast<int64_t>(output_columns)};
   output->Resize(output_dimensions);
   auto* output_data = output->template mutable_data<float>();
   for (size_t row = 0; row < input_rows; ++row) {
diff --git a/caffe2/operators/gather_fused_8bit_rowwise_op.h b/caffe2/operators/gather_fused_8bit_rowwise_op.h
index 78d9ec93feb6..0a125b3edd8d 100644
--- a/caffe2/operators/gather_fused_8bit_rowwise_op.h
+++ b/caffe2/operators/gather_fused_8bit_rowwise_op.h
@@ -28,7 +28,7 @@ class GatherFused8BitRowwiseOp : public Operator<Context> {
     CAFFE_ENFORCE_GT(data.dim(1), 8, "DATA must have more than 8 columns");
     // Subtract 8 from the #columns of data for the 4 bytes for scale and 4
     // bytes for bias that we use in the fused representation (per row).
-    const std::vector<TIndex> shape = {indices.dim(0), data.dim(1) - 8};
+    const std::vector<int64_t> shape = {indices.dim(0), data.dim(1) - 8};
     output->Resize(shape);
 
     int block_size = shape[1];
diff --git a/caffe2/operators/gather_ranges_to_dense_op.h b/caffe2/operators/gather_ranges_to_dense_op.h
index 70d57fe184f1..7b171d0e2e09 100644
--- a/caffe2/operators/gather_ranges_to_dense_op.h
+++ b/caffe2/operators/gather_ranges_to_dense_op.h
@@ -62,7 +62,7 @@ class GatherRangesToDenseOp final : public Operator<Context> {
     auto itemsize = data.meta().itemsize();
 
     auto batchSize = ranges.dim(0);
-    vector<TIndex> outputDims{batchSize, 0};
+    vector<int64_t> outputDims{batchSize, 0};
     vector<char*> outputRawData;
     for (int i = 0; i < OutputSize(); ++i) {
       auto* output = Output(i);
diff --git a/caffe2/operators/generate_proposals_op.cc b/caffe2/operators/generate_proposals_op.cc
index 2b1039b35a84..bfd641a1e246 100644
--- a/caffe2/operators/generate_proposals_op.cc
+++ b/caffe2/operators/generate_proposals_op.cc
@@ -241,15 +241,15 @@ bool GenerateProposalsOp<CPUContext>::RunOnDevice() {
   // bbox_deltas: (num_images, A * box_dim, H, W)
   CAFFE_ENFORCE_EQ(
       bbox_deltas.dims(),
-      (vector<TIndex>{num_images, box_dim * A, height, width}));
+      (vector<int64_t>{num_images, box_dim * A, height, width}));
 
   // im_info_tensor: (num_images, 3), format [height, width, scale; ...]
-  CAFFE_ENFORCE_EQ(im_info_tensor.dims(), (vector<TIndex>{num_images, 3}));
+  CAFFE_ENFORCE_EQ(im_info_tensor.dims(), (vector<int64_t>{num_images, 3}));
   CAFFE_ENFORCE(
       im_info_tensor.template IsType<float>(), im_info_tensor.meta().name());
 
   // anchors: (A, box_dim)
-  CAFFE_ENFORCE_EQ(anchors.dims(), (vector<TIndex>{A, box_dim}));
+  CAFFE_ENFORCE_EQ(anchors.dims(), (vector<int64_t>{A, box_dim}));
   CAFFE_ENFORCE(anchors.template IsType<float>(), anchors.meta().name());
 
   // Broadcast the anchors to all pixels
diff --git a/caffe2/operators/generate_proposals_op_test.cc b/caffe2/operators/generate_proposals_op_test.cc
index fb4c54581396..2b3a033a665d 100644
--- a/caffe2/operators/generate_proposals_op_test.cc
+++ b/caffe2/operators/generate_proposals_op_test.cc
@@ -11,7 +11,7 @@
 namespace caffe2 {
 
 static void AddConstInput(
-    const vector<TIndex>& shape,
+    const vector<int64_t>& shape,
     const float value,
     const string& name,
     Workspace* ws) {
@@ -26,7 +26,7 @@ static void AddConstInput(
 }
 
 static void AddLinSpacedInput(
-    const vector<TIndex>& shape,
+    const vector<int64_t>& shape,
     const float min_val,
     const float max_val,
     const string& name,
@@ -44,7 +44,7 @@ static void AddLinSpacedInput(
 }
 
 static void AddInput(
-    const vector<TIndex>& shape,
+    const vector<int64_t>& shape,
     const vector<float>& values,
     const string& name,
     Workspace* ws) {
@@ -79,7 +79,7 @@ TEST(GenerateProposalsTest, TestComputeAllAnchors) {
       79, -68, 8, 115, 103, -160, -40, 207, 151, -6, 32, 85, 79, -52, 8, 131,
       103, -144, -40, 223, 151;
 
-  Tensor anchors_tensor(vector<TIndex>{anchors.rows(), anchors.cols()}, CPU);
+  Tensor anchors_tensor(vector<int64_t>{anchors.rows(), anchors.cols()}, CPU);
   Eigen::Map<ERMatXf>(
       anchors_tensor.mutable_data<float>(), anchors.rows(), anchors.cols()) =
       anchors;
@@ -143,7 +143,7 @@ TEST(GenerateProposalsTest, TestComputeAllAnchorsRotated) {
     all_anchors_gt(i, 4) = angles[i % angles.size()];
   }
 
-  Tensor anchors_tensor(vector<TIndex>{anchors.rows(), anchors.cols()}, CPU);
+  Tensor anchors_tensor(vector<int64_t>{anchors.rows(), anchors.cols()}, CPU);
   Eigen::Map<ERMatXf>(
       anchors_tensor.mutable_data<float>(), anchors.rows(), anchors.cols()) =
       anchors;
@@ -171,11 +171,11 @@ TEST(GenerateProposalsTest, TestEmpty) {
   const int A = 4;
   const int H = 10;
   const int W = 8;
-  AddConstInput(vector<TIndex>{img_count, A, H, W}, 1., "scores", &ws);
+  AddConstInput(vector<int64_t>{img_count, A, H, W}, 1., "scores", &ws);
   AddLinSpacedInput(
-      vector<TIndex>{img_count, 4 * A, H, W}, 0, 10, "bbox_deltas", &ws);
-  AddConstInput(vector<TIndex>{img_count, 3}, 0.1, "im_info", &ws);
-  AddConstInput(vector<TIndex>{A, 4}, 1.0, "anchors", &ws);
+      vector<int64_t>{img_count, 4 * A, H, W}, 0, 10, "bbox_deltas", &ws);
+  AddConstInput(vector<int64_t>{img_count, 3}, 0.1, "im_info", &ws);
+  AddConstInput(vector<int64_t>{A, 4}, 1.0, "anchors", &ws);
 
   def.add_arg()->CopyFrom(MakeArgument("spatial_scale", 2.0f));
 
@@ -280,10 +280,10 @@ TEST(GenerateProposalsTest, TestRealDownSampled) {
                               1.50015003e-05f,
                               8.91025957e-06f};
 
-  AddInput(vector<TIndex>{img_count, A, H, W}, scores, "scores", &ws);
-  AddInput(vector<TIndex>{img_count, 4 * A, H, W}, bbx, "bbox_deltas", &ws);
-  AddInput(vector<TIndex>{img_count, 3}, im_info, "im_info", &ws);
-  AddInput(vector<TIndex>{A, 4}, anchors, "anchors", &ws);
+  AddInput(vector<int64_t>{img_count, A, H, W}, scores, "scores", &ws);
+  AddInput(vector<int64_t>{img_count, 4 * A, H, W}, bbx, "bbox_deltas", &ws);
+  AddInput(vector<int64_t>{img_count, 3}, im_info, "im_info", &ws);
+  AddInput(vector<int64_t>{A, 4}, anchors, "anchors", &ws);
 
   def.add_arg()->CopyFrom(MakeArgument("spatial_scale", 1.0f / 16.0f));
   def.add_arg()->CopyFrom(MakeArgument("pre_nms_topN", 6000));
@@ -300,7 +300,7 @@ TEST(GenerateProposalsTest, TestRealDownSampled) {
   Blob* rois_blob = ws.GetBlob("rois");
   EXPECT_NE(nullptr, rois_blob);
   auto& rois = rois_blob->Get<TensorCPU>();
-  EXPECT_EQ(rois.dims(), (vector<TIndex>{rois_gt.rows(), rois_gt.cols()}));
+  EXPECT_EQ(rois.dims(), (vector<int64_t>{rois_gt.rows(), rois_gt.cols()}));
   auto rois_data =
       Eigen::Map<const ERMatXf>(rois.data<float>(), rois.dim(0), rois.dim(1));
   EXPECT_NEAR((rois_data.matrix() - rois_gt).cwiseAbs().maxCoeff(), 0, 1e-4);
@@ -309,7 +309,7 @@ TEST(GenerateProposalsTest, TestRealDownSampled) {
   Blob* rois_probs_blob = ws.GetBlob("rois_probs");
   EXPECT_NE(nullptr, rois_probs_blob);
   auto& rois_probs = rois_probs_blob->Get<TensorCPU>();
-  EXPECT_EQ(rois_probs.dims(), (vector<TIndex>{TIndex(rois_probs_gt.size())}));
+  EXPECT_EQ(rois_probs.dims(), (vector<int64_t>{int64_t(rois_probs_gt.size())}));
   auto rois_probs_data =
       ConstEigenVectorArrayMap<float>(rois_probs.data<float>(), rois.dim(0));
   EXPECT_NEAR(
@@ -445,14 +445,14 @@ TEST(GenerateProposalsTest, TestRealDownSampledRotatedAngle0) {
                               1.50015003e-05f,
                               8.91025957e-06f};
 
-  AddInput(vector<TIndex>{img_count, A, H, W}, scores, "scores", &ws);
+  AddInput(vector<int64_t>{img_count, A, H, W}, scores, "scores", &ws);
   AddInput(
-      vector<TIndex>{img_count, 5 * A, H, W},
+      vector<int64_t>{img_count, 5 * A, H, W},
       bbx_with_angle,
       "bbox_deltas",
       &ws);
-  AddInput(vector<TIndex>{img_count, 3}, im_info, "im_info", &ws);
-  AddInput(vector<TIndex>{A, 5}, anchors, "anchors", &ws);
+  AddInput(vector<int64_t>{img_count, 3}, im_info, "im_info", &ws);
+  AddInput(vector<int64_t>{A, 5}, anchors, "anchors", &ws);
 
   def.add_arg()->CopyFrom(MakeArgument("spatial_scale", 1.0f / 16.0f));
   def.add_arg()->CopyFrom(MakeArgument("pre_nms_topN", 6000));
@@ -470,7 +470,7 @@ TEST(GenerateProposalsTest, TestRealDownSampledRotatedAngle0) {
   Blob* rois_blob = ws.GetBlob("rois");
   EXPECT_NE(nullptr, rois_blob);
   auto& rois = rois_blob->Get<TensorCPU>();
-  EXPECT_EQ(rois.dims(), (vector<TIndex>{rois_gt.rows(), rois_gt.cols()}));
+  EXPECT_EQ(rois.dims(), (vector<int64_t>{rois_gt.rows(), rois_gt.cols()}));
   auto rois_data =
       Eigen::Map<const ERMatXf>(rois.data<float>(), rois.dim(0), rois.dim(1));
   EXPECT_NEAR((rois_data.matrix() - rois_gt).cwiseAbs().maxCoeff(), 0, 1e-3);
@@ -479,7 +479,7 @@ TEST(GenerateProposalsTest, TestRealDownSampledRotatedAngle0) {
   Blob* rois_probs_blob = ws.GetBlob("rois_probs");
   EXPECT_NE(nullptr, rois_probs_blob);
   auto& rois_probs = rois_probs_blob->Get<TensorCPU>();
-  EXPECT_EQ(rois_probs.dims(), (vector<TIndex>{TIndex(rois_probs_gt.size())}));
+  EXPECT_EQ(rois_probs.dims(), (vector<int64_t>{int64_t(rois_probs_gt.size())}));
   auto rois_probs_data =
       ConstEigenVectorArrayMap<float>(rois_probs.data<float>(), rois.dim(0));
   EXPECT_NEAR(
@@ -587,14 +587,14 @@ TEST(GenerateProposalsTest, TestRealDownSampledRotated) {
   // vector<float> anchors{-38, -16, 53, 31, -120, -120, 135, 135};
   vector<float> anchors{8, 8, 92, 48, angle, 8, 8, 256, 256, angle};
 
-  AddInput(vector<TIndex>{img_count, A, H, W}, scores, "scores", &ws);
+  AddInput(vector<int64_t>{img_count, A, H, W}, scores, "scores", &ws);
   AddInput(
-      vector<TIndex>{img_count, 5 * A, H, W},
+      vector<int64_t>{img_count, 5 * A, H, W},
       bbx_with_angle,
       "bbox_deltas",
       &ws);
-  AddInput(vector<TIndex>{img_count, 3}, im_info, "im_info", &ws);
-  AddInput(vector<TIndex>{A, 5}, anchors, "anchors", &ws);
+  AddInput(vector<int64_t>{img_count, 3}, im_info, "im_info", &ws);
+  AddInput(vector<int64_t>{A, 5}, anchors, "anchors", &ws);
 
   def.add_arg()->CopyFrom(MakeArgument("spatial_scale", 1.0f / 16.0f));
   def.add_arg()->CopyFrom(MakeArgument("pre_nms_topN", 6000));
diff --git a/caffe2/operators/glu_op.h b/caffe2/operators/glu_op.h
index 6cbb9e8ff6c8..fc4999d1e987 100644
--- a/caffe2/operators/glu_op.h
+++ b/caffe2/operators/glu_op.h
@@ -17,7 +17,7 @@ class GluOp final : public Operator<Context> {
   bool RunOnDevice() {
     auto& X = Input(0);
     auto* Y = Output(0);
-    vector<TIndex> Yshape;
+    vector<int64_t> Yshape;
     Yshape.insert(Yshape.end(), X.dims().begin(), X.dims().end());
     const int split_index = dim_ == -1 ? Yshape.size() - 1 : dim_;
     CAFFE_ENFORCE(
diff --git a/caffe2/operators/half_float_ops.h b/caffe2/operators/half_float_ops.h
index b8d5dacf6947..a7c0dbe1b471 100644
--- a/caffe2/operators/half_float_ops.h
+++ b/caffe2/operators/half_float_ops.h
@@ -36,7 +36,7 @@ class Float16ConstantFillOp : public Operator<CPUContext> {
   bool RunOnDevice() override;
 
  private:
-  vector<TIndex> shape_;
+  vector<int64_t> shape_;
 };
 
 class Float16UniformFillOp : public Operator<CPUContext> {
@@ -65,7 +65,7 @@ class Float16UniformFillOp : public Operator<CPUContext> {
   bool RunOnDevice() override;
 
  private:
-  vector<TIndex> shape_;
+  vector<int64_t> shape_;
   float min_;
   float max_;
 };
diff --git a/caffe2/operators/hip/local_response_normalization_op_miopen.cc b/caffe2/operators/hip/local_response_normalization_op_miopen.cc
index 26da9bf5b874..d5d5a9880f33 100644
--- a/caffe2/operators/hip/local_response_normalization_op_miopen.cc
+++ b/caffe2/operators/hip/local_response_normalization_op_miopen.cc
@@ -51,7 +51,7 @@ class MIOPEN_LRNOP final : public Operator<HIPContext> {
   MIOPENWrapper miopen_wrapper_;
   miopenTensorDescriptor_t data_desc_;
   miopenLRNDescriptor_t norm_desc_;
-  vector<TIndex> miopen_input_dims_;
+  vector<int64_t> miopen_input_dims_;
   const miopenLRNMode_t mode_;
   const int size_;
   const float alpha_;
@@ -103,7 +103,7 @@ class MIOPENLRNGradientOp final : public Operator<HIPContext> {
   MIOPENWrapper miopen_wrapper_;
   miopenTensorDescriptor_t data_desc_;
   miopenLRNDescriptor_t norm_desc_;
-  vector<TIndex> miopen_input_dims_;
+  vector<int64_t> miopen_input_dims_;
   const miopenLRNMode_t mode_;
   const int size_;
   const float alpha_;
diff --git a/caffe2/operators/hip/relu_op_miopen.cc b/caffe2/operators/hip/relu_op_miopen.cc
index 5a8a147ff2a8..dcf74600504c 100644
--- a/caffe2/operators/hip/relu_op_miopen.cc
+++ b/caffe2/operators/hip/relu_op_miopen.cc
@@ -98,7 +98,7 @@ class MIOPENReluOp final : public Operator<HIPContext> {
   MIOPENWrapper miopen_wrapper_;
   miopenTensorDescriptor_t data_desc_;
   miopenActivationDescriptor_t activ_desc_;
-  vector<TIndex> miopen_input_dims_;
+  vector<int64_t> miopen_input_dims_;
   const float alpha_;
   const float beta_;
   const double power_;
@@ -191,7 +191,7 @@ class MIOPENReluGradientOp final : public Operator<HIPContext> {
   MIOPENWrapper miopen_wrapper_;
   miopenTensorDescriptor_t data_desc_;
   miopenActivationDescriptor_t activ_desc_;
-  vector<TIndex> miopen_input_dims_;
+  vector<int64_t> miopen_input_dims_;
   const float alpha_;
   const float beta_;
   const double power_;
diff --git a/caffe2/operators/hip/softmax_op_miopen.cc b/caffe2/operators/hip/softmax_op_miopen.cc
index 6859dccc7f8a..1fa978e59a85 100644
--- a/caffe2/operators/hip/softmax_op_miopen.cc
+++ b/caffe2/operators/hip/softmax_op_miopen.cc
@@ -71,7 +71,7 @@ class MIOpenSoftmaxOp final : public Operator<HIPContext> {
  protected:
   MIOPENWrapper miopen_wrapper_;
   miopenTensorDescriptor_t desc_;
-  vector<TIndex> dims_;
+  vector<int64_t> dims_;
   const int axis_;
   const float alpha_;
   const float beta_;
@@ -135,7 +135,7 @@ class MIOpenSoftmaxGradientOp final : public Operator<HIPContext> {
   const float alpha_;
   const float beta_;
   miopenTensorDescriptor_t desc_;
-  vector<TIndex> dims_;
+  vector<int64_t> dims_;
 };
 
 namespace {
diff --git a/caffe2/operators/hip/spatial_batch_norm_op_miopen.cc b/caffe2/operators/hip/spatial_batch_norm_op_miopen.cc
index 0a8b1d9b1cbf..7b5398296234 100644
--- a/caffe2/operators/hip/spatial_batch_norm_op_miopen.cc
+++ b/caffe2/operators/hip/spatial_batch_norm_op_miopen.cc
@@ -58,7 +58,7 @@ class MIOpenSpatialBNOp final : public SpatialBNOp<HIPContext> {
   MIOPENWrapper miopen_wrapper_;
   miopenTensorDescriptor_t data_desc_;
   miopenTensorDescriptor_t bn_param_desc_;
-  vector<TIndex> miopen_input_dims_;
+  vector<int64_t> miopen_input_dims_;
   float alpha_;
   float beta_;
   miopenBatchNormMode_t mode_;
@@ -97,7 +97,7 @@ class MIOpenSpatialBNGradientOp final : public SpatialBNGradientOp<HIPContext> {
   MIOPENWrapper miopen_wrapper_;
   miopenTensorDescriptor_t data_desc_;
   miopenTensorDescriptor_t bn_param_desc_;
-  vector<TIndex> miopen_input_dims_;
+  vector<int64_t> miopen_input_dims_;
   float alpha_;
   float beta_;
   miopenBatchNormMode_t mode_;
diff --git a/caffe2/operators/im2col_op.h b/caffe2/operators/im2col_op.h
index d5c4fec7f1c9..80c3a85a9689 100644
--- a/caffe2/operators/im2col_op.h
+++ b/caffe2/operators/im2col_op.h
@@ -77,7 +77,7 @@ class Im2ColOp final : public Operator<Context> {
     switch (order_) {
       case StorageOrder::NCHW: {
         Y->Resize(
-            std::vector<TIndex>{N, C * kernel_h_ * kernel_w_, out_h, out_w});
+            std::vector<int64_t>{N, C * kernel_h_ * kernel_w_, out_h, out_w});
 
         const size_t dx = X.size() / N;
         const size_t dy = Y->size() / N;
@@ -105,7 +105,7 @@ class Im2ColOp final : public Operator<Context> {
       }; break;
       case StorageOrder::NHWC: {
         Y->Resize(
-            std::vector<TIndex>{N, out_h, out_w, kernel_h_ * kernel_w_ * C});
+            std::vector<int64_t>{N, out_h, out_w, kernel_h_ * kernel_w_ * C});
 
         const size_t dx = X.size() / N;
         const size_t dy = Y->size() / N;
diff --git a/caffe2/operators/index_hash_ops.cc b/caffe2/operators/index_hash_ops.cc
index 4f374895451c..b803f8a6c126 100644
--- a/caffe2/operators/index_hash_ops.cc
+++ b/caffe2/operators/index_hash_ops.cc
@@ -21,7 +21,7 @@ specified number. All input and output indices are enforced to be positive.
     .TensorInferenceFunction([](const OperatorDef& /* unused */,
                                 const vector<TensorShape>& in) {
       std::vector<TensorShape> out(1);
-      std::vector<TIndex> output_dims = GetDimsVector(in[0]);
+      std::vector<int64_t> output_dims = GetDimsVector(in[0]);
       out[0] = CreateTensorShape(output_dims, in[0].data_type());
       return out;
     });
diff --git a/caffe2/operators/index_ops.cc b/caffe2/operators/index_ops.cc
index b9a8b1b46e27..241b0ff97c60 100644
--- a/caffe2/operators/index_ops.cc
+++ b/caffe2/operators/index_ops.cc
@@ -11,12 +11,12 @@
 namespace caffe2 {
 namespace {
 using IndexKeyTypes = TensorTypes<int32_t, int64_t, std::string>;
-using TIndexValue = int64_t;
+using int64_tValue = int64_t;
 }  // namespace
 
 struct IndexBase {
  public:
-  IndexBase(TIndexValue maxElements, const TypeMeta& type)
+  IndexBase(int64_tValue maxElements, const TypeMeta& type)
     : maxElements_{maxElements}
     , meta_(type)
     , frozen_{false} {}
@@ -35,7 +35,7 @@ struct IndexBase {
 
   const TypeMeta& Type() const { return meta_; }
 
-  TIndexValue Size() {
+  int64_tValue Size() {
     std::lock_guard<std::mutex> guard(dictMutex_);
     return nextId_;
   }
@@ -43,17 +43,17 @@ struct IndexBase {
  protected:
   int64_t maxElements_;
   TypeMeta meta_;
-  TIndexValue nextId_{1}; // guarded by dictMutex_
+  int64_tValue nextId_{1}; // guarded by dictMutex_
   std::atomic<bool> frozen_{false};
   std::mutex dictMutex_;
 };
 
 template<typename T>
 struct Index: IndexBase {
-  explicit Index(TIndexValue maxElements)
+  explicit Index(int64_tValue maxElements)
     : IndexBase(maxElements, TypeMeta::Make<T>()) {}
 
-  void Get(const T* keys, TIndexValue* values, size_t numKeys) {
+  void Get(const T* keys, int64_tValue* values, size_t numKeys) {
     if (frozen_) {
       FrozenGet(keys, values, numKeys);
       return;
@@ -104,14 +104,14 @@ struct Index: IndexBase {
   }
 
  private:
-  void FrozenGet(const T* keys, TIndexValue* values, size_t numKeys) {
+  void FrozenGet(const T* keys, int64_tValue* values, size_t numKeys) {
     for (int i = 0; i < numKeys; ++i) {
       auto it = dict_.find(keys[i]);
       values[i] = it != dict_.end() ? it->second : 0;
     }
   }
 
-  std::unordered_map<T, TIndexValue> dict_;
+  std::unordered_map<T, int64_tValue> dict_;
 };
 
 // TODO(azzolini): support sizes larger than int32
@@ -131,7 +131,7 @@ class IndexCreateOp: public Operator<CPUContext> {
   }
 
  private:
-  TIndexValue maxElements_;
+  int64_tValue maxElements_;
 };
 
 class IndexGetOp: public Operator<CPUContext> {
@@ -152,7 +152,7 @@ class IndexGetOp: public Operator<CPUContext> {
     values->ResizeLike(keys);
     dict->Get(
         keys.data<T>(),
-        values->template mutable_data<TIndexValue>(),
+        values->template mutable_data<int64_tValue>(),
         keys.size());
     return true;
   }
@@ -227,8 +227,8 @@ class IndexSizeOp : public Operator<CPUContext> {
   bool RunOnDevice() override {
     auto& base = OperatorBase::Input<std::unique_ptr<IndexBase>>(0);
     auto* out = Output(0);
-    out->Resize(std::vector<TIndex>{});
-    *out->template mutable_data<TIndexValue>() = base->Size();
+    out->Resize(std::vector<int64_t>{});
+    *out->template mutable_data<int64_tValue>() = base->Size();
     return true;
   }
 };
diff --git a/caffe2/operators/integral_image_op.cc b/caffe2/operators/integral_image_op.cc
index 27356104bbd6..14baf484a232 100644
--- a/caffe2/operators/integral_image_op.cc
+++ b/caffe2/operators/integral_image_op.cc
@@ -19,7 +19,7 @@ bool IntegralImageOp<float, CPUContext>::RunOnDevice() {
   auto* Y = Output(0);
   CAFFE_ENFORCE_EQ(X.ndim(), 4, "Only supports 4D tensors for the momement");
 
-  vector<TIndex> out_shape(X.dims());
+  vector<int64_t> out_shape(X.dims());
   out_shape[2] += 1; // H + 1 output size
   out_shape[3] += 1; // W + 1 output size
   Y->Resize(out_shape);
diff --git a/caffe2/operators/integral_image_op.cu b/caffe2/operators/integral_image_op.cu
index d8fa0b8f4dcc..d865e7a386c3 100644
--- a/caffe2/operators/integral_image_op.cu
+++ b/caffe2/operators/integral_image_op.cu
@@ -124,7 +124,7 @@ bool IntegralImageOp<float, CUDAContext>::RunOnDevice() {
 
   // Input is (N, C, H, W)
   // Output is (N, C, H + 1, W + 1)
-  vector<TIndex> out_shape(X.dims());
+  vector<int64_t> out_shape(X.dims());
   out_shape[2] += 1; // H + 1 output size
   out_shape[3] += 1; // W + 1 output size
   Y->Resize(out_shape);
@@ -172,7 +172,7 @@ bool IntegralImageGradientOp<float, CUDAContext>::RunOnDevice() {
   // Row pass reduces shape of dY from (N, C, H + 1, W + 1)
   // to (N, C, H + 1, W)
   // Col pass reduces shape to (N, C, H, W)
-  vector<TIndex> row_pass_shape(dY.dims());
+  vector<int64_t> row_pass_shape(dY.dims());
   row_pass_shape[3] -= 1;
   row_pass_buffer_.Resize(row_pass_shape);
   const int chans = row_pass_buffer_.dim32(1);
diff --git a/caffe2/operators/is_empty_op.h b/caffe2/operators/is_empty_op.h
index c5c9402f342d..ef11bc06608f 100644
--- a/caffe2/operators/is_empty_op.h
+++ b/caffe2/operators/is_empty_op.h
@@ -15,7 +15,7 @@ class IsEmptyOp : public Operator<Context> {
   bool RunOnDevice() override {
     auto& input = Input(0);
     auto* output = Output(0);
-    output->Resize(std::vector<TIndex>{});
+    output->Resize(std::vector<int64_t>{});
     *output->template mutable_data<bool>() = (input.size() == 0);
     return true;
   }
diff --git a/caffe2/operators/layer_norm_op.cc b/caffe2/operators/layer_norm_op.cc
index 4b995fa49d8c..a461eecdcc14 100644
--- a/caffe2/operators/layer_norm_op.cc
+++ b/caffe2/operators/layer_norm_op.cc
@@ -28,7 +28,7 @@ bool LayerNormOp<CPUContext>::DoRunWithType<float>() {
   const int right = input.size_from_dim(canonical_axis);
 
   output->ResizeLike(input);
-  std::vector<TIndex> stats_dims(
+  std::vector<int64_t> stats_dims(
       input.dims().begin(), input.dims().begin() + canonical_axis);
   stats_dims.push_back(1);
   mean->Resize(stats_dims);
diff --git a/caffe2/operators/layer_norm_op.cu b/caffe2/operators/layer_norm_op.cu
index 1f78b9e50e09..3de32dee87e0 100644
--- a/caffe2/operators/layer_norm_op.cu
+++ b/caffe2/operators/layer_norm_op.cu
@@ -92,7 +92,7 @@ bool LayerNormOp<CUDAContext>::DoRunWithType<float>() {
   const int right = input.size_from_dim(canonical_axis);
 
   output->ResizeLike(input);
-  std::vector<TIndex> stats_dims(
+  std::vector<int64_t> stats_dims(
       input.dims().begin(), input.dims().begin() + canonical_axis);
   stats_dims.push_back(1);
   mean->Resize(stats_dims);
@@ -256,7 +256,7 @@ bool LayerNormGradientOp<CUDAContext>::DoRunWithType<float>() {
   const unsigned long right = norm_inputs.size_from_dim(canonical_axis);
 
   ginput->ResizeLike(norm_inputs);
-  std::vector<TIndex> stats_dims(
+  std::vector<int64_t> stats_dims(
       norm_inputs.dims().begin(), norm_inputs.dims().begin() + canonical_axis);
   stats_dims.push_back(1);
   dmean_.Resize(stats_dims);
diff --git a/caffe2/operators/lengths_pad_op.h b/caffe2/operators/lengths_pad_op.h
index 9f65c39a262a..a5d4b04850dc 100644
--- a/caffe2/operators/lengths_pad_op.h
+++ b/caffe2/operators/lengths_pad_op.h
@@ -56,7 +56,7 @@ class LengthsPadOp : public Operator<Context> {
 
     math::Set(
         output->size(), static_cast<T>(padding_value_), out_data, &context_);
-    for (TIndex i = 0; i < lengths_size; ++i) {
+    for (int64_t i = 0; i < lengths_size; ++i) {
       auto length = lengths_data[i];
       CAFFE_ENFORCE_GE(length, 0);
       CAFFE_ENFORCE_GE(
diff --git a/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h
index 22c7a3b7c0f6..5ce2c87e988c 100644
--- a/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h
+++ b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h
@@ -50,7 +50,7 @@ class SparseLengthsFused8BitRowwiseOp : public Operator<Context> {
     CAFFE_ENFORCE_GT(data.dim(1), 8, "DATA must have more than 8 columns");
     // Subtract 8 from the #columns of data for the 4 bytes for scale and 4
     // bytes for bias that we use in the fused representation (per row).
-    const std::vector<TIndex> shape = {lengths.dim(0), data.dim(1) - 8};
+    const std::vector<int64_t> shape = {lengths.dim(0), data.dim(1) - 8};
     output->Resize(shape);
 
     Fused8BitRowwiseEmbeddingLookup(
diff --git a/caffe2/operators/lengths_reducer_ops.h b/caffe2/operators/lengths_reducer_ops.h
index 8153e088074a..c10b8eba981f 100644
--- a/caffe2/operators/lengths_reducer_ops.h
+++ b/caffe2/operators/lengths_reducer_ops.h
@@ -47,10 +47,10 @@ class CPUSparseLengthsReductionOp : public Operator<CPUContext> {
 
     CAFFE_ENFORCE_EQ(1, indicesInput.ndim(), "INDICES must be a vector");
     CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
-    const TIndex N = dataInput.dim(0);
+    const int64_t N = dataInput.dim(0);
     const int D = dataInput.size_from_dim(1);
-    const TIndex M = lengthsInput.dim(0);
-    const TIndex indices_size = indicesInput.size();
+    const int64_t M = lengthsInput.dim(0);
+    const int64_t indices_size = indicesInput.size();
 
     auto* output = Output(0);
     auto shape = dataInput.dims();
diff --git a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h
index 58ebe6cb58e8..c912d1b0009b 100644
--- a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h
+++ b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h
@@ -39,7 +39,7 @@ class SparseLengths8BitsRowwiseOp : public Operator<Context> {
     auto* output = Output(0);
     auto* scale_bias = Input(SCALE_BIAS).template data<float>();
     CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
-    const TIndex outputSize = lengthsInput.dim(0);
+    const int64_t outputSize = lengthsInput.dim(0);
 
     auto& indicesInput = Input(INDICES);
     CAFFE_ENFORCE_EQ(
@@ -54,23 +54,23 @@ class SparseLengths8BitsRowwiseOp : public Operator<Context> {
         "the second dim of scale_bias has to be equal to 2");
     CAFFE_ENFORCE_EQ(1, indicesInput.ndim(), "INDICES must be a vector");
     const IndexType* indices = indicesInput.template data<IndexType>();
-    TIndex dataToReduceSize = indicesInput.dim(0);
+    int64_t dataToReduceSize = indicesInput.dim(0);
 
     const int* lengths = lengthsInput.template data<int>();
-    vector<TIndex> shape = dataInput.dims();
+    vector<int64_t> shape = dataInput.dims();
     shape[0] = outputSize;
     output->Resize(shape);
     const float* w = nullptr;
     if (USE_WEIGHTS) {
       w = Input(WEIGHTS).template data<float>();
     }
-    TIndex in_block_size = dataInput.size_from_dim(1);
+    int64_t in_block_size = dataInput.size_from_dim(1);
     OutDataT* out = output->template mutable_data<OutDataT>();
     const uint8_t* input_data = dataInput.template data<uint8_t>();
 
     // delegate work to perfkernel that branches based on architecture
-    const TIndex indices_size = indicesInput.size();
-    const TIndex N = dataInput.dim(0);
+    const int64_t indices_size = indicesInput.size();
+    const int64_t N = dataInput.dim(0);
     EmbeddingLookup(
         in_block_size,
         outputSize,
@@ -107,7 +107,7 @@ class FloatToRowwiseQuantized8BitsOp : public Operator<Context> {
     auto* scale_bias = Output(SCALE_BIAS);
     auto* input_data = input.template data<float>();
     output->ResizeLike(input);
-    vector<TIndex> scale_bias_dims = {input.dim(0), 2};
+    vector<int64_t> scale_bias_dims = {input.dim(0), 2};
     scale_bias->Resize(scale_bias_dims);
     auto* output_data = output->template mutable_data<uint8_t>();
     float* scale_bias_data = scale_bias->template mutable_data<float>();
diff --git a/caffe2/operators/lengths_tile_op.cc b/caffe2/operators/lengths_tile_op.cc
index d5af0a91bd65..f1b843108750 100644
--- a/caffe2/operators/lengths_tile_op.cc
+++ b/caffe2/operators/lengths_tile_op.cc
@@ -33,7 +33,7 @@ bool LengthsTileOp<CPUContext>::RunOnDevice() {
   auto src = static_cast<const char*>(data.raw_data());
   auto out = static_cast<char*>(output->raw_mutable_data(data.meta()));
 
-  for (TIndex i = 0; i < lengths_size; ++i) {
+  for (int64_t i = 0; i < lengths_size; ++i) {
     auto length = lengths_data[i];
     CAFFE_ENFORCE_GE(length, 0);
     for (int32_t j = 0; j < length; ++j) {
diff --git a/caffe2/operators/lengths_tile_op.cu b/caffe2/operators/lengths_tile_op.cu
index aebb33c1460a..b15350a7cf96 100644
--- a/caffe2/operators/lengths_tile_op.cu
+++ b/caffe2/operators/lengths_tile_op.cu
@@ -50,7 +50,7 @@ bool LengthsTileOp<CUDAContext>::RunOnDevice() {
   rowMappingDevice_.Resize(total_length);
   auto* rowOffsets = rowMappingHost_.mutable_data<int32_t>();
   int32_t outputRow = 0;
-  for (TIndex i = 0; i < lengths_size; i++) {
+  for (int64_t i = 0; i < lengths_size; i++) {
     auto length = lengths_data[i];
     for (int32_t j = 0; j < length; j++) {
       rowOffsets[outputRow++] = i * numElementsPerRow;
diff --git a/caffe2/operators/lengths_top_k_op.cc b/caffe2/operators/lengths_top_k_op.cc
index c871d53caf95..0f8f0708270f 100644
--- a/caffe2/operators/lengths_top_k_op.cc
+++ b/caffe2/operators/lengths_top_k_op.cc
@@ -21,25 +21,25 @@ bool LengthsTopKOp<T, Context>::RunOnDevice() {
   int* output_topk_indices_data =
       output_topk_indices->template mutable_data<int>();
 
-  auto cmp = [](std::pair<T, TIndex>& lhs, std::pair<T, TIndex>& rhs) {
+  auto cmp = [](std::pair<T, int64_t>& lhs, std::pair<T, int64_t>& rhs) {
     return lhs.first > rhs.first ||
         (lhs.first == rhs.first && lhs.second < rhs.second);
   };
 
   // Sort preserving indices
   int next_index = 0;
-  for (TIndex i = 0; i < N; ++i) {
+  for (int64_t i = 0; i < N; ++i) {
     // Build a min-heap, the heap element is pair of (value, idx)
     // the top of the heap is the smallest value
     std::priority_queue<
-        std::pair<T, TIndex>,
-        std::vector<std::pair<T, TIndex>>,
+        std::pair<T, int64_t>,
+        std::vector<std::pair<T, int64_t>>,
         decltype(cmp)>
         p_queue(cmp);
 
     // Maintain the size of heap to be less or equal to k_, so the
     // heap will hold the k_ largest values
-    for (TIndex j = 0; j < input_len[i]; ++j) {
+    for (int64_t j = 0; j < input_len[i]; ++j) {
       const auto value = X_data[next_index++];
       if (p_queue.size() < k_ || value > p_queue.top().first) {
         p_queue.push(std::make_pair(value, j));
@@ -50,7 +50,7 @@ bool LengthsTopKOp<T, Context>::RunOnDevice() {
     }
 
     int last_index = p_queue.size();
-    for (TIndex j = 0; j < k_; ++j) {
+    for (int64_t j = 0; j < k_; ++j) {
       if (p_queue.size() > 0) {
         auto& pqElem = p_queue.top();
         output_topk_values_data[i * k_ + last_index - j - 1] = pqElem.first;
diff --git a/caffe2/operators/local_response_normalization_op.cc b/caffe2/operators/local_response_normalization_op.cc
index 81499b4a5d6a..bbbb68872cbf 100644
--- a/caffe2/operators/local_response_normalization_op.cc
+++ b/caffe2/operators/local_response_normalization_op.cc
@@ -27,7 +27,7 @@ bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
   scale_->ResizeLike(X);
   float* scale_data = scale_->template mutable_data<float>();
   math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
-  Tensor padded_square(vector<TIndex>{C + size_ - 1, H, W}, CPU);
+  Tensor padded_square(vector<int64_t>{C + size_ - 1, H, W}, CPU);
   float* padded_square_data = padded_square.template mutable_data<float>();
   math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
                                &context_);
@@ -91,7 +91,7 @@ bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
   scale_->ResizeLike(X);
   float* scale_data = scale_->template mutable_data<float>();
 
-  Tensor padded_square(vector<TIndex>(1, C + size_ - 1), CPU);
+  Tensor padded_square(vector<int64_t>(1, C + size_ - 1), CPU);
   float* padded_square_data = padded_square.template mutable_data<float>();
   math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
                                &context_);
@@ -146,7 +146,7 @@ bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
   const float* dYdata = dY.data<float>();
   float* dXdata = dX->template mutable_data<float>();
 
-  Tensor padded_ratio(vector<TIndex>{C + size_ - 1, H, W}, CPU);
+  Tensor padded_ratio(vector<int64_t>{C + size_ - 1, H, W}, CPU);
   float* padded_ratio_data = padded_ratio.template mutable_data<float>();
   // Compute scale(copied from LRNOp) - reusing padded_ratio
   math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
@@ -183,7 +183,7 @@ bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
 
   math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
                                &context_);
-  Tensor accum_ratio(vector<TIndex>{H, W}, CPU);
+  Tensor accum_ratio(vector<int64_t>{H, W}, CPU);
   float* accum_ratio_data = accum_ratio.template mutable_data<float>();
 
   const float cache_ratio = 2. * alpha_ * beta_ / size_;
@@ -243,7 +243,7 @@ bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
     scale_ = &local_scale_tensor_;
   }
   scale_->ResizeLike(X);
-  Tensor padded_ratio(vector<TIndex>(1, C + size_ - 1), CPU);
+  Tensor padded_ratio(vector<int64_t>(1, C + size_ - 1), CPU);
   float* padded_ratio_data = padded_ratio.template mutable_data<float>();
   float* scale_data = scale_->template mutable_data<float>();
   // Compute scale(copied from LRNOp) - reusing padded_ratio
diff --git a/caffe2/operators/local_response_normalization_op_cudnn.cc b/caffe2/operators/local_response_normalization_op_cudnn.cc
index e195b8b5ae44..c1e5683a1c28 100644
--- a/caffe2/operators/local_response_normalization_op_cudnn.cc
+++ b/caffe2/operators/local_response_normalization_op_cudnn.cc
@@ -38,7 +38,7 @@ class CuDNNLRNOp final : public Operator<CUDAContext> {
   cudnnTensorDescriptor_t data_desc_;
   cudnnLRNDescriptor_t norm_desc_;
 
-  vector<TIndex> cudnn_input_dims_;
+  vector<int64_t> cudnn_input_dims_;
 
   const int size_;
   const float alpha_;
@@ -80,7 +80,7 @@ class CuDNNLRNGradientOp final : public Operator<CUDAContext> {
   cudnnTensorDescriptor_t data_desc_;
   cudnnLRNDescriptor_t norm_desc_;
 
-  vector<TIndex> cudnn_input_dims_;
+  vector<int64_t> cudnn_input_dims_;
 
   const int size_;
   const float alpha_;
diff --git a/caffe2/operators/lpnorm_op.cc b/caffe2/operators/lpnorm_op.cc
index 79c35cd83a21..d4b66ed548a3 100644
--- a/caffe2/operators/lpnorm_op.cc
+++ b/caffe2/operators/lpnorm_op.cc
@@ -132,10 +132,10 @@ Y:
         "*(type: bool; default: False)* Whether we calculate norm or averaged_norm.The Lp_averaged_norm(x) is defined as Lp_averaged_norm(x) = LpNorm(x) / size(x)")
     .TensorInferenceFunction([](const OperatorDef& /* unused */,
                                 const vector<TensorShape>& in) {
-      std::vector<TIndex> output_dims(1);
+      std::vector<int64_t> output_dims(1);
       output_dims[0] = 1; // 1
       return vector<TensorShape>{
-          CreateTensorShape(vector<TIndex>{output_dims}, in[0].data_type())};
+          CreateTensorShape(vector<int64_t>{output_dims}, in[0].data_type())};
     });
 
 OPERATOR_SCHEMA(LpNormGradient)
diff --git a/caffe2/operators/map_ops.h b/caffe2/operators/map_ops.h
index 9d207815cc08..52cf8d1a8a1f 100644
--- a/caffe2/operators/map_ops.h
+++ b/caffe2/operators/map_ops.h
@@ -200,7 +200,7 @@ class MapSerializer : public BlobSerializerBase {
       BlobSerializerBase::SerializationAcceptor acceptor) override {
     CAFFE_ENFORCE(blob.IsType<MapType>());
     const MapType& map_data = blob.template Get<MapType>();
-    TIndex sz = map_data.size();
+    int64_t sz = map_data.size();
     Tensor key_tensor(CPU);
     key_tensor.Resize(sz);
     Tensor value_tensor(CPU);
diff --git a/caffe2/operators/matmul_op.h b/caffe2/operators/matmul_op.h
index ee5807b0348a..cea885c0c352 100644
--- a/caffe2/operators/matmul_op.h
+++ b/caffe2/operators/matmul_op.h
@@ -92,7 +92,7 @@ class MatMulOp final : public Operator<Context> {
  protected:
   // A local vector to cache the output shape so we don't need to recreate
   // a vector object every time we run Run().
-  vector<TIndex> Y_shape_cache_{0, 0};
+  vector<int64_t> Y_shape_cache_{0, 0};
   int axis_a_{1};
   int axis_b_{1};
   bool trans_a_;
diff --git a/caffe2/operators/numpy_tile_op.h b/caffe2/operators/numpy_tile_op.h
index 2413652e3277..88dc3cb85164 100644
--- a/caffe2/operators/numpy_tile_op.h
+++ b/caffe2/operators/numpy_tile_op.h
@@ -39,7 +39,7 @@ class NumpyTileOp : public Operator<Context> {
     // output tensor.
     Tensor *src = &buffer, *dst = output;
     src->CopyFrom(input);
-    vector<TIndex> output_dims(input.dims());
+    vector<int64_t> output_dims(input.dims());
     for (size_t i = 0; i < repeats.size(); ++i) {
       if (repeats_data[i] == 1) {
         continue;
diff --git a/caffe2/operators/one_hot_ops.cc b/caffe2/operators/one_hot_ops.cc
index 35512680327b..3f7449c51535 100644
--- a/caffe2/operators/one_hot_ops.cc
+++ b/caffe2/operators/one_hot_ops.cc
@@ -17,9 +17,9 @@ bool BatchOneHotOp<CPUContext>::DoRunWithType() {
   CAFFE_ENFORCE_EQ(lens.size(), D);
 
   const auto* lens_data = lens.template data<int32_t>();
-  TIndex output_dim = 0;
+  int64_t output_dim = 0;
   valsOffsets_.resize(D + 1);
-  for (TIndex i = 0; i < D; i++) {
+  for (int64_t i = 0; i < D; i++) {
     CAFFE_ENFORCE_GE(lens_data[i], 0);
     valsOffsets_[i] = output_dim;
     output_dim += lens_data[i];
@@ -34,10 +34,10 @@ bool BatchOneHotOp<CPUContext>::DoRunWithType() {
   const auto* vals_data = vals.template data<T>();
   auto* output_data = output->template mutable_data<T>();
 
-  for (TIndex i = 0; i < N; ++i) {
-    for (TIndex j = 0; j < D; j++) {
+  for (int64_t i = 0; i < N; ++i) {
+    for (int64_t j = 0; j < D; j++) {
       const auto input_val = input_data[i * D + j];
-      for (TIndex k = valsOffsets_[j]; k < valsOffsets_[j + 1]; ++k) {
+      for (int64_t k = valsOffsets_[j]; k < valsOffsets_[j + 1]; ++k) {
         output_data[k] = vals_data[k] == input_val;
       }
     }
@@ -50,21 +50,21 @@ bool BatchOneHotOp<CPUContext>::DoRunWithType() {
 vector<TensorShape> TensorInferenceForBatchOneHot(
     const OperatorDef& /* def */,
     const vector<TensorShape>& in) {
-  std::vector<TIndex> output_dims(2);
+  std::vector<int64_t> output_dims(2);
   output_dims[0] = in[0].dims(0); // N
   output_dims[1] = in[2].dims(0); // vals.size()
   return vector<TensorShape>{
-      CreateTensorShape(vector<TIndex>{output_dims}, in[0].data_type())};
+      CreateTensorShape(vector<int64_t>{output_dims}, in[0].data_type())};
 }
 
 vector<TensorShape> TensorInferenceForBucketBatchOneHot(
     const OperatorDef& /* def */,
     const vector<TensorShape>& in) {
-  std::vector<TIndex> output_dims(2);
+  std::vector<int64_t> output_dims(2);
   output_dims[0] = in[0].dims(0); // N
   output_dims[1] = in[1].dims(0) + in[2].dims(0); // vals.size() + length.size()
   return vector<TensorShape>{
-      CreateTensorShape(vector<TIndex>{output_dims}, in[0].data_type())};
+      CreateTensorShape(vector<int64_t>{output_dims}, in[0].data_type())};
 }
 
 OpSchema::Cost CostInferenceForBatchOneHot(
@@ -90,11 +90,11 @@ OpSchema::Cost CostInferenceForBatchOneHot(
 
 template <>
 void OneHotOp<CPUContext>::DoOneHotOp(
-    TIndex batch_size,
-    TIndex index_size,
+    int64_t batch_size,
+    int64_t index_size,
     const Tensor& indices,
     Tensor* one_hots) {
-  const TIndex* indices_ptr = indices.template data<TIndex>();
+  const int64_t* indices_ptr = indices.template data<int64_t>();
   float* one_hots_ptr = one_hots->template mutable_data<float>();
   memset(one_hots_ptr, 0, one_hots->nbytes());
   for (int i = 0; i < batch_size; ++i) {
@@ -122,8 +122,8 @@ bool BatchBucketOneHotOp<CPUContext>::RunOnDevice() {
       boundaries.size(),
       "The sum of length should be equal to the length of boundaries");
 
-  TIndex output_dim = 0;
-  for (TIndex i = 0; i < D; i++) {
+  int64_t output_dim = 0;
+  for (int64_t i = 0; i < D; i++) {
     CAFFE_ENFORCE_GT(lens_data[i], 0);
     // Number of buckets is number of bucket edges + 1
     output_dim += (lens_data[i] + 1);
@@ -137,26 +137,26 @@ bool BatchBucketOneHotOp<CPUContext>::RunOnDevice() {
 
   math::Set<float, CPUContext>(output->size(), 0.f, output_data, &context_);
 
-  TIndex pos = 0;
-  for (TIndex i = 0; i < N; i++) {
+  int64_t pos = 0;
+  for (int64_t i = 0; i < N; i++) {
     auto* boundaries_offset = boundaries_data;
-    TIndex output_offset = 0;
+    int64_t output_offset = 0;
 
-    for (TIndex j = 0; j < D; j++) {
+    for (int64_t j = 0; j < D; j++) {
       // here we assume the boundary values for each feature are sorted
-      TIndex lower_bucket_idx = std::lower_bound(
+      int64_t lower_bucket_idx = std::lower_bound(
                                     boundaries_offset,
                                     boundaries_offset + lens_data[j],
                                     input_data[pos]) -
           boundaries_offset;
 
-      TIndex upper_bucket_idx = std::upper_bound(
+      int64_t upper_bucket_idx = std::upper_bound(
                                     boundaries_offset,
                                     boundaries_offset + lens_data[j],
                                     input_data[pos]) -
           boundaries_offset;
 
-      TIndex bucket_idx = (lower_bucket_idx + upper_bucket_idx) / 2;
+      int64_t bucket_idx = (lower_bucket_idx + upper_bucket_idx) / 2;
       output_data[i * output_dim + output_offset + bucket_idx] = 1.0;
       boundaries_offset += lens_data[j];
       output_offset += (lens_data[j] + 1);
diff --git a/caffe2/operators/one_hot_ops.cu b/caffe2/operators/one_hot_ops.cu
index e1b6e18daf87..1528f9418823 100644
--- a/caffe2/operators/one_hot_ops.cu
+++ b/caffe2/operators/one_hot_ops.cu
@@ -6,9 +6,9 @@
 namespace caffe2 {
 
 __global__ void OneHotOpKernel(
-    const TIndex batch_size,
-    const TIndex index_size,
-    const TIndex* indices,
+    const int64_t batch_size,
+    const int64_t index_size,
+    const int64_t* indices,
     float* output) {
   CUDA_1D_KERNEL_LOOP(i, batch_size) {
     output[i * index_size + indices[i]] = 1.;
@@ -17,8 +17,8 @@ __global__ void OneHotOpKernel(
 
 template <>
 void OneHotOp<CUDAContext>::DoOneHotOp(
-    TIndex batch_size,
-    TIndex index_size,
+    int64_t batch_size,
+    int64_t index_size,
     const Tensor& indices,
     Tensor* output) {
   float* output_ptr = output->template mutable_data<float>();
@@ -28,7 +28,7 @@ void OneHotOp<CUDAContext>::DoOneHotOp(
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
-      batch_size, index_size, indices.data<TIndex>(), output_ptr);
+      batch_size, index_size, indices.data<int64_t>(), output_ptr);
 }
 
 REGISTER_CUDA_OPERATOR(OneHot, OneHotOp<CUDAContext>);
diff --git a/caffe2/operators/one_hot_ops.h b/caffe2/operators/one_hot_ops.h
index 5b0e9a2a6656..826a7250f457 100644
--- a/caffe2/operators/one_hot_ops.h
+++ b/caffe2/operators/one_hot_ops.h
@@ -21,17 +21,17 @@ class OneHotOp final : public Operator<Context> {
     CAFFE_ENFORCE_EQ(
         indices.ndim(),
         1,
-        "indices input must be 1D tensor of data type TIndex");
+        "indices input must be 1D tensor of data type int64_t");
 
     // Index size input must be in CPU context
     auto& index_size_tensor = this->template Input<Tensor>(1, CPU);
     CAFFE_ENFORCE_EQ(
         index_size_tensor.size(),
         1,
-        "index_size_tensor input must be scalar of data type TIndex");
+        "index_size_tensor input must be scalar of data type int64_t");
 
     auto batch_size = indices.size();
-    auto index_size = *index_size_tensor.template data<TIndex>();
+    auto index_size = *index_size_tensor.template data<int64_t>();
     auto one_hots = Output(0);
     one_hots->Resize(batch_size, index_size);
     auto output_size = one_hots->size();
@@ -45,8 +45,8 @@ class OneHotOp final : public Operator<Context> {
 
  protected:
   void DoOneHotOp(
-      TIndex batch_size,
-      TIndex index_size,
+      int64_t batch_size,
+      int64_t index_size,
       const Tensor& indices,
       Tensor* output);
 };
@@ -72,7 +72,7 @@ class BatchOneHotOp final : public Operator<Context> {
 
  private:
   // allows for fast random access to a given dict and is re-used across runs
-  std::vector<TIndex> valsOffsets_;
+  std::vector<int64_t> valsOffsets_;
 };
 
 template <class Context>
diff --git a/caffe2/operators/onnx_while_op.h b/caffe2/operators/onnx_while_op.h
index fc6cc2739ebe..dbd510395246 100644
--- a/caffe2/operators/onnx_while_op.h
+++ b/caffe2/operators/onnx_while_op.h
@@ -117,7 +117,7 @@ class ONNXWhileOp final : public Operator<Context> {
 
     // Use this to keep track of the sizes of the scan outputs and validate
     // they're the same across iterations.
-    std::vector<std::vector<TIndex>> scan_outputs_sizes;
+    std::vector<std::vector<int64_t>> scan_outputs_sizes;
 
     Workspace *cur_ws = nullptr;
     bool cur_output_condition = false;
@@ -165,8 +165,8 @@ class ONNXWhileOp final : public Operator<Context> {
             dims.insert(dims.begin(), itr);
             scan_output_target->Extend(1, 2.0f, &context_);
 
-            TIndex timestep_size = 1;
-            for (const TIndex t : scan_outputs_sizes[i]) {
+            int64_t timestep_size = 1;
+            for (const int64_t t : scan_outputs_sizes[i]) {
               timestep_size *= t;
             }
 
diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc
index aee7fff4bc33..d1b0824f1b31 100644
--- a/caffe2/operators/onnxifi_op.cc
+++ b/caffe2/operators/onnxifi_op.cc
@@ -90,7 +90,7 @@ bool OnnxifiOp<float, CPUContext>::RunOnDevice() {
 
   for (unsigned i = 0U; i < OutputSize(); ++i) {
     auto* output_tensor = Output(i);
-    std::vector<TIndex> tensor_dims;
+    std::vector<int64_t> tensor_dims;
     SetOutputShape(i, &tensor_dims);
     output_tensor->Resize(tensor_dims);
     auto& tensor_descriptor = output_desc_.at(i);
diff --git a/caffe2/operators/onnxifi_op.h b/caffe2/operators/onnxifi_op.h
index f354069d4082..3ebf8be05179 100644
--- a/caffe2/operators/onnxifi_op.h
+++ b/caffe2/operators/onnxifi_op.h
@@ -38,7 +38,7 @@ class OnnxifiOp final : public Operator<Context> {
       const std::string key = MakeString("output_size_hint_", output_idx);
       auto output_size_hint = this->template GetRepeatedArgument<int>(key);
       if (!output_size_hint.empty()) {
-        std::vector<TIndex> dims;
+        std::vector<int64_t> dims;
         for (const auto v : output_size_hint) {
           dims.push_back(v);
         }
@@ -127,7 +127,7 @@ class OnnxifiOp final : public Operator<Context> {
   bool RunOnDevice() override;
 
  private:
-  void SetOutputShape(int output_idx, std::vector<TIndex>* dims) {
+  void SetOutputShape(int output_idx, std::vector<int64_t>* dims) {
     const auto it = output_size_hints_.find(output_idx);
     if (it != output_size_hints_.end()) {
       *dims = it->second;
@@ -163,7 +163,7 @@ class OnnxifiOp final : public Operator<Context> {
   std::vector<std::vector<uint64_t>> output_shapes_;
 
   // output shape hints
-  std::unordered_map<int, std::vector<TIndex>> output_size_hints_;
+  std::unordered_map<int, std::vector<int64_t>> output_size_hints_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/operator_fallback_gpu_test.cc b/caffe2/operators/operator_fallback_gpu_test.cc
index 59d765d58604..964708bc1090 100644
--- a/caffe2/operators/operator_fallback_gpu_test.cc
+++ b/caffe2/operators/operator_fallback_gpu_test.cc
@@ -36,7 +36,7 @@ TEST(OperatorFallbackTest, IncrementByOneOp) {
       "IncrementByOne", "", vector<string>{"X"},
       vector<string>{"X"});
   Workspace ws;
-  Tensor source_tensor(vector<TIndex>{2, 3}, CPU);
+  Tensor source_tensor(vector<int64_t>{2, 3}, CPU);
   for (int i = 0; i < 6; ++i) {
     source_tensor.mutable_data<float>()[i] = i;
   }
@@ -60,7 +60,7 @@ TEST(OperatorFallbackTest, GPUIncrementByOneOp) {
       vector<string>{"X"});
   op_def.mutable_device_option()->set_device_type(PROTO_CUDA);
   Workspace ws;
-  Tensor source_tensor(vector<TIndex>{2, 3}, CPU);
+  Tensor source_tensor(vector<int64_t>{2, 3}, CPU);
   for (int i = 0; i < 6; ++i) {
     source_tensor.mutable_data<float>()[i] = i;
   }
diff --git a/caffe2/operators/order_switch_ops.cc b/caffe2/operators/order_switch_ops.cc
index 502295cb00b9..3e0940f91a83 100644
--- a/caffe2/operators/order_switch_ops.cc
+++ b/caffe2/operators/order_switch_ops.cc
@@ -10,7 +10,7 @@ bool NHWC2NCHWOp<float, CPUContext>::RunOnDevice() {
   auto ndim = X.ndim();
   CAFFE_ENFORCE_GE(ndim, 3);
   const int N = X.dim32(0), C = X.dim32(ndim - 1);
-  vector<TIndex> Y_dims(ndim);
+  vector<int64_t> Y_dims(ndim);
   Y_dims[0] = N;
   Y_dims[1] = C;
   int image_size = 1;
@@ -47,7 +47,7 @@ bool NCHW2NHWCOp<float, CPUContext>::RunOnDevice() {
   auto ndim = X.ndim();
   CAFFE_ENFORCE_GE(X.ndim(), 3);
   const int N = X.dim32(0), C = X.dim32(1);
-  vector<TIndex> Y_dims(ndim);
+  vector<int64_t> Y_dims(ndim);
   Y_dims[0] = N;
   int image_size = 1;
   for (auto i = 1; i < ndim - 1; ++i) {
diff --git a/caffe2/operators/order_switch_ops.cu b/caffe2/operators/order_switch_ops.cu
index f63a7d87fa88..d58f01e51863 100644
--- a/caffe2/operators/order_switch_ops.cu
+++ b/caffe2/operators/order_switch_ops.cu
@@ -98,7 +98,7 @@ bool NHWC2NCHWOp<float, CUDAContext>::RunOnDevice() {
   CAFFE_ENFORCE_GE(ndim, 3);
   const int N = X.dim32(0);
   const int C = X.dim32(ndim - 1);
-  vector<TIndex> Y_dims(ndim);
+  vector<int64_t> Y_dims(ndim);
   Y_dims[0] = N;
   Y_dims[1] = C;
   int HxW = 1;
@@ -134,7 +134,7 @@ bool NCHW2NHWCOp<float, CUDAContext>::RunOnDevice() {
   CAFFE_ENFORCE_GE(X.ndim(), 3);
   const int N = X.dim32(0);
   const int C = X.dim32(1);
-  vector<TIndex> Y_dims(ndim);
+  vector<int64_t> Y_dims(ndim);
   Y_dims[0] = N;
   int HxW = 1;
   for (auto i = 1; i < ndim - 1; ++i) {
diff --git a/caffe2/operators/pack_rnn_sequence_op.h b/caffe2/operators/pack_rnn_sequence_op.h
index 74d40f6bfd47..534139afbcc4 100644
--- a/caffe2/operators/pack_rnn_sequence_op.h
+++ b/caffe2/operators/pack_rnn_sequence_op.h
@@ -30,7 +30,7 @@ class PackRNNSequenceOpBase : public Operator<Context> {
     CAFFE_ENFORCE_GT(values.ndim(), dim_offset);
 
     // block_size is the size for each individual feature
-    TIndex block_size = values.size_from_dim(dim_offset);
+    int64_t block_size = values.size_from_dim(dim_offset);
     auto values_vec = values.template data<ValT>();
 
     auto& lengths = Input(LENGTHS);
@@ -47,7 +47,7 @@ class PackRNNSequenceOpBase : public Operator<Context> {
       math::Sum<int, Context>(cols, lengths_vec, &length_sum, &context_);
     }
 
-    vector<TIndex> shape;
+    vector<int64_t> shape;
     // the output shape is rows * cols for the pack,
     // or length_sum for the sequence
     if (Forward) {
diff --git a/caffe2/operators/pack_segments.cc b/caffe2/operators/pack_segments.cc
index ab831445e56e..9659d1eddd48 100644
--- a/caffe2/operators/pack_segments.cc
+++ b/caffe2/operators/pack_segments.cc
@@ -27,7 +27,7 @@ bool PackSegmentsOp<CPUContext>::DoRunWithType2() {
   // Find the length of the longest sequence.
   const T* l = lengths.template data<T>();
   T max_length = 0;
-  TIndex total_length = 0;
+  int64_t total_length = 0;
   for (T i = 0; i < lengths.dim(0); ++i) {
     max_length = std::max(max_length, l[i]);
     total_length += l[i];
@@ -61,7 +61,7 @@ bool PackSegmentsOp<CPUContext>::DoRunWithType2() {
   bool* presence_mask_data = nullptr;
   if (return_presence_mask_) {
     // Shape of presence is batch_size x max_len
-    std::vector<caffe2::TIndex> presence_shape{lengths.size(), max_length};
+    std::vector<int64_t> presence_shape{lengths.size(), max_length};
     presence_mask->Resize(presence_shape);
     presence_mask_data = presence_mask->template mutable_data<bool>();
   }
@@ -86,8 +86,8 @@ bool PackSegmentsOp<CPUContext>::DoRunWithType2() {
   auto block_size = data.size_from_dim(1);
   auto block_bytesize = data.itemsize() * block_size;
   const auto* d = static_cast<const char*>(data.raw_data());
-  TIndex start = 0;
-  for (TIndex i = 0; i < lengths.dim(0); ++i) {
+  int64_t start = 0;
+  for (int64_t i = 0; i < lengths.dim(0); ++i) {
     context_.CopyItemsSameDevice(
         data.meta(),
         l[i] * block_size,
@@ -127,7 +127,7 @@ bool UnpackSegmentsOp<CPUContext>::DoRunWithType2() {
   }
   const T* l = lengths.template data<T>();
 
-  TIndex total_l = std::accumulate(l, l + lengths.dim(0), (TIndex)0);
+  int64_t total_l = std::accumulate(l, l + lengths.dim(0), (int64_t)0);
 
   auto shape = data.dims();
   CAFFE_ENFORCE_EQ(
@@ -143,8 +143,8 @@ bool UnpackSegmentsOp<CPUContext>::DoRunWithType2() {
   auto block_size = data.size_from_dim(2);
   auto block_bytesize = data.itemsize() * block_size;
   const auto* d = static_cast<const char*>(data.raw_data());
-  TIndex start = 0;
-  for (TIndex i = 0; i < lengths.dim(0); ++i) {
+  int64_t start = 0;
+  for (int64_t i = 0; i < lengths.dim(0); ++i) {
     context_.CopyItemsSameDevice(
         data.meta(),
         l[i] * block_size,
diff --git a/caffe2/operators/pack_segments.h b/caffe2/operators/pack_segments.h
index 8d16bb658c17..6315c5906512 100644
--- a/caffe2/operators/pack_segments.h
+++ b/caffe2/operators/pack_segments.h
@@ -45,7 +45,7 @@ class PackSegmentsOp final : public Operator<Context> {
   INPUT_TAGS(LENGTHS, DATA);
 
  private:
-  TIndex max_length_;
+  int64_t max_length_;
   bool pad_minf_;
   float padding_;
   bool return_presence_mask_;
@@ -80,7 +80,7 @@ class UnpackSegmentsOp final : public Operator<Context> {
   INPUT_TAGS(LENGTHS, DATA);
 
  private:
-  TIndex max_length_;
+  int64_t max_length_;
   Tensor dev_buffer_{Context::GetDeviceType()};
   Tensor dev_lengths_prefix_sum_{Context::GetDeviceType()};
   Tensor dev_max_length_{Context::GetDeviceType()};
diff --git a/caffe2/operators/partition_ops.h b/caffe2/operators/partition_ops.h
index 89762e387160..94bd1e6150ce 100644
--- a/caffe2/operators/partition_ops.h
+++ b/caffe2/operators/partition_ops.h
@@ -41,7 +41,7 @@ class GatherByKeyOp : public Operator<CPUContext> {
     const auto& in0Shape = Input(1).dims();
     CAFFE_ENFORCE_GE(in0Shape.size(), 1);
 
-    vector<TIndex> outShape(keysShape);
+    vector<int64_t> outShape(keysShape);
     outShape.insert(outShape.end(), in0Shape.begin() + 1, in0Shape.end());
 
     CAFFE_ENFORCE_GE(outShape.size(), 1);
@@ -122,10 +122,10 @@ class PartitionOpBase : public Operator<CPUContext> {
     CAFFE_ENFORCE_GT(partitions, 0, "Invalid number of partitions");
 
     auto& main_input = Input(mainInputIndex);
-    TIndex size = main_input.size();
+    int64_t size = main_input.size();
     const Index* data = main_input.template data<Index>();
     counts_.assign(partitions, 0);
-    for (TIndex p = 0; p < size; p++) {
+    for (int64_t p = 0; p < size; p++) {
       int shard = moduloPartition(data[p], partitions);
       ++counts_[shard];
     }
@@ -158,7 +158,7 @@ class PartitionOpBase : public Operator<CPUContext> {
       block_sizes_[i] = input.size_from_dim(main_input.ndim());
       metas_[i] = input.meta();
       // shape = partition_size + suffix of input dims
-      vector<TIndex> shape(
+      vector<int64_t> shape(
           input.dims().begin() + main_input.ndim() - 1, input.dims().end());
       for (int j = 0; j < partitions; ++j) {
         int out_idx = i + j * inputSize;
@@ -170,9 +170,9 @@ class PartitionOpBase : public Operator<CPUContext> {
     }
 
     counts_.assign(partitions, 0);
-    for (TIndex p = 0; p < size; p++) {
+    for (int64_t p = 0; p < size; p++) {
       int shard = moduloPartition(data[p], partitions);
-      TIndex idx = counts_[shard]++;
+      int64_t idx = counts_[shard]++;
 
       // special case first input
       static_cast<Index*>(out_datas_[shard * inputSize + mainInputIndex])[idx] =
@@ -196,8 +196,8 @@ class PartitionOpBase : public Operator<CPUContext> {
   bool pack_first_input_;
 
   // use member fields to reuse memory
-  vector<TIndex> counts_;
-  vector<TIndex> block_sizes_;
+  vector<int64_t> counts_;
+  vector<int64_t> block_sizes_;
   vector<TypeMeta> metas_;
   vector<const void*> raw_datas_;
   vector<void*> out_datas_;
@@ -268,11 +268,11 @@ class LengthsPartitionOp : public PartitionOpBase {
 
     // Compute lengths after sharding
     auto& main_input = Input(1);
-    TIndex size = main_input.size();
+    int64_t size = main_input.size();
     const Index* data = main_input.template data<Index>();
 
     auto& length_input = Input(0);
-    TIndex elements = length_input.size();
+    int64_t elements = length_input.size();
     const int32_t* lengths_data = length_input.template data<int32_t>();
     out_length_.resize(partitions);
     for (int i = 0; i < partitions; ++i) {
diff --git a/caffe2/operators/perplexity_op.cc b/caffe2/operators/perplexity_op.cc
index 028a6077cc86..24b4e4fed1b9 100644
--- a/caffe2/operators/perplexity_op.cc
+++ b/caffe2/operators/perplexity_op.cc
@@ -10,7 +10,7 @@ bool PerplexityOp<float, CPUContext>::RunOnDevice() {
   DCHECK_EQ(X.ndim(), 1);
   int N = X.dim32(0);
 
-  Y->Resize(vector<TIndex>());
+  Y->Resize(vector<int64_t>());
   const auto* Xdata = X.data<float>();
 
   float perplexity = 1.0;
diff --git a/caffe2/operators/perplexity_op.cu b/caffe2/operators/perplexity_op.cu
index 230bdb1601cb..83261471259d 100644
--- a/caffe2/operators/perplexity_op.cu
+++ b/caffe2/operators/perplexity_op.cu
@@ -25,7 +25,7 @@ bool PerplexityOp<float, CUDAContext>::RunOnDevice() {
   DCHECK_EQ(X.ndim(), 1);
   int N = X.dim32(0);
 
-  Y->Resize(vector<TIndex>());
+  Y->Resize(vector<int64_t>());
   float* Ydata = Y->template mutable_data<float>();
   const float* Xdata = X.data<float>();
 
diff --git a/caffe2/operators/piecewise_linear_transform_op.cu b/caffe2/operators/piecewise_linear_transform_op.cu
index 8dc2d4e02285..1d3e8503efeb 100644
--- a/caffe2/operators/piecewise_linear_transform_op.cu
+++ b/caffe2/operators/piecewise_linear_transform_op.cu
@@ -103,14 +103,14 @@ __global__ void PieceWiseLinearTransformBinaryKernel2(
 
 template <>
 void PiecewiseLinearTransformOp<float, CUDAContext>::setUpTensors(
-    TIndex& num_func_per_group,
-    TIndex& num_group,
-    TIndex M) {
+    int64_t& num_func_per_group,
+    int64_t& num_group,
+    int64_t M) {
   if (transform_param_from_arg_) {
     if (!gpu_copied_) {
-      TIndex num_bounds;
-      TIndex num_slopes;
-      TIndex num_intercepts;
+      int64_t num_bounds;
+      int64_t num_slopes;
+      int64_t num_intercepts;
 
       CAFFE_ENFORCE_EQ(InputSize(), 1);
 
@@ -162,9 +162,9 @@ void PiecewiseLinearTransformOp<float, CUDAContext>::setUpTensors(
       gpu_copied_ = true;
     }
   } else {
-    TIndex num_bounds;
-    TIndex num_slopes;
-    TIndex num_intercepts;
+    int64_t num_bounds;
+    int64_t num_slopes;
+    int64_t num_intercepts;
     CAFFE_ENFORCE_EQ(InputSize(), 4);
     auto& bounds_input = Input(BOUNDS);
     auto& slopes_input = Input(SLOPES);
@@ -196,12 +196,12 @@ bool PiecewiseLinearTransformOp<float, CUDAContext>::TransformGeneral() {
   auto& X = Input(0);
   auto* Y = Output(0);
   CAFFE_ENFORCE_EQ(X.ndim(), 2);
-  TIndex N = X.dim32(0);
-  TIndex M = X.dim32(1);
+  int64_t N = X.dim32(0);
+  int64_t M = X.dim32(1);
   Y->ResizeLike(X);
 
-  TIndex num_func_per_group;
-  TIndex num_group;
+  int64_t num_func_per_group;
+  int64_t num_group;
 
   setUpTensors(num_func_per_group, num_group, M);
 
@@ -228,15 +228,15 @@ bool PiecewiseLinearTransformOp<float, CUDAContext>::TransformBinary() {
   auto& X = Input(0);
   auto* Y = Output(0);
   CAFFE_ENFORCE(X.ndim() == 1 || X.ndim() == 2);
-  TIndex N = X.dim32(0);
-  TIndex M = X.ndim() == 2 ? X.dim32(1) : 1;
+  int64_t N = X.dim32(0);
+  int64_t M = X.ndim() == 2 ? X.dim32(1) : 1;
   CAFFE_ENFORCE(
       M == 1 || M == 2,
       "If binary is set to true, the input must be Nx2 or Nx1 tensor");
   Y->ResizeLike(X);
 
-  TIndex num_func_per_group;
-  TIndex num_group;
+  int64_t num_func_per_group;
+  int64_t num_group;
 
   setUpTensors(num_func_per_group, num_group, M);
 
diff --git a/caffe2/operators/piecewise_linear_transform_op.h b/caffe2/operators/piecewise_linear_transform_op.h
index 1b552b0ae766..19edaba5ed9f 100644
--- a/caffe2/operators/piecewise_linear_transform_op.h
+++ b/caffe2/operators/piecewise_linear_transform_op.h
@@ -32,11 +32,11 @@ class PiecewiseLinearTransformOp final : public Operator<Context> {
   // num_group: The number of groups of linear functions. Each group is for
   // transforming one column of predictions.
   void InferNumFunctionsPerGroup(
-      const TIndex num_bounds,
-      const TIndex num_slopes,
-      const TIndex num_intercepts,
-      TIndex* num_func_per_group,
-      TIndex* num_group) {
+      const int64_t num_bounds,
+      const int64_t num_slopes,
+      const int64_t num_intercepts,
+      int64_t* num_func_per_group,
+      int64_t* num_group) {
     CAFFE_ENFORCE_EQ(num_slopes, num_intercepts);
 
     // This is based on the facts:
@@ -54,10 +54,10 @@ class PiecewiseLinearTransformOp final : public Operator<Context> {
 
   bool CheckBoundsSorted(
       const T* bounds,
-      const TIndex num_bounds_per_group,
-      const TIndex num_group) {
+      const int64_t num_bounds_per_group,
+      const int64_t num_group) {
     const T* start = bounds;
-    for (TIndex i = 0; i < num_group; i++) {
+    for (int64_t i = 0; i < num_group; i++) {
       if (!std::is_sorted(start, start + num_bounds_per_group)) {
         return false;
       }
@@ -77,8 +77,8 @@ class PiecewiseLinearTransformOp final : public Operator<Context> {
         good_param == 0 || good_param == 3,
         "bounds, slopes, intercepts must be all set or all not set");
     if (good_param == 3) {
-      TIndex num_func_per_group;
-      TIndex num_group;
+      int64_t num_func_per_group;
+      int64_t num_group;
       InferNumFunctionsPerGroup(
           bounds_from_arg_.size(),
           slopes_from_arg_.size(),
@@ -94,17 +94,17 @@ class PiecewiseLinearTransformOp final : public Operator<Context> {
     return good_param == 3;
   }
 
-  void setUpTensors(TIndex& num_func_per_group, TIndex& num_group, TIndex M);
+  void setUpTensors(int64_t& num_func_per_group, int64_t& num_group, int64_t M);
 
   void GetTransParamData(
       const T** bounds,
       const T** slopes,
       const T** intercepts,
-      TIndex* num_func_per_group,
-      TIndex* num_group) {
-    TIndex num_bounds;
-    TIndex num_slopes;
-    TIndex num_intercepts;
+      int64_t* num_func_per_group,
+      int64_t* num_group) {
+    int64_t num_bounds;
+    int64_t num_slopes;
+    int64_t num_intercepts;
 
     if (transform_param_from_arg_) {
       CAFFE_ENFORCE_EQ(InputSize(), 1);
@@ -134,8 +134,8 @@ class PiecewiseLinearTransformOp final : public Operator<Context> {
     auto& X = Input(0);
     auto* Y = Output(0);
     CAFFE_ENFORCE_EQ(X.ndim(), 2);
-    TIndex N = X.dim32(0);
-    TIndex M = X.dim32(1);
+    int64_t N = X.dim32(0);
+    int64_t M = X.dim32(1);
     Y->ResizeLike(X);
     const auto* Xdata = X.template data<T>();
     T* Ydata = Y->template mutable_data<T>();
@@ -143,17 +143,17 @@ class PiecewiseLinearTransformOp final : public Operator<Context> {
     const T* bounds;
     const T* slopes;
     const T* intercepts;
-    TIndex num_func_per_group;
-    TIndex num_group;
+    int64_t num_func_per_group;
+    int64_t num_group;
     GetTransParamData(
         &bounds, &slopes, &intercepts, &num_func_per_group, &num_group);
     CAFFE_ENFORCE_EQ(num_group, M);
 
-    for (TIndex j = 0; j < M; ++j) {
+    for (int64_t j = 0; j < M; ++j) {
       const T* bounds_group = bounds + j * (num_func_per_group + 1);
       const T* slopes_group = slopes + j * num_func_per_group;
       const T* intercepts_group = intercepts + j * num_func_per_group;
-      for (TIndex i = 0; i < N; ++i) {
+      for (int64_t i = 0; i < N; ++i) {
         Ydata[i * M + j] = PiecewiseLinearTransform(
             Xdata[i * M + j],
             bounds_group,
@@ -169,8 +169,8 @@ class PiecewiseLinearTransformOp final : public Operator<Context> {
     auto& X = Input(PREDICTIONS);
     auto* Y = Output(0);
     CAFFE_ENFORCE(X.ndim() == 1 || X.ndim() == 2);
-    TIndex N = X.dim32(0);
-    TIndex M = X.ndim() == 2 ? X.dim32(1) : 1;
+    int64_t N = X.dim32(0);
+    int64_t M = X.ndim() == 2 ? X.dim32(1) : 1;
     CAFFE_ENFORCE(
         M == 1 || M == 2,
         "If binary is set to true, the input must be Nx2 or Nx1 tensor");
@@ -181,19 +181,19 @@ class PiecewiseLinearTransformOp final : public Operator<Context> {
     const T* bounds;
     const T* slopes;
     const T* intercepts;
-    TIndex num_func_per_group;
-    TIndex num_group;
+    int64_t num_func_per_group;
+    int64_t num_group;
     GetTransParamData(
         &bounds, &slopes, &intercepts, &num_func_per_group, &num_group);
     CAFFE_ENFORCE_EQ(num_group, 1);
 
     if (M == 1) {
-      for (TIndex i = 0; i < N; ++i) {
+      for (int64_t i = 0; i < N; ++i) {
         Ydata[i] = PiecewiseLinearTransform(
             Xdata[i], bounds, slopes, intercepts, num_func_per_group);
       }
     } else {
-      for (TIndex i = 0; i < N; ++i) {
+      for (int64_t i = 0; i < N; ++i) {
         Ydata[i * M + 1] = PiecewiseLinearTransform(
             Xdata[i * M + 1], bounds, slopes, intercepts, num_func_per_group);
         Ydata[i * M] = 1.0f - Ydata[i * M + 1];
@@ -208,7 +208,7 @@ class PiecewiseLinearTransformOp final : public Operator<Context> {
       const T* bounds,
       const T* slopes,
       const T* intercepts,
-      const TIndex num_func_per_group) {
+      const int64_t num_func_per_group) {
     T y = 0;
     // deal with samples out of bounds
     // make it the same as the upper/lower bound value
diff --git a/caffe2/operators/pool_op_cudnn.cu b/caffe2/operators/pool_op_cudnn.cu
index eda02a220301..7552f5b29581 100644
--- a/caffe2/operators/pool_op_cudnn.cu
+++ b/caffe2/operators/pool_op_cudnn.cu
@@ -285,7 +285,7 @@ class CuDNNPoolOp : public ConvPoolOpBase<CUDAContext> {
   }
 
  protected:
-  vector<TIndex> cudnn_input_dims_;
+  vector<int64_t> cudnn_input_dims_;
 
   CuDNNWrapper cudnn_wrapper_;
   cudnnTensorDescriptor_t bottom_desc_;
@@ -498,7 +498,7 @@ class CuDNNPoolGradientOp : public ConvPoolOpBase<CUDAContext> {
   }
 
  protected:
-  vector<TIndex> cudnn_input_dims_;
+  vector<int64_t> cudnn_input_dims_;
 
   CuDNNWrapper cudnn_wrapper_;
   cudnnTensorDescriptor_t bottom_desc_;
diff --git a/caffe2/operators/reducer_functors.h b/caffe2/operators/reducer_functors.h
index 6d357e1b9f99..dd9a858e9c31 100644
--- a/caffe2/operators/reducer_functors.h
+++ b/caffe2/operators/reducer_functors.h
@@ -27,8 +27,8 @@ template <typename T>
 class SumRangeReducer<T, CPUContext> {
  public:
   void operator()(
-      const TIndex block_size,
-      const TIndex blocks,
+      const int64_t block_size,
+      const int64_t blocks,
       const T* in,
       T* out,
       CPUContext* /*context*/) {
@@ -42,15 +42,15 @@ template <typename T, class Context>
 class SumRangeReducerGradient {
  public:
   void operator()(
-      const TIndex block_size,
-      const TIndex blocks,
+      const int64_t block_size,
+      const int64_t blocks,
       const T* segment_grad,
       T* data_grad,
       const T* /*data_in*/, // unused
       const T* /*data_out*/, // unused
       Context* context) {
     // do we have some op that does it smartly with minimum number of memcpy?
-    for (TIndex i = 0; i < blocks; ++i) {
+    for (int64_t i = 0; i < blocks; ++i) {
       context->template CopySameDevice<T>(
           block_size, segment_grad, data_grad + block_size * i);
     }
@@ -78,8 +78,8 @@ template <typename T>
 class LogSumExpRangeReducer<T, CPUContext> {
  public:
   void operator()(
-      const TIndex block_size,
-      const TIndex blocks,
+      const int64_t block_size,
+      const int64_t blocks,
       const T* in,
       T* out,
       CPUContext* /*context*/) {
@@ -102,8 +102,8 @@ template <typename T, class Context>
 class LogSumExpRangeReducerGradient {
  public:
   void operator()(
-      const TIndex block_size,
-      const TIndex blocks,
+      const int64_t block_size,
+      const int64_t blocks,
       const T* segment_grad, // GO
       T* data_grad, // GI
       const T* data_in, // I
@@ -140,8 +140,8 @@ template <typename T>
 class LogMeanExpRangeReducer<T, CPUContext> {
  public:
   void operator()(
-      const TIndex block_size,
-      const TIndex blocks,
+      const int64_t block_size,
+      const int64_t blocks,
       const T* in,
       T* out,
       CPUContext* /*context*/) {
@@ -164,8 +164,8 @@ template <typename T, class Context>
 class LogMeanExpRangeReducerGradient {
  public:
   void operator()(
-      const TIndex block_size,
-      const TIndex blocks,
+      const int64_t block_size,
+      const int64_t blocks,
       const T* segment_grad, // GO
       T* data_grad, // GI
       const T* data_in, // I
@@ -202,8 +202,8 @@ template <typename T>
 class MeanRangeReducer<T, CPUContext> {
  public:
   void operator()(
-      const TIndex block_size,
-      const TIndex blocks,
+      const int64_t block_size,
+      const int64_t blocks,
       const T* in,
       T* out,
       CPUContext* /*context*/) {
@@ -221,8 +221,8 @@ template <typename T, class Context>
 class MeanRangeReducerGradient {
  public:
   void operator()(
-      const TIndex block_size,
-      const TIndex blocks,
+      const int64_t block_size,
+      const int64_t blocks,
       const T* segment_grad, // GO
       T* data_grad, // GI
       const T* /*data_in*/, // I
@@ -261,8 +261,8 @@ template <typename T>
 class MaxRangeReducer<T, CPUContext> {
  public:
   void operator()(
-      const TIndex block_size,
-      const TIndex blocks,
+      const int64_t block_size,
+      const int64_t blocks,
       const T* in,
       T* out,
       CPUContext* /*context*/) {
@@ -280,8 +280,8 @@ template <typename T, class Context>
 class MaxRangeReducerGradient {
  public:
   void operator()(
-      const TIndex block_size,
-      const TIndex blocks,
+      const int64_t block_size,
+      const int64_t blocks,
       const T* segment_grad, // GO
       T* data_grad, // GI
       const T* data_in, // I
@@ -329,13 +329,13 @@ class BaseReducer {
   static constexpr int kInputCount = 1;
 
   struct Meta {
-    TIndex block_size;
-    vector<TIndex> block_shape;
+    int64_t block_size;
+    vector<int64_t> block_shape;
     bool first_dim;
 
     explicit Meta(bool first = true) : first_dim(first) {}
 
-    void computeMeta(const std::vector<TIndex>& dims, int skip_dims) {
+    void computeMeta(const std::vector<int64_t>& dims, int skip_dims) {
       first_dim ? block_shape.assign(dims.begin() + skip_dims, dims.end())
                 : block_shape.assign(dims.begin(), dims.end() - skip_dims);
       block_size = first_dim ? size_from_dim_(skip_dims, dims)
@@ -348,13 +348,13 @@ class BaseReducer {
       computeMeta(dims, skip_dims);
     }
 
-    void appendOutputShape(vector<TIndex>* output_shape) {
+    void appendOutputShape(vector<int64_t>* output_shape) {
       output_shape->insert(
           output_shape->end(), block_shape.begin(), block_shape.end());
     }
 
-    vector<TIndex> getOutputShape(const TensorShape& in, int skip_dims) {
-      vector<TIndex> dims(in.dims().begin(), in.dims().end());
+    vector<int64_t> getOutputShape(const TensorShape& in, int skip_dims) {
+      vector<int64_t> dims(in.dims().begin(), in.dims().end());
       computeMeta(dims, skip_dims);
       return block_shape;
     }
@@ -389,8 +389,8 @@ class BaseReducerGradient {
   }
 
   struct Meta {
-    TIndex block_size;
-    vector<TIndex> block_shape;
+    int64_t block_size;
+    vector<int64_t> block_shape;
     bool first_dim;
 
     Meta(const Tensor& out_grad, int skip_dims, bool first_dim = true)
@@ -409,7 +409,7 @@ class BaseReducerGradient {
         Tensor* /*input_grad*/, // optional grad to populate
         int /*skip_dims*/) {}
 
-    void appendGradShape(vector<TIndex>* output_shape) {
+    void appendGradShape(vector<int64_t>* output_shape) {
       output_shape->insert(
           output_shape->end(), block_shape.begin(), block_shape.end());
     }
@@ -438,7 +438,7 @@ class SumReducer<T, CPUContext> : public BaseReducer {
   void process(
       const Meta& meta,
       const T* in,
-      TIndex /*offset*/,
+      int64_t /*offset*/,
       CPUContext* context) {
     if (meta.first_dim) {
       math::AxpyFixedSize<T, CPUContext, FixedSize>(
@@ -469,7 +469,7 @@ class SumReducerGradient : public BaseReducerGradient {
   void fillGrad(
       const Meta& meta,
       T* data_grad,
-      TIndex offset,
+      int64_t offset,
       Context* context,
       const int length) {
     if (FixedSize == 1) { // static if
@@ -535,7 +535,7 @@ class WeightedSumReducer<T, CPUContext> : public BaseReducer {
   }
   template <int FixedSize>
   void
-  process(const Meta& meta, const T* in, TIndex offset, CPUContext* context) {
+  process(const Meta& meta, const T* in, int64_t offset, CPUContext* context) {
     CAFFE_ENFORCE(
         meta.first_dim,
         "WeightedSumReducer implemented only for "
@@ -596,7 +596,7 @@ class WeightedSumReducerGradient : public BaseReducerGradient {
   void fillGrad(
       const Meta& meta,
       T* data_grad,
-      TIndex offset,
+      int64_t offset,
       Context* context,
       const int /*length*/) {
     math::ScaleFixedSize<T, CPUContext, FixedSize>(
@@ -610,7 +610,7 @@ class WeightedSumReducerGradient : public BaseReducerGradient {
       const Meta& meta,
       const T* data,
       T* data_grad,
-      TIndex offset,
+      int64_t offset,
       Context* context,
       const int /*length*/) {
     math::ScaleFixedSize<T, CPUContext, FixedSize>(
@@ -667,7 +667,7 @@ class MeanReducer<T, CPUContext> : public BaseReducer {
   void process(
       const Meta& meta,
       const T* in,
-      TIndex /*offset*/,
+      int64_t /*offset*/,
       CPUContext* context) {
     if (meta.first_dim) {
       math::AxpyFixedSize<T, CPUContext, FixedSize>(
@@ -716,7 +716,7 @@ class MeanReducerGradient : public BaseReducerGradient {
   void fillGrad(
       const Meta& meta,
       T* data_grad,
-      TIndex offset,
+      int64_t offset,
       Context* context,
       const int length) {
     CAFFE_ENFORCE_GT(length, 0, "Segment length must be > 0");
@@ -765,7 +765,7 @@ class MaxReducer<T, CPUContext> : public BaseReducer {
   void process(
       const Meta& meta,
       const T* in,
-      TIndex /*offset*/,
+      int64_t /*offset*/,
       CPUContext* context) {
     CAFFE_ENFORCE(
         meta.first_dim,
@@ -810,10 +810,10 @@ class MaxReducerGradient : public BaseReducerGradient {
       const T* data,
       T* data_grad,
       const T* forward_output,
-      TIndex /*offset*/,
+      int64_t /*offset*/,
       Context* /*context*/,
       const int /*length*/) {
-    for (TIndex i = 0; i < meta.block_size; ++i) {
+    for (int64_t i = 0; i < meta.block_size; ++i) {
       data_grad[i] = data[i] == forward_output[i] ? s_grad_[i] : 0;
     }
   }
diff --git a/caffe2/operators/reduction_front_back_ops.h b/caffe2/operators/reduction_front_back_ops.h
index 3c66fa4fe202..98fa71c1831a 100644
--- a/caffe2/operators/reduction_front_back_ops.h
+++ b/caffe2/operators/reduction_front_back_ops.h
@@ -32,7 +32,7 @@ class SumReduceDimsOp final : public Operator<Context> {
         num_reduce_dims_ >= 0 && num_reduce_dims_ <= X.dims().size(),
         "For N-dim input tensor, support num_reduce_dims in range [0, N].");
 
-    vector<TIndex> output_shape;
+    vector<int64_t> output_shape;
     int start_index = FIRSTDIMS ? num_reduce_dims_ : 0;
     int end_index =
         FIRSTDIMS ? X.dims().size() : X.dims().size() - num_reduce_dims_;
@@ -109,13 +109,13 @@ class SumReduceDimsGradientOp final : public Operator<Context> {
     // the shape of the input to the data tensor. This made the backward
     // computation incompatible with old models. To fix this, we check
     // the dimension and type of Input(1).
-    if (input_1.ndim() == 1 && input_1.template IsType<TIndex>()) {
+    if (input_1.ndim() == 1 && input_1.template IsType<int64_t>()) {
       // Input(1) is the shape of the input
       shape_.CopyFrom(input_1);
       // Copy first dims
-      vector<TIndex> output_shape(
-          shape_.template data<TIndex>(),
-          shape_.template data<TIndex>() + shape_.size());
+      vector<int64_t> output_shape(
+          shape_.template data<int64_t>(),
+          shape_.template data<int64_t>() + shape_.size());
       dX->Resize(output_shape);
     } else {
       // Input(1) is data tensor X
@@ -183,7 +183,7 @@ class MaxReduceDimsOp final : public Operator<Context> {
     const int cols = FIRSTDIMS ? X.size_from_dim(num_reduce_dims_)
                                : X.size_from_dim(X.ndim() - num_reduce_dims_);
 
-    vector<TIndex> output_shape;
+    vector<int64_t> output_shape;
     int start_index = FIRSTDIMS ? num_reduce_dims_ : 0;
     int end_index =
         FIRSTDIMS ? X.dims().size() : X.dims().size() - num_reduce_dims_;
diff --git a/caffe2/operators/reduction_ops.h b/caffe2/operators/reduction_ops.h
index 90ac549934c2..6c867ba6c3e8 100644
--- a/caffe2/operators/reduction_ops.h
+++ b/caffe2/operators/reduction_ops.h
@@ -24,7 +24,7 @@ class SumElementsOp : public Operator<Context> {
   bool RunOnDevice() override {
     auto& X = Input(0);
     auto* sum = Output(0);
-    sum->Resize(vector<TIndex>());
+    sum->Resize(vector<int64_t>());
 
     T* data = sum->template mutable_data<T>();
 
@@ -58,7 +58,7 @@ class SumElementsIntOp : public Operator<Context> {
   bool RunOnDevice() override {
     auto& X = Input(0);
     auto* sum = Output(0);
-    sum->Resize(vector<TIndex>());
+    sum->Resize(vector<int64_t>());
     T* data = sum->template mutable_data<T>();
     math::Sum<T, Context>(
         X.size(), X.template data<T>(), data, &context_, &scratch_);
@@ -105,7 +105,7 @@ class SumSqrElementsOp : public Operator<Context> {
     bool average = this->template GetSingleArgument<bool>("average", false);
     auto& X = Input(0);
     auto* sum = Output(0);
-    sum->Resize(vector<TIndex>());
+    sum->Resize(vector<int64_t>());
     math::SumSqr<T, Context>(
         X.size(),
         X.template data<T>(),
diff --git a/caffe2/operators/replace_nan_op.cc b/caffe2/operators/replace_nan_op.cc
index a0c7b271567f..0e8193a0d2ed 100644
--- a/caffe2/operators/replace_nan_op.cc
+++ b/caffe2/operators/replace_nan_op.cc
@@ -6,10 +6,10 @@ template <>
 template <typename T>
 void ReplaceNaNOp<CPUContext>::ReplaceNaN(
     const T& value,
-    const TIndex size,
+    const int64_t size,
     const T* X,
     T* Y) {
-  for (TIndex i = 0; i < size; i++) {
+  for (int64_t i = 0; i < size; i++) {
     if (std::isnan(X[i])) {
       Y[i] = value;
     } else {
diff --git a/caffe2/operators/replace_nan_op.cu b/caffe2/operators/replace_nan_op.cu
index e84fb3e62cf9..f6cdbed5b35d 100644
--- a/caffe2/operators/replace_nan_op.cu
+++ b/caffe2/operators/replace_nan_op.cu
@@ -6,7 +6,7 @@ namespace caffe2 {
 namespace {
 template <typename T>
 __global__ void
-replace_nan_kernel(const T value, const TIndex size, const T* X, T* Y) {
+replace_nan_kernel(const T value, const int64_t size, const T* X, T* Y) {
   CUDA_1D_KERNEL_LOOP(i, size) {
     if (isnan(X[i])) {
       Y[i] = value;
@@ -21,7 +21,7 @@ template <>
 template <typename T>
 void ReplaceNaNOp<CUDAContext>::ReplaceNaN(
     const T& value,
-    const TIndex size,
+    const int64_t size,
     const T* X,
     T* Y) {
   replace_nan_kernel<<<
diff --git a/caffe2/operators/replace_nan_op.h b/caffe2/operators/replace_nan_op.h
index 6256aef807a9..ee62c45a6d28 100644
--- a/caffe2/operators/replace_nan_op.h
+++ b/caffe2/operators/replace_nan_op.h
@@ -20,7 +20,7 @@ class ReplaceNaNOp final : public Operator<Context> {
   }
 
   template <typename T>
-  void ReplaceNaN(const T& value, const TIndex size, const T* X, T* Y);
+  void ReplaceNaN(const T& value, const int64_t size, const T* X, T* Y);
 
   template <typename T>
   bool DoRunWithType() {
diff --git a/caffe2/operators/reshape_op_gpu_test.cc b/caffe2/operators/reshape_op_gpu_test.cc
index 3786e0a4245f..3537ab69d058 100644
--- a/caffe2/operators/reshape_op_gpu_test.cc
+++ b/caffe2/operators/reshape_op_gpu_test.cc
@@ -12,7 +12,7 @@ CAFFE2_DECLARE_string(caffe_test_root);
 namespace caffe2 {
 
 static void AddConstInput(
-    const vector<TIndex>& shape,
+    const vector<int64_t>& shape,
     const float value,
     const string& name,
     Workspace* ws) {
@@ -39,7 +39,7 @@ TEST(ReshapeOpGPUTest, testReshapeWithScalar) {
   def.add_output("OldShape");
   def.add_arg()->CopyFrom(MakeArgument("shape", vector<int64_t>{1}));
   def.mutable_device_option()->set_device_type(PROTO_CUDA);
-  AddConstInput(vector<TIndex>(), 3.14, "X", &ws);
+  AddConstInput(vector<int64_t>(), 3.14, "X", &ws);
   // execute the op
   unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
   EXPECT_TRUE(op->Run());
diff --git a/caffe2/operators/reverse_packed_segs_op.h b/caffe2/operators/reverse_packed_segs_op.h
index f0bdbcf48211..6c79f1755863 100644
--- a/caffe2/operators/reverse_packed_segs_op.h
+++ b/caffe2/operators/reverse_packed_segs_op.h
@@ -63,10 +63,10 @@ class ReversePackedSegsOp final : public Operator<Context> {
     context_.FinishDeviceComputation();
 
     T* rev_data_ptr = output->template mutable_data<T>();
-    for (TIndex i = 0; i < batch_size; i++) {
+    for (int64_t i = 0; i < batch_size; i++) {
       const auto& seg_length = lengths_host[i];
       CAFFE_ENFORCE_LE(seg_length, max_length);
-      TIndex j = 0;
+      int64_t j = 0;
       for (; j < seg_length; j++) {
         const T* data_block_ptr = data_ptr + (j * batch_size + i) * block_size;
         T* rev_data_block_ptr =
diff --git a/caffe2/operators/rnn/hip/recurrent_op_miopen.h b/caffe2/operators/rnn/hip/recurrent_op_miopen.h
index 13dc3abff1d5..8a861d38def5 100644
--- a/caffe2/operators/rnn/hip/recurrent_op_miopen.h
+++ b/caffe2/operators/rnn/hip/recurrent_op_miopen.h
@@ -56,7 +56,7 @@ class RecurrentBaseOp : public Operator<HIPContext> {
   std::unique_ptr<detail::TensorDescriptors<T>> xDesc_;
   std::unique_ptr<detail::TensorDescriptors<T>> yDesc_;
 
-  std::vector<TIndex> cachedInputDims_;
+  std::vector<int64_t> cachedInputDims_;
   size_t reserveNbytes_;
   size_t miopenWsNbytes_;
 
diff --git a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
index 86b7a69fe0d1..98675cea858d 100644
--- a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
+++ b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
@@ -31,7 +31,7 @@ class RecurrentNetworkBlobFetcherOp final : public Operator<Context> {
 
     std::vector<std::string> blob_names_vector = {};
 
-    for (TIndex i = 0; i < stepWorkspaces.size(); i++) {
+    for (int64_t i = 0; i < stepWorkspaces.size(); i++) {
       Workspace* currentStepWorkspace = stepWorkspaces[i].get();
       std::vector<std::string> blob_names = currentStepWorkspace->LocalBlobs();
 
diff --git a/caffe2/operators/rnn/recurrent_network_op.h b/caffe2/operators/rnn/recurrent_network_op.h
index 00595198b6db..2421bc44263a 100644
--- a/caffe2/operators/rnn/recurrent_network_op.h
+++ b/caffe2/operators/rnn/recurrent_network_op.h
@@ -900,7 +900,7 @@ class RNNApplyLinkOp : public Operator<Context> {
     auto* external_out = Output(1);
 
     CAFFE_ENFORCE_GT(external.size(), 0);
-    const TIndex externalTimestepSize = external.size() / external.dim(0);
+    const int64_t externalTimestepSize = external.size() / external.dim(0);
     auto* externalData = external_out->template mutable_data<T>() +
         (t + offset_) * externalTimestepSize;
     auto internalDims = external_out->dims();
diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.h b/caffe2/operators/rnn/recurrent_op_cudnn.h
index 5c70b5262029..f3947901b619 100644
--- a/caffe2/operators/rnn/recurrent_op_cudnn.h
+++ b/caffe2/operators/rnn/recurrent_op_cudnn.h
@@ -56,7 +56,7 @@ class RecurrentBaseOp : public Operator<CUDAContext> {
   std::unique_ptr<detail::TensorDescriptors<T>> xDesc_;
   std::unique_ptr<detail::TensorDescriptors<T>> yDesc_;
 
-  std::vector<TIndex> cachedInputDims_;
+  std::vector<int64_t> cachedInputDims_;
   size_t reserveNbytes_;
   size_t cudnnWsNbytes_;
 
diff --git a/caffe2/operators/roi_align_op_gpu_test.cc b/caffe2/operators/roi_align_op_gpu_test.cc
index eca60a6006a2..2647a97d6f0b 100644
--- a/caffe2/operators/roi_align_op_gpu_test.cc
+++ b/caffe2/operators/roi_align_op_gpu_test.cc
@@ -12,7 +12,7 @@ namespace {
 
 template <class Context>
 void AddConstInput(
-    const vector<TIndex>& shape,
+    const vector<int64_t>& shape,
     const float value,
     const string& name,
     Context* context,
@@ -27,14 +27,14 @@ void AddConstInput(
 
 template <class Context>
 void AddInput(
-    const vector<TIndex>& shape,
+    const vector<int64_t>& shape,
     const vector<float>& values,
     const string& name,
     Workspace* ws);
 
 template <>
 void AddInput<CPUContext>(
-    const vector<TIndex>& shape,
+    const vector<int64_t>& shape,
     const vector<float>& values,
     const string& name,
     Workspace* ws) {
@@ -48,7 +48,7 @@ void AddInput<CPUContext>(
 
 template <>
 void AddInput<CUDAContext>(
-    const vector<TIndex>& shape,
+    const vector<int64_t>& shape,
     const vector<float>& values,
     const string& name,
     Workspace* ws) {
@@ -102,10 +102,10 @@ void CreateAndRun(
     vector<float> features(N * C * H * W);
     std::iota(features.begin(), features.end(), 0);
     // utils::AsEArrXt(features) /= features.size();
-    AddInput<Context>(vector<TIndex>{N, C, H, W}, features, "X", &ws);
+    AddInput<Context>(vector<int64_t>{N, C, H, W}, features, "X", &ws);
     const int n_rois = test_params.n_rois;
     const vector<float>& rois = test_params.rois_array;
-    AddInput<Context>(vector<TIndex>{n_rois, 5}, rois, "R", &ws);
+    AddInput<Context>(vector<int64_t>{n_rois, 5}, rois, "R", &ws);
   } else {
     const int N = 2;
     const int C = 3;
@@ -114,7 +114,7 @@ void CreateAndRun(
     vector<float> features(N * C * H * W);
     std::iota(features.begin(), features.end(), 0);
     // utils::AsEArrXt(features) /= features.size();
-    AddInput<Context>(vector<TIndex>{N, C, H, W}, features, "X", &ws);
+    AddInput<Context>(vector<int64_t>{N, C, H, W}, features, "X", &ws);
     vector<float> rois{0, 0,            0,            79,           59,
                        0, 0,            5.0005703f,   52.63237f,    43.69501495f,
                        0, 24.13628387f, 7.51243401f,  79,           46.06628418f,
@@ -124,7 +124,7 @@ void CreateAndRun(
                        0, 23.57396317f, 29.98791885f, 79,           59,
                        0, 0,            41.90219116f, 79,           59,
                        0, 0,            23.30098343f, 79,           59};
-    AddInput<Context>(vector<TIndex>{9, 5}, rois, "R", &ws);
+    AddInput<Context>(vector<int64_t>{9, 5}, rois, "R", &ws);
   }
 
   std::vector<unique_ptr<OperatorBase>> ops;
diff --git a/caffe2/operators/segment_reduction_op.h b/caffe2/operators/segment_reduction_op.h
index fa0f64f1eca2..9e7ab6d60401 100644
--- a/caffe2/operators/segment_reduction_op.h
+++ b/caffe2/operators/segment_reduction_op.h
@@ -19,7 +19,7 @@ class BaseInputAccessor {
   }
 
   inline const TData*
-  getBlockPtr(TIndex in_block_size, TIndex idx, TIndex /* blocks */ = 1) {
+  getBlockPtr(int64_t in_block_size, int64_t idx, int64_t /* blocks */ = 1) {
     return static_cast<const TData*>(data_) + in_block_size * idx;
   }
 
@@ -82,12 +82,12 @@ class AbstractSortedSegmentRangeOp : public Operator<Context> {
       return true;
     }
 
-    TIndex block_size = dataInput.size() / N;
+    int64_t block_size = dataInput.size() / N;
 
     // Assume the segments are sorted and there are no gaps
     CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
-    for (TIndex i = 0; i < N;) {
-      TIndex start = i;
+    for (int64_t i = 0; i < N;) {
+      int64_t start = i;
       for (++i; i < N && s_ids[start] == s_ids[i]; ++i)
         ;
 
@@ -135,7 +135,7 @@ class AbstractSortedSegmentRangeGradientOp : public Operator<Context> {
     auto* data_grads = Output(0);
 
     CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
-    TIndex N = segment_ids.dim(0);
+    int64_t N = segment_ids.dim(0);
 
     const SIndex* s_ids = segment_ids.template data<SIndex>();
     const T* s_grads = segment_grads.template data<T>();
@@ -153,15 +153,15 @@ class AbstractSortedSegmentRangeGradientOp : public Operator<Context> {
       return true;
     }
 
-    TIndex block_size = segment_grads.size_from_dim(1);
+    int64_t block_size = segment_grads.size_from_dim(1);
 
     // Assume the segments are sorted and there are no gaps
     CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
     // repeat the check from forward op
     CAFFE_ENFORCE_EQ(
         K - 1, s_ids[N - 1], "Indices must be sorted and not have gaps");
-    for (TIndex i = 0; i < N;) {
-      TIndex start = i;
+    for (int64_t i = 0; i < N;) {
+      int64_t start = i;
       for (++i; i < N && s_ids[start] == s_ids[i]; ++i)
         ;
 
@@ -292,7 +292,7 @@ class AbstractReduceFrontOrBackOp : public Operator<Context> {
     auto& data = Input(0);
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
-    TIndex in_block_size = FirstDim
+    int64_t in_block_size = FirstDim
         ? data.size_from_dim(num_reduce_dims_)
         : data.size_to_dim(data.ndim() - num_reduce_dims_);
     return DispatchHelper<typename Reducer::FixedDispatch>::call(
@@ -319,7 +319,7 @@ class AbstractReduceFrontOrBackOp : public Operator<Context> {
         data.meta().name(),
         ".");
 
-    vector<TIndex> shape;
+    vector<int64_t> shape;
     ctx.appendOutputShape(&shape);
     output->Resize(shape);
 
@@ -332,7 +332,7 @@ class AbstractReduceFrontOrBackOp : public Operator<Context> {
     const int num_blocks = block_size > 0 ? data.size() / block_size : 0;
 
     Reducer r(ctx, out, &context_);
-    for (TIndex i = 0; i < num_blocks; ++i) {
+    for (int64_t i = 0; i < num_blocks; ++i) {
       r.template process<FixedSize>(
           ctx, inputAccessor_.getBlockPtr(block_size, i), i, &context_);
     }
@@ -365,7 +365,7 @@ class AbstractReduceFrontOrBackGradientOp : public Operator<Context> {
   bool RunOnDevice() override {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
-    TIndex grad_block_size = Input(REDUCTION_GRAD).size();
+    int64_t grad_block_size = Input(REDUCTION_GRAD).size();
     return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
         this, grad_block_size);
   }
@@ -391,21 +391,21 @@ class AbstractReduceFrontOrBackGradientOp : public Operator<Context> {
 
     CAFFE_ENFORCE_LE(num_reduce_dims_, source_shape.size());
 
-    vector<TIndex> shape(
-        source_shape.template data<TIndex>(),
-        source_shape.template data<TIndex>() + source_shape.size());
+    vector<int64_t> shape(
+        source_shape.template data<int64_t>(),
+        source_shape.template data<int64_t>() + source_shape.size());
 
     data_grads->Resize(shape);
 
-    TIndex block_size = FirstDim
+    int64_t block_size = FirstDim
         ? data_grads->size_from_dim(num_reduce_dims_)
         : data_grads->size_from_dim(data_grads->ndim() - num_reduce_dims_);
-    TIndex block_num = block_size > 0 ? data_grads->size() / block_size : 0;
+    int64_t block_num = block_size > 0 ? data_grads->size() / block_size : 0;
 
     T* out = data_grads->template mutable_data<T>();
 
     ReducerGradient r(ctx, r_grad, &context_);
-    for (TIndex i = 0; i < block_num; ++i) {
+    for (int64_t i = 0; i < block_num; ++i) {
       r.template fillGrad<FixedSize>(
           ctx,
           out + block_size * i,
@@ -447,7 +447,7 @@ UnsortedSegment{op} but as if all input slices belong to a single segment.
       ArgumentHelper helper(def);
       int num_reduce_dims = helper.GetSingleArgument<int>("num_reduce_dim", 1);
       typename ReducerDef::template Reducer<T, Context>::Meta ctx(true);
-      vector<TIndex> out_dims = ctx.getOutputShape(in[0], num_reduce_dims);
+      vector<int64_t> out_dims = ctx.getOutputShape(in[0], num_reduce_dims);
       return vector<TensorShape>{
           CreateTensorShape(out_dims, in[0].data_type())};
     });
@@ -514,7 +514,7 @@ UnsortedSegment{op} but as if all input slices belong to a single segment.
       ArgumentHelper helper(def);
       int num_reduce_dims = helper.GetSingleArgument<int>("num_reduce_dim", 1);
       typename ReducerDef::template Reducer<T, Context>::Meta ctx(false);
-      vector<TIndex> out_dims = ctx.getOutputShape(in[0], num_reduce_dims);
+      vector<int64_t> out_dims = ctx.getOutputShape(in[0], num_reduce_dims);
       return vector<TensorShape>{
           CreateTensorShape(out_dims, in[0].data_type())};
     });
@@ -601,7 +601,7 @@ class AbstractSortedSegmentOp : public Operator<Context> {
           this, Input(INDICES));
     } else {
       // type doesn't matter
-      return DoRunWithType<TIndex>();
+      return DoRunWithType<int64_t>();
     }
   }
 
@@ -609,7 +609,7 @@ class AbstractSortedSegmentOp : public Operator<Context> {
   bool DoRunWithType() {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
-    TIndex in_block_size = Input(0).size_from_dim(1);
+    int64_t in_block_size = Input(0).size_from_dim(1);
     return DispatchHelper<typename Reducer::FixedDispatch, IndexType>::call(
         this, in_block_size);
   }
@@ -621,8 +621,8 @@ class AbstractSortedSegmentOp : public Operator<Context> {
     auto* output = Output(0);
 
     CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
-    TIndex N = segment_ids.dim(0);
-    const TIndex M = dataInput.dim(0);
+    int64_t N = segment_ids.dim(0);
+    const int64_t M = dataInput.dim(0);
 
     const IndexType* idxs;
     if (SparseFused) { // static if
@@ -662,7 +662,7 @@ class AbstractSortedSegmentOp : public Operator<Context> {
     const SIndex* s_ids = segment_ids.template data<SIndex>();
 
     const SIndex K = N > 0 ? s_ids[N - 1] + 1 : 0;
-    vector<TIndex> shape;
+    vector<int64_t> shape;
     shape.push_back(K);
     ctx.appendOutputShape(&shape);
     output->Resize(shape);
@@ -671,13 +671,13 @@ class AbstractSortedSegmentOp : public Operator<Context> {
     if (N == 0) {
       return true;
     }
-    TIndex in_block_size = dataInput.size_from_dim(1);
-    TIndex out_block_size = output->size_from_dim(1);
+    int64_t in_block_size = dataInput.size_from_dim(1);
+    int64_t out_block_size = output->size_from_dim(1);
 
     // Assume the segments are sorted and there are no gaps
     CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
-    for (TIndex i = 0; i < N;) {
-      TIndex start = i;
+    for (int64_t i = 0; i < N;) {
+      int64_t start = i;
 
       Reducer r(ctx, out + out_block_size * s_ids[start], &context_);
       for (; i < N && s_ids[start] == s_ids[i]; ++i) {
@@ -730,7 +730,7 @@ class AbstractSortedSegmentGradientOp : public Operator<Context> {
   bool RunOnDevice() override {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
-    TIndex grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
+    int64_t grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
     return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
         this, grad_block_size);
   }
@@ -742,7 +742,7 @@ class AbstractSortedSegmentGradientOp : public Operator<Context> {
     auto* data_grads = Output(0);
 
     CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
-    TIndex N = segment_ids.dim(0);
+    int64_t N = segment_ids.dim(0);
 
     typename ReducerGradient::Meta ctx(segment_grads, 1);
     for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
@@ -760,14 +760,14 @@ class AbstractSortedSegmentGradientOp : public Operator<Context> {
     const SIndex* s_ids = segment_ids.template data<SIndex>();
     const T* s_grads = segment_grads.template data<T>();
 
-    vector<TIndex> shape;
+    vector<int64_t> shape;
     shape.push_back(N);
     ctx.appendGradShape(&shape);
     data_grads->Resize(shape);
 
-    TIndex d_block_size = data_grads->size_from_dim(1);
+    int64_t d_block_size = data_grads->size_from_dim(1);
     const SIndex K = segment_grads.dim(0);
-    TIndex s_block_size = segment_grads.size_from_dim(1);
+    int64_t s_block_size = segment_grads.size_from_dim(1);
     T* out = data_grads->template mutable_data<T>();
 
     if (N == 0) {
@@ -779,9 +779,9 @@ class AbstractSortedSegmentGradientOp : public Operator<Context> {
     // repeat the check from forward op
     CAFFE_ENFORCE_EQ(
         K - 1, s_ids[N - 1], "Indices must be sorted and not have gaps");
-    for (TIndex i = 0; i < N;) {
-      TIndex start = i;
-      TIndex end = start;
+    for (int64_t i = 0; i < N;) {
+      int64_t start = i;
+      int64_t end = start;
 
       if (ReducerGradient::computeLength()) {
         for (; end < N && s_ids[start] == s_ids[end]; ++end) {
@@ -1005,7 +1005,7 @@ class AbstractUnsortedSegmentOp : public Operator<Context> {
           this, Input(INDICES));
     } else {
       // type doesn't matter
-      return DoRunWithType<TIndex>();
+      return DoRunWithType<int64_t>();
     }
   }
 
@@ -1013,7 +1013,7 @@ class AbstractUnsortedSegmentOp : public Operator<Context> {
   bool DoRunWithType() {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
-    TIndex in_block_size = Input(0).size_from_dim(1);
+    int64_t in_block_size = Input(0).size_from_dim(1);
     return DispatchHelper<typename Reducer::FixedDispatch, IndexType>::call(
         this, in_block_size);
   }
@@ -1025,8 +1025,8 @@ class AbstractUnsortedSegmentOp : public Operator<Context> {
     auto* output = Output(0);
 
     CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
-    TIndex N = segment_ids.dim(0);
-    const TIndex M = data.dim(0);
+    int64_t N = segment_ids.dim(0);
+    const int64_t M = data.dim(0);
 
     const IndexType* idxs;
     if (SparseFused) { // static if
@@ -1070,27 +1070,27 @@ class AbstractUnsortedSegmentOp : public Operator<Context> {
       K = num_segments_;
     } else {
       K = 0;
-      for (TIndex i = 0; i < N; ++i) {
+      for (int64_t i = 0; i < N; ++i) {
         K = std::max(K, s_ids[i] + 1);
       }
     }
 
-    vector<TIndex> shape;
+    vector<int64_t> shape;
     shape.push_back(K);
     ctx.appendOutputShape(&shape);
     output->Resize(shape);
 
-    TIndex in_block_size = data.size_from_dim(1);
-    TIndex out_block_size = output->size_from_dim(1);
+    int64_t in_block_size = data.size_from_dim(1);
+    int64_t out_block_size = output->size_from_dim(1);
     T* out = output->template mutable_data<T>();
 
     reducers_.clear();
     reducers_.reserve(K);
-    for (TIndex i = 0; i < K; ++i) {
+    for (int64_t i = 0; i < K; ++i) {
       reducers_.emplace_back(ctx, out + out_block_size * i, &context_);
     }
 
-    for (TIndex i = 0; i < N; ++i) {
+    for (int64_t i = 0; i < N; ++i) {
       auto s_id = s_ids[i];
       CAFFE_ENFORCE(
           0 <= s_id && s_id < K,
@@ -1114,7 +1114,7 @@ class AbstractUnsortedSegmentOp : public Operator<Context> {
           ctx, inputAccessor_.getBlockPtr(in_block_size, idx), i, &context_);
     }
 
-    for (TIndex i = 0; i < K; ++i) {
+    for (int64_t i = 0; i < K; ++i) {
       reducers_[i].template finish<FixedSize>(ctx, &context_);
     }
     // call reducers destructors (if there is any)
@@ -1130,7 +1130,7 @@ class AbstractUnsortedSegmentOp : public Operator<Context> {
   static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;
 
  private:
-  TIndex num_segments_;
+  int64_t num_segments_;
   // member field to reuse memory
   vector<Reducer> reducers_;
   InputAccessor inputAccessor_;
@@ -1146,7 +1146,7 @@ class AbstractUnsortedSegmentGradientOp : public Operator<Context> {
   bool RunOnDevice() override {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
-    TIndex grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
+    int64_t grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
     return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
         this, grad_block_size);
   }
@@ -1158,7 +1158,7 @@ class AbstractUnsortedSegmentGradientOp : public Operator<Context> {
     auto* data_grads = Output(0);
 
     CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
-    TIndex N = segment_ids.dim(0);
+    int64_t N = segment_ids.dim(0);
 
     typename ReducerGradient::Meta ctx(segment_grads, 1);
     for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
@@ -1176,14 +1176,14 @@ class AbstractUnsortedSegmentGradientOp : public Operator<Context> {
     const SIndex* s_ids = segment_ids.template data<SIndex>();
     const T* s_grads = segment_grads.template data<T>();
 
-    vector<TIndex> shape;
+    vector<int64_t> shape;
     shape.push_back(N);
     ctx.appendGradShape(&shape);
     data_grads->Resize(shape);
 
-    TIndex d_block_size = data_grads->size_from_dim(1);
+    int64_t d_block_size = data_grads->size_from_dim(1);
     const SIndex K = segment_grads.dim(0);
-    TIndex s_block_size = segment_grads.size_from_dim(1);
+    int64_t s_block_size = segment_grads.size_from_dim(1);
     T* out = data_grads->template mutable_data<T>();
 
     if (ReducerGradient::computeLength()) {
@@ -1206,7 +1206,7 @@ class AbstractUnsortedSegmentGradientOp : public Operator<Context> {
       reducers_.emplace_back(ctx, s_grads + s_block_size * i, &context_);
     }
 
-    for (TIndex i = 0; i < N; ++i) {
+    for (int64_t i = 0; i < N; ++i) {
       auto s_id = s_ids[i];
       if (ReducerGradient::computeLength()) {
         reducers_[s_id].template fillGrad<FixedSize>(
@@ -1399,7 +1399,7 @@ class AbstractLengthsOp : public Operator<Context> {
           this, Input(INDICES));
     } else {
       // type doesn't matter
-      return DoRunWithType<TIndex>();
+      return DoRunWithType<int64_t>();
     }
   }
 
@@ -1407,7 +1407,7 @@ class AbstractLengthsOp : public Operator<Context> {
   bool DoRunWithType() {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
-    TIndex in_block_size = Input(0).size_from_dim(1);
+    int64_t in_block_size = Input(0).size_from_dim(1);
     return DispatchHelper<typename Reducer::FixedDispatch, IndexType>::call(
         this, in_block_size);
   }
@@ -1419,10 +1419,10 @@ class AbstractLengthsOp : public Operator<Context> {
     auto* output = Output(0);
 
     CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
-    const TIndex dataSize = dataInput.dim(0);
+    const int64_t dataSize = dataInput.dim(0);
     // Either first dim the data or how much we pull in indexies from it
-    TIndex dataToReduceSize;
-    const TIndex outputSize = lengthsInput.dim(0);
+    int64_t dataToReduceSize;
+    const int64_t outputSize = lengthsInput.dim(0);
 
     const IndexType* indices;
     if (SparseFused) { // static if
@@ -1454,18 +1454,18 @@ class AbstractLengthsOp : public Operator<Context> {
         dataInput.meta().name(),
         ".");
 
-    vector<TIndex> shape{outputSize};
+    vector<int64_t> shape{outputSize};
     ctx.appendOutputShape(&shape);
     output->Resize(shape);
 
-    TIndex in_block_size = dataInput.size_from_dim(1);
-    TIndex out_block_size = output->size_from_dim(1);
+    int64_t in_block_size = dataInput.size_from_dim(1);
+    int64_t out_block_size = output->size_from_dim(1);
     TData* out = output->template mutable_data<TData>();
 
-    TIndex dataIndex = 0;
-    for (TIndex rangeIndex = 0; rangeIndex < outputSize; ++rangeIndex) {
+    int64_t dataIndex = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < outputSize; ++rangeIndex) {
       Reducer reducer(ctx, out + out_block_size * rangeIndex, &context_);
-      for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
+      for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         IndexType idx;
         if (SparseFused) { // static if
@@ -1536,7 +1536,7 @@ class AbstractLengthsGradientOp : public Operator<Context> {
   bool RunOnDevice() override {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
-    TIndex gradBlockSize = Input(SEGMENT_GRADS).size_from_dim(1);
+    int64_t gradBlockSize = Input(SEGMENT_GRADS).size_from_dim(1);
     return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
         this, gradBlockSize);
   }
@@ -1548,12 +1548,12 @@ class AbstractLengthsGradientOp : public Operator<Context> {
     auto* dataGradsOutput = Output(0);
 
     CAFFE_ENFORCE(lengthsInput.ndim() == 1, "LENGTHS must be a vector");
-    TIndex reducedDataSize = 0;
-    TIndex numSegments = lengthsInput.dim(0);
+    int64_t reducedDataSize = 0;
+    int64_t numSegments = lengthsInput.dim(0);
     CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
     CAFFE_ENFORCE(numSegments == segmentGradsInput.dim(0));
     const TLengths* lengths = lengthsInput.template data<TLengths>();
-    for (TIndex i = 0; i < numSegments; ++i) {
+    for (int64_t i = 0; i < numSegments; ++i) {
       reducedDataSize += lengths[i];
     }
 
@@ -1572,20 +1572,20 @@ class AbstractLengthsGradientOp : public Operator<Context> {
 
     const T* segmentGrads = segmentGradsInput.template data<T>();
 
-    vector<TIndex> shape;
+    vector<int64_t> shape;
     shape.push_back(reducedDataSize);
     ctx.appendGradShape(&shape);
     dataGradsOutput->Resize(shape);
 
-    TIndex dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
-    TIndex segmentBlockSize = segmentGradsInput.size_from_dim(1);
+    int64_t dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
+    int64_t segmentBlockSize = segmentGradsInput.size_from_dim(1);
     T* dataGrads = dataGradsOutput->template mutable_data<T>();
 
-    TIndex dataIndex = 0;
-    for (TIndex rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+    int64_t dataIndex = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
       ReducerGradient reducer(
           ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
-      for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
+      for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         reducer.template fillGrad<FixedSize>(
             ctx,
@@ -1633,7 +1633,7 @@ class AbstractLengthsWithMainInputGradientOp : public Operator<Context> {
           this, Input(INDICES));
     } else {
       // type doesn't matter
-      return DoRunWithType<TIndex>();
+      return DoRunWithType<int64_t>();
     }
   }
 
@@ -1641,7 +1641,7 @@ class AbstractLengthsWithMainInputGradientOp : public Operator<Context> {
   bool DoRunWithType() {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
-    TIndex in_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
+    int64_t in_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
     return DispatchHelper<typename ReducerGradient::FixedDispatch, IndexType>::
         call(this, in_block_size);
   }
@@ -1654,7 +1654,7 @@ class AbstractLengthsWithMainInputGradientOp : public Operator<Context> {
     auto* dataGradsOutput = Output(0);
 
     CAFFE_ENFORCE(lengthsInput.ndim() == 1, "LENGTHS must be a vector");
-    TIndex numSegments = lengthsInput.dim(0);
+    int64_t numSegments = lengthsInput.dim(0);
     CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
     CAFFE_ENFORCE(numSegments == segmentGradsInput.dim(0));
     const TLengths* lengths = lengthsInput.template data<TLengths>();
@@ -1668,7 +1668,7 @@ class AbstractLengthsWithMainInputGradientOp : public Operator<Context> {
     }
 
     // Either first dim the data or how much we pull in indexies from it
-    TIndex dataToReduceSize;
+    int64_t dataToReduceSize;
     const IndexType* indices = nullptr;
     if (SparseFused) { // static if
       auto& indicesInput = Input(INDICES);
@@ -1680,22 +1680,22 @@ class AbstractLengthsWithMainInputGradientOp : public Operator<Context> {
 
     const T* segmentGrads = segmentGradsInput.template data<T>();
 
-    vector<TIndex> shape;
+    vector<int64_t> shape;
     shape.push_back(dataToReduceSize);
     ctx.appendGradShape(&shape);
     dataGradsOutput->Resize(shape);
 
-    TIndex dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
-    TIndex segmentBlockSize = segmentGradsInput.size_from_dim(1);
+    int64_t dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
+    int64_t segmentBlockSize = segmentGradsInput.size_from_dim(1);
     T* dataGrads = dataGradsOutput->template mutable_data<T>();
 
     const T* data = dataInput.template data<T>();
 
-    TIndex dataIndex = 0;
-    for (TIndex rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+    int64_t dataIndex = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
       ReducerGradient reducer(
           ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
-      for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
+      for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         IndexType data_pos;
         // No range checking, should've been verified in forward pass
@@ -1743,7 +1743,7 @@ class AbstractLengthsWithMainInputAndForwardOutputGradientOp
   bool RunOnDevice() override {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class.
-    TIndex in_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
+    int64_t in_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
     return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
         this, in_block_size);
   }
@@ -1757,7 +1757,7 @@ class AbstractLengthsWithMainInputAndForwardOutputGradientOp
     auto* dataGradsOutput = Output(0);
 
     CAFFE_ENFORCE(lengthsInput.ndim() == 1, "LENGTHS must be a vector");
-    TIndex numSegments = lengthsInput.dim(0);
+    int64_t numSegments = lengthsInput.dim(0);
     CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
     CAFFE_ENFORCE(numSegments == segmentGradsInput.dim(0));
     const TLengths* lengths = lengthsInput.template data<TLengths>();
@@ -1774,26 +1774,26 @@ class AbstractLengthsWithMainInputAndForwardOutputGradientOp
     CAFFE_ENFORCE(numSegments == forwardOutputInput.dim(0));
     const T* forwardOutput = forwardOutputInput.template data<T>();
 
-    TIndex dataToReduceSize = dataInput.dim(0);
+    int64_t dataToReduceSize = dataInput.dim(0);
 
     const T* segmentGrads = segmentGradsInput.template data<T>();
 
-    vector<TIndex> shape;
+    vector<int64_t> shape;
     shape.push_back(dataToReduceSize);
     ctx.appendGradShape(&shape);
     dataGradsOutput->Resize(shape);
 
-    TIndex dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
-    TIndex segmentBlockSize = segmentGradsInput.size_from_dim(1);
+    int64_t dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
+    int64_t segmentBlockSize = segmentGradsInput.size_from_dim(1);
     T* dataGrads = dataGradsOutput->template mutable_data<T>();
 
     const T* data = dataInput.template data<T>();
 
-    TIndex dataIndex = 0;
-    for (TIndex rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+    int64_t dataIndex = 0;
+    for (int64_t rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
       ReducerGradient reducer(
           ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
-      for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
+      for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         // No range checking, should've been verified in forward pass
         reducer.template fillGradWithMainInputAndForwardOutput<FixedSize>(
diff --git a/caffe2/operators/segment_reduction_op_gpu.cu b/caffe2/operators/segment_reduction_op_gpu.cu
index 6eec2deba9ce..377ca8fb5a03 100644
--- a/caffe2/operators/segment_reduction_op_gpu.cu
+++ b/caffe2/operators/segment_reduction_op_gpu.cu
@@ -433,10 +433,10 @@ class CUDASparseLengthsSumOp : public Operator<CUDAContext> {
     auto* output = Output(0);
 
     CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
-    const TIndex dataSize = dataInput.dim(0);
+    const int64_t dataSize = dataInput.dim(0);
     // Either first dim the data or how much we pull in indexies from it
-    TIndex dataToReduceSize;
-    const TIndex outputSize = lengthsInput.dim(0);
+    int64_t dataToReduceSize;
+    const int64_t outputSize = lengthsInput.dim(0);
     const int len_length = outputSize;
 
     auto shape = dataInput.dims();
@@ -554,10 +554,10 @@ class CUDASparseLengthsMeanOp : public Operator<CUDAContext> {
     auto* output = Output(0);
 
     CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
-    const TIndex dataSize = dataInput.dim(0);
+    const int64_t dataSize = dataInput.dim(0);
     // Either first dim the data or how much we pull in indexies from it
-    TIndex dataToReduceSize;
-    const TIndex outputSize = lengthsInput.dim(0);
+    int64_t dataToReduceSize;
+    const int64_t outputSize = lengthsInput.dim(0);
     const int len_length = outputSize;
 
     auto shape = dataInput.dims();
@@ -676,10 +676,10 @@ class CUDASparseLengthsMaxOp : public Operator<CUDAContext> {
     auto* output = Output(0);
 
     CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
-    const TIndex dataSize = dataInput.dim(0);
+    const int64_t dataSize = dataInput.dim(0);
     // Either first dim the data or how much we pull in indexies from it
-    TIndex dataToReduceSize;
-    const TIndex outputSize = lengthsInput.dim(0);
+    int64_t dataToReduceSize;
+    const int64_t outputSize = lengthsInput.dim(0);
     int len_length = outputSize;
 
     auto shape = dataInput.dims();
@@ -810,10 +810,10 @@ class CUDASparseLengthsWeightedSumOp : public Operator<CUDAContext> {
     CAFFE_ENFORCE_EQ(1, indicesInput.ndim(), "INDICES must be a vector");
     CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
 
-    const TIndex dataSize = dataInput.dim(0);
+    const int64_t dataSize = dataInput.dim(0);
     // Either first dim the data or how much we pull in indexies from it
-    const TIndex dataToReduceSize = indicesInput.dim(0);
-    const TIndex outputSize = lengthsInput.dim(0);
+    const int64_t dataToReduceSize = indicesInput.dim(0);
+    const int64_t outputSize = lengthsInput.dim(0);
     const int len_length = outputSize;
 
     auto shape = dataInput.dims();
@@ -954,7 +954,7 @@ class CUDAUnsortedSegmentSumOp : public Operator<CUDAContext> {
     }
 
     CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
-    TIndex slize_sz = data.size_from_dim(1);
+    int64_t slize_sz = data.size_from_dim(1);
 
     K_tensor_.Resize(1);
     // Get maximum segment id so we can size the output.
diff --git a/caffe2/operators/sequence_ops.cc b/caffe2/operators/sequence_ops.cc
index 2b7b82095686..ec43ba3fbc8c 100644
--- a/caffe2/operators/sequence_ops.cc
+++ b/caffe2/operators/sequence_ops.cc
@@ -54,7 +54,7 @@ bool RemovePaddingOp<CPUContext>::DoRunWithType() {
   CAFFE_ENFORCE_GE(in.ndim(), 1);
   const int32_t outer_size = in.dims()[0];
   const auto block_size = std::accumulate(
-      in.dims().begin() + 1, in.dims().end(), 1, std::multiplies<TIndex>());
+      in.dims().begin() + 1, in.dims().end(), 1, std::multiplies<int64_t>());
   const auto pad_width = startPaddingWidth_ + endPaddingWidth_;
 
   // if no lengths is provided, assume it is a single full-span entry
diff --git a/caffe2/operators/sequence_ops.cu b/caffe2/operators/sequence_ops.cu
index 95ad9ece32d4..fa10b6cbfe94 100644
--- a/caffe2/operators/sequence_ops.cu
+++ b/caffe2/operators/sequence_ops.cu
@@ -237,7 +237,7 @@ bool RemovePaddingOp<CUDAContext>::DoRunWithType() {
   CAFFE_ENFORCE_GE(in.ndim(), 1);
   const int32_t outer_size = in.dims()[0];
   const auto block_size = std::accumulate(
-      in.dims().begin() + 1, in.dims().end(), 1, std::multiplies<TIndex>());
+      in.dims().begin() + 1, in.dims().end(), 1, std::multiplies<int64_t>());
 
   // if no lengths is provided, assume it is a single full-span entry
   const int32_t* lengths_ptr = nullptr;
diff --git a/caffe2/operators/sequence_ops.h b/caffe2/operators/sequence_ops.h
index 9e0f8eb5b1b3..2b59e839fd31 100644
--- a/caffe2/operators/sequence_ops.h
+++ b/caffe2/operators/sequence_ops.h
@@ -25,11 +25,11 @@ class GatherPaddingOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     if (startPaddingWidth_ == 0 && endPaddingWidth_ == 0) {
-      Output(0)->Resize(std::vector<TIndex>(0));
-      Output(0)->template mutable_data<TIndex>();
+      Output(0)->Resize(std::vector<int64_t>(0));
+      Output(0)->template mutable_data<int64_t>();
       if (OutputSize() == 2) {
-        Output(1)->Resize(std::vector<TIndex>(0));
-        Output(1)->template mutable_data<TIndex>();
+        Output(1)->Resize(std::vector<int64_t>(0));
+        Output(1)->template mutable_data<int64_t>();
       }
       return true;
     }
@@ -53,7 +53,7 @@ class GatherPaddingOp final : public Operator<Context> {
       lengths_ptr = lengths.template data<int32_t>();
       lengths_size = lengths.size();
     }
-    std::vector<TIndex> padShape(in.dims().begin() + 1, in.dims().end());
+    std::vector<int64_t> padShape(in.dims().begin() + 1, in.dims().end());
     // output will contain accumulator over paddings
     Output(0)->Resize(padShape);
     T* padding_start_ptr = Output(0)->template mutable_data<T>();
diff --git a/caffe2/operators/shape_op.h b/caffe2/operators/shape_op.h
index 05ea7a2f7c5f..4d4c080702bf 100644
--- a/caffe2/operators/shape_op.h
+++ b/caffe2/operators/shape_op.h
@@ -24,22 +24,22 @@ class ShapeOp : public Operator<Context> {
     int numAxes = axes_.size();
     if (numAxes == 0) {
       output->Resize(numDims);
-      TIndex* output_data = output->template mutable_data<TIndex>();
+      int64_t* output_data = output->template mutable_data<int64_t>();
       context_.CopyBytesSameDevice(
-          numDims * sizeof(TIndex), data.dims().data(), output_data);
+          numDims * sizeof(int64_t), data.dims().data(), output_data);
       return true;
     }
 
     output->Resize(numAxes);
     auto src = reinterpret_cast<const char*>(data.dims().data());
-    auto out = reinterpret_cast<char*>(output->template mutable_data<TIndex>());
+    auto out = reinterpret_cast<char*>(output->template mutable_data<int64_t>());
     for (int i = 0; i < numAxes; i++) {
       auto axis = axes_[i];
       CAFFE_ENFORCE_LT(axis, numDims, "Axis out of range");
       CAFFE_ENFORCE_GE(axis, 0, "Each axis should be non-negative");
       context_.CopyBytesSameDevice(
-          sizeof(TIndex), src + axis * sizeof(TIndex), out);
-      out += sizeof(TIndex);
+          sizeof(int64_t), src + axis * sizeof(int64_t), out);
+      out += sizeof(int64_t);
     }
     return true;
   }
diff --git a/caffe2/operators/slice_op.cu b/caffe2/operators/slice_op.cu
index 5de302814ba2..475d8329c924 100644
--- a/caffe2/operators/slice_op.cu
+++ b/caffe2/operators/slice_op.cu
@@ -237,15 +237,15 @@ class SliceOp<CUDAContext> : public Operator<CUDAContext> {
   USE_OPERATOR_FUNCTIONS(CUDAContext);
   SliceOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<CUDAContext>(operator_def, ws),
-        starts_(this->template GetRepeatedArgument<TIndex>("starts")),
-        ends_(this->template GetRepeatedArgument<TIndex>("ends")),
+        starts_(this->template GetRepeatedArgument<int64_t>("starts")),
+        ends_(this->template GetRepeatedArgument<int64_t>("ends")),
         statically_inited_(false) {}
 
   bool RunOnDevice() override {
     if (InputSize() > 1) {
-      return DispatchHelper<TensorTypes<int, TIndex>>::call(this, Input(1));
+      return DispatchHelper<TensorTypes<int, int64_t>>::call(this, Input(1));
     } else {
-      return DoRunWithType<TIndex>();
+      return DoRunWithType<int64_t>();
     }
   }
 
@@ -282,8 +282,8 @@ class SliceOp<CUDAContext> : public Operator<CUDAContext> {
         output, data, starts_host_, ends_host_, &context_);
   }
  private:
-  std::vector<TIndex> starts_;
-  std::vector<TIndex> ends_;
+  std::vector<int64_t> starts_;
+  std::vector<int64_t> ends_;
   bool statically_inited_;
   Tensor starts_host_{CPU};
   Tensor ends_host_{CPU};
@@ -298,17 +298,17 @@ class SliceGradientOp<CUDAContext> : public Operator<CUDAContext> {
   USE_OPERATOR_FUNCTIONS(CUDAContext);
   SliceGradientOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<CUDAContext>(operator_def, ws),
-        starts_(this->template GetRepeatedArgument<TIndex>("starts")),
-        ends_(this->template GetRepeatedArgument<TIndex>("ends")),
+        starts_(this->template GetRepeatedArgument<int64_t>("starts")),
+        ends_(this->template GetRepeatedArgument<int64_t>("ends")),
         statically_inited_(false) {}
 
   AT_DISABLE_COPY_AND_ASSIGN(SliceGradientOp);
 
   bool RunOnDevice() override {
     if (InputSize() == 4) {
-      return DispatchHelper<TensorTypes<int, TIndex>>::call(this, Input(1));
+      return DispatchHelper<TensorTypes<int, int64_t>>::call(this, Input(1));
     } else {
-      return DoRunWithType<TIndex>();
+      return DoRunWithType<int64_t>();
     }
   }
 
@@ -353,8 +353,8 @@ class SliceGradientOp<CUDAContext> : public Operator<CUDAContext> {
   }
  private:
 
-  std::vector<TIndex> starts_;
-  std::vector<TIndex> ends_;
+  std::vector<int64_t> starts_;
+  std::vector<int64_t> ends_;
   bool statically_inited_;
   Tensor starts_host_{CPU};
   Tensor ends_host_{CPU};
diff --git a/caffe2/operators/slice_op.h b/caffe2/operators/slice_op.h
index aa8d4e50f0f9..e7f8919bb81c 100644
--- a/caffe2/operators/slice_op.h
+++ b/caffe2/operators/slice_op.h
@@ -204,15 +204,15 @@ class SliceOp : public Operator<Context> {
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   SliceOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
-        starts_(this->template GetRepeatedArgument<TIndex>("starts")),
-        ends_(this->template GetRepeatedArgument<TIndex>("ends")),
+        starts_(this->template GetRepeatedArgument<int64_t>("starts")),
+        ends_(this->template GetRepeatedArgument<int64_t>("ends")),
         statically_inited_(false) {}
 
   bool RunOnDevice() override {
     if (InputSize() > 1) {
-      return DispatchHelper<TensorTypes<int, TIndex>>::call(this, Input(1));
+      return DispatchHelper<TensorTypes<int, int64_t>>::call(this, Input(1));
     } else {
-      return DoRunWithType<TIndex>();
+      return DoRunWithType<int64_t>();
     }
   }
 
@@ -252,8 +252,8 @@ class SliceOp : public Operator<Context> {
   AT_DISABLE_COPY_AND_ASSIGN(SliceOp);
 
  protected:
-  std::vector<TIndex> starts_;
-  std::vector<TIndex> ends_;
+  std::vector<int64_t> starts_;
+  std::vector<int64_t> ends_;
   bool statically_inited_;
   Tensor starts_host_{CPU};
   Tensor ends_host_{CPU};
@@ -265,17 +265,17 @@ class SliceGradientOp : public Operator<Context> {
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   SliceGradientOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
-        starts_(this->template GetRepeatedArgument<TIndex>("starts")),
-        ends_(this->template GetRepeatedArgument<TIndex>("ends")),
+        starts_(this->template GetRepeatedArgument<int64_t>("starts")),
+        ends_(this->template GetRepeatedArgument<int64_t>("ends")),
         statically_inited_(false) {}
 
         AT_DISABLE_COPY_AND_ASSIGN(SliceGradientOp);
 
   bool RunOnDevice() override {
     if (InputSize() == 4) {
-      return DispatchHelper<TensorTypes<int, TIndex>>::call(this, Input(1));
+      return DispatchHelper<TensorTypes<int, int64_t>>::call(this, Input(1));
     } else {
-      return DoRunWithType<TIndex>();
+      return DoRunWithType<int64_t>();
     }
   }
 
@@ -321,8 +321,8 @@ class SliceGradientOp : public Operator<Context> {
 
  private:
 
-  std::vector<TIndex> starts_;
-  std::vector<TIndex> ends_;
+  std::vector<int64_t> starts_;
+  std::vector<int64_t> ends_;
   bool statically_inited_;
   Tensor starts_host_{CPU};
   Tensor ends_host_{CPU};
diff --git a/caffe2/operators/softmax_op_cudnn.cc b/caffe2/operators/softmax_op_cudnn.cc
index 16224bef4d0e..6019024e73f3 100644
--- a/caffe2/operators/softmax_op_cudnn.cc
+++ b/caffe2/operators/softmax_op_cudnn.cc
@@ -71,7 +71,7 @@ class CuDNNSoftmaxOp final : public Operator<CUDAContext> {
   CuDNNWrapper cudnn_wrapper_;
   int axis_;
   cudnnTensorDescriptor_t desc_;
-  vector<TIndex> dims_;
+  vector<int64_t> dims_;
 };
 
 
@@ -137,7 +137,7 @@ class CuDNNSoftmaxGradientOp final : public Operator<CUDAContext> {
   CuDNNWrapper cudnn_wrapper_;
   int axis_;
   cudnnTensorDescriptor_t desc_;
-  vector<TIndex> dims_;
+  vector<int64_t> dims_;
 };
 
 namespace {
diff --git a/caffe2/operators/softmax_ops.cu b/caffe2/operators/softmax_ops.cu
index aa01ad278144..1945b59c7b26 100644
--- a/caffe2/operators/softmax_ops.cu
+++ b/caffe2/operators/softmax_ops.cu
@@ -308,7 +308,7 @@ bool SoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
     }
   }
 
-  avg_loss->Resize(vector<TIndex>());
+  avg_loss->Resize(vector<int64_t>());
   if (losses_.size() != N) {
     losses_.Resize(N);
   }
@@ -423,7 +423,7 @@ bool SpatialSoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
       context_.cuda_stream()>>>(N, D, W, H, Xdata, Pdata);
 
   // Cross entropy
-  avg_loss->Resize(vector<TIndex>());
+  avg_loss->Resize(vector<int64_t>());
   float* avg_loss_data = avg_loss->template mutable_data<float>();
   math::Set<float, CUDAContext>(1, 0.0f, avg_loss_data, &context_);
 
diff --git a/caffe2/operators/softmax_with_loss_op.cc b/caffe2/operators/softmax_with_loss_op.cc
index 5390b6c40c11..732821042735 100644
--- a/caffe2/operators/softmax_with_loss_op.cc
+++ b/caffe2/operators/softmax_with_loss_op.cc
@@ -252,7 +252,7 @@ bool SoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
     }
   }
 
-  avg_loss->Resize(vector<TIndex>());
+  avg_loss->Resize(vector<int64_t>());
   float* avg_loss_data = avg_loss->template mutable_data<float>();
   if (weight_sum != 0.0) {
     avg_loss_data[0] = loss_sum * scale_ / weight_sum;
diff --git a/caffe2/operators/sparse_to_dense_mask_op.h b/caffe2/operators/sparse_to_dense_mask_op.h
index 6b71ff4f4e7b..7bfa5eb1435d 100644
--- a/caffe2/operators/sparse_to_dense_mask_op.h
+++ b/caffe2/operators/sparse_to_dense_mask_op.h
@@ -93,7 +93,7 @@ class SparseToDenseMaskOp : public SparseToDenseMaskBase<Context> {
         static_cast<const char*>(sparse_values.raw_data());
     const void* default_val = default_value.raw_data();
 
-    TIndex block_size = default_value.size();
+    int64_t block_size = default_value.size();
     size_t block_nbytes = default_value.nbytes();
 
     const int cols = this->featuresCount_;
@@ -105,7 +105,7 @@ class SparseToDenseMaskOp : public SparseToDenseMaskBase<Context> {
     if (returnPresenceMask_) {
       presence_mask = Output(PRESENCEMASK);
     }
-    vector<TIndex> shape;
+    vector<int64_t> shape;
     if (InputSize() == 4) {
       auto& lengths = Input(LENGTHS);
       CAFFE_ENFORCE_EQ(lengths.ndim(), 1);
@@ -204,7 +204,7 @@ class SparseToDenseMaskGradientOp : public SparseToDenseMaskBase<Context> {
     CAFFE_ENFORCE_EQ(sparse_indices.ndim(), 1);
     auto& gradient_output = Input(GOUTPUT);
 
-    TIndex block_size = gradient_output.size_from_dim(1);
+    int64_t block_size = gradient_output.size_from_dim(1);
     size_t block_nbytes = gradient_output.itemsize() * block_size;
 
     const int cols = this->featuresCount_;
@@ -213,7 +213,7 @@ class SparseToDenseMaskGradientOp : public SparseToDenseMaskBase<Context> {
     int32_t default_length = sparse_indices.dim32(0);
     const int32_t* lengths_vec = nullptr;
     auto* output = Output(GVALUES);
-    vector<TIndex> shape;
+    vector<int64_t> shape;
     if (InputSize() > LENGTHS) {
       // if the LENGTHS is set, the gradient_output has dim:
       // lengths * mask.size() * feature_dim
diff --git a/caffe2/operators/sparse_to_dense_op.cu b/caffe2/operators/sparse_to_dense_op.cu
index c62718a8ece1..74957f980131 100644
--- a/caffe2/operators/sparse_to_dense_op.cu
+++ b/caffe2/operators/sparse_to_dense_op.cu
@@ -7,7 +7,7 @@ namespace caffe2 {
 
   template <typename TInd, typename TData>
   __global__ void SparseToDenseKernel(
-    size_t N, TIndex block_nitems, const TInd* indices, const TData* vals, TData* dst) {
+    size_t N, int64_t block_nitems, const TInd* indices, const TData* vals, TData* dst) {
     CUDA_1D_KERNEL_LOOP(i, N) {
       int idx = indices[i / block_nitems];
       int dst_idx = block_nitems * idx + i % block_nitems;
diff --git a/caffe2/operators/spatial_softmax_with_loss_op.cc b/caffe2/operators/spatial_softmax_with_loss_op.cc
index 0e1750015ef7..8a271d17b5ee 100644
--- a/caffe2/operators/spatial_softmax_with_loss_op.cc
+++ b/caffe2/operators/spatial_softmax_with_loss_op.cc
@@ -119,7 +119,7 @@ bool SpatialSoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
   }
 
   // Compute the avg cross-entropy loss
-  avg_loss->Resize(vector<TIndex>());
+  avg_loss->Resize(vector<int64_t>());
   float* avg_loss_data = avg_loss->template mutable_data<float>();
   const int* label_data = T.data<int>();
 
diff --git a/caffe2/operators/text_file_reader.cc b/caffe2/operators/text_file_reader.cc
index 4938d3f5ca15..3888dc2a0e76 100644
--- a/caffe2/operators/text_file_reader.cc
+++ b/caffe2/operators/text_file_reader.cc
@@ -156,7 +156,7 @@ class TextFileReaderReadOp : public Operator<CPUContext> {
   }
 
  private:
-  TIndex batchSize_;
+  int64_t batchSize_;
 };
 
 CAFFE_KNOWN_TYPE(std::unique_ptr<TextFileReaderInstance>);
diff --git a/caffe2/operators/tile_op.h b/caffe2/operators/tile_op.h
index 3a5dcdfd5c78..f5a3109ffd61 100644
--- a/caffe2/operators/tile_op.h
+++ b/caffe2/operators/tile_op.h
@@ -72,7 +72,7 @@ class TileOp : public Operator<Context> {
     const auto axis = input.canonical_axis_index(axis_);
 
     // reshape output to be input tiled along the axis
-    vector<TIndex> output_dims(input.dims());
+    vector<int64_t> output_dims(input.dims());
     output_dims[axis_] = output_dims[axis_] * tiles_;
     output->Resize(output_dims);
 
@@ -187,7 +187,7 @@ class TileGradientOp : public Operator<Context> {
     const auto axis = input.canonical_axis_index(axis_);
 
     // reshape output to be input "untiled" along the axis
-    vector<TIndex> output_dims(input.dims());
+    vector<int64_t> output_dims(input.dims());
     output_dims[axis_] = output_dims[axis_] / tiles_;
     output->Resize(output_dims);
 
diff --git a/caffe2/operators/top_k.cc b/caffe2/operators/top_k.cc
index 7ebe9966a504..83b4787103be 100644
--- a/caffe2/operators/top_k.cc
+++ b/caffe2/operators/top_k.cc
@@ -16,8 +16,8 @@ namespace {
 template <typename T>
 struct ValueComp {
   bool operator()(
-      const std::pair<T, TIndex>& lhs,
-      const std::pair<T, TIndex>& rhs) const {
+      const std::pair<T, int64_t>& lhs,
+      const std::pair<T, int64_t>& rhs) const {
     return lhs.first > rhs.first ||
         (lhs.first == rhs.first && lhs.second < rhs.second);
   }
@@ -26,34 +26,34 @@ struct ValueComp {
 template <typename T>
 void GetTopK(
     const T* input,
-    const TIndex n,
-    const TIndex k,
-    const TIndex src_offset,
-    const TIndex dst_offset,
-    const TIndex stride,
+    const int64_t n,
+    const int64_t k,
+    const int64_t src_offset,
+    const int64_t dst_offset,
+    const int64_t stride,
     T* values,
-    TIndex* indices,
-    TIndex* flatten_indices) {
+    int64_t* indices,
+    int64_t* flatten_indices) {
   const T* src_ptr = input + src_offset;
-  std::vector<std::pair<T, TIndex>> heap_data;
+  std::vector<std::pair<T, int64_t>> heap_data;
   heap_data.reserve(k);
-  for (TIndex i = 0; i < k && i < n; ++i) {
+  for (int64_t i = 0; i < k && i < n; ++i) {
     heap_data.emplace_back(*src_ptr, i);
     src_ptr += stride;
   }
   std::priority_queue<
-      std::pair<T, TIndex>,
-      std::vector<std::pair<T, TIndex>>,
+      std::pair<T, int64_t>,
+      std::vector<std::pair<T, int64_t>>,
       ValueComp<T>>
       pq(ValueComp<T>(), std::move(heap_data));
-  for (TIndex i = k; i < n; ++i) {
+  for (int64_t i = k; i < n; ++i) {
     if (pq.top().first < *src_ptr) {
       pq.pop();
       pq.emplace(*src_ptr, i);
     }
     src_ptr += stride;
   }
-  TIndex dst_pos = dst_offset + (std::min(k, n) - 1) * stride;
+  int64_t dst_pos = dst_offset + (std::min(k, n) - 1) * stride;
   while (!pq.empty()) {
     const auto& item = pq.top();
     values[dst_pos] = item.first;
@@ -69,13 +69,13 @@ void GetTopK(
 template <typename T>
 void SetTopKGradient(
     const T* values,
-    const TIndex* indices,
+    const int64_t* indices,
     const int k,
-    const TIndex src_offset,
-    const TIndex dst_offset,
-    const TIndex stride,
+    const int64_t src_offset,
+    const int64_t dst_offset,
+    const int64_t stride,
     T* gradient) {
-  TIndex src_pos = src_offset;
+  int64_t src_pos = src_offset;
   for (int i = 0; i < k; ++i) {
     if (indices[src_pos] < 0) {
       continue;
@@ -94,14 +94,14 @@ bool TopKOp<T, Context>::RunOnDevice() {
   auto* indices = Output(1);
   auto* flatten_indices = OutputSize() > 2 ? Output(2) : nullptr;
 
-  const std::vector<TIndex>& input_dims = input.dims();
+  const std::vector<int64_t>& input_dims = input.dims();
   if (axis_ == -1) {
     axis_ = input_dims.size() - 1;
   }
   CAFFE_ENFORCE_GE(axis_, 0);
   CAFFE_ENFORCE_LT(axis_, input_dims.size());
 
-  std::vector<TIndex> output_dims = input_dims;
+  std::vector<int64_t> output_dims = input_dims;
   output_dims[axis_] = k_;
   values->Resize(output_dims);
   indices->Resize(output_dims);
@@ -110,35 +110,35 @@ bool TopKOp<T, Context>::RunOnDevice() {
   }
   const T* input_data = input.template data<T>();
   T* values_data = values->template mutable_data<T>();
-  TIndex* indices_data = indices->template mutable_data<TIndex>();
-  TIndex* flatten_indices_data = flatten_indices == nullptr
+  int64_t* indices_data = indices->template mutable_data<int64_t>();
+  int64_t* flatten_indices_data = flatten_indices == nullptr
       ? nullptr
-      : flatten_indices->template mutable_data<TIndex>();
+      : flatten_indices->template mutable_data<int64_t>();
   // init values as the default value
   math::Set<T, Context>(values->size(), T(0), values_data, &context_);
-  math::Set<TIndex, Context>(
-      indices->size(), TIndex(-1), indices_data, &context_);
+  math::Set<int64_t, Context>(
+      indices->size(), int64_t(-1), indices_data, &context_);
   if (flatten_indices_data != nullptr) {
-    math::Set<TIndex, Context>(
-        flatten_indices->size(), TIndex(-1), flatten_indices_data, &context_);
+    math::Set<int64_t, Context>(
+        flatten_indices->size(), int64_t(-1), flatten_indices_data, &context_);
   }
 
-  const TIndex prev_size = std::accumulate(
+  const int64_t prev_size = std::accumulate(
       input_dims.cbegin(),
       input_dims.cbegin() + axis_,
-      TIndex(1),
-      std::multiplies<TIndex>());
-  const TIndex next_size = std::accumulate(
+      int64_t(1),
+      std::multiplies<int64_t>());
+  const int64_t next_size = std::accumulate(
       input_dims.cbegin() + axis_ + 1,
       input_dims.cend(),
-      TIndex(1),
-      std::multiplies<TIndex>());
-  const TIndex src_offset_stride = input_dims[axis_] * next_size;
-  const TIndex dst_offset_stride = k_ * next_size;
-  TIndex src_offset = 0;
-  TIndex dst_offset = 0;
-  for (TIndex i = 0; i < prev_size; ++i) {
-    for (TIndex j = 0; j < next_size; ++j) {
+      int64_t(1),
+      std::multiplies<int64_t>());
+  const int64_t src_offset_stride = input_dims[axis_] * next_size;
+  const int64_t dst_offset_stride = k_ * next_size;
+  int64_t src_offset = 0;
+  int64_t dst_offset = 0;
+  for (int64_t i = 0; i < prev_size; ++i) {
+    for (int64_t j = 0; j < next_size; ++j) {
       GetTopK(
           input_data,
           input_dims[axis_],
@@ -162,34 +162,34 @@ bool TopKGradientOp<T, Context>::RunOnDevice() {
   const auto& indices = Input(1);
   const auto& original_input = Input(2);
   auto* output = Output(0);
-  const std::vector<TIndex>& values_dims = values.dims();
-  const std::vector<TIndex>& origin_dims = original_input.dims();
+  const std::vector<int64_t>& values_dims = values.dims();
+  const std::vector<int64_t>& origin_dims = original_input.dims();
   CAFFE_ENFORCE_EQ(values_dims.size(), origin_dims.size());
   output->Resize(origin_dims);
   const T* values_data = values.template data<T>();
-  const TIndex* indices_data = indices.template data<TIndex>();
+  const int64_t* indices_data = indices.template data<int64_t>();
   T* output_data = output->template mutable_data<T>();
   if (axis_ == -1) {
     axis_ = values_dims.size() - 1;
   }
   const int k = values_dims[axis_];
   math::Set<T, Context>(output->size(), T(0), output_data, &context_);
-  const TIndex prev_size = std::accumulate(
+  const int64_t prev_size = std::accumulate(
       values_dims.cbegin(),
       values_dims.cbegin() + axis_,
-      TIndex(1),
-      std::multiplies<TIndex>());
-  const TIndex next_size = std::accumulate(
+      int64_t(1),
+      std::multiplies<int64_t>());
+  const int64_t next_size = std::accumulate(
       values_dims.cbegin() + axis_ + 1,
       values_dims.cend(),
-      TIndex(1),
-      std::multiplies<TIndex>());
-  const TIndex src_offset_stride = k * next_size;
-  const TIndex dst_offset_stride = origin_dims[axis_] * next_size;
-  TIndex src_offset = 0;
-  TIndex dst_offset = 0;
-  for (TIndex i = 0; i < prev_size; ++i) {
-    for (TIndex j = 0; j < next_size; ++j) {
+      int64_t(1),
+      std::multiplies<int64_t>());
+  const int64_t src_offset_stride = k * next_size;
+  const int64_t dst_offset_stride = origin_dims[axis_] * next_size;
+  int64_t src_offset = 0;
+  int64_t dst_offset = 0;
+  for (int64_t i = 0; i < prev_size; ++i) {
+    for (int64_t j = 0; j < next_size; ++j) {
       SetTopKGradient(
           values_data,
           indices_data,
diff --git a/caffe2/operators/top_k.cu b/caffe2/operators/top_k.cu
index 6562b7fa5030..5d294236befb 100644
--- a/caffe2/operators/top_k.cu
+++ b/caffe2/operators/top_k.cu
@@ -23,24 +23,24 @@ namespace {
 template <typename T, int kHeapSize, bool kSelectMax = true>
 void RunHeapSelectionImpl(
     const T* input,
-    const TIndex outer_size,
-    const TIndex inner_size,
+    const int64_t outer_size,
+    const int64_t inner_size,
     const int k,
     T* values,
-    TIndex* indices,
+    int64_t* indices,
     CUDAContext* context) {
   constexpr int kBlockSize = 256;
   constexpr int kNumWarps = kBlockSize / kWarpSize;
-  constexpr int smem = kNumWarps * kHeapSize * (sizeof(T) + sizeof(TIndex));
+  constexpr int smem = kNumWarps * kHeapSize * (sizeof(T) + sizeof(int64_t));
   constexpr T kInitVal = kSelectMax ? std::numeric_limits<T>::lowest()
                                     : std::numeric_limits<T>::max();
-  selectRowsViaHeap<T, TIndex, TIndex, kBlockSize, kHeapSize, kSelectMax>
+  selectRowsViaHeap<T, int64_t, int64_t, kBlockSize, kHeapSize, kSelectMax>
       <<<outer_size, kBlockSize, smem, context->cuda_stream()>>>(
           input,
           values,
           indices,
           kInitVal,
-          std::numeric_limits<TIndex>::max(),
+          std::numeric_limits<int64_t>::max(),
           outer_size,
           inner_size,
           k);
@@ -49,16 +49,16 @@ void RunHeapSelectionImpl(
 template <typename T, bool kSelectMax = true>
 void RunRadixSelectionImpl(
     const T* input,
-    const TIndex outer_size,
-    const TIndex inner_size,
+    const int64_t outer_size,
+    const int64_t inner_size,
     const int k,
     T* values,
-    TIndex* indices,
+    int64_t* indices,
     CUDAContext* context) {
   const int block = std::min(
       math::roundUp(static_cast<int>(inner_size), kWarpSize),
       CAFFE_CUDA_NUM_THREADS);
-  gatherTopK<T, kSelectMax, TIndex>
+  gatherTopK<T, kSelectMax, int64_t>
       <<<outer_size, block, 0, context->cuda_stream()>>>(
           input, inner_size, k, outer_size, values, indices);
   // Unfortunately the output is not currently sorted, and there is no batch
@@ -77,11 +77,11 @@ void RunRadixSelectionImpl(
 template <typename T>
 void RunTopKOnLastDimCUDAImpl(
     const T* input,
-    const TIndex outer_size,
-    const TIndex inner_size,
+    const int64_t outer_size,
+    const int64_t inner_size,
     const int k,
     T* values,
-    TIndex* indices,
+    int64_t* indices,
     CUDAContext* context) {
   // If k is small, uses heap selection, otherwise uses radix selection.
   if (k < 32) {
@@ -100,18 +100,18 @@ void RunTopKOnLastDimCUDAImpl(
 }
 
 __global__ void FlattenIndicesCUDAKernel(
-    const TIndex* src,
-    const TIndex size,
-    const TIndex stride,
-    const TIndex n,
+    const int64_t* src,
+    const int64_t size,
+    const int64_t stride,
+    const int64_t n,
     const int k,
-    TIndex* dst) {
+    int64_t* dst) {
   CUDA_1D_KERNEL_LOOP(i, size) {
     if (src[i] < 0) {
       continue;
     }
-    const TIndex x = i / stride / k;
-    const TIndex y = i % stride;
+    const int64_t x = i / stride / k;
+    const int64_t y = i % stride;
 #if __CUDA_ARCH__ >= 350
     dst[i] = __ldg(src + i) * stride + x * n * stride + y;
 #else
@@ -123,18 +123,18 @@ __global__ void FlattenIndicesCUDAKernel(
 template <typename T>
 __global__ void SetTopKGradientCUDAKernel(
     const T* values,
-    const TIndex* indices,
-    const TIndex size,
-    const TIndex stride,
-    const TIndex n,
+    const int64_t* indices,
+    const int64_t size,
+    const int64_t stride,
+    const int64_t n,
     const int k,
     T* dst) {
   CUDA_1D_KERNEL_LOOP(i, size) {
     if (indices[i] < 0) {
       continue;
     }
-    const TIndex x = i / stride / k;
-    const TIndex y = i % stride;
+    const int64_t x = i / stride / k;
+    const int64_t y = i % stride;
 #if __CUDA_ARCH__ >= 350
     dst[__ldg(indices + i) * stride + x * n * stride + y] = __ldg(values + i);
 #else
@@ -187,7 +187,7 @@ bool TopKCudaOp<T, Context>::RunOnDevice() {
   auto* indices = Output(1);
   auto* flatten_indices = OutputSize() > 2 ? Output(2) : nullptr;
 
-  const std::vector<TIndex>& input_dims = input.dims();
+  const std::vector<int64_t>& input_dims = input.dims();
   if (axis_ == -1) {
     axis_ = input_dims.size() - 1;
   }
@@ -195,20 +195,20 @@ bool TopKCudaOp<T, Context>::RunOnDevice() {
   CAFFE_ENFORCE_LT(axis_, input_dims.size());
 
   const bool need_transpose = axis_ < input_dims.size() - 1;
-  std::vector<TIndex> output_dims = input_dims;
+  std::vector<int64_t> output_dims = input_dims;
   output_dims[axis_] = k_;
-  const TIndex prev_size = std::accumulate(
+  const int64_t prev_size = std::accumulate(
       input_dims.cbegin(),
       input_dims.cbegin() + axis_,
-      TIndex(1),
-      std::multiplies<TIndex>());
-  const TIndex next_size = std::accumulate(
+      int64_t(1),
+      std::multiplies<int64_t>());
+  const int64_t next_size = std::accumulate(
       input_dims.cbegin() + axis_ + 1,
       input_dims.cend(),
-      TIndex(1),
-      std::multiplies<TIndex>());
-  const TIndex outer_size = input.size() / input_dims[axis_];
-  const TIndex inner_size = input_dims[axis_];
+      int64_t(1),
+      std::multiplies<int64_t>());
+  const int64_t outer_size = input.size() / input_dims[axis_];
+  const int64_t inner_size = input_dims[axis_];
 
   values->Resize(output_dims);
   indices->Resize(output_dims);
@@ -217,10 +217,10 @@ bool TopKCudaOp<T, Context>::RunOnDevice() {
   }
   const T* input_data = input.template data<T>();
   T* values_data = values->template mutable_data<T>();
-  TIndex* indices_data = indices->template mutable_data<TIndex>();
-  TIndex* flatten_indices_data = flatten_indices == nullptr
+  int64_t* indices_data = indices->template mutable_data<int64_t>();
+  int64_t* flatten_indices_data = flatten_indices == nullptr
       ? nullptr
-      : flatten_indices->template mutable_data<TIndex>();
+      : flatten_indices->template mutable_data<int64_t>();
 
   if (need_transpose) {
     const std::array<int, 3> dims = {static_cast<int>(prev_size),
@@ -228,9 +228,9 @@ bool TopKCudaOp<T, Context>::RunOnDevice() {
                                      static_cast<int>(next_size)};
     const std::array<int, 3> axes = {0, 2, 1};
     input_transposed_buffer_.Resize(
-        std::vector<TIndex>{outer_size, inner_size});
-    values_transposed_buffer_.Resize(std::vector<TIndex>{outer_size, k_});
-    indices_transposed_buffer_.Resize(std::vector<TIndex>{outer_size, k_});
+        std::vector<int64_t>{outer_size, inner_size});
+    values_transposed_buffer_.Resize(std::vector<int64_t>{outer_size, k_});
+    indices_transposed_buffer_.Resize(std::vector<int64_t>{outer_size, k_});
     math::Transpose(
         3,
         dims.data(),
@@ -240,16 +240,16 @@ bool TopKCudaOp<T, Context>::RunOnDevice() {
         &context_);
     input_data = input_transposed_buffer_.template data<T>();
     values_data = values_transposed_buffer_.template mutable_data<T>();
-    indices_data = indices_transposed_buffer_.template mutable_data<TIndex>();
+    indices_data = indices_transposed_buffer_.template mutable_data<int64_t>();
   }
 
   // init values as the default value
   math::Set<T, CUDAContext>(values->size(), T(0), values_data, &context_);
-  math::Set<TIndex, CUDAContext>(
-      indices->size(), TIndex(-1), indices_data, &context_);
+  math::Set<int64_t, CUDAContext>(
+      indices->size(), int64_t(-1), indices_data, &context_);
   if (flatten_indices_data != nullptr) {
-    math::Set<TIndex, CUDAContext>(
-        flatten_indices->size(), TIndex(-1), flatten_indices_data, &context_);
+    math::Set<int64_t, CUDAContext>(
+        flatten_indices->size(), int64_t(-1), flatten_indices_data, &context_);
   }
 
   RunTopKOnLastDimCUDAImpl<T>(
@@ -275,8 +275,8 @@ bool TopKCudaOp<T, Context>::RunOnDevice() {
         3,
         dims.data(),
         axes.data(),
-        indices_transposed_buffer_.template data<TIndex>(),
-        indices->template mutable_data<TIndex>(),
+        indices_transposed_buffer_.template data<int64_t>(),
+        indices->template mutable_data<int64_t>(),
         &context_);
   }
 
@@ -287,12 +287,12 @@ bool TopKCudaOp<T, Context>::RunOnDevice() {
         CAFFE_CUDA_NUM_THREADS,
         0,
         context_.cuda_stream()>>>(
-        indices->template data<TIndex>(),
+        indices->template data<int64_t>(),
         indices->size(),
         next_size,
         inner_size,
         k_,
-        flatten_indices->template mutable_data<TIndex>());
+        flatten_indices->template mutable_data<int64_t>());
   }
   return true;
 }
@@ -322,8 +322,8 @@ bool TopKGradientCudaOp<T, Context>::RunOnDevice() {
   const auto& indices = Input(1);
   const auto& original_input = Input(2);
   auto* output = Output(0);
-  const std::vector<TIndex>& values_dims = values.dims();
-  const std::vector<TIndex>& origin_dims = original_input.dims();
+  const std::vector<int64_t>& values_dims = values.dims();
+  const std::vector<int64_t>& origin_dims = original_input.dims();
   CAFFE_ENFORCE_EQ(values_dims.size(), origin_dims.size());
   output->Resize(origin_dims);
   T* output_data = output->template mutable_data<T>();
@@ -332,18 +332,18 @@ bool TopKGradientCudaOp<T, Context>::RunOnDevice() {
   }
   const int k = values_dims[axis_];
   math::Set<T, Context>(output->size(), T(0), output_data, &context_);
-  const TIndex stride = std::accumulate(
+  const int64_t stride = std::accumulate(
       values_dims.cbegin() + axis_ + 1,
       values_dims.cend(),
-      TIndex(1),
-      std::multiplies<TIndex>());
+      int64_t(1),
+      std::multiplies<int64_t>());
   SetTopKGradientCUDAKernel<<<
       CAFFE_GET_BLOCKS(indices.size()),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
       values.template data<T>(),
-      indices.template data<TIndex>(),
+      indices.template data<int64_t>(),
       values.size(),
       stride,
       origin_dims[axis_],
diff --git a/caffe2/operators/transpose_op.h b/caffe2/operators/transpose_op.h
index 2de956e83aa7..fdfed0e3254f 100644
--- a/caffe2/operators/transpose_op.h
+++ b/caffe2/operators/transpose_op.h
@@ -33,7 +33,7 @@ class TransposeOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     // Do the actual transpose, which is implemented in DoRunWithType().
-    return DispatchHelper<TensorTypes<float, double, int, TIndex>>::call(
+    return DispatchHelper<TensorTypes<float, double, int, int64_t>>::call(
         this, Input(0));
   }
 
diff --git a/caffe2/operators/utility_ops.cu b/caffe2/operators/utility_ops.cu
index 368325908359..f489ea723d59 100644
--- a/caffe2/operators/utility_ops.cu
+++ b/caffe2/operators/utility_ops.cu
@@ -246,14 +246,14 @@ bool SelectGradientOpBase<float, CUDAContext>::RunOnDevice() {
 template <typename T_INDEX>
 __global__ void AxpySliceKernel(
     const float* weight0,
-    const TIndex N,
-    const TIndex B,
-    const TIndex slice_size,
+    const int64_t N,
+    const int64_t B,
+    const int64_t slice_size,
     const float** alpha,
     const float** X,
     const T_INDEX* Indices,
     float* Y,
-    const TIndex M) {
+    const int64_t M) {
   // This implementation requires that the first weight is 1.0
   CUDA_KERNEL_ASSERT(weight0[0] == 1.0);
   for (int i = blockIdx.x; i < N; i += gridDim.x) {
@@ -288,17 +288,17 @@ bool ScatterWeightedSumOp<float, CUDAContext>::DoRunWithType() {
   CAFFE_ENFORCE_GT(X0.ndim(), 0, "X0 has to be at least the vector");
   CAFFE_ENFORCE_EQ(weight0.size(), 1);
 
-  TIndex M = X0.size();
-  TIndex N = X0.dim(0);
-  TIndex K = indices.size();
-  TIndex block_size = M / N;
+  int64_t M = X0.size();
+  int64_t N = X0.dim(0);
+  int64_t K = indices.size();
+  int64_t block_size = M / N;
 
   float* data = output->template mutable_data<float>();
 
   // In order to have all device pointers of x_i (and weight_i similarly)
   // consecutively in device memory, copy pointers to a host vector and then
   // copy back into a device array.
-  const TIndex B = (InputSize() - 3) / 2;
+  const int64_t B = (InputSize() - 3) / 2;
   x_data_host_.Resize(B);
   weights_host_.Resize(B);
   x_data_device_.Resize(B);
@@ -320,7 +320,7 @@ bool ScatterWeightedSumOp<float, CUDAContext>::DoRunWithType() {
       B, weights_host, weights_device);
 
   AxpySliceKernel<<<
-      std::min<TIndex>(K, CAFFE_MAXIMUM_NUM_BLOCKS),
+      std::min<int64_t>(K, CAFFE_MAXIMUM_NUM_BLOCKS),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(
@@ -348,15 +348,15 @@ __global__ void scatter_assign_kernel(
     T* data,
     const Index* idxs,
     const T* slicesData,
-    TIndex N,
-    TIndex K,
-    TIndex block_size) {
-  for (TIndex i = blockIdx.x; i < K; i += gridDim.x) {
+    int64_t N,
+    int64_t K,
+    int64_t block_size) {
+  for (int64_t i = blockIdx.x; i < K; i += gridDim.x) {
     Index idx = idxs[i];
     CUDA_KERNEL_ASSERT(0 <= idx && idx < N);
     const T* src = slicesData + block_size * i;
     T* dest = data + block_size * idx;
-    for (TIndex j = threadIdx.x; j < block_size; j += blockDim.x) {
+    for (int64_t j = threadIdx.x; j < block_size; j += blockDim.x) {
       dest[j] = src[j];
     }
   }
@@ -370,11 +370,11 @@ void ScatterAssignOp<CUDAContext>::DoScatterAssign(
     T* data,
     const Index* idxs,
     const T* slicesData,
-    TIndex N,
-    TIndex K,
-    TIndex block_size) {
+    int64_t N,
+    int64_t K,
+    int64_t block_size) {
   scatter_assign_kernel<<<
-      std::min(K, static_cast<TIndex>(CAFFE_MAXIMUM_NUM_BLOCKS)),
+      std::min(K, static_cast<int64_t>(CAFFE_MAXIMUM_NUM_BLOCKS)),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(data, idxs, slicesData, N, K, block_size);
diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h
index 58b3f9c96ee9..dc170e4f6789 100644
--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@@ -496,7 +496,7 @@ class ScatterWeightedSumOp : public Operator<Context> {
  private:
   template <typename Index>
   bool DoRunWithType() {
-    TIndex block_size = Input(0).size_from_dim(1);
+    int64_t block_size = Input(0).size_from_dim(1);
     return DispatchHelper<FixedValues<1>, Index>::call(this, block_size);
   }
 
@@ -512,10 +512,10 @@ class ScatterWeightedSumOp : public Operator<Context> {
     CAFFE_ENFORCE_GT(X0.size(), 0);
     CAFFE_ENFORCE_GT(X0.ndim(), 0, "X0 has to be at least the vector");
     CAFFE_ENFORCE_EQ(weight0.size(), 1);
-    TIndex M = X0.size();
-    TIndex N = X0.dim(0);
-    TIndex K = indices.size();
-    TIndex block_size = M / N;
+    int64_t M = X0.size();
+    int64_t N = X0.dim(0);
+    int64_t K = indices.size();
+    int64_t block_size = M / N;
     T* data = output->template mutable_data<T>();
     const Index* idxs = indices.template data<Index>();
     T w0 = *weight0.template data<T>();
@@ -664,10 +664,10 @@ class ScatterAssignOp : public Operator<Context> {
     CAFFE_ENFORCE_EQ(&input, output, "In place operation is required");
 
     CAFFE_ENFORCE_GT(input.ndim(), 0, "X0 has to be at least the vector");
-    TIndex M = input.size();
-    TIndex N = input.dim(0);
-    TIndex K = indices.size();
-    TIndex block_size = M / N;
+    int64_t M = input.size();
+    int64_t N = input.dim(0);
+    int64_t K = indices.size();
+    int64_t block_size = M / N;
     CAFFE_ENFORCE_EQ(slices.size(), block_size * K);
     // TODO(dzhulgakov): it can be made to work with arbitrary data type by
     // using raw_mutable_data
@@ -682,9 +682,9 @@ class ScatterAssignOp : public Operator<Context> {
       T* data,
       const Index* idxs,
       const T* slicesData,
-      TIndex N,
-      TIndex K,
-      TIndex block_size) {
+      int64_t N,
+      int64_t K,
+      int64_t block_size) {
     for (int i = 0; i < K; ++i) {
       Index idx = idxs[i];
       // double-checking the indices, but it's fine as it's DCHECK only
@@ -936,7 +936,7 @@ class HasElementsOp : public Operator<Context> {
   bool RunOnDevice() override {
     auto& input = Input(0);
     auto* output = Output(0);
-    output->Resize(std::vector<TIndex>{});
+    output->Resize(std::vector<int64_t>{});
     *output->template mutable_data<bool>() = input.size() > 0;
     return true;
   }
@@ -953,7 +953,7 @@ class SizeOp : public Operator<Context> {
     auto& input = Input(0);
     auto* output = Output(0);
 
-    output->Resize(vector<TIndex>());
+    output->Resize(vector<int64_t>());
     auto* output_data = output->template mutable_data<int64_t>();
 
     auto size = input.size();
@@ -1099,7 +1099,7 @@ class LengthsGatherOp : public Operator<Context> {
     const auto* lengths_data = lengths.template data<int32_t>();
     const auto* indices_data = indices.template data<Index>();
 
-    TIndex total_length = 0;
+    int64_t total_length = 0;
     for (size_t i = 0; i < indices.size(); ++i) {
       auto idx = indices_data[i];
       CAFFE_ENFORCE_LT(idx, lengths.size());
@@ -1110,7 +1110,7 @@ class LengthsGatherOp : public Operator<Context> {
     output->Resize(shape);
 
     offsets_.clear();
-    TIndex running_offset = 0;
+    int64_t running_offset = 0;
     offsets_.reserve(lengths.size());
     for (size_t i = 0; i < lengths.size(); ++i) {
       offsets_.push_back(running_offset);
@@ -1139,7 +1139,7 @@ class LengthsGatherOp : public Operator<Context> {
     return true;
   }
 
-  std::vector<TIndex> offsets_;
+  std::vector<int64_t> offsets_;
 
   INPUT_TAGS(ITEMS, LENGTHS, INDICES);
 };
diff --git a/caffe2/operators/utility_ops_gpu_test.cc b/caffe2/operators/utility_ops_gpu_test.cc
index eb70a09aefb4..f500afaf9ed2 100644
--- a/caffe2/operators/utility_ops_gpu_test.cc
+++ b/caffe2/operators/utility_ops_gpu_test.cc
@@ -11,7 +11,7 @@ CAFFE2_DECLARE_string(caffe_test_root);
 namespace caffe2 {
 
 static void AddConstInput(
-    const vector<TIndex>& shape,
+    const vector<int64_t>& shape,
     const float value,
     const string& name,
     Workspace* ws) {
@@ -38,7 +38,7 @@ TEST(UtilityOpGPUTest, testReshapeWithScalar) {
   def.add_output("OldShape");
   def.add_arg()->CopyFrom(MakeArgument("shape", vector<int64_t>{1}));
   def.mutable_device_option()->set_device_type(PROTO_CUDA);
-  AddConstInput(vector<TIndex>(), 3.14, "X", &ws);
+  AddConstInput(vector<int64_t>(), 3.14, "X", &ws);
   // execute the op
   unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
   EXPECT_TRUE(op->Run());
diff --git a/caffe2/operators/utility_ops_test.cc b/caffe2/operators/utility_ops_test.cc
index 7b4bcb3144f3..379dd52655c4 100644
--- a/caffe2/operators/utility_ops_test.cc
+++ b/caffe2/operators/utility_ops_test.cc
@@ -9,7 +9,7 @@ CAFFE2_DECLARE_string(caffe_test_root);
 namespace caffe2 {
 
 static void AddConstInput(
-    const vector<TIndex>& shape,
+    const vector<int64_t>& shape,
     const float value,
     const string& name,
     Workspace* ws) {
@@ -32,7 +32,7 @@ TEST(UtilityOpTest, testReshapeWithScalar) {
   def.add_output("XNew");
   def.add_output("OldShape");
   def.add_arg()->CopyFrom(MakeArgument("shape", vector<int64_t>{1}));
-  AddConstInput(vector<TIndex>(), 3.14, "X", &ws);
+  AddConstInput(vector<int64_t>(), 3.14, "X", &ws);
   // execute the op
   unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
   EXPECT_TRUE(op->Run());
diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc
index 2f48ece4fd87..ce79df56ecb7 100644
--- a/caffe2/opt/onnxifi_transformer.cc
+++ b/caffe2/opt/onnxifi_transformer.cc
@@ -174,7 +174,7 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOp(
       // Feed into workspace as CPU Tensors
       auto* blob = ws->CreateBlob(t.name());
       auto* cpu_tensor = blob->GetMutableTensor(CPU);
-      std::vector<TIndex> dims;
+      std::vector<int64_t> dims;
       for(const auto& d : t.dims()) {
         dims.push_back(d);
       }
diff --git a/caffe2/perfkernels/embedding_lookup.cc b/caffe2/perfkernels/embedding_lookup.cc
index 2b3b2fad3937..55dbeee5b9d4 100644
--- a/caffe2/perfkernels/embedding_lookup.cc
+++ b/caffe2/perfkernels/embedding_lookup.cc
@@ -16,10 +16,10 @@ template <
     typename OutType,
     bool IS_WEIGHT_POSITIONAL = false>
 static void EmbeddingLookupGenericSlow(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const InType* input,
     const IndexType* indices,
     const int* lengths,
@@ -27,13 +27,13 @@ static void EmbeddingLookupGenericSlow(
     const float* scale_bias, // optional scale & bias params for uint8 input
     bool normalize_by_lengths,
     OutType* out) {
-  TIndex current = 0;
+  int64_t current = 0;
   for (int m = 0; m < output_size; ++m) {
     memset(out, 0, sizeof(OutType) * block_size);
     EigenVectorArrayMap<OutType> out_vector(out, block_size);
     for (int i = 0; i < lengths[m]; ++i) {
       CAFFE_ENFORCE_LT(current, index_size);
-      TIndex idx = indices[current];
+      int64_t idx = indices[current];
       CAFFE_ENFORCE(
           0 <= idx && idx < data_size,
           "Index ",
@@ -86,10 +86,10 @@ static void EmbeddingLookupGenericSlow(
     IndexTypeName, IndexType, InTypeName, InType, OutTypeName, OutType, IS_WEIGHT_POSITIONAL)          \
   void                                                                                                 \
       EmbeddingLookup_##IndexTypeName##_##InTypeName##_##OutTypeName##_##IS_WEIGHT_POSITIONAL##__base( \
-          const TIndex block_size,                                                                     \
-          const TIndex output_size,                                                                    \
-          const TIndex index_size,                                                                     \
-          const TIndex data_size,                                                                      \
+          const int64_t block_size,                                                                     \
+          const int64_t output_size,                                                                    \
+          const int64_t index_size,                                                                     \
+          const int64_t data_size,                                                                      \
           const InType* input,                                                                         \
           const IndexType* indices,                                                                    \
           const int* lengths,                                                                          \
@@ -116,10 +116,10 @@ static void EmbeddingLookupGenericSlow(
   }                                                                                                    \
   template <>                                                                                          \
   void EmbeddingLookup<IndexType, InType, OutType, IS_WEIGHT_POSITIONAL>(                              \
-      const TIndex block_size,                                                                         \
-      const TIndex output_size,                                                                        \
-      const TIndex index_size,                                                                         \
-      const TIndex data_size,                                                                          \
+      const int64_t block_size,                                                                         \
+      const int64_t output_size,                                                                        \
+      const int64_t index_size,                                                                         \
+      const int64_t data_size,                                                                          \
       const InType* input,                                                                             \
       const IndexType* indices,                                                                        \
       const int* lengths,                                                                              \
diff --git a/caffe2/perfkernels/embedding_lookup.h b/caffe2/perfkernels/embedding_lookup.h
index c4c6ccfe5f36..d147708970b3 100644
--- a/caffe2/perfkernels/embedding_lookup.h
+++ b/caffe2/perfkernels/embedding_lookup.h
@@ -36,10 +36,10 @@ template <
     typename OutType,
     bool IS_WEIGHT_POSITIONAL = false>
 void EmbeddingLookup(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const InType* input,
     const IndexType* indices,
     const int* lengths,
diff --git a/caffe2/perfkernels/embedding_lookup_avx2.cc b/caffe2/perfkernels/embedding_lookup_avx2.cc
index b01a0a65daca..cd5cb7305eba 100644
--- a/caffe2/perfkernels/embedding_lookup_avx2.cc
+++ b/caffe2/perfkernels/embedding_lookup_avx2.cc
@@ -13,10 +13,10 @@ namespace caffe2 {
 
 template <bool IS_WEIGHT_POSITIONAL>
 static void EmbeddingLookup_int32_t_float_float__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const float* input,
     const int32_t* indices,
     const int* lengths,
@@ -318,7 +318,7 @@ static void EmbeddingLookup_int32_t_float_float__avx2_fma(
     int32_t dataInd = 0;
     for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
-      TIndex j = 0;
+      int64_t j = 0;
       for (; j + 8 <= block_size; j += 8) {
         _mm256_storeu_ps(op + j, _mm256_setzero_ps());
       }
@@ -376,10 +376,10 @@ static void EmbeddingLookup_int32_t_float_float__avx2_fma(
   }
 }
 void EmbeddingLookup_int32_t_float_float_false__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const float* input,
     const int32_t* indices,
     const int* lengths,
@@ -401,10 +401,10 @@ void EmbeddingLookup_int32_t_float_float_false__avx2_fma(
       out);
 }
 void EmbeddingLookup_int32_t_float_float_true__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const float* input,
     const int32_t* indices,
     const int* lengths,
@@ -428,10 +428,10 @@ void EmbeddingLookup_int32_t_float_float_true__avx2_fma(
 
 template <bool IS_WEIGHT_POSITIONAL>
 static void EmbeddingLookup_int64_t_float_float__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const float* input,
     const int64_t* indices,
     const int* lengths,
@@ -733,7 +733,7 @@ static void EmbeddingLookup_int64_t_float_float__avx2_fma(
     int64_t dataInd = 0;
     for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
-      TIndex j = 0;
+      int64_t j = 0;
       for (; j + 8 <= block_size; j += 8) {
         _mm256_storeu_ps(op + j, _mm256_setzero_ps());
       }
@@ -791,10 +791,10 @@ static void EmbeddingLookup_int64_t_float_float__avx2_fma(
   }
 }
 void EmbeddingLookup_int64_t_float_float_false__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const float* input,
     const int64_t* indices,
     const int* lengths,
@@ -816,10 +816,10 @@ void EmbeddingLookup_int64_t_float_float_false__avx2_fma(
       out);
 }
 void EmbeddingLookup_int64_t_float_float_true__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const float* input,
     const int64_t* indices,
     const int* lengths,
@@ -843,10 +843,10 @@ void EmbeddingLookup_int64_t_float_float_true__avx2_fma(
 
 template <bool IS_WEIGHT_POSITIONAL>
 static void EmbeddingLookup_int32_t_half_float__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const at::Half* input,
     const int32_t* indices,
     const int* lengths,
@@ -1268,7 +1268,7 @@ static void EmbeddingLookup_int32_t_half_float__avx2_fma(
     int32_t dataInd = 0;
     for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
-      TIndex j = 0;
+      int64_t j = 0;
       for (; j + 8 <= block_size; j += 8) {
         _mm256_storeu_ps(op + j, _mm256_setzero_ps());
       }
@@ -1332,10 +1332,10 @@ static void EmbeddingLookup_int32_t_half_float__avx2_fma(
   }
 }
 void EmbeddingLookup_int32_t_half_float_false__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const at::Half* input,
     const int32_t* indices,
     const int* lengths,
@@ -1357,10 +1357,10 @@ void EmbeddingLookup_int32_t_half_float_false__avx2_fma(
       out);
 }
 void EmbeddingLookup_int32_t_half_float_true__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const at::Half* input,
     const int32_t* indices,
     const int* lengths,
@@ -1384,10 +1384,10 @@ void EmbeddingLookup_int32_t_half_float_true__avx2_fma(
 
 template <bool IS_WEIGHT_POSITIONAL>
 static void EmbeddingLookup_int64_t_half_float__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const at::Half* input,
     const int64_t* indices,
     const int* lengths,
@@ -1809,7 +1809,7 @@ static void EmbeddingLookup_int64_t_half_float__avx2_fma(
     int64_t dataInd = 0;
     for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
-      TIndex j = 0;
+      int64_t j = 0;
       for (; j + 8 <= block_size; j += 8) {
         _mm256_storeu_ps(op + j, _mm256_setzero_ps());
       }
@@ -1873,10 +1873,10 @@ static void EmbeddingLookup_int64_t_half_float__avx2_fma(
   }
 }
 void EmbeddingLookup_int64_t_half_float_false__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const at::Half* input,
     const int64_t* indices,
     const int* lengths,
@@ -1898,10 +1898,10 @@ void EmbeddingLookup_int64_t_half_float_false__avx2_fma(
       out);
 }
 void EmbeddingLookup_int64_t_half_float_true__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const at::Half* input,
     const int64_t* indices,
     const int* lengths,
@@ -1925,10 +1925,10 @@ void EmbeddingLookup_int64_t_half_float_true__avx2_fma(
 
 template <bool IS_WEIGHT_POSITIONAL>
 static void EmbeddingLookup_int32_t_uint8_t_float__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const uint8_t* input,
     const int32_t* indices,
     const int* lengths,
@@ -2366,7 +2366,7 @@ static void EmbeddingLookup_int32_t_uint8_t_float__avx2_fma(
     int32_t dataInd = 0;
     for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
-      TIndex j = 0;
+      int64_t j = 0;
       for (; j + 8 <= block_size; j += 8) {
         _mm256_storeu_ps(op + j, _mm256_setzero_ps());
       }
@@ -2432,10 +2432,10 @@ static void EmbeddingLookup_int32_t_uint8_t_float__avx2_fma(
   }
 }
 void EmbeddingLookup_int32_t_uint8_t_float_false__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const uint8_t* input,
     const int32_t* indices,
     const int* lengths,
@@ -2457,10 +2457,10 @@ void EmbeddingLookup_int32_t_uint8_t_float_false__avx2_fma(
       out);
 }
 void EmbeddingLookup_int32_t_uint8_t_float_true__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const uint8_t* input,
     const int32_t* indices,
     const int* lengths,
@@ -2484,10 +2484,10 @@ void EmbeddingLookup_int32_t_uint8_t_float_true__avx2_fma(
 
 template <bool IS_WEIGHT_POSITIONAL>
 static void EmbeddingLookup_int64_t_uint8_t_float__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const uint8_t* input,
     const int64_t* indices,
     const int* lengths,
@@ -2925,7 +2925,7 @@ static void EmbeddingLookup_int64_t_uint8_t_float__avx2_fma(
     int64_t dataInd = 0;
     for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
-      TIndex j = 0;
+      int64_t j = 0;
       for (; j + 8 <= block_size; j += 8) {
         _mm256_storeu_ps(op + j, _mm256_setzero_ps());
       }
@@ -2991,10 +2991,10 @@ static void EmbeddingLookup_int64_t_uint8_t_float__avx2_fma(
   }
 }
 void EmbeddingLookup_int64_t_uint8_t_float_false__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const uint8_t* input,
     const int64_t* indices,
     const int* lengths,
@@ -3016,10 +3016,10 @@ void EmbeddingLookup_int64_t_uint8_t_float_false__avx2_fma(
       out);
 }
 void EmbeddingLookup_int64_t_uint8_t_float_true__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const uint8_t* input,
     const int64_t* indices,
     const int* lengths,
diff --git a/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc b/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc
index b4e5c922f441..5eeb4ef3e760 100644
--- a/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc
+++ b/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc
@@ -13,10 +13,10 @@ namespace caffe2 {
 
 template <bool IS_WEIGHT_POSITIONAL>
 static void Fused8BitRowwiseEmbeddingLookup_int32_t_float_float__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const float* input,
     const int32_t* indices,
     const int* lengths,
@@ -316,7 +316,7 @@ static void Fused8BitRowwiseEmbeddingLookup_int32_t_float_float__avx2_fma(
     int32_t dataInd = 0;
     for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
-      TIndex j = 0;
+      int64_t j = 0;
       for (; j + 8 <= block_size; j += 8) {
         _mm256_storeu_ps(op + j, _mm256_setzero_ps());
       }
@@ -374,10 +374,10 @@ static void Fused8BitRowwiseEmbeddingLookup_int32_t_float_float__avx2_fma(
   }
 }
 void Fused8BitRowwiseEmbeddingLookup_int32_t_float_float_false__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const float* input,
     const int32_t* indices,
     const int* lengths,
@@ -397,10 +397,10 @@ void Fused8BitRowwiseEmbeddingLookup_int32_t_float_float_false__avx2_fma(
       out);
 }
 void Fused8BitRowwiseEmbeddingLookup_int32_t_float_float_true__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const float* input,
     const int32_t* indices,
     const int* lengths,
@@ -422,10 +422,10 @@ void Fused8BitRowwiseEmbeddingLookup_int32_t_float_float_true__avx2_fma(
 
 template <bool IS_WEIGHT_POSITIONAL>
 static void Fused8BitRowwiseEmbeddingLookup_int64_t_float_float__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const float* input,
     const int64_t* indices,
     const int* lengths,
@@ -725,7 +725,7 @@ static void Fused8BitRowwiseEmbeddingLookup_int64_t_float_float__avx2_fma(
     int64_t dataInd = 0;
     for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
-      TIndex j = 0;
+      int64_t j = 0;
       for (; j + 8 <= block_size; j += 8) {
         _mm256_storeu_ps(op + j, _mm256_setzero_ps());
       }
@@ -783,10 +783,10 @@ static void Fused8BitRowwiseEmbeddingLookup_int64_t_float_float__avx2_fma(
   }
 }
 void Fused8BitRowwiseEmbeddingLookup_int64_t_float_float_false__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const float* input,
     const int64_t* indices,
     const int* lengths,
@@ -806,10 +806,10 @@ void Fused8BitRowwiseEmbeddingLookup_int64_t_float_float_false__avx2_fma(
       out);
 }
 void Fused8BitRowwiseEmbeddingLookup_int64_t_float_float_true__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const float* input,
     const int64_t* indices,
     const int* lengths,
@@ -831,10 +831,10 @@ void Fused8BitRowwiseEmbeddingLookup_int64_t_float_float_true__avx2_fma(
 
 template <bool IS_WEIGHT_POSITIONAL>
 static void Fused8BitRowwiseEmbeddingLookup_int32_t_half_float__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const at::Half* input,
     const int32_t* indices,
     const int* lengths,
@@ -1254,7 +1254,7 @@ static void Fused8BitRowwiseEmbeddingLookup_int32_t_half_float__avx2_fma(
     int32_t dataInd = 0;
     for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
-      TIndex j = 0;
+      int64_t j = 0;
       for (; j + 8 <= block_size; j += 8) {
         _mm256_storeu_ps(op + j, _mm256_setzero_ps());
       }
@@ -1318,10 +1318,10 @@ static void Fused8BitRowwiseEmbeddingLookup_int32_t_half_float__avx2_fma(
   }
 }
 void Fused8BitRowwiseEmbeddingLookup_int32_t_half_float_false__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const at::Half* input,
     const int32_t* indices,
     const int* lengths,
@@ -1341,10 +1341,10 @@ void Fused8BitRowwiseEmbeddingLookup_int32_t_half_float_false__avx2_fma(
       out);
 }
 void Fused8BitRowwiseEmbeddingLookup_int32_t_half_float_true__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const at::Half* input,
     const int32_t* indices,
     const int* lengths,
@@ -1366,10 +1366,10 @@ void Fused8BitRowwiseEmbeddingLookup_int32_t_half_float_true__avx2_fma(
 
 template <bool IS_WEIGHT_POSITIONAL>
 static void Fused8BitRowwiseEmbeddingLookup_int64_t_half_float__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const at::Half* input,
     const int64_t* indices,
     const int* lengths,
@@ -1789,7 +1789,7 @@ static void Fused8BitRowwiseEmbeddingLookup_int64_t_half_float__avx2_fma(
     int64_t dataInd = 0;
     for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
-      TIndex j = 0;
+      int64_t j = 0;
       for (; j + 8 <= block_size; j += 8) {
         _mm256_storeu_ps(op + j, _mm256_setzero_ps());
       }
@@ -1853,10 +1853,10 @@ static void Fused8BitRowwiseEmbeddingLookup_int64_t_half_float__avx2_fma(
   }
 }
 void Fused8BitRowwiseEmbeddingLookup_int64_t_half_float_false__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const at::Half* input,
     const int64_t* indices,
     const int* lengths,
@@ -1876,10 +1876,10 @@ void Fused8BitRowwiseEmbeddingLookup_int64_t_half_float_false__avx2_fma(
       out);
 }
 void Fused8BitRowwiseEmbeddingLookup_int64_t_half_float_true__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const at::Half* input,
     const int64_t* indices,
     const int* lengths,
@@ -1901,10 +1901,10 @@ void Fused8BitRowwiseEmbeddingLookup_int64_t_half_float_true__avx2_fma(
 
 template <bool IS_WEIGHT_POSITIONAL>
 static void Fused8BitRowwiseEmbeddingLookup_int32_t_uint8_t_float__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const uint8_t* input,
     const int32_t* indices,
     const int* lengths,
@@ -2348,7 +2348,7 @@ static void Fused8BitRowwiseEmbeddingLookup_int32_t_uint8_t_float__avx2_fma(
     int32_t dataInd = 0;
     for (int32_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
-      TIndex j = 0;
+      int64_t j = 0;
       for (; j + 8 <= block_size; j += 8) {
         _mm256_storeu_ps(op + j, _mm256_setzero_ps());
       }
@@ -2415,10 +2415,10 @@ static void Fused8BitRowwiseEmbeddingLookup_int32_t_uint8_t_float__avx2_fma(
   }
 }
 void Fused8BitRowwiseEmbeddingLookup_int32_t_uint8_t_float_false__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const uint8_t* input,
     const int32_t* indices,
     const int* lengths,
@@ -2438,10 +2438,10 @@ void Fused8BitRowwiseEmbeddingLookup_int32_t_uint8_t_float_false__avx2_fma(
       out);
 }
 void Fused8BitRowwiseEmbeddingLookup_int32_t_uint8_t_float_true__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const uint8_t* input,
     const int32_t* indices,
     const int* lengths,
@@ -2463,10 +2463,10 @@ void Fused8BitRowwiseEmbeddingLookup_int32_t_uint8_t_float_true__avx2_fma(
 
 template <bool IS_WEIGHT_POSITIONAL>
 static void Fused8BitRowwiseEmbeddingLookup_int64_t_uint8_t_float__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const uint8_t* input,
     const int64_t* indices,
     const int* lengths,
@@ -2910,7 +2910,7 @@ static void Fused8BitRowwiseEmbeddingLookup_int64_t_uint8_t_float__avx2_fma(
     int64_t dataInd = 0;
     for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
-      TIndex j = 0;
+      int64_t j = 0;
       for (; j + 8 <= block_size; j += 8) {
         _mm256_storeu_ps(op + j, _mm256_setzero_ps());
       }
@@ -2977,10 +2977,10 @@ static void Fused8BitRowwiseEmbeddingLookup_int64_t_uint8_t_float__avx2_fma(
   }
 }
 void Fused8BitRowwiseEmbeddingLookup_int64_t_uint8_t_float_false__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const uint8_t* input,
     const int64_t* indices,
     const int* lengths,
@@ -3000,10 +3000,10 @@ void Fused8BitRowwiseEmbeddingLookup_int64_t_uint8_t_float_false__avx2_fma(
       out);
 }
 void Fused8BitRowwiseEmbeddingLookup_int64_t_uint8_t_float_true__avx2_fma(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const uint8_t* input,
     const int64_t* indices,
     const int* lengths,
diff --git a/caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.cc b/caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.cc
index 34777eeab4f4..68c8c8709814 100644
--- a/caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.cc
+++ b/caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.cc
@@ -16,10 +16,10 @@ template <
     typename OutType,
     bool IS_WEIGHT_POSITIONAL = false>
 static void Fused8BitRowwiseEmbeddingLookupGenericSlow(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const InType* input,
     const IndexType* indices,
     const int* lengths,
@@ -29,14 +29,14 @@ static void Fused8BitRowwiseEmbeddingLookupGenericSlow(
   // block_size is the number of elements and fused_block_size is the size of
   // an entire row, including scale and bias.
   const auto scale_bias_offset = 8 / sizeof(InType);
-  const TIndex fused_block_size = block_size + scale_bias_offset;
-  TIndex current = 0;
+  const int64_t fused_block_size = block_size + scale_bias_offset;
+  int64_t current = 0;
   for (int m = 0; m < output_size; ++m) {
     memset(out, 0, sizeof(OutType) * block_size);
     EigenVectorArrayMap<OutType> out_vector(out, block_size);
     for (int i = 0; i < lengths[m]; ++i) {
       CAFFE_ENFORCE_LT(current, index_size);
-      TIndex idx = indices[current];
+      int64_t idx = indices[current];
       CAFFE_ENFORCE(
           0 <= idx && idx < data_size,
           "Index ",
@@ -89,10 +89,10 @@ static void Fused8BitRowwiseEmbeddingLookupGenericSlow(
     IndexType, InType, OutType)                                                         \
   void                                                                                  \
       Fused8BitRowwiseEmbeddingLookup_##IndexType##_##InType##_##OutType##_false__base( \
-          const TIndex block_size,                                                      \
-          const TIndex output_size,                                                     \
-          const TIndex index_size,                                                      \
-          const TIndex data_size,                                                       \
+          const int64_t block_size,                                                      \
+          const int64_t output_size,                                                     \
+          const int64_t index_size,                                                      \
+          const int64_t data_size,                                                       \
           const InType* input,                                                          \
           const IndexType* indices,                                                     \
           const int* lengths,                                                           \
@@ -117,10 +117,10 @@ static void Fused8BitRowwiseEmbeddingLookupGenericSlow(
   }                                                                                     \
   template <>                                                                           \
   void Fused8BitRowwiseEmbeddingLookup<IndexType, InType, OutType, false>(              \
-      const TIndex block_size,                                                          \
-      const TIndex output_size,                                                         \
-      const TIndex index_size,                                                          \
-      const TIndex data_size,                                                           \
+      const int64_t block_size,                                                          \
+      const int64_t output_size,                                                         \
+      const int64_t index_size,                                                          \
+      const int64_t data_size,                                                           \
       const InType* input,                                                              \
       const IndexType* indices,                                                         \
       const int* lengths,                                                               \
diff --git a/caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.h b/caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.h
index 9605fbb39c57..85363c6ddb63 100644
--- a/caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.h
+++ b/caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.h
@@ -42,10 +42,10 @@ template <
     typename OutType,
     bool IS_WEIGHT_POSITIONAL = false>
 void Fused8BitRowwiseEmbeddingLookup(
-    const TIndex block_size,
-    const TIndex output_size,
-    const TIndex index_size,
-    const TIndex data_size,
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
     const InType* input,
     const IndexType* indices,
     const int* lengths,
diff --git a/caffe2/perfkernels/hp_emblookup_codegen.py b/caffe2/perfkernels/hp_emblookup_codegen.py
index c69ddf5f14de..2db1cee9c9cd 100644
--- a/caffe2/perfkernels/hp_emblookup_codegen.py
+++ b/caffe2/perfkernels/hp_emblookup_codegen.py
@@ -166,7 +166,7 @@ def generic(IndexType, InType, OutType, use_weights, isa, fused):
     code.append(OutType + " *op = &out[rangeIndex * block_size];")
 
     # initialize to 0
-    code.append("TIndex j = 0;")
+    code.append("int64_t j = 0;")
     code.append("for(; j + 8 <= block_size; j += 8) {")
     code.append("_mm256_storeu_ps(op + j, _mm256_setzero_ps());")
     code.append("}")
@@ -312,10 +312,10 @@ for o in options:
     code.append(fn + "(")
 
     args = []
-    args.append("const TIndex block_size,")
-    args.append("const TIndex output_size,")
-    args.append("const TIndex index_size,")
-    args.append("const TIndex data_size,")
+    args.append("const int64_t block_size,")
+    args.append("const int64_t output_size,")
+    args.append("const int64_t index_size,")
+    args.append("const int64_t data_size,")
     args.append("const " + InType + "* input,")
     args.append("const " + IndexType + "* indices,")
     args.append("const int* lengths,")
diff --git a/caffe2/predictor/predictor_test.cc b/caffe2/predictor/predictor_test.cc
index 326265fc66d0..ae4f73e9da0a 100644
--- a/caffe2/predictor/predictor_test.cc
+++ b/caffe2/predictor/predictor_test.cc
@@ -132,7 +132,7 @@ const char* metaSpec = R"DOC(
 )DOC";
 
 std::unique_ptr<Blob> randomTensor(
-    const std::vector<TIndex>& dims,
+    const std::vector<int64_t>& dims,
     CPUContext* ctx) {
   auto blob = make_unique<Blob>();
   auto* t = blob->GetMutableTensor(CPU);
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 54d858cb8a98..81197047102f 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -392,7 +392,7 @@ void addObjectMethods(py::module& m) {
           })
       .def(
           "_reshape",
-          [](DLPackWrapper<CPUContext>* t, std::vector<TIndex> dims) {
+          [](DLPackWrapper<CPUContext>* t, std::vector<int64_t> dims) {
             auto* tensor = t->tensor;
             tensor->Resize(dims);
           });
@@ -430,7 +430,7 @@ void addObjectMethods(py::module& m) {
           "Copy data from this tensor into a new numpy array.")
       .def(
           "init",
-          [](Tensor* t, std::vector<TIndex> dims, int caffe_type) {
+          [](Tensor* t, std::vector<int64_t> dims, int caffe_type) {
             const auto& meta =
                 DataTypeToTypeMeta((TensorProto::DataType)caffe_type);
             CAFFE_ENFORCE(
@@ -443,7 +443,7 @@ void addObjectMethods(py::module& m) {
           "Fail if the given data type cannot be accessed from python.")
       .def_property_readonly(
           "_shape", [](const TensorCPU& t) { return t.dims(); })
-      .def("_reshape", [](TensorCPU* t, std::vector<TIndex> dims) {
+      .def("_reshape", [](TensorCPU* t, std::vector<int64_t> dims) {
         t->Resize(dims);
       });
 
@@ -1361,7 +1361,7 @@ void addGlobalMethods(py::module& m) {
   m.def(
       "infer_shapes_and_types_from_map",
       [](const std::vector<py::bytes>& net_protos,
-         const std::map<std::string, std::vector<TIndex>> blob_dimensions) {
+         const std::map<std::string, std::vector<int64_t>> blob_dimensions) {
         // Parse protobuffers to NetDefs
         std::vector<std::unique_ptr<caffe2::NetDef>> nets;
         std::vector<caffe2::NetDef*> nets_ptr;
@@ -1381,7 +1381,7 @@ void addGlobalMethods(py::module& m) {
   m.def(
       "infer_shapes_and_types_from_map",
       [](const std::vector<py::bytes>& net_protos,
-         const std::map<std::string, std::vector<TIndex>> blob_dimensions,
+         const std::map<std::string, std::vector<int64_t>> blob_dimensions,
          const std::map<std::string, int> int_blob_types) {
         // Parse protobuffers to NetDefs
         std::vector<std::unique_ptr<caffe2::NetDef>> nets;
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index 1271d67f48e7..59f39dd31303 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -178,7 +178,7 @@ class TensorFeeder : public BlobFeederBase {
     // numpy requires long int as its dims.
     int ndim = PyArray_NDIM(array);
     npy_intp* npy_dims = PyArray_DIMS(array);
-    std::vector<TIndex> dims;
+    std::vector<int64_t> dims;
     for (int i = 0; i < ndim; ++i) {
       dims.push_back(npy_dims[i]);
     }
diff --git a/caffe2/python/pybind_state_dlpack.h b/caffe2/python/pybind_state_dlpack.h
index e0122fdcc998..679152c78813 100644
--- a/caffe2/python/pybind_state_dlpack.h
+++ b/caffe2/python/pybind_state_dlpack.h
@@ -90,7 +90,7 @@ class DLPackWrapper {
         device_option.cuda_gpu_id(),
         "Expected same device id for DLPack and C2 tensors");
 
-    std::vector<TIndex> dims;
+    std::vector<int64_t> dims;
     dims.reserve(dlTensor->ndim);
     for (int idx = 0; idx < dlTensor->ndim; ++idx) {
       dims.push_back(dlTensor->shape[idx]);
diff --git a/caffe2/python/pybind_state_gpu.cc b/caffe2/python/pybind_state_gpu.cc
index 8c547cf8eccc..3893be96ff98 100644
--- a/caffe2/python/pybind_state_gpu.cc
+++ b/caffe2/python/pybind_state_gpu.cc
@@ -146,7 +146,7 @@ void addCUDAObjectMethods(py::module& m) {
           [](const DLPackWrapper<CUDAContext>& t) { return t.tensor->dims(); })
       .def(
           "_reshape",
-          [](DLPackWrapper<CUDAContext>* t, std::vector<TIndex> dims) {
+          [](DLPackWrapper<CUDAContext>* t, std::vector<int64_t> dims) {
             t->tensor->Resize(dims);
           });
 }
diff --git a/caffe2/python/pybind_state_hip.cc b/caffe2/python/pybind_state_hip.cc
index a5d443cb9a7a..36e9f6a75d64 100644
--- a/caffe2/python/pybind_state_hip.cc
+++ b/caffe2/python/pybind_state_hip.cc
@@ -74,7 +74,7 @@ void addHIPObjectMethods(py::module& m) {
           [](const DLPackWrapper<HIPContext>& t) { return t.tensor->dims(); })
       .def(
           "_reshape",
-          [](DLPackWrapper<HIPContext>* t, std::vector<TIndex> dims) {
+          [](DLPackWrapper<HIPContext>* t, std::vector<int64_t> dims) {
             t->tensor->Resize(dims);
           });
 }
diff --git a/caffe2/python/pybind_state_mkl.cc b/caffe2/python/pybind_state_mkl.cc
index dd192c325f7d..1fa5d806377d 100644
--- a/caffe2/python/pybind_state_mkl.cc
+++ b/caffe2/python/pybind_state_mkl.cc
@@ -70,7 +70,7 @@ class MKLMemoryFeeder : public BlobFeederBase {
     // numpy requires long int as its dims.
     int ndim = PyArray_NDIM(array);
     npy_intp* npy_dims = PyArray_DIMS(array);
-    std::vector<TIndex> dims;
+    std::vector<int64_t> dims;
     for (int i = 0; i < ndim; ++i) {
       dims.push_back(npy_dims[i]);
     }
diff --git a/caffe2/queue/rebatching_queue.cc b/caffe2/queue/rebatching_queue.cc
index cfb43a99f491..c768f1cfd0a1 100644
--- a/caffe2/queue/rebatching_queue.cc
+++ b/caffe2/queue/rebatching_queue.cc
@@ -17,7 +17,7 @@ void concat(
   const auto numRows = inputs.size();
 
   // Precompute the output sizes to avoid resizing
-  std::vector<std::vector<TIndex>> outputDims(numTensors);
+  std::vector<std::vector<int64_t>> outputDims(numTensors);
 
   for (int i = 0; i < numTensors; ++i) {
     SmartTensorPrinter::PrintTensor(inputZero.at(i));
diff --git a/caffe2/sgd/ftrl_op.cc b/caffe2/sgd/ftrl_op.cc
index 263406114670..a8ef4879d68e 100644
--- a/caffe2/sgd/ftrl_op.cc
+++ b/caffe2/sgd/ftrl_op.cc
@@ -88,10 +88,10 @@ void SparseFtrlOp<T>::DoRun() {
   auto& grad = Input(GRAD);
   CAFFE_ENFORCE_EQ(&Input(VAR), var, "In place operation is required");
   CAFFE_ENFORCE_EQ(&Input(N_Z), n_z, "In place operation is required");
-  TIndex M = var->size();
-  TIndex N = var->dim(0);
-  TIndex block_size = M / N;
-  TIndex K = indices.size();
+  int64_t M = var->size();
+  int64_t N = var->dim(0);
+  int64_t block_size = M / N;
+  int64_t K = indices.size();
   DCHECK_EQ(M * 2, n_z->size());
   DCHECK_EQ(grad.size(), K * block_size);
   T* w = var->template mutable_data<T>();
@@ -101,7 +101,7 @@ void SparseFtrlOp<T>::DoRun() {
 
   // TODO(cxj): use OMP when it is reliable
   // #pragma omp parallel for
-  for (TIndex i = 0; i < K; ++i) {
+  for (int64_t i = 0; i < K; ++i) {
     SIndex idx = idxs[i];
     DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
                                 << ", range 0 to " << N;
@@ -116,7 +116,7 @@ void SparseFtrlOp<T>::DoRun() {
           nz[idx * 2 + 1],
           params_);
     } else {
-      TIndex x = block_size * idx;
+      int64_t x = block_size * idx;
       ftrl_update(
           block_size,
           w + x,
diff --git a/caffe2/sgd/lars_op.h b/caffe2/sgd/lars_op.h
index 7b65598cdffe..a54c67ddc2fc 100644
--- a/caffe2/sgd/lars_op.h
+++ b/caffe2/sgd/lars_op.h
@@ -29,7 +29,7 @@ class LarsOp final : public Operator<Context> {
     auto& trust = Input(3);
     auto& lr_max = Input(4);
     auto* lr_rescaled = Output(0);
-    lr_rescaled->Resize(vector<TIndex>{1});
+    lr_rescaled->Resize(vector<int64_t>{1});
 
     X_norm_tensor_.Resize(1);
     T* X_norm_ = X_norm_tensor_.template mutable_data<T>();
@@ -60,7 +60,7 @@ class LarsOp final : public Operator<Context> {
  private:
   // Compute the l2 norm of X_data and dX_data
   void ComputeNorms(
-      TIndex N,
+      int64_t N,
       const T* X_data,
       const T* dX_data,
       T* X_norm,
diff --git a/caffe2/sgd/learning_rate_op.h b/caffe2/sgd/learning_rate_op.h
index 4e798921e13e..5e4357d6b9ee 100644
--- a/caffe2/sgd/learning_rate_op.h
+++ b/caffe2/sgd/learning_rate_op.h
@@ -31,7 +31,7 @@ class LearningRateOp final : public Operator<Context> {
     T learning_rate = cur_base_lr_ * (*functor_)(iter);
     // Write to output.
     auto* output = Output(0);
-    output->Resize(vector<TIndex>());
+    output->Resize(vector<int64_t>());
     context_.template CopyFromCPU<T>(
         1, &learning_rate, Output(0)->template mutable_data<T>());
     return true;
diff --git a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
index 476930ce4f90..4ac3524d49d8 100644
--- a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
+++ b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
@@ -13,7 +13,7 @@ namespace caffe2 {
 namespace {
 
 void AddNoiseInput(
-    const vector<TIndex>& shape,
+    const vector<int64_t>& shape,
     const string& name,
     Workspace* ws) {
   DeviceOption option;
@@ -78,10 +78,10 @@ void compare(
   depthwiseOpDef.add_arg()->CopyFrom(MakeArgument("pad_r", padR));
   depthwiseOpDef.add_arg()->CopyFrom(MakeArgument("group", group));
 
-  AddNoiseInput(vector<TIndex>{N, inputC, H, W}, "X", &ws);
+  AddNoiseInput(vector<int64_t>{N, inputC, H, W}, "X", &ws);
   AddNoiseInput(
-      vector<TIndex>{outputC, inputC / group, kernelH, kernelW}, "W", &ws);
-  AddNoiseInput(vector<TIndex>{outputC}, "B", &ws);
+      vector<int64_t>{outputC, inputC / group, kernelH, kernelW}, "W", &ws);
+  AddNoiseInput(vector<int64_t>{outputC}, "B", &ws);
 
   unique_ptr<OperatorBase> depthwiseOp(CreateOperator(depthwiseOpDef, &ws));
   EXPECT_NE(nullptr, depthwiseOp.get());
diff --git a/caffe2/share/contrib/nnpack/nnpack_test.cc b/caffe2/share/contrib/nnpack/nnpack_test.cc
index ddc451264abc..2f892118982d 100644
--- a/caffe2/share/contrib/nnpack/nnpack_test.cc
+++ b/caffe2/share/contrib/nnpack/nnpack_test.cc
@@ -13,7 +13,7 @@ namespace caffe2 {
 namespace {
 
 void AddNoiseInput(
-    const vector<TIndex>& shape,
+    const vector<int64_t>& shape,
     const string& name,
     Workspace* ws) {
   DeviceOption option;
@@ -91,10 +91,10 @@ void compare(
   nnpackOpDef.add_arg()->CopyFrom(MakeArgument("pad_r", padR));
   nnpackOpDef.add_arg()->CopyFrom(MakeArgument("group", group));
 
-  AddNoiseInput(vector<TIndex>{N, inputC, H, W}, "X", &ws);
+  AddNoiseInput(vector<int64_t>{N, inputC, H, W}, "X", &ws);
   AddNoiseInput(
-      vector<TIndex>{outputC, inputC / group, kernelH, kernelW}, "W", &ws);
-  AddNoiseInput(vector<TIndex>{outputC}, "B", &ws);
+      vector<int64_t>{outputC, inputC / group, kernelH, kernelW}, "W", &ws);
+  AddNoiseInput(vector<int64_t>{outputC}, "B", &ws);
 
   unique_ptr<OperatorBase> nnpackOp(CreateOperator(nnpackOpDef, &ws));
   EXPECT_NE(nullptr, nnpackOp.get());
diff --git a/caffe2/share/contrib/zstd/quant_decomp_zstd_op.cc b/caffe2/share/contrib/zstd/quant_decomp_zstd_op.cc
index 9bc5200750f3..d0dd70c48952 100644
--- a/caffe2/share/contrib/zstd/quant_decomp_zstd_op.cc
+++ b/caffe2/share/contrib/zstd/quant_decomp_zstd_op.cc
@@ -65,7 +65,7 @@ TensorProtos GetTensorsProto(const TensorCPU& compressed) {
 // Decompress tensor stored in compressed format
 // It is compressed using mutils.compress_data_list()
 void Decompress(const TensorProto& compressed, TensorCPU* outDecomp) {
-  vector<TIndex> shape(compressed.dims().begin(), compressed.dims().end());
+  vector<int64_t> shape(compressed.dims().begin(), compressed.dims().end());
   // shape stores the dimensions of data before compression,
   //   see _compress_data_single() in mutils.py
   outDecomp->Resize(shape);
diff --git a/caffe2/utils/filler.h b/caffe2/utils/filler.h
index 7016f09a3bab..9739ca26e580 100644
--- a/caffe2/utils/filler.h
+++ b/caffe2/utils/filler.h
@@ -90,19 +90,19 @@ class TensorFiller {
     return Min(0).Max(max_segment).Dist(FD_SYNTHETIC);
   }
 
-  TensorFiller& Shape(const std::vector<TIndex>& shape) {
+  TensorFiller& Shape(const std::vector<int64_t>& shape) {
     shape_ = shape;
     return *this;
   }
 
   template <class Type>
-  TensorFiller(const std::vector<TIndex>& shape, Type fixed_sum)
+  TensorFiller(const std::vector<int64_t>& shape, Type fixed_sum)
       : shape_(shape), dist_(FD_FIXEDSUM), fixed_sum_((double)fixed_sum) {}
 
-  TensorFiller(const std::vector<TIndex>& shape)
+  TensorFiller(const std::vector<int64_t>& shape)
       : shape_(shape), dist_(FD_UNIFORM), fixed_sum_(0) {}
 
-  TensorFiller() : TensorFiller(std::vector<TIndex>()) {}
+  TensorFiller() : TensorFiller(std::vector<int64_t>()) {}
 
   std::string DebugString() const {
     std::stringstream stream;
@@ -123,7 +123,7 @@ class TensorFiller {
   }
 
  private:
-  std::vector<TIndex> shape_;
+  std::vector<int64_t> shape_;
   // TODO: type is unknown until a user starts to fill data;
   // cast everything to double for now.
   double min_ = 0.0;
diff --git a/caffe2/utils/hip/math_hip.cc b/caffe2/utils/hip/math_hip.cc
index 89fbe0193e3e..0a285480388d 100644
--- a/caffe2/utils/hip/math_hip.cc
+++ b/caffe2/utils/hip/math_hip.cc
@@ -714,8 +714,8 @@ DEFINE_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
     cub::DeviceReduce::func(                                            \
         nullptr, memRequired, src, dst, N, context->hip_stream());      \
     auto buffer_size =                                                  \
-        static_cast<TIndex>((memRequired + sizeof(T) - 1) / sizeof(T)); \
-    scratch_ptr->Resize(std::vector<TIndex>{buffer_size});              \
+        static_cast<int64_t>((memRequired + sizeof(T) - 1) / sizeof(T)); \
+    scratch_ptr->Resize(std::vector<int64_t>{buffer_size});              \
     cub::DeviceReduce::func(                                            \
         static_cast<void*>(scratch_ptr->mutable_data<T>()),             \
         memRequired,                                                    \
@@ -1485,13 +1485,13 @@ void SumGenericIter(
   cub::DeviceReduce::Sum(
       nullptr, memRequired, it, dest, N, context->hip_stream());
   auto buffer_size =
-      static_cast<TIndex>((memRequired + sizeof(T) - 1) / sizeof(T));
+      static_cast<int64_t>((memRequired + sizeof(T) - 1) / sizeof(T));
   if (!dest) {
     // allocate one more T at the end of scratch for dest
-    scratch_ptr->Resize(std::vector<TIndex>{buffer_size + 1});
+    scratch_ptr->Resize(std::vector<int64_t>{buffer_size + 1});
     dest = scratch_ptr->template mutable_data<T>() + buffer_size;
   } else {
-    scratch_ptr->Resize(std::vector<TIndex>{buffer_size});
+    scratch_ptr->Resize(std::vector<int64_t>{buffer_size});
   }
   cub::DeviceReduce::Sum(
       static_cast<void*>(scratch_ptr->template mutable_data<T>()),
@@ -3473,7 +3473,7 @@ void TransposeHIPImpl(
 CAFFE2_SPECIALIZED_HIP_TRANSPOSE(float)
 CAFFE2_SPECIALIZED_HIP_TRANSPOSE(double)
 CAFFE2_SPECIALIZED_HIP_TRANSPOSE(int)
-CAFFE2_SPECIALIZED_HIP_TRANSPOSE(TIndex)
+CAFFE2_SPECIALIZED_HIP_TRANSPOSE(int64_t)
 #undef CAFFE2_SPECIALIZED_HIP_TRANSPOSE
 
 namespace {
diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc
index c54226af68ed..18e20e4fa414 100644
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@@ -2747,7 +2747,7 @@ CAFFE2_SPECIALIZED_COPY_MATRIX(double)
 #endif // CAFFE2_USE_MKL
 
 CAFFE2_SPECIALIZED_COPY_MATRIX(int)
-CAFFE2_SPECIALIZED_COPY_MATRIX(TIndex)
+CAFFE2_SPECIALIZED_COPY_MATRIX(int64_t)
 #ifdef CAFFE2_UNIQUE_LONG_TYPEMETA
 CAFFE2_SPECIALIZED_COPY_MATRIX(long)
 #endif
@@ -3522,7 +3522,7 @@ CAFFE2_SPECIALIZED_TRANSPOSE_2D(double)
 #endif // CAFFE2_USE_MKL
 
 CAFFE2_SPECIALIZED_TRANSPOSE_2D(int)
-CAFFE2_SPECIALIZED_TRANSPOSE_2D(TIndex)
+CAFFE2_SPECIALIZED_TRANSPOSE_2D(int64_t)
 #ifdef CAFFE2_UNIQUE_LONG_TYPEMETA
 CAFFE2_SPECIALIZED_TRANSPOSE_2D(long)
 #endif
@@ -3645,7 +3645,7 @@ void TransposeCPUImpl(
 CAFFE2_SPECIALIZED_TRANSPOSE(float)
 CAFFE2_SPECIALIZED_TRANSPOSE(double)
 CAFFE2_SPECIALIZED_TRANSPOSE(int)
-CAFFE2_SPECIALIZED_TRANSPOSE(TIndex)
+CAFFE2_SPECIALIZED_TRANSPOSE(int64_t)
 #ifdef CAFFE2_UNIQUE_LONG_TYPEMETA
 CAFFE2_SPECIALIZED_TRANSPOSE(long)
 #endif
diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
index 33d798417b1e..f9b113980737 100644
--- a/caffe2/utils/math_gpu.cu
+++ b/caffe2/utils/math_gpu.cu
@@ -648,8 +648,8 @@ DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
     cub::DeviceReduce::func(                                            \
         nullptr, memRequired, src, dst, N, context->cuda_stream());     \
     auto buffer_size =                                                  \
-        static_cast<TIndex>((memRequired + sizeof(T) - 1) / sizeof(T)); \
-    scratch_ptr->Resize(std::vector<TIndex>{buffer_size});              \
+        static_cast<int64_t>((memRequired + sizeof(T) - 1) / sizeof(T)); \
+    scratch_ptr->Resize(std::vector<int64_t>{buffer_size});              \
     cub::DeviceReduce::func(                                            \
         static_cast<void*>(scratch_ptr->mutable_data<T>()),             \
         memRequired,                                                    \
@@ -1770,13 +1770,13 @@ CAFFE2_CUDA_EXPORT void SumGenericIter(
   cub::DeviceReduce::Sum(
       nullptr, memRequired, it, dest, N, context->cuda_stream());
   auto buffer_size =
-      static_cast<TIndex>((memRequired + sizeof(T) - 1) / sizeof(T));
+      static_cast<int64_t>((memRequired + sizeof(T) - 1) / sizeof(T));
   if (!dest) {
     // allocate one more T at the end of scratch for dest
-    scratch_ptr->Resize(std::vector<TIndex>{buffer_size + 1});
+    scratch_ptr->Resize(std::vector<int64_t>{buffer_size + 1});
     dest = scratch_ptr->template mutable_data<T>() + buffer_size;
   } else {
-    scratch_ptr->Resize(std::vector<TIndex>{buffer_size});
+    scratch_ptr->Resize(std::vector<int64_t>{buffer_size});
   }
   cub::DeviceReduce::Sum(
       static_cast<void*>(scratch_ptr->template mutable_data<T>()),
@@ -3078,7 +3078,7 @@ CAFFE2_CUDA_EXPORT void CopyMatrix<CUDAContext>(
 CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX(float)
 CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX(double)
 CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX(int)
-CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX(TIndex)
+CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX(int64_t)
 #undef CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX
 
 template <>
@@ -3905,7 +3905,7 @@ CAFFE2_CUDA_EXPORT void TransposeCUDAImpl(
 CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(float)
 CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(double)
 CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(int)
-CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(TIndex)
+CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(int64_t)
 #undef CAFFE2_SPECIALIZED_CUDA_TRANSPOSE
 
 namespace {
diff --git a/caffe2/utils/math_gpu_test.cc b/caffe2/utils/math_gpu_test.cc
index b9f09706f986..9be1c3db6c1d 100644
--- a/caffe2/utils/math_gpu_test.cc
+++ b/caffe2/utils/math_gpu_test.cc
@@ -261,9 +261,9 @@ class GemmBatchedGPUTest
     X_ = X_blob->GetMutableTensor(CUDA);
     W_ = W_blob->GetMutableTensor(CUDA);
     Y_ = Y_blob->GetMutableTensor(CUDA);
-    X_->Resize(std::vector<TIndex>{3, 5, 10});
-    W_->Resize(std::vector<TIndex>{3, 6, 10});
-    Y_->Resize(std::vector<TIndex>{3, 5, 6});
+    X_->Resize(std::vector<int64_t>{3, 5, 10});
+    W_->Resize(std::vector<int64_t>{3, 6, 10});
+    Y_->Resize(std::vector<int64_t>{3, 5, 6});
     math::Set<float, CUDAContext>(
         X_->size(), 1.0f, X_->mutable_data<float>(), cuda_context_.get());
     math::Set<float, CUDAContext>(
diff --git a/caffe2/utils/math_test.cc b/caffe2/utils/math_test.cc
index 7b210fd57337..241d19dec424 100644
--- a/caffe2/utils/math_test.cc
+++ b/caffe2/utils/math_test.cc
@@ -171,9 +171,9 @@ class GemmBatchedTest
  protected:
   void SetUp() override {
     cpu_context_ = make_unique<CPUContext>(option_);
-    X_.Resize(std::vector<TIndex>{3, 5, 10});
-    W_.Resize(std::vector<TIndex>{3, 6, 10});
-    Y_.Resize(std::vector<TIndex>{3, 5, 6});
+    X_.Resize(std::vector<int64_t>{3, 5, 10});
+    W_.Resize(std::vector<int64_t>{3, 6, 10});
+    Y_.Resize(std::vector<int64_t>{3, 5, 6});
     math::Set<float, CPUContext>(
         X_.size(), 1, X_.mutable_data<float>(), cpu_context_.get());
     math::Set<float, CPUContext>(
diff --git a/caffe2/utils/smart_tensor_printer_test.cc b/caffe2/utils/smart_tensor_printer_test.cc
index e207f7c7b052..651b6ad78adc 100644
--- a/caffe2/utils/smart_tensor_printer_test.cc
+++ b/caffe2/utils/smart_tensor_printer_test.cc
@@ -31,7 +31,7 @@ void printTensorAndCheck(const std::vector<T>& values) {
   CPUContext cpuContext;
 
   Tensor tensor(
-      std::vector<TIndex>{static_cast<TIndex>(values.size())},
+      std::vector<int64_t>{static_cast<int64_t>(values.size())},
       values,
       &cpuContext);
 
diff --git a/caffe2/video/video_input_op.h b/caffe2/video/video_input_op.h
index 3034e1bd4adb..e58ece491e22 100644
--- a/caffe2/video/video_input_op.h
+++ b/caffe2/video/video_input_op.h
@@ -462,8 +462,8 @@ VideoInputOp<Context>::VideoInputOp(
   CAFFE_ENFORCE_GT(
       operator_def.input_size(), 0, "Need to have a DBReader blob input");
 
-  vector<TIndex> data_shape(5);
-  vector<TIndex> label_shape(2);
+  vector<int64_t> data_shape(5);
+  vector<int64_t> label_shape(2);
 
   // for RGB data
   data_shape[0] = batch_size_ * clip_per_video_ * multi_crop_count_;
@@ -486,11 +486,11 @@ VideoInputOp<Context>::VideoInputOp(
     prefetched_label_.Resize(label_shape);
   } else {
     prefetched_label_.Resize(
-        vector<TIndex>(1, batch_size_ * clip_per_video_ * multi_crop_count_));
+        vector<int64_t>(1, batch_size_ * clip_per_video_ * multi_crop_count_));
   }
 
   prefetched_video_id_.Resize(
-      vector<TIndex>(1, batch_size_ * clip_per_video_ * multi_crop_count_));
+      vector<int64_t>(1, batch_size_ * clip_per_video_ * multi_crop_count_));
 }
 
 template <class Context>
diff --git a/modules/detectron/sample_as_op.cu b/modules/detectron/sample_as_op.cu
index 43ebaa27405c..910d1ba3eb5e 100644
--- a/modules/detectron/sample_as_op.cu
+++ b/modules/detectron/sample_as_op.cu
@@ -58,7 +58,7 @@ bool SampleAsOp<float, CUDAContext>::RunOnDevice() {
   assert(count > 0);
 
   // resize Y
-  vector<TIndex> out_shape(X.dims());
+  vector<int64_t> out_shape(X.dims());
   out_shape[0] = count;
   Y->Resize(out_shape);
 
diff --git a/modules/detectron/select_smooth_l1_loss_op.cu b/modules/detectron/select_smooth_l1_loss_op.cu
index 98dc0bc4fa87..259f89297f10 100644
--- a/modules/detectron/select_smooth_l1_loss_op.cu
+++ b/modules/detectron/select_smooth_l1_loss_op.cu
@@ -99,7 +99,7 @@ bool SelectSmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
   auto& S         = Input(3);
   auto* avg_loss  = Output(0);
 
-  avg_loss->Resize(vector<TIndex>());
+  avg_loss->Resize(vector<int64_t>());
   if (Y.size() == 0){
     math::Set<float, CUDAContext>(
       1, static_cast<float>(0), avg_loss->mutable_data<float>(), &context_);
diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.cu b/modules/detectron/sigmoid_cross_entropy_loss_op.cu
index eb3bd9718191..a8b639058ce1 100644
--- a/modules/detectron/sigmoid_cross_entropy_loss_op.cu
+++ b/modules/detectron/sigmoid_cross_entropy_loss_op.cu
@@ -79,10 +79,10 @@ bool SigmoidCrossEntropyLossOp<float, CUDAContext>::RunOnDevice() {
       " vs. ",
       T.size(),
       ")");
-  avg_loss->Resize(vector<TIndex>());
+  avg_loss->Resize(vector<int64_t>());
   counts_.ResizeLike(X);
   losses_.ResizeLike(X);
-  normalizer_.Resize(vector<TIndex>());
+  normalizer_.Resize(vector<int64_t>());
   SigmoidCrossEntropyLossKernel<<<
       CAFFE_GET_BLOCKS(X.size()),
       CAFFE_CUDA_NUM_THREADS,
@@ -124,7 +124,7 @@ bool SigmoidCrossEntropyLossGradientOp<float, CUDAContext>::RunOnDevice() {
 
   dX->ResizeLike(X);
   counts_.ResizeLike(X);
-  normalizer_.Resize(vector<TIndex>());
+  normalizer_.Resize(vector<int64_t>());
   SigmoidCrossEntropyLossGradientKernel<<<
       CAFFE_GET_BLOCKS(X.size()),
       CAFFE_CUDA_NUM_THREADS,
diff --git a/modules/detectron/sigmoid_focal_loss_op.cu b/modules/detectron/sigmoid_focal_loss_op.cu
index 0b7b4011ba07..2630cf37b10c 100644
--- a/modules/detectron/sigmoid_focal_loss_op.cu
+++ b/modules/detectron/sigmoid_focal_loss_op.cu
@@ -125,7 +125,7 @@ bool SigmoidFocalLossOp<float, CUDAContext>::RunOnDevice() {
   int H = X.dim32(2);
   int W = X.dim32(3);
 
-  avg_loss->Resize(vector<TIndex>());
+  avg_loss->Resize(vector<int64_t>());
   losses_.ResizeLike(X);
   float* avg_loss_data = avg_loss->mutable_data<float>();
 
diff --git a/modules/detectron/smooth_l1_loss_op.cu b/modules/detectron/smooth_l1_loss_op.cu
index 7ded2802dd3a..30aadc5f4534 100644
--- a/modules/detectron/smooth_l1_loss_op.cu
+++ b/modules/detectron/smooth_l1_loss_op.cu
@@ -78,7 +78,7 @@ bool SmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
   CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size());
   CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size());
 
-  avg_loss->Resize(vector<TIndex>());
+  avg_loss->Resize(vector<int64_t>());
   buff_.ResizeLike(Y);
 
   // Difference
diff --git a/modules/detectron/softmax_focal_loss_op.cu b/modules/detectron/softmax_focal_loss_op.cu
index 8b6d1dd178f6..72b24aeb9f14 100644
--- a/modules/detectron/softmax_focal_loss_op.cu
+++ b/modules/detectron/softmax_focal_loss_op.cu
@@ -158,7 +158,7 @@ bool SoftmaxFocalLossOp<float, CUDAContext>::RunOnDevice() {
 
   losses_.Resize(N * A * H * W);
   P->Resize(N * D * H * W);
-  avg_loss->Resize(vector<TIndex>());
+  avg_loss->Resize(vector<int64_t>());
   math::Set<float, CUDAContext>(
       avg_loss->size(), 0.f, avg_loss->mutable_data<float>(), &context_);
   math::Set<float, CUDAContext>(
diff --git a/modules/detectron/upsample_nearest_op.cu b/modules/detectron/upsample_nearest_op.cu
index 2afff9719fa8..870f0508dd9a 100644
--- a/modules/detectron/upsample_nearest_op.cu
+++ b/modules/detectron/upsample_nearest_op.cu
@@ -125,7 +125,7 @@ bool UpsampleNearestOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0);
   auto* Y = Output(0);
 
-  vector<TIndex> out_shape;
+  vector<int64_t> out_shape;
   for (int i = 0; i < X.ndim(); ++i) {
     out_shape.push_back(X.dim32(i));
   }