[TGIF][Inplace][Perf] Copy tensor to device with pinned memory & move copy weight sleep to getRecord (#106849)

Summary: There are 2 changes in the diff that helps optimize perf during inplace update: 1. Read data with pinned memory 2. move the copy weight sleep from between copying the whole Tensor to between copying chunks Test Plan: **Local Test** ``` ./ai_infra/inference_platform/test_platform/script/run_sigrid_4card.sh --port 7451 --local_model_dir /home/lujia/script --cuda_devices 6 --bind_node 3 --model_id 962549778_514 --gflag_config_path sigrid/predictor/predictor_x_gflags_mrs_prospector_gpu_torchscript_fusedsolution_1card_opt_fm -- --enable_thrift_warmup=false --tgif_replicate_merge_by_tempfile=false --enable_inplace_snapshot_transition --model_version_config_path sigrid/predictor/models_version/lujia_test --inplace_update_max_retries 0 --submod_to_device="merge|cuda0" ``` **Load test on job tsp_eag/smart/inference_platform_sp__sigrid_predictor_gpu_adhoc_realtimetest_m962549778_latest.s3** Before: (p99 latency) {F1066957232} (SR error rate) {F1066957650} After: (p99 latency) {F1066957141} (SR error rate) {F1066957376} Differential Revision: D48182533 Pull Request resolved: https://github.com/pytorch/pytorch/pull/106849 Approved by: https://github.com/842974287, https://github.com/kit1980
2025-10-20 21:14:14 +08:00 · 2023-08-13 07:37:46 +00:00
parent ddd2f682b9
commit b897c57d47
3 changed files with 16 additions and 6 deletions
--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@ -9,7 +9,6 @@
 #include <sys/stat.h>
 #include <sys/types.h>

-
 #include <c10/core/Allocator.h>
 #include <c10/core/CPUAllocator.h>
 #include <c10/core/Backend.h>
@ -346,6 +345,7 @@ size_t PyTorchStreamReader::getRecord(
    void* dst,
    size_t n,
    size_t chunk_size,
+    void* buf,
    const std::function<void(void*, const void*, size_t)>& memcpy_func) {
  std::lock_guard<std::mutex> guard(reader_lock_);
  if ((!load_debug_symbol_) && c10::string_view(name).ends_with(kDebugPklSuffix)) {
@ -368,17 +368,17 @@ size_t PyTorchStreamReader::getRecord(
      iter != nullptr,
      "Failed to create zip reader iter: ",
      mz_zip_get_error_string(mz_zip_get_last_error(ar_.get())));
-  std::vector<uint8_t> buf(chunk_size);
+
  for (size_t offset = 0; offset < stat.m_uncomp_size; offset += chunk_size) {
    size_t want_size =
        std::min(chunk_size, (size_t)stat.m_uncomp_size - offset);
    size_t read_size =
-        mz_zip_reader_extract_iter_read(iter, buf.data(), want_size);
+        mz_zip_reader_extract_iter_read(iter, buf, want_size);
    TORCH_CHECK(
        read_size == want_size,
        "Failed to advance zip reader iter: ",
        mz_zip_get_error_string(mz_zip_get_last_error(ar_.get())));
-    memcpy_func((char*)dst + offset, buf.data(), read_size);
+    memcpy_func((char*)dst + offset, buf, read_size);
  }
  valid("reading file ", name.c_str());
  mz_zip_reader_extract_iter_free(iter);
--- a/caffe2/serialize/inline_container.h
+++ b/caffe2/serialize/inline_container.h
@ -110,6 +110,7 @@ class TORCH_API PyTorchStreamReader final {
      void* dst,
      size_t n,
      size_t chunk_size,
+      void* buf,
      const std::function<void(void*, const void*, size_t)>& memcpy_func);
  size_t getRecordOffset(const std::string& name);
  bool hasRecord(const std::string& name);
--- a/caffe2/serialize/inline_container_test.cc
+++ b/caffe2/serialize/inline_container_test.cc
@ -23,6 +23,8 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoad) {
  });
  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
  std::array<char, 127> data1;
+  // Inplace memory buffer
+  std::vector<uint8_t> buf(data1.size());

  for (auto i : c10::irange(data1.size())) {
    data1[i] = data1.size() - i;
@ -74,7 +76,7 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoad) {
  ASSERT_EQ(memcmp(dst.data(), data1.data(), size), 0);
  // chunked getRecord() test
  ret = reader.getRecord(
-      "key1", dst.data(), size, 3, [](void* dst, const void* src, size_t n) {
+      "key1", dst.data(), size, 3, buf.data(), [](void* dst, const void* src, size_t n) {
        memcpy(dst, src, n);
      });
  ASSERT_EQ(ret, size);
@ -94,7 +96,7 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoad) {
  ASSERT_EQ(memcmp(dst.data(), data2.data(), size), 0);
  // chunked getRecord() test
  ret = reader.getRecord(
-      "key2", dst.data(), size, 3, [](void* dst, const void* src, size_t n) {
+      "key2", dst.data(), size, 3, buf.data(), [](void* dst, const void* src, size_t n) {
        memcpy(dst, src, n);
      });
  ASSERT_EQ(ret, size);
@ -113,6 +115,9 @@ TEST(PytorchStreamWriterAndReader, GetNonexistentRecordThrows) {
  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
  std::array<char, 127> data1;

+  // Inplace memory buffer
+  std::vector<uint8_t> buf;
+
  for (auto i : c10::irange(data1.size())) {
    data1[i] = data1.size() - i;
  }
@ -154,6 +159,7 @@ TEST(PytorchStreamWriterAndReader, GetNonexistentRecordThrows) {
          dst.data(),
          data1.size(),
          3,
+          buf.data(),
          [](void* dst, const void* src, size_t n) { memcpy(dst, src, n); }),
      c10::Error);

@ -171,6 +177,8 @@ TEST(PytorchStreamWriterAndReader, SkipDebugRecords) {
  });
  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
  std::array<char, 127> data1;
+  // Inplace memory buffer
+  std::vector<uint8_t> buf(data1.size());

  for (auto i : c10::irange(data1.size())) {
    data1[i] = data1.size() - i;
@ -218,6 +226,7 @@ TEST(PytorchStreamWriterAndReader, SkipDebugRecords) {
      dst.data(),
      data1.size(),
      3,
+      buf.data(),
      [](void* dst, const void* src, size_t n) { memcpy(dst, src, n); });
  EXPECT_EQ(ret, 0);
  // clean up