[TGIF][Inplace][Perf] Copy tensor to device with pinned memory & move copy weight sleep to getRecord (#106849)

Summary:
There are 2 changes in the diff that helps optimize perf during inplace update:
1. Read data with pinned memory
2. move the copy weight sleep from between copying the whole Tensor to between copying chunks

Test Plan:
**Local Test**
```
./ai_infra/inference_platform/test_platform/script/run_sigrid_4card.sh --port 7451 --local_model_dir /home/lujia/script --cuda_devices 6 --bind_node 3 --model_id 962549778_514 --gflag_config_path sigrid/predictor/predictor_x_gflags_mrs_prospector_gpu_torchscript_fusedsolution_1card_opt_fm -- --enable_thrift_warmup=false --tgif_replicate_merge_by_tempfile=false --enable_inplace_snapshot_transition --model_version_config_path sigrid/predictor/models_version/lujia_test --inplace_update_max_retries 0 --submod_to_device="merge|cuda0"
```

**Load test on job  tsp_eag/smart/inference_platform_sp__sigrid_predictor_gpu_adhoc_realtimetest_m962549778_latest.s3**

Before:
(p99 latency)
{F1066957232}

(SR error rate)
 {F1066957650}

After:
(p99 latency)
 {F1066957141}

(SR error rate)
{F1066957376}

Differential Revision: D48182533

Pull Request resolved: https://github.com/pytorch/pytorch/pull/106849
Approved by: https://github.com/842974287, https://github.com/kit1980
This commit is contained in:
Lujia Zhang
2023-08-13 07:37:46 +00:00
committed by PyTorch MergeBot
parent ddd2f682b9
commit b897c57d47
3 changed files with 16 additions and 6 deletions

View File

@ -9,7 +9,6 @@
#include <sys/stat.h>
#include <sys/types.h>
#include <c10/core/Allocator.h>
#include <c10/core/CPUAllocator.h>
#include <c10/core/Backend.h>
@ -346,6 +345,7 @@ size_t PyTorchStreamReader::getRecord(
void* dst,
size_t n,
size_t chunk_size,
void* buf,
const std::function<void(void*, const void*, size_t)>& memcpy_func) {
std::lock_guard<std::mutex> guard(reader_lock_);
if ((!load_debug_symbol_) && c10::string_view(name).ends_with(kDebugPklSuffix)) {
@ -368,17 +368,17 @@ size_t PyTorchStreamReader::getRecord(
iter != nullptr,
"Failed to create zip reader iter: ",
mz_zip_get_error_string(mz_zip_get_last_error(ar_.get())));
std::vector<uint8_t> buf(chunk_size);
for (size_t offset = 0; offset < stat.m_uncomp_size; offset += chunk_size) {
size_t want_size =
std::min(chunk_size, (size_t)stat.m_uncomp_size - offset);
size_t read_size =
mz_zip_reader_extract_iter_read(iter, buf.data(), want_size);
mz_zip_reader_extract_iter_read(iter, buf, want_size);
TORCH_CHECK(
read_size == want_size,
"Failed to advance zip reader iter: ",
mz_zip_get_error_string(mz_zip_get_last_error(ar_.get())));
memcpy_func((char*)dst + offset, buf.data(), read_size);
memcpy_func((char*)dst + offset, buf, read_size);
}
valid("reading file ", name.c_str());
mz_zip_reader_extract_iter_free(iter);

View File

@ -110,6 +110,7 @@ class TORCH_API PyTorchStreamReader final {
void* dst,
size_t n,
size_t chunk_size,
void* buf,
const std::function<void(void*, const void*, size_t)>& memcpy_func);
size_t getRecordOffset(const std::string& name);
bool hasRecord(const std::string& name);

View File

@ -23,6 +23,8 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoad) {
});
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
std::array<char, 127> data1;
// Inplace memory buffer
std::vector<uint8_t> buf(data1.size());
for (auto i : c10::irange(data1.size())) {
data1[i] = data1.size() - i;
@ -74,7 +76,7 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoad) {
ASSERT_EQ(memcmp(dst.data(), data1.data(), size), 0);
// chunked getRecord() test
ret = reader.getRecord(
"key1", dst.data(), size, 3, [](void* dst, const void* src, size_t n) {
"key1", dst.data(), size, 3, buf.data(), [](void* dst, const void* src, size_t n) {
memcpy(dst, src, n);
});
ASSERT_EQ(ret, size);
@ -94,7 +96,7 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoad) {
ASSERT_EQ(memcmp(dst.data(), data2.data(), size), 0);
// chunked getRecord() test
ret = reader.getRecord(
"key2", dst.data(), size, 3, [](void* dst, const void* src, size_t n) {
"key2", dst.data(), size, 3, buf.data(), [](void* dst, const void* src, size_t n) {
memcpy(dst, src, n);
});
ASSERT_EQ(ret, size);
@ -113,6 +115,9 @@ TEST(PytorchStreamWriterAndReader, GetNonexistentRecordThrows) {
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
std::array<char, 127> data1;
// Inplace memory buffer
std::vector<uint8_t> buf;
for (auto i : c10::irange(data1.size())) {
data1[i] = data1.size() - i;
}
@ -154,6 +159,7 @@ TEST(PytorchStreamWriterAndReader, GetNonexistentRecordThrows) {
dst.data(),
data1.size(),
3,
buf.data(),
[](void* dst, const void* src, size_t n) { memcpy(dst, src, n); }),
c10::Error);
@ -171,6 +177,8 @@ TEST(PytorchStreamWriterAndReader, SkipDebugRecords) {
});
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
std::array<char, 127> data1;
// Inplace memory buffer
std::vector<uint8_t> buf(data1.size());
for (auto i : c10::irange(data1.size())) {
data1[i] = data1.size() - i;
@ -218,6 +226,7 @@ TEST(PytorchStreamWriterAndReader, SkipDebugRecords) {
dst.data(),
data1.size(),
3,
buf.data(),
[](void* dst, const void* src, size_t n) { memcpy(dst, src, n); });
EXPECT_EQ(ret, 0);
// clean up