mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[TGIF][Inplace][Perf] Copy tensor to device with pinned memory & move copy weight sleep to getRecord (#106849)
Summary: There are 2 changes in the diff that helps optimize perf during inplace update: 1. Read data with pinned memory 2. move the copy weight sleep from between copying the whole Tensor to between copying chunks Test Plan: **Local Test** ``` ./ai_infra/inference_platform/test_platform/script/run_sigrid_4card.sh --port 7451 --local_model_dir /home/lujia/script --cuda_devices 6 --bind_node 3 --model_id 962549778_514 --gflag_config_path sigrid/predictor/predictor_x_gflags_mrs_prospector_gpu_torchscript_fusedsolution_1card_opt_fm -- --enable_thrift_warmup=false --tgif_replicate_merge_by_tempfile=false --enable_inplace_snapshot_transition --model_version_config_path sigrid/predictor/models_version/lujia_test --inplace_update_max_retries 0 --submod_to_device="merge|cuda0" ``` **Load test on job tsp_eag/smart/inference_platform_sp__sigrid_predictor_gpu_adhoc_realtimetest_m962549778_latest.s3** Before: (p99 latency) {F1066957232} (SR error rate) {F1066957650} After: (p99 latency) {F1066957141} (SR error rate) {F1066957376} Differential Revision: D48182533 Pull Request resolved: https://github.com/pytorch/pytorch/pull/106849 Approved by: https://github.com/842974287, https://github.com/kit1980
This commit is contained in:
committed by
PyTorch MergeBot
parent
ddd2f682b9
commit
b897c57d47
@ -9,7 +9,6 @@
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
|
||||
#include <c10/core/Allocator.h>
|
||||
#include <c10/core/CPUAllocator.h>
|
||||
#include <c10/core/Backend.h>
|
||||
@ -346,6 +345,7 @@ size_t PyTorchStreamReader::getRecord(
|
||||
void* dst,
|
||||
size_t n,
|
||||
size_t chunk_size,
|
||||
void* buf,
|
||||
const std::function<void(void*, const void*, size_t)>& memcpy_func) {
|
||||
std::lock_guard<std::mutex> guard(reader_lock_);
|
||||
if ((!load_debug_symbol_) && c10::string_view(name).ends_with(kDebugPklSuffix)) {
|
||||
@ -368,17 +368,17 @@ size_t PyTorchStreamReader::getRecord(
|
||||
iter != nullptr,
|
||||
"Failed to create zip reader iter: ",
|
||||
mz_zip_get_error_string(mz_zip_get_last_error(ar_.get())));
|
||||
std::vector<uint8_t> buf(chunk_size);
|
||||
|
||||
for (size_t offset = 0; offset < stat.m_uncomp_size; offset += chunk_size) {
|
||||
size_t want_size =
|
||||
std::min(chunk_size, (size_t)stat.m_uncomp_size - offset);
|
||||
size_t read_size =
|
||||
mz_zip_reader_extract_iter_read(iter, buf.data(), want_size);
|
||||
mz_zip_reader_extract_iter_read(iter, buf, want_size);
|
||||
TORCH_CHECK(
|
||||
read_size == want_size,
|
||||
"Failed to advance zip reader iter: ",
|
||||
mz_zip_get_error_string(mz_zip_get_last_error(ar_.get())));
|
||||
memcpy_func((char*)dst + offset, buf.data(), read_size);
|
||||
memcpy_func((char*)dst + offset, buf, read_size);
|
||||
}
|
||||
valid("reading file ", name.c_str());
|
||||
mz_zip_reader_extract_iter_free(iter);
|
||||
|
@ -110,6 +110,7 @@ class TORCH_API PyTorchStreamReader final {
|
||||
void* dst,
|
||||
size_t n,
|
||||
size_t chunk_size,
|
||||
void* buf,
|
||||
const std::function<void(void*, const void*, size_t)>& memcpy_func);
|
||||
size_t getRecordOffset(const std::string& name);
|
||||
bool hasRecord(const std::string& name);
|
||||
|
@ -23,6 +23,8 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoad) {
|
||||
});
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
|
||||
std::array<char, 127> data1;
|
||||
// Inplace memory buffer
|
||||
std::vector<uint8_t> buf(data1.size());
|
||||
|
||||
for (auto i : c10::irange(data1.size())) {
|
||||
data1[i] = data1.size() - i;
|
||||
@ -74,7 +76,7 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoad) {
|
||||
ASSERT_EQ(memcmp(dst.data(), data1.data(), size), 0);
|
||||
// chunked getRecord() test
|
||||
ret = reader.getRecord(
|
||||
"key1", dst.data(), size, 3, [](void* dst, const void* src, size_t n) {
|
||||
"key1", dst.data(), size, 3, buf.data(), [](void* dst, const void* src, size_t n) {
|
||||
memcpy(dst, src, n);
|
||||
});
|
||||
ASSERT_EQ(ret, size);
|
||||
@ -94,7 +96,7 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoad) {
|
||||
ASSERT_EQ(memcmp(dst.data(), data2.data(), size), 0);
|
||||
// chunked getRecord() test
|
||||
ret = reader.getRecord(
|
||||
"key2", dst.data(), size, 3, [](void* dst, const void* src, size_t n) {
|
||||
"key2", dst.data(), size, 3, buf.data(), [](void* dst, const void* src, size_t n) {
|
||||
memcpy(dst, src, n);
|
||||
});
|
||||
ASSERT_EQ(ret, size);
|
||||
@ -113,6 +115,9 @@ TEST(PytorchStreamWriterAndReader, GetNonexistentRecordThrows) {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
|
||||
std::array<char, 127> data1;
|
||||
|
||||
// Inplace memory buffer
|
||||
std::vector<uint8_t> buf;
|
||||
|
||||
for (auto i : c10::irange(data1.size())) {
|
||||
data1[i] = data1.size() - i;
|
||||
}
|
||||
@ -154,6 +159,7 @@ TEST(PytorchStreamWriterAndReader, GetNonexistentRecordThrows) {
|
||||
dst.data(),
|
||||
data1.size(),
|
||||
3,
|
||||
buf.data(),
|
||||
[](void* dst, const void* src, size_t n) { memcpy(dst, src, n); }),
|
||||
c10::Error);
|
||||
|
||||
@ -171,6 +177,8 @@ TEST(PytorchStreamWriterAndReader, SkipDebugRecords) {
|
||||
});
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
|
||||
std::array<char, 127> data1;
|
||||
// Inplace memory buffer
|
||||
std::vector<uint8_t> buf(data1.size());
|
||||
|
||||
for (auto i : c10::irange(data1.size())) {
|
||||
data1[i] = data1.size() - i;
|
||||
@ -218,6 +226,7 @@ TEST(PytorchStreamWriterAndReader, SkipDebugRecords) {
|
||||
dst.data(),
|
||||
data1.size(),
|
||||
3,
|
||||
buf.data(),
|
||||
[](void* dst, const void* src, size_t n) { memcpy(dst, src, n); });
|
||||
EXPECT_EQ(ret, 0);
|
||||
// clean up
|
||||
|
Reference in New Issue
Block a user