Files
pytorch/torch/csrc/jit/frontend/source_range.h
Han Qi 61d6c43864 Make debug_pkl smaller by only emitting unique traces. (#73368)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/73368

debug_pkl file inside of pytorch's .pt file consists of a list of SourceRanges. Each SourceRange points to a Source which is a stack track, filename, and start, end numbers. Those are emitted in debug_pkl file as strings.
Since many SourceRange shares the same source, the string for trace can be deduped.
The newer format saves a set of unique traces in a tuple, then each SourceRange will save the offset of it's trace w.r.t. position in that tuple. (i.e. manually applying dictionary compression).
The above helps with smaller file size. On loading, if we copy each trace to Source as string the runtime memory would still blowup.
To mitigate this, we use SourceView directly instead of source which will take the reference of string inside of Deserializer and make that into string_view. This is safe because Deserializer is hold by Unpickler by shared_ptr, and Unpickler is also hold by shared_ptr by another Source object. That Source object will be alive during the model construction.

Test Plan:
unit test

Took original file (312271638_930.predictor.disagg.local); loaded with `torch.jit.load` save again with `torch.jit.save`. Unzip both, look at contents:
```
[qihan@devvm5585.vll0 ~]$ du archive -h
4.0K    archive/xl_model_weights
3.7M    archive/extra
8.0K    archive/code/__torch__/caffe2/torch/fb/model_transform/splitting
8.0K    archive/code/__torch__/caffe2/torch/fb/model_transform
8.0K    archive/code/__torch__/caffe2/torch/fb
8.0K    archive/code/__torch__/caffe2/torch
8.0K    archive/code/__torch__/caffe2
20M     archive/code/__torch__/torch/fx/graph_module
20M     archive/code/__torch__/torch/fx
8.0K    archive/code/__torch__/torch/classes
20M     archive/code/__torch__/torch
20M     archive/code/__torch__
20M     archive/code
2.7M    archive/constants
35M     archive
[qihan@devvm5585.vll0 ~]$ du resaved -h
4.0K    resaved/extra
8.0K    resaved/code/__torch__/caffe2/torch/fb/model_transform/splitting
8.0K    resaved/code/__torch__/caffe2/torch/fb/model_transform
8.0K    resaved/code/__torch__/caffe2/torch/fb
8.0K    resaved/code/__torch__/caffe2/torch
8.0K    resaved/code/__torch__/caffe2
1.3M    resaved/code/__torch__/torch/fx/graph_module
1.3M    resaved/code/__torch__/torch/fx
8.0K    resaved/code/__torch__/torch/classes
1.4M    resaved/code/__torch__/torch
1.4M    resaved/code/__torch__
1.4M    resaved/code
2.7M    resaved/constants
13M     resaved
[qihan@devvm5585.vll0 ~]$
```

Reviewed By: gmagogsfm

Differential Revision: D34455360

fbshipit-source-id: 8cc716f9bba7183746b1b4ecc33a2de34ac503b9
(cherry picked from commit f1a04730fc9ac8fdab6c8e4c44cb5529e42090e4)
2022-03-02 08:37:08 +00:00

391 lines
11 KiB
C++

#pragma once
#include <c10/util/Exception.h>
#include <c10/util/Optional.h>
#include <algorithm>
#include <iostream>
#include <iterator>
#include <memory>
#include <numeric>
#include <unordered_map>
namespace torch {
namespace jit {
class SourceRangeUnpickler;
struct SourceRange;
// A stringlike class backed by a vector of string_view
// the string represented are logically the concatenation of the string_views
// This has advantage of not needing continues memory.
struct TORCH_API StringCordView {
StringCordView();
StringCordView(
std::vector<c10::string_view> inputs,
std::vector<std::shared_ptr<std::string>> ownerships);
size_t size() const {
return accumulated_sizes_.back();
}
size_t find(const std::string& tok, size_t start) const;
StringCordView substr(size_t start, size_t size) const;
char at(size_t index) const {
return *iter_for_pos(index);
}
char operator[](size_t index) const {
return at(index);
}
std::string str() const {
std::stringstream ss;
for (auto s : pieces_) {
ss << std::string(s);
}
return ss.str();
}
bool operator==(const std::string& rhs);
bool operator==(const StringCordView& rhs);
c10::string_view piece(size_t index) const {
return pieces_[index];
}
struct Iterator {
Iterator(
const StringCordView* str,
size_t start_line,
size_t start_pos,
size_t size)
: line_(start_line), pos_(start_pos), str_(str), size_(size) {}
explicit Iterator(const StringCordView* str)
: Iterator(str, 0, 0, str->size()) {}
Iterator(const Iterator&) = default;
Iterator(Iterator&&) = default;
Iterator& operator=(const Iterator&) = default;
Iterator& operator=(Iterator&&) = default;
Iterator operator++() {
if (size_ == 0) {
return *this;
}
if ((pos_ + 1) < str_->pieces_[line_].size()) {
pos_++;
} else {
line_++;
pos_ = 0;
}
return *this;
}
Iterator operator++(int) {
Iterator prev(*this);
++(*this);
return prev;
}
bool operator==(const Iterator& rhs) const {
if (!has_next() && !rhs.has_next()) {
return true;
}
return (str_ == rhs.str_) && (line_ == rhs.line_) && (pos_ == rhs.pos_);
}
bool operator!=(const Iterator& rhs) {
return !((*this) == rhs);
}
bool has_next() const {
return size_ > 0 && (line_ < str_->pieces_.size());
}
char operator*() const {
TORCH_INTERNAL_ASSERT(line_ < str_->pieces_.size());
TORCH_INTERNAL_ASSERT(pos_ < str_->pieces_[line_].size());
return str_->pieces_[line_].at(pos_);
}
private:
size_t line_;
size_t pos_;
const StringCordView* str_;
size_t size_;
friend struct StringCordView;
};
Iterator begin() const {
return Iterator(this, 0, 0, size());
}
Iterator end() const {
return Iterator(this, pieces_.size(), 0, 0);
}
private:
Iterator iter_for_pos(size_t pos) const;
std::vector<c10::string_view> pieces_;
std::vector<size_t> accumulated_sizes_;
std::vector<std::shared_ptr<std::string>> owned_strings_;
};
// Source represents a code segment. It keeps track of:
// - text_view : the view into text of the code segment
// - filename (optional) : if present, represents the name of the file from
// which the code segment originated.
// - starting_line_no : represents the line in the original file where the
// code segment started.
struct TORCH_API Source {
// Whether or not Source should copy the string passed in the constructor.
enum CopiesString { COPIES_STRING, DONT_COPY };
explicit Source(
c10::string_view text_view,
c10::optional<std::string> filename = c10::nullopt,
size_t starting_line_no = 0,
std::shared_ptr<SourceRangeUnpickler> gen_ranges = nullptr,
CopiesString copies_str = COPIES_STRING)
: filename_(std::move(filename)),
starting_line_no_(starting_line_no),
gen_ranges_(std::move(gen_ranges)) {
if (copies_str == COPIES_STRING) {
std::shared_ptr<std::string> allocated_str =
std::make_shared<std::string>(text_view.data(), text_view.size());
text_view_ = StringCordView({*allocated_str}, {allocated_str});
} else {
text_view_ = StringCordView({text_view}, {});
}
calc_line_start_offsets();
}
explicit Source(
StringCordView str,
c10::optional<std::string> filename = c10::nullopt,
size_t starting_line_no = 0,
std::shared_ptr<SourceRangeUnpickler> gen_ranges = nullptr)
: text_view_(str),
filename_(std::move(filename)),
starting_line_no_(starting_line_no),
gen_ranges_(std::move(gen_ranges)) {
calc_line_start_offsets();
}
// Given a line number (within source_), return the byte offset of the
// beginning of that line.
size_t offset_for_line(size_t line) const {
return line_starting_offsets_.at(line);
}
// Returns number of lines present.
size_t num_lines() const {
return line_starting_offsets_.size();
}
// Calculate the line (within the code segment) on which `offset` resides.
size_t lineno_for_offset(size_t offset) const {
auto iter = std::upper_bound(
line_starting_offsets_.begin(), line_starting_offsets_.end(), offset);
return iter - line_starting_offsets_.begin() - 1;
}
// Calculate the line (within the original source file, if present) on which
// `lineno` resides.
size_t lineno_to_source_lineno(size_t lineno) const {
if (filename_) {
return lineno + starting_line_no_;
} else {
return lineno;
}
}
StringCordView get_line(size_t lineno) const {
auto start = offset_for_line(lineno);
auto size = (lineno + 1) < num_lines() ? offset_for_line(lineno + 1) - start
: text_view_.size() - start;
return text_view_.substr(start, size);
}
// Note: this makes a copy
StringCordView text_str() const {
return text_view_;
}
char char_at(size_t index) const {
return text_view_.at(index);
}
size_t size() const {
return text_view_.size();
}
c10::optional<std::string>& filename() {
return filename_;
}
size_t starting_line_no() const {
return starting_line_no_;
}
c10::optional<SourceRange> findSourceRangeThatGenerated(
const SourceRange& range);
~Source() = default;
private:
void calc_line_start_offsets() {
line_starting_offsets_.clear();
line_starting_offsets_.push_back(0);
size_t pos = 0;
while ((pos = text_view_.find("\n", pos)) != std::string::npos) {
line_starting_offsets_.push_back(++pos);
}
}
StringCordView text_view_;
c10::optional<std::string> filename_;
// If filename_ is not present, starting_line_no_ is don't care
size_t starting_line_no_;
// Starting offsets for lines into the source. e.g. line 0 starts at
// line_starting_offsets_[0], etc.
std::vector<size_t> line_starting_offsets_;
std::shared_ptr<SourceRangeUnpickler> gen_ranges_;
};
// A SourceRange is a reference to subset of a Source, specified by `start` and
// `end` byte offsets into the source text.
struct TORCH_API SourceRange {
SourceRange(std::shared_ptr<Source> source_view_, size_t start_, size_t end_)
: source_view_(std::move(source_view_)), start_(start_), end_(end_) {}
SourceRange() : source_view_(nullptr), start_(0), end_(0) {}
const StringCordView text() const {
return source_view_->text_str().substr(start(), end() - start());
}
size_t size() const {
return end() - start();
}
static const size_t CONTEXT = 3;
void highlight(std::ostream& out) const;
// Customizable version of 'highlight' method.
void print_with_context(
std::ostream& out,
size_t context,
bool highlight,
const std::string& funcname) const;
const std::shared_ptr<Source>& source() const {
return source_view_;
}
size_t start() const {
return start_;
}
size_t end() const {
return end_;
}
std::string str() const {
std::stringstream ss;
highlight(ss);
return ss.str();
}
c10::optional<std::tuple<std::string, size_t, size_t>> file_line_col() const {
if (!source_view_ || !source()->filename()) {
return c10::nullopt;
}
auto lineno = source_view_->lineno_for_offset(start_);
auto col_offset = (int)start_ - (int)source_view_->offset_for_line(lineno);
// TODO: c10::optional<>::value returns an rvalue ref so can't use it here??
return std::make_tuple<std::string, size_t, size_t>(
source_view_->filename().value_or(""),
source_view_->lineno_to_source_lineno(lineno),
(size_t)col_offset);
}
bool operator==(const SourceRange& rhs) const {
return start() == rhs.start() && end() == rhs.end() &&
source() == rhs.source();
}
bool operator!=(const SourceRange& rhs) const {
return !(*this == rhs);
}
c10::optional<SourceRange> findSourceRangeThatGenerated() const {
if (!source_view_) {
return c10::nullopt;
}
return source_view_->findSourceRangeThatGenerated(*this);
}
protected:
std::shared_ptr<Source> source_view_;
private:
size_t start_;
size_t end_;
};
// OwnedSourceRange is just like a SourceRange except that it owns a `Source`
// instead of `Source`. Thus OwnedSourceRange owns a copy of source text.
struct OwnedSourceRange : public SourceRange {
explicit OwnedSourceRange(const SourceRange& source_range)
: SourceRange(source_range) {
const auto& source = source_range.source();
if (source) {
source_view_ = std::make_shared<Source>(
source->text_str().str(),
source->filename(),
source->starting_line_no());
}
}
};
struct TORCH_API SourceRangeHasher {
public:
size_t operator()(const torch::jit::SourceRange& key) const;
};
struct StackEntry {
std::string filename;
SourceRange range;
};
C10_EXPORT void format_stack_trace(
std::ostream& out,
const std::vector<StackEntry>& entries);
inline std::ostream& operator<<(std::ostream& out, const SourceRange& range) {
range.highlight(out);
return out;
}
// A pair of (byte offset, SourceRange) describing a specific segment
// of the output stream
struct TaggedRange {
TaggedRange(size_t bytes, SourceRange range)
: bytes(bytes), range(std::move(range)) {}
size_t bytes;
SourceRange range;
};
using SourceRangeRecords = std::vector<TaggedRange>;
using SourceRangeTagMap =
std::unordered_map<SourceRange, int64_t, SourceRangeHasher>;
} // namespace jit
} // namespace torch
namespace std {
template <>
struct iterator_traits<torch::jit::StringCordView::Iterator> {
using value_type = char;
using difference_type = ptrdiff_t;
using pointer = char*;
using reference = char&;
using iterator_category = std::forward_iterator_tag;
};
} // namespace std