mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-11 22:34:53 +08:00
This PR is the first step towards refactors the build for nvfuser in order to have the coegen being a standalone library.
Contents inside this PR:
1. nvfuser code base has been moved to `./nvfuser`, from `./torch/csrc/jit/codegen/cuda/`, except for registration code for integration (interface.h/interface.cpp)
2. splits the build system so nvfuser is generating its own `.so` files. Currently there are:
- `libnvfuser_codegen.so`, which contains the integration, codegen and runtime system of nvfuser
- `nvfuser.so`, which is nvfuser's python API via pybind. Python frontend is now exposed via `nvfuser._C.XXX` instead of `torch._C._nvfuser`
3. nvfuser cpp tests is currently being compiled into `nvfuser_tests`
4. cmake is refactored so that:
- nvfuser now has its own `CMakeLists.txt`, which is under `torch/csrc/jit/codegen/cuda/`.
- nvfuser backend code is not compiled inside `libtorch_cuda_xxx` any more
- nvfuser is added as a subdirectory under `./CMakeLists.txt` at the very end after torch is built.
- since nvfuser has dependency on torch, the registration of nvfuser at runtime is done via dlopen (`at::DynamicLibrary`). This avoids circular dependency in cmake, which will be a nightmare to handle. For details, look at `torch/csrc/jit/codegen/cuda/interface.cpp::LoadingNvfuserLibrary`
Future work that's scoped in following PR:
- Currently since nvfuser codegen has dependency on torch, we need to refactor that out so we can move nvfuser into a submodule and not rely on dlopen to load the library. @malfet
- Since we moved nvfuser into a cmake build, we effectively disabled bazel build for nvfuser. This could impact internal workload at Meta, so we need to put support back. cc'ing @vors
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89621
Approved by: https://github.com/davidberard98
344 lines
10 KiB
C++
344 lines
10 KiB
C++
#pragma once
|
|
#include <dynamic_type.h>
|
|
#include <executor_kernel_arg.h>
|
|
#include <executor_launch_params.h>
|
|
#include <fusion.h>
|
|
#include <ir_all_nodes.h>
|
|
#include <lower2device.h>
|
|
|
|
#include <c10/core/DeviceType.h>
|
|
|
|
namespace torch {
|
|
namespace jit {
|
|
namespace fuser {
|
|
namespace cuda {
|
|
|
|
//! This is the common space for expression evaluators in
|
|
//! fusion IR and kernel IR context. Much of the evaluator
|
|
//! optimizations and runtimes could share the same code
|
|
//! path and they could be collected here.
|
|
|
|
class ExpressionEvaluator;
|
|
|
|
namespace kir {
|
|
|
|
class ExpressionEvaluator;
|
|
|
|
} // namespace kir
|
|
|
|
//! IR Contexts to be passed to generic evaluator optimizations
|
|
//! and runtimes. Defines the essential interface for the
|
|
//! generic logic to get necessary type and function info
|
|
//! from the IR nodes. Generic optimizations will assume
|
|
//! the same list of static definitions are provided
|
|
//! in each of the contexts, just FusionIR and KernelIR
|
|
//! currently.
|
|
|
|
//! Context for using generic logic on FusionIR
|
|
class FusionIRContext {
|
|
public:
|
|
using TV_TYPE = TensorView;
|
|
using EVALUATOR_TYPE = ExpressionEvaluator;
|
|
|
|
static BinaryOpType getOpType(BinaryOp* bop) {
|
|
return bop->getBinaryOpType();
|
|
}
|
|
|
|
static UnaryOpType getOpType(UnaryOp* uop) {
|
|
return uop->getUnaryOpType();
|
|
}
|
|
};
|
|
|
|
//! Context for using generic logic on KernelIR
|
|
class KernelIRContext {
|
|
public:
|
|
using EVALUATOR_TYPE = kir::ExpressionEvaluator;
|
|
|
|
static BinaryOpType getOpType(BinaryOp* bop) {
|
|
return bop->getBinaryOpType();
|
|
}
|
|
|
|
static UnaryOpType getOpType(UnaryOp* uop) {
|
|
return uop->getUnaryOpType();
|
|
}
|
|
};
|
|
|
|
template <typename IRContext>
|
|
class PrecomputedValuesBase;
|
|
|
|
//! NaiveValueMachine:
|
|
//! This is an un-optimized runtime for evaluating a
|
|
//! set of values in one run. The runtime contains
|
|
//! a vector of instructions inferred from IR at compile-time
|
|
//! and it currently must be associated with an instance of
|
|
//! PrecomputedValuesBase that will provide the workspace
|
|
//! containing the concrete values for the values.
|
|
template <typename IRContext>
|
|
class NaiveValueMachine {
|
|
//! The generic types of instructions supported for this
|
|
//! machine, currently only binary and unary.
|
|
enum class InstructionType { UNARY_OP, BINARY_OP };
|
|
|
|
public:
|
|
//! Constructor lowers all the expr IR nodes stored in precomputed_values
|
|
//! and stores them in the private state.
|
|
NaiveValueMachine(PrecomputedValuesBase<IRContext>& precomputed_values);
|
|
|
|
//! Runs all the instructions and write results to the associated
|
|
//! precomputed_values.
|
|
void run();
|
|
|
|
private:
|
|
//! Convert an unary IR expr to an instruction
|
|
void makeUnaryOp(UnaryOp* uop);
|
|
|
|
//! Convert an binary IR expr to an instruction
|
|
void makeBinaryOp(BinaryOp* bop);
|
|
|
|
//! Create an empty instruction with all default values
|
|
//! and place it at the end of the instruction buffer.
|
|
int makeInstructionEntry();
|
|
|
|
//! Run a single instruction at the given index of
|
|
//! the instruction buffer. Decodes and dispatches
|
|
//! to the corresponding instruction handle functions.
|
|
void runInstruction(int index);
|
|
|
|
//! Runs a unary operation at given index of instruction buffer
|
|
void runUnaryOp(int index);
|
|
|
|
//! Runs a binary operation at given index of instruction buffer
|
|
void runBinaryOp(int index);
|
|
|
|
private:
|
|
friend PrecomputedValuesBase<IRContext>;
|
|
|
|
//! Reference to the PrecomputedValues workspace associated with
|
|
//! this runtime. All the instructions will read and write the
|
|
//! values in this workspace.
|
|
PrecomputedValuesBase<IRContext>& precomputed_values_;
|
|
|
|
//! Instruction buffer. All states are in separate vectors and
|
|
//! the entry of each vector at the same index correspond to
|
|
//! the same instruction.
|
|
|
|
//! Total number of instructions
|
|
int num_of_instructions_ = 0;
|
|
|
|
//! Machine instruction type for each instruction i.e.
|
|
//! unary or binary
|
|
std::vector<InstructionType> inst_type_;
|
|
|
|
//! Unary operator type if applicable, contains a default
|
|
//! value at each index corresponding to a binary op.
|
|
std::vector<UnaryOpType> uop_type_;
|
|
|
|
//! Data type for unary op of type UnaryOpType::Cast, contains a default
|
|
//! value at each index corresponding other ops.
|
|
std::vector<DataType> data_type_;
|
|
|
|
//! Unary operator type if applicable, contains a default
|
|
//! value at each index corresponding to a unary op.
|
|
std::vector<BinaryOpType> bop_type_;
|
|
|
|
//! Indexes of operands and destination of each instruction.
|
|
//! The indexes corresponds to positions in the workspace
|
|
//! where concrete values are hosted.
|
|
|
|
//! Operand 0 of each instruction.
|
|
std::vector<int> src0_;
|
|
|
|
//! Operand 1 of each instruction, a default value at
|
|
//! each index corresponding to a unary op.
|
|
std::vector<int> src1_;
|
|
|
|
//! Destination of each instruction.
|
|
std::vector<int> dest_;
|
|
};
|
|
|
|
//! PrecomputedValuesBase:
|
|
//! A class to support optimized evaluation of values
|
|
//! at runtime.
|
|
//! At compile time all necessary values are collected
|
|
//! from given IR nodes and a runtime and a workspace containing
|
|
//! the concrete values is created and pre-allocated.
|
|
//! At runtime the value vm is used to evaluate all the
|
|
//! values and store them in the workspace ahead of time.
|
|
template <typename IRContext>
|
|
class PrecomputedValuesBase {
|
|
using VALUE_MACHINE = NaiveValueMachine<IRContext>;
|
|
|
|
public:
|
|
explicit PrecomputedValuesBase() = default;
|
|
|
|
//! Returns if the workspace contains evaluated results.
|
|
bool ready() {
|
|
return has_valid_values_;
|
|
}
|
|
|
|
//! Runs the internal value machine that will compute
|
|
//! the values allocated in the workspace.
|
|
void evaluate();
|
|
|
|
//! Returns value for the given IR node if it's stored
|
|
//! in the workspace and has been evaluated.
|
|
c10::optional<IntOrDouble> getMaybeValueFor(const Val* val);
|
|
|
|
//! Debugging helper, prints all the currently known values
|
|
void print() const;
|
|
|
|
protected:
|
|
//! Initialize the workspace before first use.
|
|
//! Assume the given value list IR nodes have
|
|
//! been topologically sorted.
|
|
void initializeValueList(
|
|
typename IRContext::EVALUATOR_TYPE& evaluator,
|
|
const std::vector<Val*>& sorted_value_list);
|
|
|
|
//! Bind concrete value to the given index
|
|
//! if the index is valid.
|
|
void bindValue(int index, IntOrDouble value) {
|
|
if (index < 0 || is_constant_[index]) {
|
|
return;
|
|
}
|
|
defined_[index] = true;
|
|
values_[index] = value;
|
|
binding_log_.emplace_back(index, value);
|
|
}
|
|
|
|
//! Invalidate all computed values in the workspace.
|
|
void invalidate();
|
|
|
|
//! Interface for subclasses to access symbols_
|
|
void loadSymbols(std::vector<Val*> symbols) {
|
|
symbols_ = std::move(symbols);
|
|
}
|
|
|
|
//! Interface for subclasses to access symbols_
|
|
std::vector<Val*>& symbols() {
|
|
return symbols_;
|
|
}
|
|
|
|
//! Initialize the value runtime that will
|
|
//! infer instructions from the workspace.
|
|
void initializeIntegerMachine() {
|
|
value_machine_ = std::make_unique<VALUE_MACHINE>(*this);
|
|
}
|
|
|
|
bool hasValidValues() {
|
|
return has_valid_values_;
|
|
}
|
|
|
|
private:
|
|
//! Post evaluation check, throws if any computed value
|
|
//! is inconsistent with its bound value
|
|
void validate();
|
|
|
|
//! Returns true if workspace has a computed or constant
|
|
//! value for given index.
|
|
bool hasValue(int index) {
|
|
TORCH_INTERNAL_ASSERT(index > 0);
|
|
return defined_[index] || is_constant_[index];
|
|
}
|
|
|
|
private:
|
|
friend VALUE_MACHINE;
|
|
|
|
//! Marks if an evaluation has finished
|
|
bool has_valid_values_ = false;
|
|
|
|
//! The size of workspace
|
|
int num_of_values_ = -1;
|
|
|
|
//! Marks if a value has been bound or
|
|
//! computed at each index.
|
|
std::vector<bool> defined_;
|
|
|
|
//! Marks if a value is compile-time constant
|
|
//! at each index.
|
|
std::vector<bool> is_constant_;
|
|
|
|
//! Stores the concrete values at each index.
|
|
std::vector<IntOrDouble> values_;
|
|
|
|
//! Stores the IR nodes corresponding to each index.
|
|
std::vector<Val*> symbols_;
|
|
|
|
//! An internal log to keep track of all the bindings
|
|
//! used in each evaluation cycle. To be used for
|
|
//! consistency check.
|
|
std::vector<std::pair<int, IntOrDouble>> binding_log_;
|
|
|
|
//! Integer runtime for realizing the values computations.
|
|
std::unique_ptr<VALUE_MACHINE> value_machine_;
|
|
};
|
|
|
|
//! PrecomputedValues workspace in Fusion IR context,
|
|
//! defines the set of values to be collected in each
|
|
//! fusion graph and the input value binding given each
|
|
//! fusion runtime input.
|
|
class FusionPrecomputedValues : public PrecomputedValuesBase<FusionIRContext> {
|
|
using precomputedValuesBaseType = PrecomputedValuesBase<FusionIRContext>;
|
|
|
|
public:
|
|
FusionPrecomputedValues(Fusion* fusion);
|
|
|
|
//! Bind concrete values from fusion runtime inputs
|
|
void bindFusionInputs(const KernelArgumentHolder& args);
|
|
|
|
private:
|
|
void bindTensorMetaData(
|
|
TensorView* tv,
|
|
const TensorArgAbstract* tensor_arg_abstract);
|
|
|
|
private:
|
|
Fusion* fusion_ = nullptr;
|
|
};
|
|
//! PrecomputedValues workspace in Fusion IR context,
|
|
//! defines the set of values to be collected in each
|
|
//! kernel IR sequence and the input value binding given each
|
|
//! fusion runtime input and launch constraints.
|
|
class KernelPrecomputedValues : public PrecomputedValuesBase<KernelIRContext> {
|
|
using precomputedValuesBaseType = PrecomputedValuesBase<KernelIRContext>;
|
|
|
|
public:
|
|
using ParallelExtentMap =
|
|
std::unordered_map<ParallelType, std::vector<const Val*>, TypeHash>;
|
|
|
|
KernelPrecomputedValues(kir::Kernel* kernel);
|
|
|
|
//! Bind concrete values from fusion runtime inputs
|
|
void bindKernelInputs(kir::Kernel* kernel, const KernelArgumentHolder& args);
|
|
|
|
//! Bind concrete values from launch constraints
|
|
void bindParallelExtents(
|
|
const ParallelExtentMap& parallel_extents,
|
|
const LaunchParams& launch_constraint);
|
|
|
|
//! Bind the NamedScalars corresponding to the
|
|
//! concrete parallel dimension sizes after the
|
|
//! actual value has been resolved.
|
|
void bindConcreteParallelTypeValue(ParallelType pt, int64_t value);
|
|
|
|
private:
|
|
void bindTensorMetaData(
|
|
TensorView* tv,
|
|
const TensorArgAbstract* tensor_arg_abstract);
|
|
|
|
//! Iterate through all the named scalars corresponding
|
|
//! to thread sizes and pre-group them by their parallel
|
|
//! types.
|
|
void initializeNamedScalars();
|
|
|
|
private:
|
|
//! Contains all the named scalars correspond
|
|
//! to thread size of each parallel type.
|
|
std::unordered_map<ParallelType, std::unique_ptr<std::vector<int>>, TypeHash>
|
|
thread_dim_value_indices_;
|
|
};
|
|
|
|
} // namespace cuda
|
|
} // namespace fuser
|
|
} // namespace jit
|
|
} // namespace torch
|