Files
pytorch/third_party/nvfuser/csrc/evaluator_common.h
jjsjann123 c11b301bcd [NVFUSER] refactor nvfuser build (#89621)
This PR is the first step towards refactors the build for nvfuser in order to have the coegen being a standalone library.

Contents inside this PR:
1. nvfuser code base has been moved to `./nvfuser`, from `./torch/csrc/jit/codegen/cuda/`, except for registration code for integration (interface.h/interface.cpp)
2. splits the build system so nvfuser is generating its own `.so` files. Currently there are:
    - `libnvfuser_codegen.so`, which contains the integration, codegen and runtime system of nvfuser
    - `nvfuser.so`, which is nvfuser's python API via pybind. Python frontend is now exposed via `nvfuser._C.XXX` instead of `torch._C._nvfuser`
3. nvfuser cpp tests is currently being compiled into `nvfuser_tests`
4. cmake is refactored so that:
    - nvfuser now has its own `CMakeLists.txt`, which is under `torch/csrc/jit/codegen/cuda/`.
    - nvfuser backend code is not compiled inside `libtorch_cuda_xxx` any more
    - nvfuser is added as a subdirectory under `./CMakeLists.txt` at the very end after torch is built.
    - since nvfuser has dependency on torch, the registration of nvfuser at runtime is done via dlopen (`at::DynamicLibrary`). This avoids circular dependency in cmake, which will be a nightmare to handle. For details, look at `torch/csrc/jit/codegen/cuda/interface.cpp::LoadingNvfuserLibrary`

Future work that's scoped in following PR:
- Currently since nvfuser codegen has dependency on torch, we need to refactor that out so we can move nvfuser into a submodule and not rely on dlopen to load the library. @malfet
- Since we moved nvfuser into a cmake build, we effectively disabled bazel build for nvfuser. This could impact internal workload at Meta, so we need to put support back. cc'ing @vors

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89621
Approved by: https://github.com/davidberard98
2023-01-26 02:50:44 +00:00

344 lines
10 KiB
C++

#pragma once
#include <dynamic_type.h>
#include <executor_kernel_arg.h>
#include <executor_launch_params.h>
#include <fusion.h>
#include <ir_all_nodes.h>
#include <lower2device.h>
#include <c10/core/DeviceType.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
//! This is the common space for expression evaluators in
//! fusion IR and kernel IR context. Much of the evaluator
//! optimizations and runtimes could share the same code
//! path and they could be collected here.
class ExpressionEvaluator;
namespace kir {
class ExpressionEvaluator;
} // namespace kir
//! IR Contexts to be passed to generic evaluator optimizations
//! and runtimes. Defines the essential interface for the
//! generic logic to get necessary type and function info
//! from the IR nodes. Generic optimizations will assume
//! the same list of static definitions are provided
//! in each of the contexts, just FusionIR and KernelIR
//! currently.
//! Context for using generic logic on FusionIR
class FusionIRContext {
public:
using TV_TYPE = TensorView;
using EVALUATOR_TYPE = ExpressionEvaluator;
static BinaryOpType getOpType(BinaryOp* bop) {
return bop->getBinaryOpType();
}
static UnaryOpType getOpType(UnaryOp* uop) {
return uop->getUnaryOpType();
}
};
//! Context for using generic logic on KernelIR
class KernelIRContext {
public:
using EVALUATOR_TYPE = kir::ExpressionEvaluator;
static BinaryOpType getOpType(BinaryOp* bop) {
return bop->getBinaryOpType();
}
static UnaryOpType getOpType(UnaryOp* uop) {
return uop->getUnaryOpType();
}
};
template <typename IRContext>
class PrecomputedValuesBase;
//! NaiveValueMachine:
//! This is an un-optimized runtime for evaluating a
//! set of values in one run. The runtime contains
//! a vector of instructions inferred from IR at compile-time
//! and it currently must be associated with an instance of
//! PrecomputedValuesBase that will provide the workspace
//! containing the concrete values for the values.
template <typename IRContext>
class NaiveValueMachine {
//! The generic types of instructions supported for this
//! machine, currently only binary and unary.
enum class InstructionType { UNARY_OP, BINARY_OP };
public:
//! Constructor lowers all the expr IR nodes stored in precomputed_values
//! and stores them in the private state.
NaiveValueMachine(PrecomputedValuesBase<IRContext>& precomputed_values);
//! Runs all the instructions and write results to the associated
//! precomputed_values.
void run();
private:
//! Convert an unary IR expr to an instruction
void makeUnaryOp(UnaryOp* uop);
//! Convert an binary IR expr to an instruction
void makeBinaryOp(BinaryOp* bop);
//! Create an empty instruction with all default values
//! and place it at the end of the instruction buffer.
int makeInstructionEntry();
//! Run a single instruction at the given index of
//! the instruction buffer. Decodes and dispatches
//! to the corresponding instruction handle functions.
void runInstruction(int index);
//! Runs a unary operation at given index of instruction buffer
void runUnaryOp(int index);
//! Runs a binary operation at given index of instruction buffer
void runBinaryOp(int index);
private:
friend PrecomputedValuesBase<IRContext>;
//! Reference to the PrecomputedValues workspace associated with
//! this runtime. All the instructions will read and write the
//! values in this workspace.
PrecomputedValuesBase<IRContext>& precomputed_values_;
//! Instruction buffer. All states are in separate vectors and
//! the entry of each vector at the same index correspond to
//! the same instruction.
//! Total number of instructions
int num_of_instructions_ = 0;
//! Machine instruction type for each instruction i.e.
//! unary or binary
std::vector<InstructionType> inst_type_;
//! Unary operator type if applicable, contains a default
//! value at each index corresponding to a binary op.
std::vector<UnaryOpType> uop_type_;
//! Data type for unary op of type UnaryOpType::Cast, contains a default
//! value at each index corresponding other ops.
std::vector<DataType> data_type_;
//! Unary operator type if applicable, contains a default
//! value at each index corresponding to a unary op.
std::vector<BinaryOpType> bop_type_;
//! Indexes of operands and destination of each instruction.
//! The indexes corresponds to positions in the workspace
//! where concrete values are hosted.
//! Operand 0 of each instruction.
std::vector<int> src0_;
//! Operand 1 of each instruction, a default value at
//! each index corresponding to a unary op.
std::vector<int> src1_;
//! Destination of each instruction.
std::vector<int> dest_;
};
//! PrecomputedValuesBase:
//! A class to support optimized evaluation of values
//! at runtime.
//! At compile time all necessary values are collected
//! from given IR nodes and a runtime and a workspace containing
//! the concrete values is created and pre-allocated.
//! At runtime the value vm is used to evaluate all the
//! values and store them in the workspace ahead of time.
template <typename IRContext>
class PrecomputedValuesBase {
using VALUE_MACHINE = NaiveValueMachine<IRContext>;
public:
explicit PrecomputedValuesBase() = default;
//! Returns if the workspace contains evaluated results.
bool ready() {
return has_valid_values_;
}
//! Runs the internal value machine that will compute
//! the values allocated in the workspace.
void evaluate();
//! Returns value for the given IR node if it's stored
//! in the workspace and has been evaluated.
c10::optional<IntOrDouble> getMaybeValueFor(const Val* val);
//! Debugging helper, prints all the currently known values
void print() const;
protected:
//! Initialize the workspace before first use.
//! Assume the given value list IR nodes have
//! been topologically sorted.
void initializeValueList(
typename IRContext::EVALUATOR_TYPE& evaluator,
const std::vector<Val*>& sorted_value_list);
//! Bind concrete value to the given index
//! if the index is valid.
void bindValue(int index, IntOrDouble value) {
if (index < 0 || is_constant_[index]) {
return;
}
defined_[index] = true;
values_[index] = value;
binding_log_.emplace_back(index, value);
}
//! Invalidate all computed values in the workspace.
void invalidate();
//! Interface for subclasses to access symbols_
void loadSymbols(std::vector<Val*> symbols) {
symbols_ = std::move(symbols);
}
//! Interface for subclasses to access symbols_
std::vector<Val*>& symbols() {
return symbols_;
}
//! Initialize the value runtime that will
//! infer instructions from the workspace.
void initializeIntegerMachine() {
value_machine_ = std::make_unique<VALUE_MACHINE>(*this);
}
bool hasValidValues() {
return has_valid_values_;
}
private:
//! Post evaluation check, throws if any computed value
//! is inconsistent with its bound value
void validate();
//! Returns true if workspace has a computed or constant
//! value for given index.
bool hasValue(int index) {
TORCH_INTERNAL_ASSERT(index > 0);
return defined_[index] || is_constant_[index];
}
private:
friend VALUE_MACHINE;
//! Marks if an evaluation has finished
bool has_valid_values_ = false;
//! The size of workspace
int num_of_values_ = -1;
//! Marks if a value has been bound or
//! computed at each index.
std::vector<bool> defined_;
//! Marks if a value is compile-time constant
//! at each index.
std::vector<bool> is_constant_;
//! Stores the concrete values at each index.
std::vector<IntOrDouble> values_;
//! Stores the IR nodes corresponding to each index.
std::vector<Val*> symbols_;
//! An internal log to keep track of all the bindings
//! used in each evaluation cycle. To be used for
//! consistency check.
std::vector<std::pair<int, IntOrDouble>> binding_log_;
//! Integer runtime for realizing the values computations.
std::unique_ptr<VALUE_MACHINE> value_machine_;
};
//! PrecomputedValues workspace in Fusion IR context,
//! defines the set of values to be collected in each
//! fusion graph and the input value binding given each
//! fusion runtime input.
class FusionPrecomputedValues : public PrecomputedValuesBase<FusionIRContext> {
using precomputedValuesBaseType = PrecomputedValuesBase<FusionIRContext>;
public:
FusionPrecomputedValues(Fusion* fusion);
//! Bind concrete values from fusion runtime inputs
void bindFusionInputs(const KernelArgumentHolder& args);
private:
void bindTensorMetaData(
TensorView* tv,
const TensorArgAbstract* tensor_arg_abstract);
private:
Fusion* fusion_ = nullptr;
};
//! PrecomputedValues workspace in Fusion IR context,
//! defines the set of values to be collected in each
//! kernel IR sequence and the input value binding given each
//! fusion runtime input and launch constraints.
class KernelPrecomputedValues : public PrecomputedValuesBase<KernelIRContext> {
using precomputedValuesBaseType = PrecomputedValuesBase<KernelIRContext>;
public:
using ParallelExtentMap =
std::unordered_map<ParallelType, std::vector<const Val*>, TypeHash>;
KernelPrecomputedValues(kir::Kernel* kernel);
//! Bind concrete values from fusion runtime inputs
void bindKernelInputs(kir::Kernel* kernel, const KernelArgumentHolder& args);
//! Bind concrete values from launch constraints
void bindParallelExtents(
const ParallelExtentMap& parallel_extents,
const LaunchParams& launch_constraint);
//! Bind the NamedScalars corresponding to the
//! concrete parallel dimension sizes after the
//! actual value has been resolved.
void bindConcreteParallelTypeValue(ParallelType pt, int64_t value);
private:
void bindTensorMetaData(
TensorView* tv,
const TensorArgAbstract* tensor_arg_abstract);
//! Iterate through all the named scalars corresponding
//! to thread sizes and pre-group them by their parallel
//! types.
void initializeNamedScalars();
private:
//! Contains all the named scalars correspond
//! to thread size of each parallel type.
std::unordered_map<ParallelType, std::unique_ptr<std::vector<int>>, TypeHash>
thread_dim_value_indices_;
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch