mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
This PR adds support in train_decision if one wants to learn a heuristic for ranking. The main idea is that the user has to provide a number of choices the heuristic should return. I added a way to prune the learned decision tree such that it always returns the number of choices provided by the user. Pull Request resolved: https://github.com/pytorch/pytorch/pull/131705 Approved by: https://github.com/eellison
180 lines
5.8 KiB
Python
180 lines
5.8 KiB
Python
# mypy: ignore-errors
|
|
|
|
import argparse
|
|
import json
|
|
import warnings
|
|
|
|
import pandas as pd # type: ignore[import-untyped]
|
|
|
|
from torch._inductor.autoheuristic.autoheuristic_utils import (
|
|
CHOICE_COL,
|
|
get_metadata_str_from_log,
|
|
)
|
|
|
|
|
|
# TODO (AlnisM): Fix these warnings
|
|
warnings.filterwarnings(
|
|
"ignore",
|
|
message="The behavior of DataFrame concatenation with empty or all-NA entries is deprecated",
|
|
)
|
|
warnings.filterwarnings(
|
|
"ignore",
|
|
message="DataFrameGroupBy.apply operated on the grouping columns.",
|
|
)
|
|
|
|
|
|
class AHTrain:
|
|
"""
|
|
Base class for AutoHeuristic training.
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
self.parser = argparse.ArgumentParser()
|
|
self.add_base_arguments()
|
|
self.args = None
|
|
|
|
def add_base_arguments(self):
|
|
self.parser.add_argument(
|
|
"dataset",
|
|
type=str,
|
|
help="Path to text file containing data collected with AutoHeuristic.",
|
|
)
|
|
self.parser.add_argument(
|
|
"--nrows",
|
|
type=int,
|
|
default=None,
|
|
help="Only read first n rows of the dataset.",
|
|
)
|
|
self.parser.add_argument(
|
|
"--heuristic-name",
|
|
type=str,
|
|
default="learned_heuristic",
|
|
help="Name of the heuristic to be generated.",
|
|
)
|
|
self.parser.add_argument(
|
|
"--data",
|
|
nargs=2,
|
|
action="append",
|
|
metavar=("TYPE", "PATH"),
|
|
help="Specify name of datasets and file paths to be evaluated.",
|
|
)
|
|
self.parser.add_argument(
|
|
"--save-dot",
|
|
action="store_true",
|
|
help="Export heuristic to graphviz dot.",
|
|
)
|
|
self.parser.add_argument(
|
|
"--ranking",
|
|
type=int,
|
|
default=None,
|
|
help="""
|
|
Makes AutoHeuristic learn a heuristic that ranks choices instead of predicting a single choice.
|
|
The argument is the number of choices the heuristic will provide.
|
|
""",
|
|
)
|
|
|
|
def parse_args(self):
|
|
return self.parser.parse_args()
|
|
|
|
def parse_log(self, log_path, nrows=None):
|
|
(df, metadata) = self.deserialize_data(log_path)
|
|
numerical_features = metadata["numerical_features"]
|
|
categorical_features = metadata["categorical_features"]
|
|
choices = df[CHOICE_COL].unique().tolist()
|
|
features = numerical_features + categorical_features
|
|
if nrows is not None:
|
|
df = df.head(nrows)
|
|
df = self.filter_df(df)
|
|
return (df, metadata, features, categorical_features, choices)
|
|
|
|
def generate_heuristic(self):
|
|
self.args = self.parse_args()
|
|
self.main(
|
|
self.args.dataset,
|
|
self.args.data,
|
|
self.args.nrows,
|
|
self.args.heuristic_name,
|
|
self.args.save_dot,
|
|
self.args.ranking is not None,
|
|
)
|
|
|
|
def filter_df(self, df):
|
|
return df
|
|
|
|
def add_new_features(self, results):
|
|
return (results, [])
|
|
|
|
def add_real_datasets(self, datasets, other_datasets, cat_feature2cats):
|
|
if other_datasets:
|
|
for name, path in other_datasets:
|
|
(df_other, choices, _, _, _) = self.get_df(
|
|
path, cat_feature2cats=cat_feature2cats, apply_filters=False
|
|
)
|
|
datasets[name] = df_other
|
|
|
|
def handle_categorical_features(
|
|
self, cat_feature2cats, categorical_features, results
|
|
):
|
|
# Doing this here because if we create another df for testing purposes
|
|
# and that other df does not contain all categories for a categorical feature,
|
|
# pd.dummies will not create columns for the missing categories
|
|
if not cat_feature2cats:
|
|
cat_feature2cats = {}
|
|
for cat_feature in categorical_features:
|
|
if cat_feature in cat_feature2cats:
|
|
categories = cat_feature2cats[cat_feature]
|
|
else:
|
|
categories = results[cat_feature].unique()
|
|
cat_feature2cats[cat_feature] = categories
|
|
results[cat_feature] = pd.Categorical(
|
|
results[cat_feature], categories=categories
|
|
)
|
|
|
|
dummy_col_2_col_val = {}
|
|
for col in categorical_features:
|
|
unique_vals = results[col].unique()
|
|
for val in unique_vals:
|
|
dummy_col_2_col_val[f"{col}_{val}"] = (col, val)
|
|
# one-hot encode categorical features
|
|
results = pd.get_dummies(results, columns=categorical_features)
|
|
return (results, cat_feature2cats, dummy_col_2_col_val)
|
|
|
|
def gen_precondition(self, opt_name, shared_memory, device_capa):
|
|
return f""" def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
|
|
return (
|
|
metadata.name == self.get_name()
|
|
and metadata.shared_memory == {shared_memory}
|
|
and str(metadata.device_capa) == "{device_capa}"
|
|
)"""
|
|
|
|
def codegen_boilerplate(
|
|
self, heuristic_name, opt_name, threshold, shared_memory, device_capa, dt
|
|
):
|
|
pass
|
|
|
|
def gen_predict_fn_def(self):
|
|
pass
|
|
|
|
def write_heuristic_to_file(self, lines, heuristic_name):
|
|
output_file = (
|
|
f"../../../torch/_inductor/autoheuristic/artifacts/_{heuristic_name}.py"
|
|
)
|
|
path = f"{output_file}"
|
|
with open(path, "w") as f:
|
|
f.write("\n".join(lines) + "\n")
|
|
|
|
def deserialize_data(self, log_path):
|
|
json_string = get_metadata_str_from_log(log_path)
|
|
metadata = self.deserialize_metadata(json_string)
|
|
|
|
df = pd.read_csv(log_path, skiprows=1, on_bad_lines="skip")
|
|
return (df, metadata)
|
|
|
|
def deserialize_metadata(self, json_string):
|
|
return json.loads(json_string)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
train = AHTrain()
|
|
train.generate_heuristic()
|