mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Linearizable Label: Class Weights, Allow Missing Label, and Average by Batch Size (#29707)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/29707 In D17885977, Linearizable label (a multi-class classification) was implemented in MTML. In this diff, we add several items for Linearizable label: - Assigning different weights to each class through ```model_def.tasks[i].class_weights```. - This option is a dictionary, the keys of which are indices of the classes and the values of which are weights for each class. - For example, if a linearizable-label task has 4 classes and its ```class_weights = {"0": 1, "1": 0.1, "2": 0.1, "3": 0.01}```, it means that in the loss function of this task, we assign weight 1 to its first class, weight 0.1 to its second and third class, and weight 0.01 to its forth class. The index/order of classes follows the logic of linearizable label. - Note that when you assign different weights to different classes, you need to correct the calibration by setting an appropriate ```model_def.tasks[i].calibration.linearizable_class_weight```. Basically, the class weights in calibration should be the reciprocals of the class weights in loss function. So the ```calibration.linearizable_class_weight = {"0": 1, "1": 10, "2": 10, "3": 100}``` for the example above. - Example FBLearner job: f150763093 - We also support ```model_def.allow_missing_label_with_zero_weight``` for linearizable label, which will ignore those examples with first label missing, by assigning zero weights to them in loss function. - We need to set ```allow_missing_label_with_zero_weight = true``` to enable it. - Example FBLearner job: f150763093 - Last but not least, we update caffe2 operator ```SoftmaxWithLoss``` to support loss averaged by batch size. - We need to set ```model_def.tasks[i].loss.softmaxLoss.average_by_batch_size = true``` to enable it. - Previously, the loss was averaged by weight sum of examples in batch, which is still the default behavior now (when ```average_by_batch_size = null``` or ```average_by_batch_size = false```). - Without this new feature, the calibration will be incorrect when applying non-equal-weight training among different classes to a linearizable task. - Example FBLearner job with ```average_by_batch_size = true``` results in a correct calibration: f150763093 - Example FBLearner job with ```average_by_batch_size = null``` results in an incorrect calibration: f150762990 Test Plan: buck test caffe2/caffe2/fb/dper/layer_models/tests:mtml_test_2 -- test_linearizable_label_task_with_class_weights buck test caffe2/caffe2/fb/dper/layer_models/tests:mtml_test_2 -- test_linearizable_label_task_with_zero_weight buck test caffe2/caffe2/fb/dper/layer_models/tests:mtml_test_2 -- test_linearizable_label_task_average_by_batch_size All tests passed. full canary: https://fburl.com/fblearner/troznfgh Reviewed By: chenshouyuan Differential Revision: D18461163 fbshipit-source-id: aaf3df031406ae94f74e2e365b57e47409ef0bfe
This commit is contained in:
committed by
Facebook Github Bot
parent
b8dca04f73
commit
ed788ec780
@ -256,7 +256,11 @@ bool SoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
|
||||
|
||||
float* avg_loss_data = avg_loss->template mutable_data<float>();
|
||||
if (weight_sum != 0.0) {
|
||||
avg_loss_data[0] = loss_sum * scale_ / weight_sum;
|
||||
if (average_by_batch_size_) {
|
||||
avg_loss_data[0] = loss_sum * scale_ / N;
|
||||
} else {
|
||||
avg_loss_data[0] = loss_sum * scale_ / weight_sum;
|
||||
}
|
||||
} else {
|
||||
avg_loss_data[0] = 0.0;
|
||||
}
|
||||
@ -278,6 +282,7 @@ bool SoftmaxWithLossGradientOp<float, CPUContext>::RunOnDevice() {
|
||||
N = X.size_to_dim(canonical_axis); // batch size
|
||||
D = X.size_from_dim(canonical_axis);
|
||||
auto* dX = Output(0, X.sizes(), at::dtype<float>());
|
||||
float avg_denominator;
|
||||
|
||||
if (label_prob_mode_) {
|
||||
CAFFE_ENFORCE_GE(T.dim(), 2);
|
||||
@ -349,9 +354,14 @@ bool SoftmaxWithLossGradientOp<float, CPUContext>::RunOnDevice() {
|
||||
|
||||
// Scale by d_avg_loss / N
|
||||
if (total_weight > 0) {
|
||||
if (average_by_batch_size_) {
|
||||
avg_denominator = N;
|
||||
} else {
|
||||
avg_denominator = total_weight;
|
||||
}
|
||||
math::Scale<float, float, CPUContext>(
|
||||
dX->numel(),
|
||||
scale_ / total_weight * d_avg_loss.data<float>()[0],
|
||||
scale_ / avg_denominator * d_avg_loss.data<float>()[0],
|
||||
dX->data<float>(),
|
||||
dX_data,
|
||||
&context_);
|
||||
|
@ -17,6 +17,8 @@ class SoftmaxWithLossOp final : public Operator<Context> {
|
||||
scale_(this->template GetSingleArgument<float>("scale", 1.)),
|
||||
label_prob_mode_(
|
||||
this->template GetSingleArgument<int>("label_prob", 0)),
|
||||
average_by_batch_size_(
|
||||
this->template GetSingleArgument<int>("average_by_batch_size", 0)),
|
||||
order_(StringToStorageOrder(
|
||||
this->template GetSingleArgument<string>("order", "NCHW"))),
|
||||
axis_(this->template GetSingleArgument<int>("axis", 1)) {
|
||||
@ -31,6 +33,7 @@ class SoftmaxWithLossOp final : public Operator<Context> {
|
||||
protected:
|
||||
float scale_;
|
||||
int label_prob_mode_;
|
||||
int average_by_batch_size_;
|
||||
StorageOrder order_;
|
||||
int axis_;
|
||||
|
||||
@ -52,6 +55,8 @@ class SoftmaxWithLossGradientOp final : public Operator<Context> {
|
||||
scale_(this->template GetSingleArgument<float>("scale", 1.)),
|
||||
label_prob_mode_(
|
||||
this->template GetSingleArgument<int>("label_prob", 0)),
|
||||
average_by_batch_size_(
|
||||
this->template GetSingleArgument<int>("average_by_batch_size", 0)),
|
||||
order_(StringToStorageOrder(
|
||||
this->template GetSingleArgument<string>("order", "NCHW"))),
|
||||
only_loss_(this->template GetSingleArgument<bool>("only_loss", false)),
|
||||
@ -67,6 +72,7 @@ class SoftmaxWithLossGradientOp final : public Operator<Context> {
|
||||
protected:
|
||||
float scale_;
|
||||
int label_prob_mode_;
|
||||
int average_by_batch_size_;
|
||||
// not used?
|
||||
Tensor sum_multiplier_{Context::GetDeviceType()};
|
||||
Tensor weights_; // unignored weights
|
||||
|
@ -19,6 +19,7 @@ class BatchSoftmaxLoss(ModelLayer):
|
||||
label_smoothing_matrix=None,
|
||||
label_prob=False,
|
||||
scale=1.0,
|
||||
average_by_batch_size=False,
|
||||
**kwargs
|
||||
):
|
||||
super(BatchSoftmaxLoss, self).__init__(
|
||||
@ -33,6 +34,7 @@ class BatchSoftmaxLoss(ModelLayer):
|
||||
)
|
||||
self.label_prob = label_prob
|
||||
self.scale = scale
|
||||
self.average_by_batch_size = average_by_batch_size
|
||||
|
||||
# label smoothing matrix: a K * K matrix where K is the label
|
||||
# cardinality; (i, j) element is the value of for label i
|
||||
@ -122,4 +124,5 @@ class BatchSoftmaxLoss(ModelLayer):
|
||||
self.output_schema.field_blobs(),
|
||||
label_prob=self.label_prob,
|
||||
scale=self.scale,
|
||||
average_by_batch_size=self.average_by_batch_size,
|
||||
)
|
||||
|
Reference in New Issue
Block a user