[BE] [1/3] Rewrite super() calls in caffe2 and benchmarks (#94587)

Rewrite Python built-in class `super()` calls. Only non-semantic changes should be applied. - #94587 - #94588 - #94592 Also, methods with only a `super()` call are removed: ```diff class MyModule(nn.Module): - def __init__(self): - super().__init__() - def forward(self, ...): ... ``` Some cases that change the semantics should be kept unchanged. E.g.: f152a79be9/caffe2/python/net_printer.py (L184-L190) f152a79be9/test/test_jit_fuser_te.py (L2628-L2635) Pull Request resolved: https://github.com/pytorch/pytorch/pull/94587 Approved by: https://github.com/ezyang
2025-10-20 21:14:14 +08:00 · 2023-02-11 18:19:44 +00:00
parent aa6f0ace2f
commit 8d45f555d7
97 changed files with 207 additions and 239 deletions
--- a/.circleci/cimodel/data/binary_build_data.py
+++ b/.circleci/cimodel/data/binary_build_data.py
@ -57,7 +57,7 @@ WINDOWS_LIBTORCH_CONFIG_VARIANTS = [

 class TopLevelNode(ConfigNode):
    def __init__(self, node_name, config_tree_data, smoke):
-        super(TopLevelNode, self).__init__(None, node_name)
+        super().__init__(None, node_name)

        self.config_tree_data = config_tree_data
        self.props["smoke"] = smoke
@ -68,7 +68,7 @@ class TopLevelNode(ConfigNode):

 class OSConfigNode(ConfigNode):
    def __init__(self, parent, os_name, gpu_versions, py_tree):
-        super(OSConfigNode, self).__init__(parent, os_name)
+        super().__init__(parent, os_name)

        self.py_tree = py_tree
        self.props["os_name"] = os_name
@ -80,7 +80,7 @@ class OSConfigNode(ConfigNode):

 class PackageFormatConfigNode(ConfigNode):
    def __init__(self, parent, package_format, python_versions):
-        super(PackageFormatConfigNode, self).__init__(parent, package_format)
+        super().__init__(parent, package_format)

        self.props["python_versions"] = python_versions
        self.props["package_format"] = package_format
@ -97,7 +97,7 @@ class PackageFormatConfigNode(ConfigNode):

 class LinuxGccConfigNode(ConfigNode):
    def __init__(self, parent, gcc_config_variant):
-        super(LinuxGccConfigNode, self).__init__(parent, "GCC_CONFIG_VARIANT=" + str(gcc_config_variant))
+        super().__init__(parent, "GCC_CONFIG_VARIANT=" + str(gcc_config_variant))

        self.props["gcc_config_variant"] = gcc_config_variant

@ -122,7 +122,7 @@ class LinuxGccConfigNode(ConfigNode):

 class WindowsLibtorchConfigNode(ConfigNode):
    def __init__(self, parent, libtorch_config_variant):
-        super(WindowsLibtorchConfigNode, self).__init__(parent, "LIBTORCH_CONFIG_VARIANT=" + str(libtorch_config_variant))
+        super().__init__(parent, "LIBTORCH_CONFIG_VARIANT=" + str(libtorch_config_variant))

        self.props["libtorch_config_variant"] = libtorch_config_variant

@ -132,7 +132,7 @@ class WindowsLibtorchConfigNode(ConfigNode):

 class ArchConfigNode(ConfigNode):
    def __init__(self, parent, gpu):
-        super(ArchConfigNode, self).__init__(parent, get_processor_arch_name(gpu))
+        super().__init__(parent, get_processor_arch_name(gpu))

        self.props["gpu"] = gpu

@ -142,7 +142,7 @@ class ArchConfigNode(ConfigNode):

 class PyVersionConfigNode(ConfigNode):
    def __init__(self, parent, pyver):
-        super(PyVersionConfigNode, self).__init__(parent, pyver)
+        super().__init__(parent, pyver)

        self.props["pyver"] = pyver

@ -158,7 +158,7 @@ class PyVersionConfigNode(ConfigNode):

 class LinkingVariantConfigNode(ConfigNode):
    def __init__(self, parent, linking_variant):
-        super(LinkingVariantConfigNode, self).__init__(parent, linking_variant)
+        super().__init__(parent, linking_variant)

    def get_children(self):
        return [DependencyInclusionConfigNode(self, v) for v in DEPS_INCLUSION_DIMENSIONS]
@ -166,6 +166,6 @@ class LinkingVariantConfigNode(ConfigNode):

 class DependencyInclusionConfigNode(ConfigNode):
    def __init__(self, parent, deps_variant):
-        super(DependencyInclusionConfigNode, self).__init__(parent, deps_variant)
+        super().__init__(parent, deps_variant)

        self.props["libtorch_variant"] = "-".join([self.parent.get_label(), self.get_label()])
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@ -12,7 +12,7 @@ def get_major_pyver(dotted_version):

 class TreeConfigNode(ConfigNode):
    def __init__(self, parent, node_name, subtree):
-        super(TreeConfigNode, self).__init__(parent, self.modify_label(node_name))
+        super().__init__(parent, self.modify_label(node_name))
        self.subtree = subtree
        self.init2(node_name)

@ -28,7 +28,7 @@ class TreeConfigNode(ConfigNode):

 class TopLevelNode(TreeConfigNode):
    def __init__(self, node_name, subtree):
-        super(TopLevelNode, self).__init__(None, node_name, subtree)
+        super().__init__(None, node_name, subtree)

    # noinspection PyMethodMayBeStatic
    def child_constructor(self):
--- a/android/pytorch_android/generate_test_torchscripts.py
+++ b/android/pytorch_android/generate_test_torchscripts.py
@ -15,9 +15,6 @@ def scriptAndSave(module, fileName):
    print('=' * 80)

 class Test(torch.jit.ScriptModule):
-    def __init__(self):
-        super(Test, self).__init__()
-
    @torch.jit.script_method
    def forward(self, input):
        return None
--- a/benchmarks/distributed/ddp/benchmark.py
+++ b/benchmarks/distributed/ddp/benchmark.py
@ -173,7 +173,7 @@ class Benchmark:

 class TorchvisionBenchmark(Benchmark):
    def __init__(self, device, distributed_backend, bucket_size, model):
-        super(TorchvisionBenchmark, self).__init__(
+        super().__init__(
            device,
            distributed_backend,
            bucket_size,
--- a/benchmarks/distributed/pipeline/pipe.py
+++ b/benchmarks/distributed/pipeline/pipe.py
@ -43,7 +43,7 @@ class EmbeddingLayer(nn.Embedding):

 class PositionalEncodingLayer(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
-        super(PositionalEncodingLayer, self).__init__()
+        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
@ -99,7 +99,7 @@ class TransformerLMSequential(nn.Sequential):
            layers.append(TransformerDecoderLayer(ninp, nhead, nhid, dropout))

        layers.append(LinearLayer(ninp, ntokens, initrange))
-        super(TransformerLMSequential, self).__init__(*layers)
+        super().__init__(*layers)


 def make_model(args, device, ntokens):
--- a/benchmarks/distributed/rpc/rl/agent.py
+++ b/benchmarks/distributed/rpc/rl/agent.py
@ -22,7 +22,7 @@ class Policy(nn.Module):
            nlayers (int): Number of layers in the model
            out_features (int): Number of features the model outputs
        """
-        super(Policy, self).__init__()
+        super().__init__()

        self.model = nn.Sequential(
            nn.Flatten(1, -1),
--- a/benchmarks/dynamo/dist_util.py
+++ b/benchmarks/dynamo/dist_util.py
@ -38,7 +38,7 @@ def cleanup():

 class CustomLinear(torch.nn.Module):
    def __init__(self, a, b):
-        super(CustomLinear, self).__init__()
+        super().__init__()
        self.weight = nn.Parameter(torch.randn(a, b))

    def forward(self, x):
@ -47,7 +47,7 @@ class CustomLinear(torch.nn.Module):

 class MyModule(torch.nn.Module):
    def __init__(self, a, b):
-        super(MyModule, self).__init__()
+        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(a, b),
            nn.ReLU(),
@ -59,7 +59,7 @@ class MyModule(torch.nn.Module):

 class ToyModel(nn.Module):
    def __init__(self):
-        super(ToyModel, self).__init__()
+        super().__init__()
        self.net = nn.Sequential(
            *[nn.Linear(10, 10000), nn.ReLU()]
            + [nn.Linear(10000, 10000), nn.ReLU()]
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@ -364,7 +364,7 @@ EXTRA_MODELS = {

 class HuggingfaceRunner(BenchmarkRunner):
    def __init__(self):
-        super(HuggingfaceRunner, self).__init__()
+        super().__init__()
        self.suite_name = "huggingface"

    def load_model(
--- a/benchmarks/dynamo/timm_models.py
+++ b/benchmarks/dynamo/timm_models.py
@ -169,7 +169,7 @@ def refresh_model_names():

 class TimmRunnner(BenchmarkRunner):
    def __init__(self):
-        super(TimmRunnner, self).__init__()
+        super().__init__()
        self.suite_name = "timm_models"

    def load_model(
--- a/benchmarks/dynamo/torchbench.py
+++ b/benchmarks/dynamo/torchbench.py
@ -195,7 +195,7 @@ MAX_BATCH_SIZE_FOR_ACCURACY_CHECK = {

 class TorchBenchmarkRunner(BenchmarkRunner):
    def __init__(self):
-        super(TorchBenchmarkRunner, self).__init__()
+        super().__init__()
        self.suite_name = "torchbench"
        self.optimizer = None

--- a/benchmarks/fastrnns/custom_lstms.py
+++ b/benchmarks/fastrnns/custom_lstms.py
@ -92,7 +92,7 @@ def reverse(lst: List[Tensor]) -> List[Tensor]:

 class LSTMCell(jit.ScriptModule):
    def __init__(self, input_size, hidden_size):
-        super(LSTMCell, self).__init__()
+        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.weight_ih = Parameter(torch.randn(4 * hidden_size, input_size))
@ -120,7 +120,7 @@ class LSTMCell(jit.ScriptModule):

 class LayerNorm(jit.ScriptModule):
    def __init__(self, normalized_shape):
-        super(LayerNorm, self).__init__()
+        super().__init__()
        if isinstance(normalized_shape, numbers.Integral):
            normalized_shape = (normalized_shape,)
        normalized_shape = torch.Size(normalized_shape)
@ -146,7 +146,7 @@ class LayerNorm(jit.ScriptModule):

 class LayerNormLSTMCell(jit.ScriptModule):
    def __init__(self, input_size, hidden_size, decompose_layernorm=False):
-        super(LayerNormLSTMCell, self).__init__()
+        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.weight_ih = Parameter(torch.randn(4 * hidden_size, input_size))
@ -183,7 +183,7 @@ class LayerNormLSTMCell(jit.ScriptModule):

 class LSTMLayer(jit.ScriptModule):
    def __init__(self, cell, *cell_args):
-        super(LSTMLayer, self).__init__()
+        super().__init__()
        self.cell = cell(*cell_args)

    @jit.script_method
@ -198,7 +198,7 @@ class LSTMLayer(jit.ScriptModule):

 class ReverseLSTMLayer(jit.ScriptModule):
    def __init__(self, cell, *cell_args):
-        super(ReverseLSTMLayer, self).__init__()
+        super().__init__()
        self.cell = cell(*cell_args)

    @jit.script_method
@ -215,7 +215,7 @@ class BidirLSTMLayer(jit.ScriptModule):
    __constants__ = ['directions']

    def __init__(self, cell, *cell_args):
-        super(BidirLSTMLayer, self).__init__()
+        super().__init__()
        self.directions = nn.ModuleList([
            LSTMLayer(cell, *cell_args),
            ReverseLSTMLayer(cell, *cell_args),
@ -247,7 +247,7 @@ class StackedLSTM(jit.ScriptModule):
    __constants__ = ['layers']  # Necessary for iterating through self.layers

    def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
-        super(StackedLSTM, self).__init__()
+        super().__init__()
        self.layers = init_stacked_lstm(num_layers, layer, first_layer_args,
                                        other_layer_args)

@ -274,7 +274,7 @@ class StackedLSTM2(jit.ScriptModule):
    __constants__ = ['layers']  # Necessary for iterating through self.layers

    def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
-        super(StackedLSTM2, self).__init__()
+        super().__init__()
        self.layers = init_stacked_lstm(num_layers, layer, first_layer_args,
                                        other_layer_args)

@ -299,7 +299,7 @@ class StackedLSTMWithDropout(jit.ScriptModule):
    __constants__ = ['layers', 'num_layers']

    def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
-        super(StackedLSTMWithDropout, self).__init__()
+        super().__init__()
        self.layers = init_stacked_lstm(num_layers, layer, first_layer_args,
                                        other_layer_args)
        # Introduces a Dropout layer on the outputs of each LSTM layer except
--- a/benchmarks/framework_overhead_benchmark/SimpleAddModule.py
+++ b/benchmarks/framework_overhead_benchmark/SimpleAddModule.py
@ -9,7 +9,7 @@ def add_tensors_loop(x, y):

 class SimpleAddModule(torch.nn.Module):
    def __init__(self, add_op):
-        super(SimpleAddModule, self).__init__()
+        super().__init__()
        self.add_op = add_op

    def forward(self, x, y):
--- a/benchmarks/functional_autograd_benchmark/torchaudio_models.py
+++ b/benchmarks/functional_autograd_benchmark/torchaudio_models.py
@ -27,7 +27,7 @@ class Wav2Letter(nn.Module):
    def __init__(self, num_classes: int = 40,
                 input_type: str = "waveform",
                 num_features: int = 1) -> None:
-        super(Wav2Letter, self).__init__()
+        super().__init__()

        acoustic_num_features = 250 if input_type == "waveform" else num_features
        acoustic_model = nn.Sequential(
@ -85,7 +85,7 @@ class SequenceWise(nn.Module):
        Allows handling of variable sequence lengths and minibatch sizes.
        :param module: Module to apply input to.
        """
-        super(SequenceWise, self).__init__()
+        super().__init__()
        self.module = module

    def forward(self, x):
@ -110,7 +110,7 @@ class MaskConv(nn.Module):
        Input needs to be in the shape of (BxCxDxT)
        :param seq_module: The sequential module containing the conv stack.
        """
-        super(MaskConv, self).__init__()
+        super().__init__()
        self.seq_module = seq_module

    def forward(self, x, lengths):
@ -142,7 +142,7 @@ class InferenceBatchSoftmax(nn.Module):

 class BatchRNN(nn.Module):
    def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, bidirectional=False, batch_norm=True):
-        super(BatchRNN, self).__init__()
+        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
@ -170,7 +170,7 @@ class Lookahead(nn.Module):
    # input shape - sequence, batch, feature - TxNxH
    # output shape - same as input
    def __init__(self, n_features, context):
-        super(Lookahead, self).__init__()
+        super().__init__()
        assert context > 0
        self.context = context
        self.n_features = n_features
@ -193,7 +193,7 @@ class Lookahead(nn.Module):
 class DeepSpeech(nn.Module):
    def __init__(self, rnn_type, labels, rnn_hidden_size, nb_layers, audio_conf,
                 bidirectional, context=20):
-        super(DeepSpeech, self).__init__()
+        super().__init__()

        self.hidden_size = rnn_hidden_size
        self.hidden_layers = nb_layers
@ -298,7 +298,7 @@ class PositionalEncoding(nn.Module):
    """

    def __init__(self, d_model, dropout=0.1, max_len=5000):
-        super(PositionalEncoding, self).__init__()
+        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
@ -327,7 +327,7 @@ class TransformerModel(nn.Module):
    """Container module with an encoder, a recurrent or transformer module, and a decoder."""

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
-        super(TransformerModel, self).__init__()
+        super().__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except Exception as e:
@ -392,7 +392,7 @@ class MultiheadAttentionContainer(torch.nn.Module):
            >>> print(attn_output.shape)
            >>> torch.Size([21, 64, 10])
        """
-        super(MultiheadAttentionContainer, self).__init__()
+        super().__init__()
        self.nhead = nhead
        self.in_proj_container = in_proj_container
        self.attention_layer = attention_layer
@ -456,7 +456,7 @@ class ScaledDotProduct(torch.nn.Module):
            >>> print(attn_output.shape, attn_weights.shape)
            torch.Size([256, 21, 3]) torch.Size([256, 21, 21])
        """
-        super(ScaledDotProduct, self).__init__()
+        super().__init__()
        self.dropout = dropout

    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
@ -532,7 +532,7 @@ class InProjContainer(torch.nn.Module):
            value_proj: a proj layer for value.
        """

-        super(InProjContainer, self).__init__()
+        super().__init__()
        self.query_proj = query_proj
        self.key_proj = key_proj
        self.value_proj = value_proj
--- a/benchmarks/functional_autograd_benchmark/torchvision_models.py
+++ b/benchmarks/functional_autograd_benchmark/torchvision_models.py
@ -29,7 +29,7 @@ class BasicBlock(nn.Module):

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
-        super(BasicBlock, self).__init__()
+        super().__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
@ -74,7 +74,7 @@ class Bottleneck(nn.Module):

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
-        super(Bottleneck, self).__init__()
+        super().__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        width = int(planes * (base_width / 64.)) * groups
@ -116,7 +116,7 @@ class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
                 norm_layer=None):
-        super(ResNet, self).__init__()
+        super().__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer
@ -281,7 +281,7 @@ class IntermediateLayerGetter(nn.ModuleDict):
            if not return_layers:
                break

-        super(IntermediateLayerGetter, self).__init__(layers)
+        super().__init__(layers)
        self.return_layers = orig_return_layers

    def forward(self, x):
@ -297,7 +297,7 @@ class _SimpleSegmentationModel(nn.Module):
    __constants__ = ['aux_classifier']

    def __init__(self, backbone, classifier, aux_classifier=None):
-        super(_SimpleSegmentationModel, self).__init__()
+        super().__init__()
        self.backbone = backbone
        self.classifier = classifier
        self.aux_classifier = aux_classifier
@ -346,7 +346,7 @@ class FCNHead(nn.Sequential):
            nn.Conv2d(inter_channels, channels, 1)
        ]

-        super(FCNHead, self).__init__(*layers)
+        super().__init__(*layers)

 def _segm_resnet(name, backbone_name, num_classes, aux, pretrained_backbone=True):
    # backbone = resnet.__dict__[backbone_name](
--- a/benchmarks/operator_benchmark/benchmark_pytorch.py
+++ b/benchmarks/operator_benchmark/benchmark_pytorch.py
@ -18,7 +18,7 @@ class TorchBenchmarkBase(torch.nn.Module):
    """

    def __init__(self):
-        super(TorchBenchmarkBase, self).__init__()
+        super().__init__()
        self.user_given_name = None
        self._pass_count = 0
        self._num_inputs_require_grads = 0
--- a/benchmarks/operator_benchmark/pt/qarithmetic_test.py
+++ b/benchmarks/operator_benchmark/pt/qarithmetic_test.py
@ -46,7 +46,7 @@ class _QFunctionalBinaryArithmeticBenchmarkBase(op_bench.TorchBenchmarkBase):

 class QFunctionalBenchmark(_QFunctionalBinaryArithmeticBenchmarkBase):
    def init(self, N, dtype, contig, op_func):
-        super(QFunctionalBenchmark, self).setup(N, dtype, contig)
+        super().setup(N, dtype, contig)
        self.inputs = {
            "q_input_a": self.q_input_a,
            "q_input_b": self.q_input_a,
@ -66,7 +66,7 @@ op_bench.generate_pt_tests_from_op_list(qarithmetic_binary_ops,

 class QFunctionalScalarBenchmark(_QFunctionalBinaryArithmeticBenchmarkBase):
    def init(self, N, dtype, contig, op_func):
-        super(QFunctionalScalarBenchmark, self).setup(N, dtype, contig)
+        super().setup(N, dtype, contig)
        self.inputs = {
            "q_input": self.q_input_a,
            "scalar_input": 42
--- a/benchmarks/operator_benchmark/pt/qconv_test.py
+++ b/benchmarks/operator_benchmark/pt/qconv_test.py
@ -41,7 +41,7 @@ class QConv1dBenchmark(op_bench.TorchBenchmarkBase):
 class QConv2dBenchmark(op_bench.TorchBenchmarkBase):
    # def init(self, N, IC, OC, H, W, G, kernel, stride, pad):
    def init(self, IC, OC, kernel, stride, N, H, W, G, pad, device):
-        # super(QConv2dBenchmark, self).init(N, IC, OC, (H, W), G, (kernel, kernel), stride, pad)
+        # super().init(N, IC, OC, (H, W), G, (kernel, kernel), stride, pad)

        self.scale = 1.0 / 255
        self.zero_point = 0
--- a/benchmarks/operator_benchmark/pt/qlinear_test.py
+++ b/benchmarks/operator_benchmark/pt/qlinear_test.py
@ -32,7 +32,7 @@ class _QLinearBenchmarkBase(op_bench.TorchBenchmarkBase):

 class QLinearBenchmark(_QLinearBenchmarkBase):
    def init(self, N, IN, OUT, device):
-        super(QLinearBenchmark, self).init(N, IN, OUT, nnq.Linear(IN, OUT))
+        super().init(N, IN, OUT, nnq.Linear(IN, OUT))
        self.inputs = {
            "input": self.qX
        }
@ -41,7 +41,7 @@ class QLinearBenchmark(_QLinearBenchmarkBase):

 class QDynamicLinearBenchmark(_QLinearBenchmarkBase):
    def init(self, N, IN, OUT, device):
-        super(QDynamicLinearBenchmark, self).init(N, IN, OUT, nnqd.Linear(IN, OUT))
+        super().init(N, IN, OUT, nnqd.Linear(IN, OUT))
        self.inputs = {
            "input": self.X
        }
--- a/benchmarks/operator_benchmark/pt/qpool_test.py
+++ b/benchmarks/operator_benchmark/pt/qpool_test.py
@ -101,22 +101,20 @@ class QMaxPool2dBenchmark(_QPool2dBenchmarkBase):
        self.pool_op = torch.nn.MaxPool2d(kernel_size=k, stride=s, padding=p,
                                          dilation=(1, 1), ceil_mode=False,
                                          return_indices=False)
-        super(QMaxPool2dBenchmark, self).setup(N, C, H, W, dtype, contig)
+        super().setup(N, C, H, W, dtype, contig)


 class QAvgPool2dBenchmark(_QPool2dBenchmarkBase):
    def init(self, N, C, H, W, k, s, p, contig, dtype):
        self.pool_op = torch.nn.AvgPool2d(kernel_size=k, stride=s, padding=p,
                                          ceil_mode=False)
-        super(QAvgPool2dBenchmark, self).setup(N, C, H, W, dtype, contig)
+        super().setup(N, C, H, W, dtype, contig)


 class QAdaptiveAvgPool2dBenchmark(_QPool2dBenchmarkBase):
    def init(self, N, C, input_size, output_size, contig, dtype):
        self.pool_op = torch.nn.AdaptiveAvgPool2d(output_size=output_size)
-        super(QAdaptiveAvgPool2dBenchmark, self).setup(N, C, *input_size,
-                                                       dtype=dtype,
-                                                       contig=contig)
+        super().setup(N, C, *input_size, dtype=dtype, contig=contig)


 op_bench.generate_pt_test(qadaptive_avgpool2d_short_configs + qadaptive_avgpool2d_long_configs,
--- a/benchmarks/tensorexpr/broadcast.py
+++ b/benchmarks/tensorexpr/broadcast.py
@ -69,7 +69,7 @@ class BroadcastMulBench(benchmark.Benchmark):

 class BroadcastRowBench(BroadcastMulBench):
    def __init__(self, mode, device, dtype, M, N, K):
-        super(BroadcastRowBench, self).__init__(mode, device, dtype, "row", M, N, K)
+        super().__init__(mode, device, dtype, "row", M, N, K)

    @staticmethod
    def module():
@ -78,7 +78,7 @@ class BroadcastRowBench(BroadcastMulBench):

 class BroadcastMidBench(BroadcastMulBench):
    def __init__(self, mode, device, dtype, M, N, K):
-        super(BroadcastMidBench, self).__init__(mode, device, dtype, "mid", M, N, K)
+        super().__init__(mode, device, dtype, "mid", M, N, K)

    @staticmethod
    def module():
@ -87,7 +87,7 @@ class BroadcastMidBench(BroadcastMulBench):

 class BroadcastColBench(BroadcastMulBench):
    def __init__(self, mode, device, dtype, M, N, K):
-        super(BroadcastColBench, self).__init__(mode, device, dtype, "col", M, N, K)
+        super().__init__(mode, device, dtype, "col", M, N, K)

    @staticmethod
    def module():
--- a/benchmarks/tensorexpr/reduction.py
+++ b/benchmarks/tensorexpr/reduction.py
@ -80,7 +80,7 @@ class ReduceBench(benchmark.Benchmark):

 class ReduceRowBench(ReduceBench):
    def __init__(self, mode, device, dtype, M, N, K, skip_input_transform):
-        super(ReduceRowBench, self).__init__(mode, device, dtype, "row", M, N, K, skip_input_transform)
+        super().__init__(mode, device, dtype, "row", M, N, K, skip_input_transform)

    @staticmethod
    def module():
@ -89,7 +89,7 @@ class ReduceRowBench(ReduceBench):

 class ReduceMidBench(ReduceBench):
    def __init__(self, mode, device, dtype, M, N, K, skip_input_transform):
-        super(ReduceMidBench, self).__init__(mode, device, dtype, "mid", M, N, K, skip_input_transform)
+        super().__init__(mode, device, dtype, "mid", M, N, K, skip_input_transform)

    @staticmethod
    def module():
@ -98,7 +98,7 @@ class ReduceMidBench(ReduceBench):

 class ReduceColBench(ReduceBench):
    def __init__(self, mode, device, dtype, M, N, K, skip_input_transform):
-        super(ReduceColBench, self).__init__(mode, device, dtype, "col", M, N, K, skip_input_transform)
+        super().__init__(mode, device, dtype, "col", M, N, K, skip_input_transform)

    @staticmethod
    def module():
@ -107,7 +107,7 @@ class ReduceColBench(ReduceBench):

 class ReduceFullBench(ReduceBench):
    def __init__(self, mode, device, dtype, M, skip_input_transform):
-        super(ReduceFullBench, self).__init__(mode, device, dtype, "full", M, 1, 1, skip_input_transform)
+        super().__init__(mode, device, dtype, "full", M, 1, 1, skip_input_transform)

    def config(self):
        return [self.M * self.N * self.K, self._skip_input_transform_str()]
@ -178,7 +178,7 @@ class Reduce2DBench(benchmark.Benchmark):

 class Reduce2DInnerBench(Reduce2DBench):
    def __init__(self, mode, device, dtype, dim0, dim1):
-        super(Reduce2DInnerBench, self).__init__(mode, device, dtype, 1, dim0, dim1)
+        super().__init__(mode, device, dtype, 1, dim0, dim1)

    @staticmethod
    def default_configs():
@ -186,7 +186,7 @@ class Reduce2DInnerBench(Reduce2DBench):
        return [parent_config[1:]]

    def config(self):
-        parent_config = super(Reduce2DInnerBench, self).config()
+        parent_config = super().config()
        return parent_config[1:]

    @staticmethod
@ -195,7 +195,7 @@ class Reduce2DInnerBench(Reduce2DBench):

 class Reduce2DOuterBench(Reduce2DBench):
    def __init__(self, mode, device, dtype, dim0, dim1):
-        super(Reduce2DOuterBench, self).__init__(mode, device, dtype, 0, dim0, dim1)
+        super().__init__(mode, device, dtype, 0, dim0, dim1)

    @staticmethod
    def default_configs():
@ -203,7 +203,7 @@ class Reduce2DOuterBench(Reduce2DBench):
        return [parent_config[1:]]

    def config(self):
-        parent_config = super(Reduce2DOuterBench, self).config()
+        parent_config = super().config()
        return parent_config[1:]

    @staticmethod
@ -249,7 +249,7 @@ class DynamicReduce2DInnerBench(DynamicReduce2DBench):
        return [parent_config[1:]]

    def config(self):
-        parent_config = super(DynamicReduce2DInnerBench, self).config()
+        parent_config = super().config()
        return parent_config[1:]

    @staticmethod
@ -267,7 +267,7 @@ class DynamicReduce2DOuterBench(DynamicReduce2DBench):
        return [parent_config[1:]]

    def config(self):
-        parent_config = super(DynamicReduce2DInnerBench, self).config()
+        parent_config = super().config()
        return parent_config[1:]

    @staticmethod
--- a/caffe2/distributed/file_store_handler_op_test.py
+++ b/caffe2/distributed/file_store_handler_op_test.py
@ -21,7 +21,7 @@ class TestFileStoreHandlerOp(TestCase):
    testCounter = 0

    def setUp(self):
-        super(TestFileStoreHandlerOp, self).setUp()
+        super().setUp()
        self.tmpdir = tempfile.mkdtemp()

        # Use counter to tell test cases apart
@ -29,7 +29,7 @@ class TestFileStoreHandlerOp(TestCase):

    def tearDown(self):
        shutil.rmtree(self.tmpdir)
-        super(TestFileStoreHandlerOp, self).tearDown()
+        super().tearDown()

    def create_store_handler(self):
        # Use new path for every test so they are isolated
--- a/caffe2/distributed/redis_store_handler_op_test.py
+++ b/caffe2/distributed/redis_store_handler_op_test.py
@ -17,12 +17,9 @@ dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:store_ops")

 class TestRedisStoreHandlerOp(TestCase):
    def setUp(self):
-        super(TestRedisStoreHandlerOp, self).setUp()
+        super().setUp()
        self.uuid = str(uuid.uuid4()) + "/"

-    def tearDown(self):
-        super(TestRedisStoreHandlerOp, self).tearDown()
-
    def create_store_handler(self):
        store_handler = "store_handler"
        workspace.RunOperatorOnce(
--- a/caffe2/python/cached_reader.py
+++ b/caffe2/python/cached_reader.py
@ -71,7 +71,7 @@ class CachedReader(DBFileReader):
        assert original_reader is not None, "original_reader can't be None"
        self.original_reader = original_reader

-        super(CachedReader, self).__init__(
+        super().__init__(
            db_path,
            db_type,
            name,
--- a/caffe2/python/checkpoint.py
+++ b/caffe2/python/checkpoint.py
@ -96,13 +96,13 @@ class Job(context.Managed):
        self.exit_group = session_class.compile(self.exit_group)

    def __enter__(self):
-        super(Job, self).__enter__()
+        super().__enter__()
        self.epoch_group.__enter__()
        return self

    def __exit__(self, *args):
        self.epoch_group.__exit__()
-        super(Job, self).__exit__(*args)
+        super().__exit__(*args)

    def add_stop_condition(self, output):
        if isinstance(output, core.BlobReference):
--- a/caffe2/python/cnn.py
+++ b/caffe2/python/cnn.py
@ -36,7 +36,7 @@ class CNNModelHelper(ModelHelper):
        }
        if ws_nbytes_limit:
            cnn_arg_scope['ws_nbytes_limit'] = ws_nbytes_limit
-        super(CNNModelHelper, self).__init__(
+        super().__init__(
            skip_sparse_optim=skip_sparse_optim,
            name="CNN" if name is None else name,
            init_params=init_params,
--- a/caffe2/python/control_test.py
+++ b/caffe2/python/control_test.py
@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)

 class TestControl(test_util.TestCase):
    def setUp(self):
-        super(TestControl, self).setUp()
+        super().setUp()
        self.N_ = 10

        self.init_net_ = core.Net("init-net")
--- a/caffe2/python/dataio.py
+++ b/caffe2/python/dataio.py
@ -424,7 +424,7 @@ class ReaderWithLimit(ReaderWithLimitBase):
                produces a data_finished blob as a side effect to indicate
                whether the input stream is exhausted.
        """
-        super(ReaderWithLimit, self).__init__(reader)
+        super().__init__(reader)
        self.counter = None
        self.num_iter = num_iter
        if self.num_iter is not None:
@ -466,7 +466,7 @@ class ReaderWithTimeLimit(ReaderWithLimitBase):
                produces a data_finished blob as a side effect to indicate
                whether the input stream is exhausted.
        """
-        super(ReaderWithTimeLimit, self).__init__(reader)
+        super().__init__(reader)

        self.timer = None
        self.duration = duration
@ -528,7 +528,7 @@ class CompositeReader(Reader):
            readers: list[Reader] Reader instances, must have schema
        """
        assert len(names) == len(readers)
-        super(CompositeReader, self).__init__(schema=Struct(*[
+        super().__init__(schema=Struct(*[
            (name, reader.schema()) for name, reader in zip(names, readers)
        ]))
        self._names = names
@ -584,7 +584,7 @@ class CompositeReaderBuilder(ReaderBuilder):
            reader_builders: list[ReaderBuilder] ReaderBuilder instances;
                must have schema
        """
-        super(CompositeReaderBuilder, self).__init__()
+        super().__init__()
        self._names = names
        self._reader_builders = reader_builders
        self._schema = Struct(*[
--- a/caffe2/python/db_file_reader.py
+++ b/caffe2/python/db_file_reader.py
@ -66,7 +66,7 @@ class DBFileReader(Reader):

        # Before self._init_reader_schema(...),
        # self.db_path and self.db_type are required to be set.
-        super(DBFileReader, self).__init__(self._init_reader_schema(field_names))
+        super().__init__(self._init_reader_schema(field_names))
        self.ds = Dataset(self._schema, self.name + '_dataset')
        self.ds_reader = None

--- a/caffe2/python/gru_cell.py
+++ b/caffe2/python/gru_cell.py
@ -19,7 +19,7 @@ class GRUCell(rnn_cell.RNNCell):
        linear_before_reset=False,
        **kwargs
    ):
-        super(GRUCell, self).__init__(**kwargs)
+        super().__init__(**kwargs)
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.forget_bias = float(forget_bias)
--- a/caffe2/python/layer_model_helper.py
+++ b/caffe2/python/layer_model_helper.py
@ -47,7 +47,7 @@ class LayerModelHelper(model_helper.ModelHelper):
            This attribute access will be consistent with MTML model.
        '''

-        super(LayerModelHelper, self).__init__(name=name)
+        super().__init__(name=name)
        self._layer_names = set()
        self._layers = []
        self._param_to_shape = {}
--- a/caffe2/python/layer_test_util.py
+++ b/caffe2/python/layer_test_util.py
@ -32,7 +32,7 @@ class OpSpec(namedtuple("OpSpec", "type input output arg")):
 class LayersTestCase(test_util.TestCase):

    def setUp(self):
-        super(LayersTestCase, self).setUp()
+        super().setUp()
        self.setup_example()

    def setup_example(self):
--- a/caffe2/python/layers/adaptive_weight.py
+++ b/caffe2/python/layers/adaptive_weight.py
@ -27,7 +27,7 @@ class AdaptiveWeight(ModelLayer):
        reg_lambda=0.1,
        **kwargs
    ):
-        super(AdaptiveWeight, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
        self.output_schema = schema.Scalar(
            np.float32, self.get_next_blob_reference("adaptive_weight")
        )
--- a/caffe2/python/layers/add_bias.py
+++ b/caffe2/python/layers/add_bias.py
@ -14,7 +14,7 @@ class AddBias(ModelLayer):

    def __init__(self, model, input_record, bias_init=None,
                 bias_optim=None, name='add_bias'):
-        super(AddBias, self).__init__(model, name, input_record)
+        super().__init__(model, name, input_record)
        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
        assert len(input_record.field_type().shape) > 0, (
            "AddBias expects limited dimensions of the input tensor")
--- a/caffe2/python/layers/arc_cosine_feature_map.py
+++ b/caffe2/python/layers/arc_cosine_feature_map.py
@ -49,8 +49,7 @@ class ArcCosineFeatureMap(ModelLayer):
            name='arc_cosine_feature_map',
            **kwargs):

-        super(ArcCosineFeatureMap, self).__init__(model, name, input_record,
-                                                  **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
        self.params = []
        self.model = model
--- a/caffe2/python/layers/batch_huber_loss.py
+++ b/caffe2/python/layers/batch_huber_loss.py
@ -18,7 +18,7 @@ import numpy as np
 class BatchHuberLoss(ModelLayer):

    def __init__(self, model, input_record, name='batch_huber_loss', delta=1.0, **kwargs):
-        super(BatchHuberLoss, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)

        assert delta > 0

--- a/caffe2/python/layers/batch_lr_loss.py
+++ b/caffe2/python/layers/batch_lr_loss.py
@ -35,7 +35,7 @@ class BatchLRLoss(ModelLayer):
        task_gamma_lb=0.1,
        **kwargs
    ):
-        super(BatchLRLoss, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)

        self.average_loss = average_loss

--- a/caffe2/python/layers/batch_mse_loss.py
+++ b/caffe2/python/layers/batch_mse_loss.py
@ -18,7 +18,7 @@ import numpy as np
 class BatchMSELoss(ModelLayer):

    def __init__(self, model, input_record, name='batch_mse_loss', **kwargs):
-        super(BatchMSELoss, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)

        assert schema.is_schema_subset(
            schema.Struct(
--- a/caffe2/python/layers/batch_normalization.py
+++ b/caffe2/python/layers/batch_normalization.py
@ -22,8 +22,7 @@ class BatchNormalization(ModelLayer):
        scale_init_value=1.0,
        **kwargs
    ):
-        super(BatchNormalization, self).__init__(
-            model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)

        assert isinstance(input_record, schema.Scalar), "Incorrect input type"

--- a/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py
+++ b/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py
@ -19,8 +19,7 @@ class BatchSigmoidCrossEntropyLoss(ModelLayer):
        name='batch_sigmoid_cross_entropy_loss',
        **kwargs
    ):
-        super(BatchSigmoidCrossEntropyLoss, self).__init__(
-            model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)

        assert schema.is_schema_subset(
            schema.Struct(
--- a/caffe2/python/layers/batch_softmax_loss.py
+++ b/caffe2/python/layers/batch_softmax_loss.py
@ -22,8 +22,7 @@ class BatchSoftmaxLoss(ModelLayer):
        average_by_batch_size=False,
        **kwargs
    ):
-        super(BatchSoftmaxLoss, self).__init__(
-            model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)

        assert schema.is_schema_subset(
            schema.Struct(
--- a/caffe2/python/layers/blob_weighted_sum.py
+++ b/caffe2/python/layers/blob_weighted_sum.py
@ -23,7 +23,7 @@ class BlobWeightedSum(ModelLayer):
        name='blob_weighted_sum',
        **kwargs
    ):
-        super(BlobWeightedSum, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)

        self.blobs = self.input_record.field_blobs()

--- a/caffe2/python/layers/bpr_loss.py
+++ b/caffe2/python/layers/bpr_loss.py
@ -19,7 +19,7 @@ import numpy as np
 class BPRLoss(ModelLayer):

    def __init__(self, model, input_record, name='bpr_loss', **kwargs):
-        super(BPRLoss, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
        assert schema.is_schema_subset(
            schema.Struct(
                ('pos_prediction', schema.Scalar()),
--- a/caffe2/python/layers/bucket_weighted.py
+++ b/caffe2/python/layers/bucket_weighted.py
@ -22,7 +22,7 @@ logger = logging.getLogger(__name__)
 class BucketWeighted(ModelLayer):
    def __init__(self, model, input_record, max_score=0, bucket_boundaries=None,
                 hash_buckets=True, weight_optim=None, name="bucket_weighted"):
-        super(BucketWeighted, self).__init__(model, name, input_record)
+        super().__init__(model, name, input_record)

        assert isinstance(input_record, schema.List), "Incorrect input type"
        self.bucket_boundaries = bucket_boundaries
--- a/caffe2/python/layers/build_index.py
+++ b/caffe2/python/layers/build_index.py
@ -23,7 +23,7 @@ class MapToRange(ModelLayer):
        name='map_to_range',
        **kwargs
    ):
-        super(MapToRange, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)

        assert max_index > 0
        assert isinstance(input_record, schema.Scalar)
--- a/caffe2/python/layers/concat.py
+++ b/caffe2/python/layers/concat.py
@ -65,7 +65,7 @@ class Concat(ModelLayer):

    def __init__(self, model, input_record, axis=1, add_axis=0,
                 name='concat', **kwargs):
-        super(Concat, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
        self.axis = axis
        self.add_axis = add_axis
        assert not (axis == 0 and add_axis == 1), \
--- a/caffe2/python/layers/conv.py
+++ b/caffe2/python/layers/conv.py
@ -31,7 +31,7 @@ class Conv(ModelLayer):
                 kernel_optim=None, bias_optim=None,
                 name='conv', **kwargs):

-        super(Conv, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
        # input num_channels (C) is needed
        input_dims = input_record.field_type().shape
--- a/caffe2/python/layers/dropout.py
+++ b/caffe2/python/layers/dropout.py
@ -19,7 +19,7 @@ class Dropout(ModelLayer):
            dropout_for_eval=False,
            **kwargs):

-        super(Dropout, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
        assert (ratio >= 0 and ratio < 1.0), \
            "Expected 0 <= ratio < 1, but got ratio of %s" % ratio
--- a/caffe2/python/layers/fc.py
+++ b/caffe2/python/layers/fc.py
@ -29,7 +29,7 @@ class FC(SamplingTrainableMixin, ModelLayer):
                 max_fc_size=None, axis=1, transposed=False,
                 uniform_weight_init_scale_numerator=1.0,
                 **kwargs):
-        super(FC, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
        assert isinstance(input_record, schema.Scalar), (
            "Incorrect input type {}".format(input_record))
        assert len(input_record.field_types()[0].shape) > 0, (
--- a/caffe2/python/layers/fc_with_bootstrap.py
+++ b/caffe2/python/layers/fc_with_bootstrap.py
@ -37,7 +37,7 @@ class FCWithBootstrap(SamplingTrainableMixin, ModelLayer):
        axis=1,
        **kwargs
    ):
-        super(FCWithBootstrap, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
        assert isinstance(
            input_record, schema.Scalar
        ), "Incorrect input type {}".format(input_record)
--- a/caffe2/python/layers/fc_without_bias.py
+++ b/caffe2/python/layers/fc_without_bias.py
@ -25,7 +25,7 @@ class FCWithoutBias(SamplingTrainableMixin, ModelLayer):
        uniform_weight_init_scale_numerator=1.0,
        **kwargs
    ):
-        super(FCWithoutBias, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
        assert len(input_record.field_types()[0].shape) > 0, (
            "FCWithoutBias expects limited dimensions of the input tensor"
--- a/caffe2/python/layers/feature_sparse_to_dense.py
+++ b/caffe2/python/layers/feature_sparse_to_dense.py
@ -26,7 +26,7 @@ class FeatureSparseToDense(ModelLayer):
        Default_dense_value can only be 0.0 or float("NaN"). Any input that isn't
        None will be NaN.
        """
-        super(FeatureSparseToDense, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
        if default_dense_value is None:
            default_dense_value = 0.0
        default_dense_value = float(default_dense_value)
--- a/caffe2/python/layers/functional.py
+++ b/caffe2/python/layers/functional.py
@ -25,7 +25,7 @@ class Functional(ModelLayer):
        # allow coercion
        input_record = schema.as_record(input_record)

-        super(Functional, self).__init__(model, name, input_record, tags=tags, **kwargs)
+        super().__init__(model, name, input_record, tags=tags, **kwargs)
        self._function = function
        self._kwargs = kwargs
        return_struct = (
--- a/caffe2/python/layers/gather_record.py
+++ b/caffe2/python/layers/gather_record.py
@ -30,7 +30,7 @@ class GatherRecord(ModelLayer):
    """

    def __init__(self, model, input_record, name='gather_record', **kwargs):
-        super(GatherRecord, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)

        assert 'indices' in input_record
        assert 'record' in input_record
--- a/caffe2/python/layers/label_smooth.py
+++ b/caffe2/python/layers/label_smooth.py
@ -29,7 +29,7 @@ class LabelSmooth(ModelLayer):
    def __init__(
        self, model, label, smooth_matrix, name='label_smooth', **kwargs
    ):
-        super(LabelSmooth, self).__init__(model, name, label, **kwargs)
+        super().__init__(model, name, label, **kwargs)
        self.label = label
        # shape as a list
        smooth_matrix = np.array(smooth_matrix).astype(np.float32).flatten()
--- a/caffe2/python/layers/last_n_window_collector.py
+++ b/caffe2/python/layers/last_n_window_collector.py
@ -15,8 +15,7 @@ class LastNWindowCollector(ModelLayer):

    def __init__(self, model, input_record, num_to_collect,
                 name='last_n_window_collector', **kwargs):
-        super(LastNWindowCollector, self).__init__(
-            model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
        assert num_to_collect > 0
        self.num_to_collect = num_to_collect
        assert isinstance(input_record, schema.Scalar), \
--- a/caffe2/python/layers/layer_normalization.py
+++ b/caffe2/python/layers/layer_normalization.py
@ -23,8 +23,7 @@ class LayerNormalization(ModelLayer):
        scale_init_value=1.0,
        **kwargs
    ):
-        super(LayerNormalization, self).__init__(
-            model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)

        assert isinstance(input_record, schema.Scalar), (
            "Incorrect input type: {}".format(input_record))
--- a/caffe2/python/layers/margin_rank_loss.py
+++ b/caffe2/python/layers/margin_rank_loss.py
@ -19,7 +19,7 @@ class MarginRankLoss(ModelLayer):

    def __init__(self, model, input_record, name='margin_rank_loss',
                 margin=0.1, average_loss=False, **kwargs):
-        super(MarginRankLoss, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
        assert margin >= 0, ('For hinge loss, margin should be no less than 0')
        self._margin = margin
        self._average_loss = average_loss
--- a/caffe2/python/layers/merge_id_lists.py
+++ b/caffe2/python/layers/merge_id_lists.py
@ -25,7 +25,7 @@ class MergeIdLists(ModelLayer):
        the merged ID_LIST feature
    """
    def __init__(self, model, input_record, name='merged'):
-        super(MergeIdLists, self).__init__(model, name, input_record)
+        super().__init__(model, name, input_record)
        assert all(schema.equal_schemas(x, IdList) for x in input_record), \
            "Inputs to MergeIdLists should all be IdLists."

--- a/caffe2/python/layers/pairwise_similarity.py
+++ b/caffe2/python/layers/pairwise_similarity.py
@ -15,7 +15,7 @@ class PairwiseSimilarity(ModelLayer):

    def __init__(self, model, input_record, output_dim, pairwise_similarity_func='dot',
                 name='pairwise_similarity', **kwargs):
-        super(PairwiseSimilarity, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
        assert isinstance(input_record, schema.Struct), (
            "Incorrect input type. Expected Struct, but received: {0}".
            format(input_record))
--- a/caffe2/python/layers/position_weighted.py
+++ b/caffe2/python/layers/position_weighted.py
@ -22,7 +22,7 @@ logger = logging.getLogger(__name__)
 class PositionWeighted(ModelLayer):
    def __init__(self, model, input_record, weight_optim=None,
                 name="position_weights"):
-        super(PositionWeighted, self).__init__(model, name, input_record)
+        super().__init__(model, name, input_record)

        assert isinstance(input_record, schema.List), "Incorrect input type"
        length_metadata = input_record.lengths.metadata
--- a/caffe2/python/layers/random_fourier_features.py
+++ b/caffe2/python/layers/random_fourier_features.py
@ -38,8 +38,7 @@ class RandomFourierFeatures(ModelLayer):
            name='random_fourier_features',
            **kwargs):

-        super(RandomFourierFeatures, self).__init__(model, name, input_record,
-                                                    **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
        assert isinstance(input_record, schema.Scalar), "Incorrect input type"

        input_dims = input_record.field_type().shape[0]
--- a/caffe2/python/layers/reservoir_sampling.py
+++ b/caffe2/python/layers/reservoir_sampling.py
@ -19,8 +19,7 @@ class ReservoirSampling(ModelLayer):

    def __init__(self, model, input_record, num_to_collect,
                 name='reservoir_sampling', **kwargs):
-        super(ReservoirSampling, self).__init__(
-            model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
        assert num_to_collect > 0
        self.num_to_collect = num_to_collect

--- a/caffe2/python/layers/sampling_train.py
+++ b/caffe2/python/layers/sampling_train.py
@ -21,9 +21,7 @@ class SamplingTrain(ModelLayer):
        name='sampling_train',
        **kwargs
    ):
-        super(SamplingTrain, self).__init__(
-            model, name, input_record, **kwargs
-        )
+        super().__init__(model, name, input_record, **kwargs)

        layer_class = get_layer_class(prediction_layer)
        assert issubclass(layer_class, SamplingTrainableMixin)
--- a/caffe2/python/layers/sampling_trainable_mixin.py
+++ b/caffe2/python/layers/sampling_trainable_mixin.py
@ -11,7 +11,7 @@ import abc
 class SamplingTrainableMixin(metaclass=abc.ABCMeta):

    def __init__(self, *args, **kwargs):
-        super(SamplingTrainableMixin, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
        self._train_param_blobs = None
        self._train_param_blobs_frozen = False

--- a/caffe2/python/layers/select_record_by_context.py
+++ b/caffe2/python/layers/select_record_by_context.py
@ -32,8 +32,7 @@ class SelectRecordByContext(ModelLayer):
        default_output_record_field=None,
        **kwargs
    ):
-        super(SelectRecordByContext, self).__init__(model, name, input_record,
-                                                    **kwargs)
+        super().__init__(model, name, input_record, **kwargs)

        assert isinstance(input_record, schema.Struct)
        assert len(input_record) > 1
--- a/caffe2/python/layers/semi_random_features.py
+++ b/caffe2/python/layers/semi_random_features.py
@ -84,7 +84,7 @@ class SemiRandomFeatures(ArcCosineFeatureMap):
            self.input_record_full = input_record
            self.input_record_random = input_record

-        super(SemiRandomFeatures, self).__init__(
+        super().__init__(
            model,
            self.input_record_full,
            output_dims,
--- a/caffe2/python/layers/sparse_dropout_with_replacement.py
+++ b/caffe2/python/layers/sparse_dropout_with_replacement.py
@ -42,7 +42,7 @@ class SparseDropoutWithReplacement(ModelLayer):
            name='sparse_dropout',
            **kwargs):

-        super(SparseDropoutWithReplacement, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
        assert schema.equal_schemas(input_record, IdList), "Incorrect input type"

        self.dropout_prob_train = float(dropout_prob_train)
--- a/caffe2/python/layers/sparse_feature_hash.py
+++ b/caffe2/python/layers/sparse_feature_hash.py
@ -22,7 +22,7 @@ class SparseFeatureHash(ModelLayer):

    def __init__(self, model, input_record, seed=0, modulo=None,
                 use_hashing=True, use_divide_mod=False, divisor=None, name='sparse_feature_hash', **kwargs):
-        super(SparseFeatureHash, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)

        assert use_hashing + use_divide_mod < 2, "use_hashing and use_divide_mod cannot be set true at the same time."

--- a/caffe2/python/layers/sparse_itemwise_dropout_with_replacement.py
+++ b/caffe2/python/layers/sparse_itemwise_dropout_with_replacement.py
@ -41,7 +41,7 @@ class SparseItemwiseDropoutWithReplacement(ModelLayer):
            name='sparse_itemwise_dropout',
            **kwargs):

-        super(SparseItemwiseDropoutWithReplacement, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
        assert schema.equal_schemas(input_record, IdList), "Incorrect input type"

        self.dropout_prob_train = float(dropout_prob_train)
--- a/caffe2/python/layers/sparse_lookup.py
+++ b/caffe2/python/layers/sparse_lookup.py
@ -133,7 +133,7 @@ class SparseLookup(ModelLayer):
                 name='sparse_lookup', regularizer=None, use_external_weights=False,
                 uniform_weight_init_scale_numerator=1.0, **kwargs):

-        super(SparseLookup, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)

        self.sparse_key = get_key(self.input_record)()
        logger.info("Setup the sparse lookup layer for " + self.sparse_key)
--- a/caffe2/python/layers/split.py
+++ b/caffe2/python/layers/split.py
@ -15,7 +15,7 @@ class Split(ModelLayer):

    def __init__(self, model, input_record, num_splits=1, axis=1,
                 name='split', split=None, **kwargs):
-        super(Split, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
        self.axis = axis
        # Assume that first dimension is batch, so actual axis in shape is
        # axis - 1
--- a/caffe2/python/layers/uniform_sampling.py
+++ b/caffe2/python/layers/uniform_sampling.py
@ -27,9 +27,7 @@ class UniformSampling(ModelLayer):
        name='uniform_sampling',
        **kwargs
    ):
-        super(UniformSampling, self).__init__(
-            model, name, input_record, **kwargs
-        )
+        super().__init__(model, name, input_record, **kwargs)

        assert num_elements > num_samples > 0
        assert isinstance(input_record, schema.Scalar)
--- a/caffe2/python/models/seq2seq/seq2seq_model_helper.py
+++ b/caffe2/python/models/seq2seq/seq2seq_model_helper.py
@ -20,11 +20,7 @@ class Seq2SeqModelHelper(ModelHelper):
        if kwargs.get('ws_nbytes_limit', None):
            arg_scope['ws_nbytes_limit'] = kwargs.pop('ws_nbytes_limit')

-        super(Seq2SeqModelHelper, self).__init__(
-            init_params=init_params,
-            arg_scope=arg_scope,
-            **kwargs
-        )
+        super().__init__(init_params=init_params, arg_scope=arg_scope, **kwargs)
        self.non_trainable_params = []

    def AddParam(self, name, init=None, init_value=None, trainable=True):
--- a/caffe2/python/net_builder.py
+++ b/caffe2/python/net_builder.py
@ -137,7 +137,7 @@ class NetBuilder(context.Managed):
        return self._children

    def __exit__(self, etype, *args):
-        super(NetBuilder, self).__exit__(etype, *args)
+        super().__exit__(etype, *args)

        if self._use_control_ops and len(self._children) > 0:
            _children = self._children
--- a/caffe2/python/normalizer.py
+++ b/caffe2/python/normalizer.py
@ -21,7 +21,7 @@ class Normalizer:

 class BatchNormalizer(Normalizer):
    def __init__(self, momentum, scale_init_value=1.0):
-        super(BatchNormalizer, self).__init__()
+        super().__init__()
        self._momentum = float(momentum)
        self._scale_init_value = float(scale_init_value)

@ -33,7 +33,7 @@ class BatchNormalizer(Normalizer):

 class LayerNormalizer(Normalizer):
    def __init__(self, epsilon, use_layer_norm_op=True, scale_init_value=1.0):
-        super(LayerNormalizer, self).__init__()
+        super().__init__()
        self._epsilon = float(epsilon)
        self._use_layer_norm_op = use_layer_norm_op
        self._scale_init_value = float(scale_init_value)
--- a/caffe2/python/onnx/backend_cpp_rep.py
+++ b/caffe2/python/onnx/backend_cpp_rep.py
@ -12,7 +12,7 @@ from onnx.backend.base import BackendRep, namedtupledict
 # mainly to handle the different input and output types for convenience of Python
 class Caffe2CppRep(BackendRep):
    def __init__(self, cpp_rep):
-        super(Caffe2CppRep, self).__init__()
+        super().__init__()
        self.__core = cpp_rep
        self.__external_outputs = cpp_rep.external_outputs()
        self.__external_inputs = cpp_rep.external_inputs()
--- a/caffe2/python/onnx/backend_rep.py
+++ b/caffe2/python/onnx/backend_rep.py
@ -11,7 +11,7 @@ from onnx.backend.base import BackendRep, namedtupledict

 class Caffe2Rep(BackendRep):
    def __init__(self, init_net, predict_net, workspace, uninitialized):
-        super(Caffe2Rep, self).__init__()
+        super().__init__()
        self.init_net = init_net
        self.predict_net = predict_net
        self.workspace = workspace
@ -28,7 +28,7 @@ class Caffe2Rep(BackendRep):
        return ''

    def run(self, inputs, **kwargs):
-        super(Caffe2Rep, self).run(inputs, **kwargs)
+        super().run(inputs, **kwargs)
        with core.DeviceScope(self.predict_net.device_option):
            if isinstance(inputs, dict):
                with core.NameScope(self._name_scope):
--- a/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py
+++ b/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py
@ -39,7 +39,7 @@ def c10_op_ref(maps, rois):

 class TestHeatmapMaxKeypointOp(hu.HypothesisTestCase):
    def setUp(self):
-        super(TestHeatmapMaxKeypointOp, self).setUp()
+        super().setUp()
        np.random.seed(0)

        # initial coordinates and interpolate HEATMAP_SIZE from it
--- a/caffe2/python/operator_test/load_save_test.py
+++ b/caffe2/python/operator_test/load_save_test.py
@ -31,7 +31,7 @@ class MiniDBEntry(NamedTuple):
 class TestLoadSaveBase(test_util.TestCase):

    def __init__(self, methodName, db_type='minidb'):
-        super(TestLoadSaveBase, self).__init__(methodName)
+        super().__init__(methodName)
        self._db_type = db_type

    @settings(deadline=None)
--- a/caffe2/python/operator_test/recurrent_net_executor_test.py
+++ b/caffe2/python/operator_test/recurrent_net_executor_test.py
@ -18,7 +18,7 @@ from hypothesis import given, settings
 class TestRNNExecutor(test_util.TestCase):

    def setUp(self):
-        super(TestRNNExecutor, self).setUp()
+        super().setUp()
        self.batch_size = 8
        self.input_dim = 20
        self.hidden_dim = 30
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@ -302,7 +302,7 @@ class SgdOptimizer(Optimizer):
        lars=None,
        **kwargs
    ):
-        super(SgdOptimizer, self).__init__()
+        super().__init__()
        self.base_learning_rate = base_learning_rate
        self.policy = policy
        self.momentum = momentum
@ -418,7 +418,7 @@ class MultiPrecisionSgdOptimizer(SgdOptimizer):
        sparse_dedup_aggregator=None,
        **kwargs
    ):
-        super(MultiPrecisionSgdOptimizer, self).__init__(
+        super().__init__(
            base_learning_rate=base_learning_rate,
            policy=policy,
            momentum=momentum,
@ -489,7 +489,7 @@ class FP16SgdOptimizer(SgdOptimizer):
        sparse_dedup_aggregator=None,
        **kwargs
    ):
-        super(FP16SgdOptimizer, self).__init__(
+        super().__init__(
            base_learning_rate=base_learning_rate,
            policy=policy,
            momentum=momentum,
@ -635,7 +635,7 @@ class AdagradOptimizer(Optimizer):
        use_dedicated_lr_iteration_counter=False,
        **kwargs
    ):
-        super(AdagradOptimizer, self).__init__()
+        super().__init__()
        self.alpha = alpha
        self.epsilon = epsilon
        self.decay = decay
@ -1207,7 +1207,7 @@ class WngradOptimizer(Optimizer):
        output_effective_lr_and_update=False,
        **kwargs
    ):
-        super(WngradOptimizer, self).__init__()
+        super().__init__()
        self.alpha = alpha
        self.epsilon = epsilon
        self.policy = policy
@ -1319,7 +1319,7 @@ class StormOptimizer(Optimizer):
              include 'mean' and 'sum'.
            lars: lars offset.
        """
-        super(StormOptimizer, self).__init__()
+        super().__init__()
        self.lr = lr
        self.momentum = momentum
        self.beta = beta
@ -1420,7 +1420,7 @@ class AdadeltaOptimizer(Optimizer):
              include "mean" and "sum".
            engine: the engine used, options include "", "CUDNN", etc.
        """
-        super(AdadeltaOptimizer, self).__init__()
+        super().__init__()
        self.alpha = alpha
        self.epsilon = epsilon
        self.decay = decay
@ -1488,7 +1488,7 @@ class FtrlOptimizer(Optimizer):
        sparse_dedup_aggregator=None,
        engine="",
    ):
-        super(FtrlOptimizer, self).__init__()
+        super().__init__()
        self.alpha = alpha
        self.beta = beta
        self.lambda1 = lambda1
@ -1546,7 +1546,7 @@ class GFtrlOptimizer(Optimizer):
        sparse_dedup_aggregator=None,
        engine="",
    ):
-        super(GFtrlOptimizer, self).__init__()
+        super().__init__()
        self.alpha = alpha
        self.beta = beta
        self.lambda1 = lambda1
@ -1598,7 +1598,7 @@ class AdamOptimizer(Optimizer):
        use_smart_decay=False,  # See https://fburl.com/2jdiwrhy for context.
        **kwargs
    ):
-        super(AdamOptimizer, self).__init__()
+        super().__init__()
        self.alpha = alpha
        self.beta1 = beta1
        self.beta2 = beta2
@ -1761,7 +1761,7 @@ class DecayAdagradOptimizer(Optimizer):
        engine="",
        **kwargs
    ):
-        super(DecayAdagradOptimizer, self).__init__()
+        super().__init__()
        self.alpha = alpha
        self.beta1 = beta1
        self.beta2 = beta2
@ -1885,7 +1885,7 @@ class YellowFinOptimizer(Optimizer):
        sparse_dedup_aggregator=None,
        **kwargs
    ):
-        super(YellowFinOptimizer, self).__init__()
+        super().__init__()
        self.alpha = alpha
        self.mu = mu
        self.beta = beta
@ -1973,7 +1973,7 @@ class RmsPropOptimizer(Optimizer):
        engine="",
        **kwargs
    ):
-        super(RmsPropOptimizer, self).__init__()
+        super().__init__()
        self.alpha = alpha
        self.decay = decay
        self.momentum = momentum
--- a/caffe2/python/optimizer_test.py
+++ b/caffe2/python/optimizer_test.py
@ -79,7 +79,7 @@ class TestMultiPrecisionSgd(

    @unittest.skipIf(not workspace.has_gpu_support, "No GPU support")
    def testGPUDense(self):
-        super(TestMultiPrecisionSgd, self).testGPUDense(core.DataType.FLOAT16)
+        super().testGPUDense(core.DataType.FLOAT16)


 class TestFtrl(OptimizerTestBase, TestCase):
--- a/caffe2/python/record_queue.py
+++ b/caffe2/python/record_queue.py
@ -17,7 +17,7 @@ from caffe2.python.schema import (
 class _QueueReader(Reader):
    def __init__(self, blobs_queue, schema, name=None):
        """Don't call this directly. Instead, use dataset.reader()"""
-        super(_QueueReader, self).__init__(schema)
+        super().__init__(schema)
        self.blobs_queue = blobs_queue
        self.name = name

--- a/caffe2/python/regularizer.py
+++ b/caffe2/python/regularizer.py
@ -89,7 +89,7 @@ class Regularizer:

 class L1Norm(Regularizer):
    def __init__(self, reg_lambda):
-        super(L1Norm, self).__init__()
+        super().__init__()
        assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"

        self.reg_lambda = reg_lambda
@ -109,7 +109,7 @@ class LpNorm(Regularizer):
                    we will calculate Lp norm with the formula:
                    pow( sum_i { pow(theda_i, p) } ,  1/p)
        """
-        super(LpNorm, self).__init__()
+        super().__init__()
        assert reg_lambda > 0, "factor ahead of regularization should be greater than 0"
        assert p_value > 0, "p_value factor should be greater than 0"
        self.p_value = p_value
@ -158,7 +158,7 @@ class L0ApproxNorm(Regularizer):
                    budget, no penalization will be applied. Optional parameter, if
                    0, then no budget is used
        """
-        super(L0ApproxNorm, self).__init__()
+        super().__init__()
        assert reg_lambda > 0, "factor ahead of regularization should be greater than 0"
        assert alpha > 0, "alpha factor must be a positive value greater than 0"
        assert budget >= 0, "budget factor must be greater than or equal to 0"
@ -204,7 +204,7 @@ class L1NormTrimmed(Regularizer):
    The Trimmed Lasso: Sparsity and Robustness. https://arxiv.org/abs/1708.04527
    """
    def __init__(self, reg_lambda, k):
-        super(L1NormTrimmed, self).__init__()
+        super().__init__()
        assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
        assert isinstance(k, int), "k should be an interger as expected #. after selection"
        assert k >= 1, "k should be larger than 1"
@ -225,7 +225,7 @@ class L1NormTrimmed(Regularizer):

 class L2Norm(Regularizer):
    def __init__(self, reg_lambda):
-        super(L2Norm, self).__init__()
+        super().__init__()
        assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"

        self.reg_lambda = reg_lambda
@ -239,7 +239,7 @@ class L2Norm(Regularizer):

 class ElasticNet(Regularizer):
    def __init__(self, l1, l2):
-        super(ElasticNet, self).__init__()
+        super().__init__()
        self.l1 = l1
        self.l2 = l2

@ -257,7 +257,7 @@ class ElasticNet(Regularizer):

 class ElasticNetL1NormTrimmed(Regularizer):
    def __init__(self, l1, l2, k):
-        super(ElasticNetL1NormTrimmed, self).__init__()
+        super().__init__()
        self.l1 = l1
        self.l2 = l2
        self.k = k
@ -282,7 +282,7 @@ class ElasticNetL1NormTrimmed(Regularizer):

 class MaxNorm(Regularizer):
    def __init__(self, norm=1.0, dtype=None):
-        super(MaxNorm, self).__init__()
+        super().__init__()
        self.norm = norm
        self.dtype = dtype

@ -309,7 +309,7 @@ class MaxNorm(Regularizer):

 class ConstantNorm(Regularizer):
    def __init__(self, norm=1.0):
-        super(ConstantNorm, self).__init__()
+        super().__init__()
        self.norm = norm

    def _run_after_optimizer(self, net, param_init_net, param, grad):
@ -329,7 +329,7 @@ class ConstantNorm(Regularizer):

 class SparseLpNorm(Regularizer):
    def __init__(self, p, reg_lambda):
-        super(SparseLpNorm, self).__init__()
+        super().__init__()
        assert p in (1.0, 2.0), "Sparse Lp regularization only implemented for p = 1.0 and p = 2.0."
        assert reg_lambda > 0, "factor ahead of regularization should be greater than 0."
        self.p = p
@ -349,12 +349,12 @@ class SparseLpNorm(Regularizer):

 class SparseL1Norm(SparseLpNorm):
    def __init__(self, reg_lambda):
-        super(SparseL1Norm, self).__init__(p=1.0, reg_lambda=reg_lambda)
+        super().__init__(p=1.0, reg_lambda=reg_lambda)


 class SparseL2Norm(SparseLpNorm):
    def __init__(self, reg_lambda):
-        super(SparseL2Norm, self).__init__(p=2.0, reg_lambda=reg_lambda)
+        super().__init__(p=2.0, reg_lambda=reg_lambda)


 class LogBarrier(Regularizer):
@ -369,7 +369,7 @@ class LogBarrier(Regularizer):
        similar to the learning rate. It is specified by a learning rate policy and
        corresponding options
        """
-        super(LogBarrier, self).__init__()
+        super().__init__()
        assert reg_lambda > 0, "factor ahead of regularization should be 0 or positive"
        self.reg_lambda = reg_lambda
        self.discount_policy = discount_policy
@ -412,7 +412,7 @@ class BoundedGradientProjection(Regularizer):
    def __init__(
        self, lb=None, ub=None, left_open=False, right_open=False, epsilon=None
    ):
-        super(BoundedGradientProjection, self).__init__()
+        super().__init__()
        lb = float(lb) if lb is not None else None
        ub = float(ub) if ub is not None else None
        epsilon = float(epsilon) if epsilon is not None else self.kEpsilon
@ -481,7 +481,7 @@ class GroupL1Norm(Regularizer):
                of the gradient operator of Sqrt has taken into stability into
                consideration, this term won't be necessary.
        """
-        super(GroupL1Norm, self).__init__()
+        super().__init__()
        assert (
            (reg_lambda) >= 0
        ), "regularization weight should be 0 or positive"
--- a/caffe2/python/rnn_cell.py
+++ b/caffe2/python/rnn_cell.py
@ -302,7 +302,7 @@ class BasicRNNCell(RNNCell):
        activation=None,
        **kwargs
    ):
-        super(BasicRNNCell, self).__init__(**kwargs)
+        super().__init__(**kwargs)
        self.drop_states = drop_states
        self.input_size = input_size
        self.hidden_size = hidden_size
@ -403,7 +403,7 @@ class LSTMCell(RNNCell):
        initializer=None,
        **kwargs
    ):
-        super(LSTMCell, self).__init__(initializer=initializer, **kwargs)
+        super().__init__(initializer=initializer, **kwargs)
        self.initializer = initializer or LSTMInitializer(
            hidden_size=hidden_size)

@ -507,9 +507,7 @@ class LayerNormLSTMCell(RNNCell):
        initializer=None,
        **kwargs
    ):
-        super(LayerNormLSTMCell, self).__init__(
-            initializer=initializer, **kwargs
-        )
+        super().__init__(initializer=initializer, **kwargs)
        self.initializer = initializer or LSTMInitializer(
            hidden_size=hidden_size
        )
@ -828,7 +826,7 @@ class DropoutCell(RNNCell):
        assert 'is_test' in kwargs, "Argument 'is_test' is required"
        self.is_test = kwargs.pop('is_test')
        self.use_cudnn = use_cudnn
-        super(DropoutCell, self).__init__(**kwargs)
+        super().__init__(**kwargs)

        self.prepare_input = internal_cell.prepare_input
        self.get_output_state_index = internal_cell.get_output_state_index
@ -932,7 +930,7 @@ class MultiRNNCell(RNNCell):

        forward_only: used to construct inference-only network.
        '''
-        super(MultiRNNCell, self).__init__(**kwargs)
+        super().__init__(**kwargs)
        self.cells = cells

        if residual_output_layers is None:
@ -1117,7 +1115,7 @@ class AttentionCell(RNNCell):
        attention_memory_optimization,
        **kwargs
    ):
-        super(AttentionCell, self).__init__(**kwargs)
+        super().__init__(**kwargs)
        self.encoder_output_dim = encoder_output_dim
        self.encoder_outputs = encoder_outputs
        self.encoder_lengths = encoder_lengths
@ -1414,7 +1412,7 @@ class LSTMWithAttentionCell(AttentionCell):
            forward_only=False,
            drop_states=False,
        )
-        super(LSTMWithAttentionCell, self).__init__(
+        super().__init__(
            encoder_output_dim=encoder_output_dim,
            encoder_outputs=encoder_outputs,
            encoder_lengths=encoder_lengths,
@ -1453,7 +1451,7 @@ class MILSTMWithAttentionCell(AttentionCell):
            forward_only=False,
            drop_states=False,
        )
-        super(MILSTMWithAttentionCell, self).__init__(
+        super().__init__(
            encoder_output_dim=encoder_output_dim,
            encoder_outputs=encoder_outputs,
            decoder_cell=decoder_cell,
--- a/caffe2/python/schema.py
+++ b/caffe2/python/schema.py
@ -218,7 +218,7 @@ class List(Field):
        self._items = _normalize_field(values)
        self.lengths._set_parent(self, 0)
        self._items._set_parent(self, 1)
-        super(List, self).__init__([self.lengths, self._items])
+        super().__init__([self.lengths, self._items])

    def field_names(self):
        value_fields = self._items.field_names()
@ -295,7 +295,7 @@ class ListWithEvicted(List):
            self._evicted_values = _normalize_field(evicted_values)
        else:
            self._evicted_values = Scalar(np.int64, evicted_values)
-        super(ListWithEvicted, self).__init__(values, lengths_blob=lengths_blob)
+        super().__init__(values, lengths_blob=lengths_blob)

    def field_names(self):
        value_fields = self._items.field_names()
@ -418,7 +418,7 @@ class Struct(Field):
            self.fields[name] = self.fields[name] + field
        for id, (_, field) in enumerate(self.fields.items()):
            field._set_parent(self, id)
-        super(Struct, self).__init__(self.fields.values())
+        super().__init__(self.fields.values())
        self._frozen = True

    def _struct_from_nested_name(self, nested_name, field):
@ -544,7 +544,7 @@ class Struct(Field):
        if item.startswith('__'):
            raise AttributeError(item)
        try:
-            return super(Struct, self).__getattribute__("fields")[item]
+            return super().__getattribute__("fields")[item]
        except KeyError as e:
            raise AttributeError(item) from e

@ -555,7 +555,7 @@ class Struct(Field):
        # post initialization.
        if getattr(self, '_frozen', None) and not key.startswith('_'):
            raise TypeError('Struct.__setattr__() is disabled after __init__()')
-        super(Struct, self).__setattr__(key, value)
+        super().__setattr__(key, value)

    def __add__(self, other):
        """
@ -725,7 +725,7 @@ class Scalar(Field):
    def __init__(self, dtype=None, blob=None, metadata=None):
        self._metadata = None
        self.set(dtype, blob, metadata, unsafe=True)
-        super(Scalar, self).__init__([])
+        super().__init__([])

    def field_names(self):
        return ['']
--- a/caffe2/python/serialized_test/serialized_test_util.py
+++ b/caffe2/python/serialized_test/serialized_test_util.py
@ -232,7 +232,7 @@ class SerializedTestCase(hu.HypothesisTestCase):
        outputs_to_check=None,
        ensure_outputs_are_inferred=False,
    ):
-        outs = super(SerializedTestCase, self).assertReferenceChecks(
+        outs = super().assertReferenceChecks(
            device_option,
            op,
            inputs,
--- a/caffe2/python/task.py
+++ b/caffe2/python/task.py
@ -535,7 +535,7 @@ class Task(context.Managed):
        self._num_instances = num_instances

    def __enter__(self):
-        super(Task, self).__enter__()
+        super().__enter__()

        # temporarily remove from _tasks_to_add to ensure correct order
        if self.group is not None:
@ -548,7 +548,7 @@ class Task(context.Managed):
        return self

    def __exit__(self, type, value, traceback):
-        super(Task, self).__exit__(type, value, traceback)
+        super().__exit__(type, value, traceback)

        self._net_builder.__exit__(type, value, traceback)
        if type is None:
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@ -768,7 +768,7 @@ class TestTransform(htu.HypothesisTestCase):

 class MyModule(torch.jit.ScriptModule):
    def __init__(self):
-        super(MyModule, self).__init__()
+        super().__init__()
        self.mult = torch.nn.Parameter(torch.tensor([[1, 2, 3, 4, 5.0]]))

    @torch.jit.script_method
--- a/docs/source/ddp_comm_hooks.rst
+++ b/docs/source/ddp_comm_hooks.rst
@ -134,7 +134,7 @@ Here is a simple, end-to-end example of saving and reloading PowerSGD state and

    class SimpleModel(nn.Module):
        def __init__(self):
-            super(SimpleModel, self).__init__()
+            super().__init__()
            self.fc1 = nn.Linear(24,24)
            self.relu = nn.ReLU()
            self.fc2 = nn.Linear(24,12)
--- a/docs/source/dynamo/troubleshooting.rst
+++ b/docs/source/dynamo/troubleshooting.rst
@ -326,14 +326,12 @@ code:
   # GPU Hardware Info:
   # NVIDIA A100-SXM4-40GB : 8

-
   from torch.nn import *
+
   class Repro(torch.nn.Module):
       def __init__(self):
           super().__init__()

-
-
       def forward(self, add):
           _foobar = torch.ops.aten._foobar.default(add);  add = None
           return (_foobar,)
@ -407,14 +405,12 @@ the following code in ``{torch._dynamo.config.base_dir}/repro.py``.
   from math import inf
   from torch._dynamo.debug_utils import run_fwd_maybe_bwd

-
   from torch.nn import *
+
   class Repro(torch.nn.Module):
       def __init__(self):
           super().__init__()

-
-
       def forward(self, add):
           relu = torch.relu(add);  add = None
           return (relu,)
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@ -161,7 +161,7 @@ Example (using a traced module):

    class MyScriptModule(torch.nn.Module):
        def __init__(self):
-            super(MyScriptModule, self).__init__()
+            super().__init__()
            self.means = torch.nn.Parameter(torch.tensor([103.939, 116.779, 123.68])
                                            .resize_(1, 3, 1, 1))
            self.resnet = torch.jit.trace(torchvision.models.resnet18(),
@ -593,7 +593,7 @@ Q: How do I store attributes on a :class:`ScriptModule`?

        class Model(torch.nn.Module):
            def __init__(self):
-                super(Model, self).__init__()
+                super().__init__()
                self.x = 2

            def forward(self):
@ -672,7 +672,7 @@ The new usage looks like this:

    class Model(nn.Module):
        def __init__(self):
-            super(Model, self).__init__()
+            super().__init__()
            self.conv1 = nn.Conv2d(1, 20, 5)
            self.conv2 = nn.Conv2d(20, 20, 5)

@ -779,7 +779,7 @@ Old API:

    class MyModule(torch.jit.ScriptModule):
        def __init__(self):
-            super(MyModule, self).__init__()
+            super().__init__()
            self.my_dict = torch.jit.Attribute({}, Dict[str, int])
            self.my_int = torch.jit.Attribute(20, int)

@ -795,7 +795,7 @@ New API:
        my_dict: Dict[str, int]

        def __init__(self):
-            super(MyModule, self).__init__()
+            super().__init__()
            # This type cannot be inferred and must be specified
            self.my_dict = {}

@ -820,7 +820,7 @@ Old API:
        __constants__ = ['my_constant']

        def __init__(self):
-            super(MyModule, self).__init__()
+            super().__init__()
            self.my_constant = 2

        def forward(self):
@ -838,7 +838,7 @@ New API:
        my_constant: Final[int]

        def __init__(self):
-            super(MyModule, self).__init__()
+            super().__init__()
            self.my_constant = 2

        def forward(self):
--- a/docs/source/jit_language_reference.rst
+++ b/docs/source/jit_language_reference.rst
@ -205,7 +205,7 @@ Example (type annotations for Python 3):

    class EmptyDataStructures(torch.nn.Module):
        def __init__(self):
-            super(EmptyDataStructures, self).__init__()
+            super().__init__()

        def forward(self, x: torch.Tensor) -> Tuple[List[Tuple[int, float]], Dict[str, int]]:
            # This annotates the list to be a `List[Tuple[int, float]]`
@ -249,7 +249,7 @@ Example (refining types on parameters and locals):
        z: Optional[int]

        def __init__(self, z):
-            super(M, self).__init__()
+            super().__init__()
            # If `z` is None, its type cannot be inferred, so it must
            # be specified (above)
            self.z = z
@ -567,7 +567,7 @@ calling its ``forward`` method (e.g. ``self.resnet.forward(input)``).

    class MyModule(nn.Module):
        def __init__(self):
-            super(MyModule, self).__init__()
+            super().__init__()
            means = torch.tensor([103.939, 116.779, 123.68])
            self.means = torch.nn.Parameter(means.resize_(1, 3, 1, 1))
            resnet = torchvision.models.resnet18()
@ -703,7 +703,7 @@ loop at compile time, with each member of the constant module list.

    class SubModule(torch.nn.Module):
        def __init__(self):
-            super(SubModule, self).__init__()
+            super().__init__()
            self.weight = nn.Parameter(torch.randn(2))

        def forward(self, input):
@ -713,7 +713,7 @@ loop at compile time, with each member of the constant module list.
        __constants__ = ['mods']

        def __init__(self):
-            super(MyModule, self).__init__()
+            super().__init__()
            self.mods = torch.nn.ModuleList([SubModule() for i in range(10)])

        def forward(self, v):
@ -853,7 +853,7 @@ value should be treated as a constant.
        a : torch.jit.Final[int]

        def __init__(self):
-            super(Foo, self).__init__()
+            super().__init__()
            self.a = 1 + 4

        def forward(self, input):
@ -906,7 +906,7 @@ Example:
        some_dict: Dict[str, int]

        def __init__(self, a_dict):
-            super(Foo, self).__init__()
+            super().__init__()
            self.words = []
            self.some_dict = a_dict

--- a/docs/source/jit_language_reference_v2.rst
+++ b/docs/source/jit_language_reference_v2.rst
@ -1437,16 +1437,15 @@ For loops on lists: for loops over a ``nn.ModuleList`` will unroll the body of t

    class SubModule(torch.nn.Module):
        def __init__(self):
-            super(SubModule, self).__init__()
+            super().__init__()
            self.weight = nn.Parameter(torch.randn(2))

        def forward(self, input):
            return self.weight + input

    class MyModule(torch.nn.Module):
-
        def __init__(self):
-            super(MyModule, self).init()
+            super().__init__()
            self.mods = torch.nn.ModuleList([SubModule() for i in range(10)])

        def forward(self, v):
--- a/docs/source/notes/extending.rst
+++ b/docs/source/notes/extending.rst
@ -415,7 +415,7 @@ This is how a ``Linear`` module can be implemented::

    class Linear(nn.Module):
        def __init__(self, input_features, output_features, bias=True):
-            super(Linear, self).__init__()
+            super().__init__()
            self.input_features = input_features
            self.output_features = output_features

--- a/docs/source/notes/serialization.rst
+++ b/docs/source/notes/serialization.rst
@ -153,7 +153,7 @@ can use this pattern:
    # A module with two linear layers
    >>> class MyModule(torch.nn.Module):
          def __init__(self):
-            super(MyModule, self).__init__()
+            super().__init__()
            self.l0 = torch.nn.Linear(4, 2)
            self.l1 = torch.nn.Linear(2, 1)

@ -218,7 +218,7 @@ this:
    # A module with control flow
    >>> class ControlFlowModule(torch.nn.Module):
          def __init__(self):
-            super(ControlFlowModule, self).__init__()
+            super().__init__()
            self.l0 = torch.nn.Linear(4, 2)
            self.l1 = torch.nn.Linear(2, 1)