[BE] Enable ruff's UP rules and autoformat optim/ (#105426)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/105426 Approved by: https://github.com/malfet, https://github.com/albanD, https://github.com/aaronenyeshi, https://github.com/janeyx99
2025-10-20 21:14:14 +08:00 · 2023-07-18 16:31:30 +00:00
parent be03a56955
commit 3721fa5612
33 changed files with 250 additions and 264 deletions
--- a/test/distributions/test_constraints.py
+++ b/test/distributions/test_constraints.py
@ -83,7 +83,7 @@ def test_biject_to(constraint_fn, args, is_cuda):
        t = biject_to(constraint)
    except NotImplementedError:
        pytest.skip('`biject_to` not implemented.')
-    assert t.bijective, "biject_to({}) is not bijective".format(constraint)
+    assert t.bijective, f"biject_to({constraint}) is not bijective"
    if constraint_fn is constraints.corr_cholesky:
        # (D * (D-1)) / 2 (where D = 4) = 6 (size of last dim)
        x = torch.randn(6, 6, dtype=torch.double)
@ -93,12 +93,12 @@ def test_biject_to(constraint_fn, args, is_cuda):
        x = x.cuda()
    y = t(x)
    assert constraint.check(y).all(), '\n'.join([
-        "Failed to biject_to({})".format(constraint),
-        "x = {}".format(x),
-        "biject_to(...)(x) = {}".format(y),
+        f"Failed to biject_to({constraint})",
+        f"x = {x}",
+        f"biject_to(...)(x) = {y}",
    ])
    x2 = t.inv(y)
-    assert torch.allclose(x, x2), "Error in biject_to({}) inverse".format(constraint)
+    assert torch.allclose(x, x2), f"Error in biject_to({constraint}) inverse"

    j = t.log_abs_det_jacobian(x, y)
    assert j.shape == x.shape[:x.dim() - t.domain.event_dim]
@ -119,10 +119,10 @@ def test_transform_to(constraint_fn, args, is_cuda):
    if is_cuda:
        x = x.cuda()
    y = t(x)
-    assert constraint.check(y).all(), "Failed to transform_to({})".format(constraint)
+    assert constraint.check(y).all(), f"Failed to transform_to({constraint})"
    x2 = t.inv(y)
    y2 = t(x2)
-    assert torch.allclose(y, y2), "Error in transform_to({}) pseudoinverse".format(constraint)
+    assert torch.allclose(y, y2), f"Error in transform_to({constraint}) pseudoinverse"


 if __name__ == "__main__":
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@ -862,7 +862,7 @@ class TestDistributions(DistributionsTestCase):
        bins = samples.reshape((num_bins, samples_per_bin)).mean(axis=1)
        stddev = samples_per_bin ** -0.5
        threshold = stddev * scipy.special.erfinv(1 - 2 * failure_rate / num_bins)
-        message = '{}.sample() is biased:\n{}'.format(message, bins)
+        message = f'{message}.sample() is biased:\n{bins}'
        for bias in bins:
            self.assertLess(-threshold, bias, message)
            self.assertLess(bias, threshold, message)
@ -971,7 +971,7 @@ class TestDistributions(DistributionsTestCase):
            if isinstance(Dist, type) and issubclass(Dist, Distribution) \
                    and Dist is not Distribution and Dist is not ExponentialFamily:
                self.assertIn(Dist, distributions_with_examples,
-                              "Please add {} to the EXAMPLES list in test_distributions.py".format(Dist.__name__))
+                              f"Please add {Dist.__name__} to the EXAMPLES list in test_distributions.py")

    def test_support_attributes(self):
        for Dist, params in EXAMPLES:
@ -1120,7 +1120,7 @@ class TestDistributions(DistributionsTestCase):
        for prob in [0.01, 0.18, 0.8]:
            self._check_sampler_discrete(Geometric(prob),
                                         scipy.stats.geom(p=prob, loc=-1),
-                                         'Geometric(prob={})'.format(prob))
+                                         f'Geometric(prob={prob})')

    def test_binomial(self):
        p = torch.arange(0.05, 1, 0.1).requires_grad_()
@ -1136,7 +1136,7 @@ class TestDistributions(DistributionsTestCase):
            for count in [2, 10, 100, 500]:
                self._check_sampler_discrete(Binomial(total_count=count, probs=prob),
                                             scipy.stats.binom(count, prob),
-                                             'Binomial(total_count={}, probs={})'.format(count, prob))
+                                             f'Binomial(total_count={count}, probs={prob})')

    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
    def test_binomial_log_prob_and_entropy(self):
@ -1431,7 +1431,7 @@ class TestDistributions(DistributionsTestCase):
        for rate in [0.1, 1.0, 5.0]:
            self._check_sampler_discrete(Poisson(rate),
                                         scipy.stats.poisson(rate),
-                                         'Poisson(lambda={})'.format(rate),
+                                         f'Poisson(lambda={rate})',
                                         failure_rate=1e-3)

    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
@ -1441,7 +1441,7 @@ class TestDistributions(DistributionsTestCase):
        for rate in [0.12, 0.9, 4.0]:
            self._check_sampler_discrete(Poisson(torch.tensor([rate]).cuda()),
                                         scipy.stats.poisson(rate),
-                                         'Poisson(lambda={}, cuda)'.format(rate),
+                                         f'Poisson(lambda={rate}, cuda)',
                                         failure_rate=1e-3)

    def test_relaxed_bernoulli(self):
@ -1476,7 +1476,7 @@ class TestDistributions(DistributionsTestCase):
        for probs, temp in product([0.1, 0.2, 0.8], [0.1, 1.0, 10.0]):
            self._check_sampler_discrete(Rounded(RelaxedBernoulli(temp, probs)),
                                         scipy.stats.bernoulli(probs),
-                                         'Rounded(RelaxedBernoulli(temp={}, probs={}))'.format(temp, probs),
+                                         f'Rounded(RelaxedBernoulli(temp={temp}, probs={probs}))',
                                         failure_rate=1e-3)

        for probs in [0.001, 0.2, 0.999]:
@ -1534,7 +1534,7 @@ class TestDistributions(DistributionsTestCase):
        for probs, temp in product([torch.tensor([0.1, 0.9]), torch.tensor([0.2, 0.2, 0.6])], [0.1, 1.0, 10.0]):
            self._check_sampler_discrete(ArgMax(RelaxedOneHotCategorical(temp, probs)),
                                         ScipyCategorical(scipy.stats.multinomial(1, probs)),
-                                         'Rounded(RelaxedOneHotCategorical(temp={}, probs={}))'.format(temp, probs),
+                                         f'Rounded(RelaxedOneHotCategorical(temp={temp}, probs={probs}))',
                                         failure_rate=1e-3)

        for probs in [torch.tensor([0.1, 0.9]), torch.tensor([0.2, 0.2, 0.6])]:
@ -1588,7 +1588,7 @@ class TestDistributions(DistributionsTestCase):
            for concentration in [0.03, 0.3, 1.0, 10.0, 100.0]:
                self._check_sampler_sampler(VonMises(loc, concentration),
                                            scipy.stats.vonmises(loc=loc, kappa=concentration),
-                                            "VonMises(loc={}, concentration={})".format(loc, concentration),
+                                            f"VonMises(loc={loc}, concentration={concentration})",
                                            num_samples=int(1e5), circular=True)

    def test_vonmises_logprob(self):
@ -1694,7 +1694,7 @@ class TestDistributions(DistributionsTestCase):
        for std in [0.1, 1.0, 10.0]:
            self._check_sampler_sampler(HalfNormal(std),
                                        scipy.stats.halfnorm(scale=std),
-                                        'HalfNormal(scale={})'.format(std))
+                                        f'HalfNormal(scale={std})')

    def test_lognormal(self):
        mean = torch.randn(5, 5, requires_grad=True)
@ -1746,7 +1746,7 @@ class TestDistributions(DistributionsTestCase):
        for mean, std in product([-1.0, 0.0, 1.0], [0.1, 1.0, 10.0]):
            self._check_sampler_sampler(LogNormal(mean, std),
                                        scipy.stats.lognorm(scale=math.exp(mean), s=std),
-                                        'LogNormal(loc={}, scale={})'.format(mean, std))
+                                        f'LogNormal(loc={mean}, scale={std})')

    def test_logisticnormal(self):
        set_rng_seed(1)  # see Note [Randomized statistical tests]
@ -1814,7 +1814,7 @@ class TestDistributions(DistributionsTestCase):
            std_th = torch.tensor(np.sqrt(np.diag(cov)))
            self._check_sampler_sampler(
                LogisticNormal(mean_th, std_th), ref_dist,
-                'LogisticNormal(loc={}, scale={})'.format(mean_th, std_th),
+                f'LogisticNormal(loc={mean_th}, scale={std_th})',
                multivariate=True)

    def test_mixture_same_family_shape(self):
@ -1958,7 +1958,7 @@ class TestDistributions(DistributionsTestCase):
        for loc, scale in product([-1.0, 0.0, 1.0], [0.1, 1.0, 10.0]):
            self._check_sampler_sampler(Normal(loc, scale),
                                        scipy.stats.norm(loc=loc, scale=scale),
-                                        'Normal(mean={}, std={})'.format(loc, scale))
+                                        f'Normal(mean={loc}, std={scale})')

    def test_lowrank_multivariate_normal_shape(self):
        mean = torch.randn(5, 3, requires_grad=True)
@ -2191,15 +2191,15 @@ class TestDistributions(DistributionsTestCase):

        self._check_sampler_sampler(MultivariateNormal(mean, cov),
                                    scipy.stats.multivariate_normal(mean.detach().numpy(), cov.detach().numpy()),
-                                    'MultivariateNormal(loc={}, cov={})'.format(mean, cov),
+                                    f'MultivariateNormal(loc={mean}, cov={cov})',
                                    multivariate=True)
        self._check_sampler_sampler(MultivariateNormal(mean, precision_matrix=prec),
                                    scipy.stats.multivariate_normal(mean.detach().numpy(), cov.detach().numpy()),
-                                    'MultivariateNormal(loc={}, atol={})'.format(mean, prec),
+                                    f'MultivariateNormal(loc={mean}, atol={prec})',
                                    multivariate=True)
        self._check_sampler_sampler(MultivariateNormal(mean, scale_tril=scale_tril),
                                    scipy.stats.multivariate_normal(mean.detach().numpy(), cov.detach().numpy()),
-                                    'MultivariateNormal(loc={}, scale_tril={})'.format(mean, scale_tril),
+                                    f'MultivariateNormal(loc={mean}, scale_tril={scale_tril})',
                                    multivariate=True)

    def test_multivariate_normal_properties(self):
@ -2352,15 +2352,15 @@ class TestDistributions(DistributionsTestCase):

        self._check_sampler_sampler(Wishart(df, cov),
                                    ref_dist,
-                                    'Wishart(df={}, covariance_matrix={})'.format(df, cov),
+                                    f'Wishart(df={df}, covariance_matrix={cov})',
                                    multivariate=True)
        self._check_sampler_sampler(Wishart(df, precision_matrix=prec),
                                    ref_dist,
-                                    'Wishart(df={}, precision_matrix={})'.format(df, prec),
+                                    f'Wishart(df={df}, precision_matrix={prec})',
                                    multivariate=True)
        self._check_sampler_sampler(Wishart(df, scale_tril=scale_tril),
                                    ref_dist,
-                                    'Wishart(df={}, scale_tril={})'.format(df, scale_tril),
+                                    f'Wishart(df={df}, scale_tril={scale_tril})',
                                    multivariate=True)

    def test_wishart_properties(self):
@ -2431,7 +2431,7 @@ class TestDistributions(DistributionsTestCase):
        for rate in [1e-5, 1.0, 10.]:
            self._check_sampler_sampler(Exponential(rate),
                                        scipy.stats.expon(scale=1. / rate),
-                                        'Exponential(rate={})'.format(rate))
+                                        f'Exponential(rate={rate})')

    def test_laplace(self):
        loc = torch.randn(5, 5, requires_grad=True)
@ -2482,7 +2482,7 @@ class TestDistributions(DistributionsTestCase):
        for loc, scale in product([-1.0, 0.0, 1.0], [0.1, 1.0, 10.0]):
            self._check_sampler_sampler(Laplace(loc, scale),
                                        scipy.stats.laplace(loc=loc, scale=scale),
-                                        'Laplace(loc={}, scale={})'.format(loc, scale))
+                                        f'Laplace(loc={loc}, scale={scale})')

    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
    def test_gamma_shape(self):
@ -2533,7 +2533,7 @@ class TestDistributions(DistributionsTestCase):
        for alpha, beta in product([0.1, 1.0, 5.0], [0.1, 1.0, 10.0]):
            self._check_sampler_sampler(Gamma(alpha, beta),
                                        scipy.stats.gamma(alpha, scale=1.0 / beta),
-                                        'Gamma(concentration={}, rate={})'.format(alpha, beta))
+                                        f'Gamma(concentration={alpha}, rate={beta})')

    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
@ -2543,7 +2543,7 @@ class TestDistributions(DistributionsTestCase):
            a, b = torch.tensor([alpha]).cuda(), torch.tensor([beta]).cuda()
            self._check_sampler_sampler(Gamma(a, b),
                                        scipy.stats.gamma(alpha, scale=1.0 / beta),
-                                        'Gamma(alpha={}, beta={})'.format(alpha, beta),
+                                        f'Gamma(alpha={alpha}, beta={beta})',
                                        failure_rate=1e-4)

    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
@ -2575,7 +2575,7 @@ class TestDistributions(DistributionsTestCase):
        for scale, alpha in product([0.1, 1.0, 5.0], [0.1, 1.0, 10.0]):
            self._check_sampler_sampler(Pareto(scale, alpha),
                                        scipy.stats.pareto(alpha, scale=scale),
-                                        'Pareto(scale={}, alpha={})'.format(scale, alpha))
+                                        f'Pareto(scale={scale}, alpha={alpha})')

    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
    def test_gumbel(self):
@ -2616,7 +2616,7 @@ class TestDistributions(DistributionsTestCase):
        for loc, scale in product([-5.0, -1.0, -0.1, 0.1, 1.0, 5.0], [0.1, 1.0, 10.0]):
            self._check_sampler_sampler(Gumbel(loc, scale),
                                        scipy.stats.gumbel_r(loc=loc, scale=scale),
-                                        'Gumbel(loc={}, scale={})'.format(loc, scale))
+                                        f'Gumbel(loc={loc}, scale={scale})')

    def test_kumaraswamy_shape(self):
        concentration1 = torch.randn(2, 3).abs().requires_grad_()
@ -2646,13 +2646,13 @@ class TestDistributions(DistributionsTestCase):
            error = (expected - actual).abs()
            max_error = max(error[error == error])
            self.assertLess(max_error, 0.01,
-                            "Kumaraswamy example {}/{}, incorrect .mean".format(i + 1, len(cases)))
+                            f"Kumaraswamy example {i + 1}/{len(cases)}, incorrect .mean")
            expected = samples.var(0)
            actual = m.variance
            error = (expected - actual).abs()
            max_error = max(error[error == error])
            self.assertLess(max_error, 0.01,
-                            "Kumaraswamy example {}/{}, incorrect .variance".format(i + 1, len(cases)))
+                            f"Kumaraswamy example {i + 1}/{len(cases)}, incorrect .variance")

    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
    def test_fishersnedecor(self):
@ -2683,7 +2683,7 @@ class TestDistributions(DistributionsTestCase):
        for df1, df2 in product([0.1, 0.5, 1.0, 5.0, 10.0], [0.1, 0.5, 1.0, 5.0, 10.0]):
            self._check_sampler_sampler(FisherSnedecor(df1, df2),
                                        scipy.stats.f(df1, df2),
-                                        'FisherSnedecor(loc={}, scale={})'.format(df1, df2))
+                                        f'FisherSnedecor(loc={df1}, scale={df2})')

    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
    def test_chi2_shape(self):
@ -2710,7 +2710,7 @@ class TestDistributions(DistributionsTestCase):
        for df in [0.1, 1.0, 5.0]:
            self._check_sampler_sampler(Chi2(df),
                                        scipy.stats.chi2(df),
-                                        'Chi2(df={})'.format(df))
+                                        f'Chi2(df={df})')

    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
    def test_studentT(self):
@ -2740,7 +2740,7 @@ class TestDistributions(DistributionsTestCase):
        for df, loc, scale in product([0.1, 1.0, 5.0, 10.0], [-1.0, 0.0, 1.0], [0.1, 1.0, 10.0]):
            self._check_sampler_sampler(StudentT(df=df, loc=loc, scale=scale),
                                        scipy.stats.t(df=df, loc=loc, scale=scale),
-                                        'StudentT(df={}, loc={}, scale={})'.format(df, loc, scale))
+                                        f'StudentT(df={df}, loc={loc}, scale={scale})')

    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
    def test_studentT_log_prob(self):
@ -2793,7 +2793,7 @@ class TestDistributions(DistributionsTestCase):
        alpha = torch.exp(torch.randn(3))
        self._check_sampler_sampler(Dirichlet(alpha),
                                    scipy.stats.dirichlet(alpha.numpy()),
-                                    'Dirichlet(alpha={})'.format(list(alpha)),
+                                    f'Dirichlet(alpha={list(alpha)})',
                                    multivariate=True)

    def test_dirichlet_mode(self):
@ -2837,11 +2837,11 @@ class TestDistributions(DistributionsTestCase):
        for con1, con0 in product([0.1, 1.0, 10.0], [0.1, 1.0, 10.0]):
            self._check_sampler_sampler(Beta(con1, con0),
                                        scipy.stats.beta(con1, con0),
-                                        'Beta(alpha={}, beta={})'.format(con1, con0))
+                                        f'Beta(alpha={con1}, beta={con0})')
        # Check that small alphas do not cause NANs.
        for Tensor in [torch.FloatTensor, torch.DoubleTensor]:
            x = Beta(Tensor([1e-6]), Tensor([1e-6])).sample()[0]
-            self.assertTrue(np.isfinite(x) and x > 0, 'Invalid Beta.sample(): {}'.format(x))
+            self.assertTrue(np.isfinite(x) and x > 0, f'Invalid Beta.sample(): {x}')

    def test_beta_underflow(self):
        # For low values of (alpha, beta), the gamma samples can underflow
@ -2997,10 +2997,10 @@ class TestDistributions(DistributionsTestCase):
                    continue
                rel_error = torch.abs(actual - samples) / (1e-10 + torch.abs(samples))
                self.assertLess(rel_error.max(), 1e-4, msg='\n'.join([
-                    '{} example {}/{}, icdf(cdf(x)) != x'.format(Dist.__name__, i + 1, len(params)),
-                    'x = {}'.format(samples),
-                    'cdf(x) = {}'.format(cdf),
-                    'icdf(cdf(x)) = {}'.format(actual),
+                    f'{Dist.__name__} example {i + 1}/{len(params)}, icdf(cdf(x)) != x',
+                    f'x = {samples}',
+                    f'cdf(x) = {cdf}',
+                    f'icdf(cdf(x)) = {actual}',
                ]))

    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
@ -3029,11 +3029,11 @@ class TestDistributions(DistributionsTestCase):
                    continue
                cdfs_derivative = grad(cdfs.sum(), [samples])[0]  # this should not be wrapped in torch.abs()
                self.assertEqual(cdfs_derivative, pdfs, msg='\n'.join([
-                    '{} example {}/{}, d(cdf)/dx != pdf(x)'.format(Dist.__name__, i + 1, len(params)),
-                    'x = {}'.format(samples),
-                    'cdf = {}'.format(cdfs),
-                    'pdf = {}'.format(pdfs),
-                    'grad(cdf) = {}'.format(cdfs_derivative),
+                    f'{Dist.__name__} example {i + 1}/{len(params)}, d(cdf)/dx != pdf(x)',
+                    f'x = {samples}',
+                    f'cdf = {cdfs}',
+                    f'pdf = {pdfs}',
+                    f'grad(cdf) = {cdfs_derivative}',
                ]))

    def test_valid_parameter_broadcasting(self):
@ -3144,13 +3144,13 @@ class TestDistributions(DistributionsTestCase):
        for dist, expected_size in valid_examples:
            actual_size = dist.sample().size()
            self.assertEqual(actual_size, expected_size,
-                             msg='{} actual size: {} != expected size: {}'.format(dist, actual_size, expected_size))
+                             msg=f'{dist} actual size: {actual_size} != expected size: {expected_size}')

            sample_shape = torch.Size((2,))
            expected_size = sample_shape + expected_size
            actual_size = dist.sample(sample_shape).size()
            self.assertEqual(actual_size, expected_size,
-                             msg='{} actual size: {} != expected size: {}'.format(dist, actual_size, expected_size))
+                             msg=f'{dist} actual size: {actual_size} != expected size: {expected_size}')

    def test_invalid_parameter_broadcasting(self):
        # invalid broadcasting cases; should throw error
@ -3303,13 +3303,13 @@ class TestRsample(DistributionsTestCase):
            expected_grad = -cdf_alpha / cdf_x
            rel_error = np.abs(actual_grad - expected_grad) / (expected_grad + 1e-30)
            self.assertLess(np.max(rel_error), 0.0005, '\n'.join([
-                'Bad gradient dx/alpha for x ~ Gamma({}, 1)'.format(alpha),
-                'x {}'.format(x),
-                'expected {}'.format(expected_grad),
-                'actual {}'.format(actual_grad),
-                'rel error {}'.format(rel_error),
-                'max error {}'.format(rel_error.max()),
-                'at alpha={}, x={}'.format(alpha, x[rel_error.argmax()]),
+                f'Bad gradient dx/alpha for x ~ Gamma({alpha}, 1)',
+                f'x {x}',
+                f'expected {expected_grad}',
+                f'actual {actual_grad}',
+                f'rel error {rel_error}',
+                f'max error {rel_error.max()}',
+                f'at alpha={alpha}, x={x[rel_error.argmax()]}',
            ]))

    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
@ -3331,12 +3331,12 @@ class TestRsample(DistributionsTestCase):
            expected_grad = -cdf_df / cdf_x
            rel_error = np.abs(actual_grad - expected_grad) / (expected_grad + 1e-30)
            self.assertLess(np.max(rel_error), 0.001, '\n'.join([
-                'Bad gradient dx/ddf for x ~ Chi2({})'.format(df),
-                'x {}'.format(x),
-                'expected {}'.format(expected_grad),
-                'actual {}'.format(actual_grad),
-                'rel error {}'.format(rel_error),
-                'max error {}'.format(rel_error.max()),
+                f'Bad gradient dx/ddf for x ~ Chi2({df})',
+                f'x {x}',
+                f'expected {expected_grad}',
+                f'actual {actual_grad}',
+                f'rel error {rel_error}',
+                f'max error {rel_error.max()}',
            ]))

    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
@ -3361,13 +3361,13 @@ class TestRsample(DistributionsTestCase):
            expected_grad = -cdf_alpha / cdf_x
            rel_error = np.abs(actual_grad - expected_grad) / (expected_grad + 1e-30)
            self.assertLess(np.max(rel_error), 0.001, '\n'.join([
-                'Bad gradient dx[0]/dalpha[0] for Dirichlet([{}, {}, {}])'.format(a0, a1, a2),
-                'x {}'.format(x),
-                'expected {}'.format(expected_grad),
-                'actual {}'.format(actual_grad),
-                'rel error {}'.format(rel_error),
-                'max error {}'.format(rel_error.max()),
-                'at x={}'.format(x[rel_error.argmax()]),
+                f'Bad gradient dx[0]/dalpha[0] for Dirichlet([{a0}, {a1}, {a2}])',
+                f'x {x}',
+                f'expected {expected_grad}',
+                f'actual {actual_grad}',
+                f'rel error {rel_error}',
+                f'max error {rel_error.max()}',
+                f'at x={x[rel_error.argmax()]}',
            ]))

    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
@ -3391,13 +3391,13 @@ class TestRsample(DistributionsTestCase):
            expected_grad = -cdf_alpha / cdf_x
            rel_error = np.abs(actual_grad - expected_grad) / (expected_grad + 1e-30)
            self.assertLess(np.max(rel_error), 0.005, '\n'.join([
-                'Bad gradient dx/dcon1 for x ~ Beta({}, {})'.format(con1, con0),
-                'x {}'.format(x),
-                'expected {}'.format(expected_grad),
-                'actual {}'.format(actual_grad),
-                'rel error {}'.format(rel_error),
-                'max error {}'.format(rel_error.max()),
-                'at x = {}'.format(x[rel_error.argmax()]),
+                f'Bad gradient dx/dcon1 for x ~ Beta({con1}, {con0})',
+                f'x {x}',
+                f'expected {expected_grad}',
+                f'actual {actual_grad}',
+                f'rel error {rel_error}',
+                f'max error {rel_error.max()}',
+                f'at x = {x[rel_error.argmax()]}',
            ]))

    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
@ -3421,13 +3421,13 @@ class TestRsample(DistributionsTestCase):
            expected_grad = -cdf_beta / cdf_x
            rel_error = np.abs(actual_grad - expected_grad) / (expected_grad + 1e-30)
            self.assertLess(np.max(rel_error), 0.005, '\n'.join([
-                'Bad gradient dx/dcon0 for x ~ Beta({}, {})'.format(con1, con0),
-                'x {}'.format(x),
-                'expected {}'.format(expected_grad),
-                'actual {}'.format(actual_grad),
-                'rel error {}'.format(rel_error),
-                'max error {}'.format(rel_error.max()),
-                'at x = {!r}'.format(x[rel_error.argmax()]),
+                f'Bad gradient dx/dcon0 for x ~ Beta({con1}, {con0})',
+                f'x {x}',
+                f'expected {expected_grad}',
+                f'actual {actual_grad}',
+                f'rel error {rel_error}',
+                f'max error {rel_error.max()}',
+                f'at x = {x[rel_error.argmax()]!r}',
            ]))

    def test_dirichlet_multivariate(self):
@ -3485,8 +3485,8 @@ class TestRsample(DistributionsTestCase):
            # expression in terms of log_prob rather than the less numerically stable log_prob.exp().
            error = dlogp_da + (dlogp_dx * v).sum(-1) + div_v
            self.assertLess(torch.abs(error).max(), 0.005, '\n'.join([
-                'Dirichlet([{}, {}, {}]) gradient violates continuity equation:'.format(a1, a2, a3),
-                'error = {}'.format(error),
+                f'Dirichlet([{a1}, {a2}, {a3}]) gradient violates continuity equation:',
+                f'error = {error}',
            ]))


@ -4147,9 +4147,9 @@ class TestKL(DistributionsTestCase):
                if error[error == error].max() < self.precision:
                    break
            self.assertLess(error[error == error].max(), self.precision, '\n'.join([
-                'Incorrect KL({}, {}).'.format(type(p).__name__, type(q).__name__),
-                'Expected ({} Monte Carlo samples): {}'.format(denominator, expected),
-                'Actual (analytic): {}'.format(actual),
+                f'Incorrect KL({type(p).__name__}, {type(q).__name__}).',
+                f'Expected ({denominator} Monte Carlo samples): {expected}',
+                f'Actual (analytic): {actual}',
            ]))

    # Multivariate normal has a separate Monte Carlo based test due to the requirement of random generation of
@ -4174,9 +4174,9 @@ class TestKL(DistributionsTestCase):
                if error[error == error].max() < self.precision:
                    break
            self.assertLess(error[error == error].max(), self.precision, '\n'.join([
-                'Incorrect KL(MultivariateNormal, MultivariateNormal) instance {}/{}'.format(i + 1, n),
-                'Expected ({} Monte Carlo sample): {}'.format(denominator, expected),
-                'Actual (analytic): {}'.format(actual),
+                f'Incorrect KL(MultivariateNormal, MultivariateNormal) instance {i + 1}/{n}',
+                f'Expected ({denominator} Monte Carlo sample): {expected}',
+                f'Actual (analytic): {actual}',
            ]))

    def test_kl_multivariate_normal_batched(self):
@ -4223,23 +4223,23 @@ class TestKL(DistributionsTestCase):

            error_lowrank_lowrank = torch.abs(actual_lowrank_lowrank - expected).max()
            self.assertLess(error_lowrank_lowrank, self.precision, '\n'.join([
-                'Incorrect KL(LowRankMultivariateNormal, LowRankMultivariateNormal) instance {}/{}'.format(i + 1, n),
-                'Expected (from KL MultivariateNormal): {}'.format(expected),
-                'Actual (analytic): {}'.format(actual_lowrank_lowrank),
+                f'Incorrect KL(LowRankMultivariateNormal, LowRankMultivariateNormal) instance {i + 1}/{n}',
+                f'Expected (from KL MultivariateNormal): {expected}',
+                f'Actual (analytic): {actual_lowrank_lowrank}',
            ]))

            error_lowrank_full = torch.abs(actual_lowrank_full - expected).max()
            self.assertLess(error_lowrank_full, self.precision, '\n'.join([
-                'Incorrect KL(LowRankMultivariateNormal, MultivariateNormal) instance {}/{}'.format(i + 1, n),
-                'Expected (from KL MultivariateNormal): {}'.format(expected),
-                'Actual (analytic): {}'.format(actual_lowrank_full),
+                f'Incorrect KL(LowRankMultivariateNormal, MultivariateNormal) instance {i + 1}/{n}',
+                f'Expected (from KL MultivariateNormal): {expected}',
+                f'Actual (analytic): {actual_lowrank_full}',
            ]))

            error_full_lowrank = torch.abs(actual_full_lowrank - expected).max()
            self.assertLess(error_full_lowrank, self.precision, '\n'.join([
-                'Incorrect KL(MultivariateNormal, LowRankMultivariateNormal) instance {}/{}'.format(i + 1, n),
-                'Expected (from KL MultivariateNormal): {}'.format(expected),
-                'Actual (analytic): {}'.format(actual_full_lowrank),
+                f'Incorrect KL(MultivariateNormal, LowRankMultivariateNormal) instance {i + 1}/{n}',
+                f'Expected (from KL MultivariateNormal): {expected}',
+                f'Actual (analytic): {actual_full_lowrank}',
            ]))

    def test_kl_lowrank_multivariate_normal_batched(self):
@ -4261,16 +4261,16 @@ class TestKL(DistributionsTestCase):
                actual = kl_divergence(p, q)
                expected = _kl_expfamily_expfamily(p, q)
                self.assertEqual(actual, expected, msg='\n'.join([
-                    'Incorrect KL({}, {}).'.format(type(p).__name__, type(q).__name__),
-                    'Expected (using Bregman Divergence) {}'.format(expected),
-                    'Actual (analytic) {}'.format(actual),
-                    'max error = {}'.format(torch.abs(actual - expected).max())
+                    f'Incorrect KL({type(p).__name__}, {type(q).__name__}).',
+                    f'Expected (using Bregman Divergence) {expected}',
+                    f'Actual (analytic) {actual}',
+                    f'max error = {torch.abs(actual - expected).max()}'
                ]))

    def test_kl_infinite(self):
        for p, q in self.infinite_examples:
            self.assertTrue((kl_divergence(p, q) == inf).all(),
-                            'Incorrect KL({}, {})'.format(type(p).__name__, type(q).__name__))
+                            f'Incorrect KL({type(p).__name__}, {type(q).__name__})')

    def test_kl_edgecases(self):
        self.assertEqual(kl_divergence(Bernoulli(0), Bernoulli(0)), 0)
@ -4287,9 +4287,9 @@ class TestKL(DistributionsTestCase):
                    continue
                expected_shape = dist.batch_shape if dist.batch_shape else torch.Size()
                self.assertEqual(kl.shape, expected_shape, msg='\n'.join([
-                    '{} example {}/{}'.format(Dist.__name__, i + 1, len(params)),
-                    'Expected {}'.format(expected_shape),
-                    'Actual {}'.format(kl.shape),
+                    f'{Dist.__name__} example {i + 1}/{len(params)}',
+                    f'Expected {expected_shape}',
+                    f'Actual {kl.shape}',
                ]))

    def test_kl_transformed(self):
@ -4316,10 +4316,10 @@ class TestKL(DistributionsTestCase):
                ignore = (expected == inf) | (expected == -inf)
                expected[ignore] = actual[ignore]
                self.assertEqual(actual, expected, atol=0.2, rtol=0, msg='\n'.join([
-                    '{} example {}/{}, incorrect .entropy().'.format(Dist.__name__, i + 1, len(params)),
-                    'Expected (monte carlo) {}'.format(expected),
-                    'Actual (analytic) {}'.format(actual),
-                    'max error = {}'.format(torch.abs(actual - expected).max()),
+                    f'{Dist.__name__} example {i + 1}/{len(params)}, incorrect .entropy().',
+                    f'Expected (monte carlo) {expected}',
+                    f'Actual (analytic) {actual}',
+                    f'max error = {torch.abs(actual - expected).max()}',
                ]))

    def test_entropy_exponential_family(self):
@ -4337,10 +4337,10 @@ class TestKL(DistributionsTestCase):
                except NotImplementedError:
                    continue
                self.assertEqual(actual, expected, msg='\n'.join([
-                    '{} example {}/{}, incorrect .entropy().'.format(Dist.__name__, i + 1, len(params)),
-                    'Expected (Bregman Divergence) {}'.format(expected),
-                    'Actual (analytic) {}'.format(actual),
-                    'max error = {}'.format(torch.abs(actual - expected).max())
+                    f'{Dist.__name__} example {i + 1}/{len(params)}, incorrect .entropy().',
+                    f'Expected (Bregman Divergence) {expected}',
+                    f'Actual (analytic) {actual}',
+                    f'max error = {torch.abs(actual - expected).max()}'
                ]))


@ -4632,7 +4632,7 @@ class TestLazyLogitsInitialization(DistributionsTestCase):
            dist = Dist(**param)
            # Create new instance to generate a valid sample
            dist.log_prob(Dist(**param).sample())
-            message = 'Failed for {} example 0/{}'.format(Dist.__name__, len(params))
+            message = f'Failed for {Dist.__name__} example 0/{len(params)}'
            self.assertNotIn('probs', dist.__dict__, msg=message)
            try:
                dist.enumerate_support()
@ -4649,7 +4649,7 @@ class TestLazyLogitsInitialization(DistributionsTestCase):
                continue
            dist = Dist(**param)
            dist.sample()
-            message = 'Failed for {} example 0/{}'.format(Dist.__name__, len(params))
+            message = f'Failed for {Dist.__name__} example 0/{len(params)}'
            self.assertNotIn('logits', dist.__dict__, msg=message)
            try:
                dist.enumerate_support()
@ -5161,7 +5161,7 @@ class TestJit(DistributionsTestCase):
            expected = f(sample, *values)
            actual = traced_f(sample, *values)
            self.assertEqual(expected, actual,
-                             msg='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
+                             msg=f'{Dist.__name__}\nExpected:\n{expected}\nActual:\n{actual}')

    def test_enumerate_support(self):
        for Dist, keys, values, sample in self._examples():
@ -5185,7 +5185,7 @@ class TestJit(DistributionsTestCase):
            expected = f(*values)
            actual = traced_f(*values)
            self.assertEqual(expected, actual,
-                             msg='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
+                             msg=f'{Dist.__name__}\nExpected:\n{expected}\nActual:\n{actual}')

    def test_mean(self):
        for Dist, keys, values, sample in self._examples():
@ -5207,7 +5207,7 @@ class TestJit(DistributionsTestCase):
            expected[expected == float('inf')] = 0.
            actual[actual == float('inf')] = 0.
            self.assertEqual(expected, actual,
-                             msg='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
+                             msg=f'{Dist.__name__}\nExpected:\n{expected}\nActual:\n{actual}')

    def test_variance(self):
        for Dist, keys, values, sample in self._examples():
@ -5231,7 +5231,7 @@ class TestJit(DistributionsTestCase):
            expected[expected == float('inf')] = 0.
            actual[actual == float('inf')] = 0.
            self.assertEqual(expected, actual,
-                             msg='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
+                             msg=f'{Dist.__name__}\nExpected:\n{expected}\nActual:\n{actual}')

    def test_entropy(self):
        for Dist, keys, values, sample in self._examples():
@ -5255,7 +5255,7 @@ class TestJit(DistributionsTestCase):
            expected = f(*values)
            actual = traced_f(*values)
            self.assertEqual(expected, actual,
-                             msg='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
+                             msg=f'{Dist.__name__}\nExpected:\n{expected}\nActual:\n{actual}')

    def test_cdf(self):
        for Dist, keys, values, sample in self._examples():
@ -5276,7 +5276,7 @@ class TestJit(DistributionsTestCase):
            expected = f(sample, *values)
            actual = traced_f(sample, *values)
            self.assertEqual(expected, actual,
-                             msg='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
+                             msg=f'{Dist.__name__}\nExpected:\n{expected}\nActual:\n{actual}')


 if __name__ == '__main__' and torch._C.has_lapack:
--- a/test/distributions/test_transforms.py
+++ b/test/distributions/test_transforms.py
@ -156,7 +156,7 @@ def generate_data(transform):
        x /= x.norm(dim=-1, keepdim=True)
        x.diagonal(dim1=-1).copy_(x.diagonal(dim1=-1).abs())
        return x
-    raise ValueError('Unsupported domain: {}'.format(domain))
+    raise ValueError(f'Unsupported domain: {domain}')


 TRANSFORMS_CACHE_ACTIVE = get_transforms(cache_size=1)
@ -215,19 +215,19 @@ def test_forward_inverse(transform, test_cached):
    if transform.bijective:
        # verify function inverse
        assert torch.allclose(x2, x, atol=1e-4, equal_nan=True), '\n'.join([
-            '{} t.inv(t(-)) error'.format(transform),
-            'x = {}'.format(x),
-            'y = t(x) = {}'.format(y),
-            'x2 = t.inv(y) = {}'.format(x2),
+            f'{transform} t.inv(t(-)) error',
+            f'x = {x}',
+            f'y = t(x) = {y}',
+            f'x2 = t.inv(y) = {x2}',
        ])
    else:
        # verify weaker function pseudo-inverse
        assert torch.allclose(y2, y, atol=1e-4, equal_nan=True), '\n'.join([
-            '{} t(t.inv(t(-))) error'.format(transform),
-            'x = {}'.format(x),
-            'y = t(x) = {}'.format(y),
-            'x2 = t.inv(y) = {}'.format(x2),
-            'y2 = t(x2) = {}'.format(y2),
+            f'{transform} t(t.inv(t(-))) error',
+            f'x = {x}',
+            f'y = t(x) = {y}',
+            f'x2 = t.inv(y) = {x2}',
+            f'y2 = t(x2) = {y2}',
        ])


--- a/test/optim/test_optim.py
+++ b/test/optim/test_optim.py
@ -1701,8 +1701,8 @@ class TestOptim(TestCase):

        num_tensors = 5
        for functional_optim, amsgrad, no_grad_scale in itertools.product((adam.adam, adamw.adamw), (False, True), (False, True)):
-            params, grads, exp_avgs, exp_avg_sqs = [
-                [torch.ones((1,), device="cuda") for _ in range(num_tensors)] for _ in range(4)]
+            params, grads, exp_avgs, exp_avg_sqs = (
+                [torch.ones((1,), device="cuda") for _ in range(num_tensors)] for _ in range(4))
            prev_params = [t.clone().detach() for t in params]
            max_exp_avg_sqs = [torch.ones((1,), device="cuda") for _ in range(num_tensors)] if amsgrad else []
            state_steps = [torch.ones((), dtype=torch.float32, device="cuda") for _ in range(num_tensors)]
--- a/torch/distributions/constraints.py
+++ b/torch/distributions/constraints.py
@ -258,7 +258,7 @@ class _IntegerInterval(Constraint):

    def __repr__(self):
        fmt_string = self.__class__.__name__[1:]
-        fmt_string += '(lower_bound={}, upper_bound={})'.format(self.lower_bound, self.upper_bound)
+        fmt_string += f'(lower_bound={self.lower_bound}, upper_bound={self.upper_bound})'
        return fmt_string


@ -277,7 +277,7 @@ class _IntegerLessThan(Constraint):

    def __repr__(self):
        fmt_string = self.__class__.__name__[1:]
-        fmt_string += '(upper_bound={})'.format(self.upper_bound)
+        fmt_string += f'(upper_bound={self.upper_bound})'
        return fmt_string


@ -296,7 +296,7 @@ class _IntegerGreaterThan(Constraint):

    def __repr__(self):
        fmt_string = self.__class__.__name__[1:]
-        fmt_string += '(lower_bound={})'.format(self.lower_bound)
+        fmt_string += f'(lower_bound={self.lower_bound})'
        return fmt_string


@ -321,7 +321,7 @@ class _GreaterThan(Constraint):

    def __repr__(self):
        fmt_string = self.__class__.__name__[1:]
-        fmt_string += '(lower_bound={})'.format(self.lower_bound)
+        fmt_string += f'(lower_bound={self.lower_bound})'
        return fmt_string


@ -338,7 +338,7 @@ class _GreaterThanEq(Constraint):

    def __repr__(self):
        fmt_string = self.__class__.__name__[1:]
-        fmt_string += '(lower_bound={})'.format(self.lower_bound)
+        fmt_string += f'(lower_bound={self.lower_bound})'
        return fmt_string


@ -355,7 +355,7 @@ class _LessThan(Constraint):

    def __repr__(self):
        fmt_string = self.__class__.__name__[1:]
-        fmt_string += '(upper_bound={})'.format(self.upper_bound)
+        fmt_string += f'(upper_bound={self.upper_bound})'
        return fmt_string


@ -373,7 +373,7 @@ class _Interval(Constraint):

    def __repr__(self):
        fmt_string = self.__class__.__name__[1:]
-        fmt_string += '(lower_bound={}, upper_bound={})'.format(self.lower_bound, self.upper_bound)
+        fmt_string += f'(lower_bound={self.lower_bound}, upper_bound={self.upper_bound})'
        return fmt_string


@ -391,7 +391,7 @@ class _HalfOpenInterval(Constraint):

    def __repr__(self):
        fmt_string = self.__class__.__name__[1:]
-        fmt_string += '(lower_bound={}, upper_bound={})'.format(self.lower_bound, self.upper_bound)
+        fmt_string += f'(lower_bound={self.lower_bound}, upper_bound={self.upper_bound})'
        return fmt_string


--- a/torch/distributions/independent.py
+++ b/torch/distributions/independent.py
@ -109,4 +109,4 @@ class Independent(Distribution):
        return self.base_dist.enumerate_support(expand=expand)

    def __repr__(self):
-        return self.__class__.__name__ + '({}, {})'.format(self.base_dist, self.reinterpreted_batch_ndims)
+        return self.__class__.__name__ + f'({self.base_dist}, {self.reinterpreted_batch_ndims})'
--- a/torch/distributions/kl.py
+++ b/torch/distributions/kl.py
@ -65,9 +65,9 @@ def register_kl(type_p, type_q):
        type_q (type): A subclass of :class:`~torch.distributions.Distribution`.
    """
    if not isinstance(type_p, type) and issubclass(type_p, Distribution):
-        raise TypeError('Expected type_p to be a Distribution subclass but got {}'.format(type_p))
+        raise TypeError(f'Expected type_p to be a Distribution subclass but got {type_p}')
    if not isinstance(type_q, type) and issubclass(type_q, Distribution):
-        raise TypeError('Expected type_q to be a Distribution subclass but got {}'.format(type_q))
+        raise TypeError(f'Expected type_q to be a Distribution subclass but got {type_q}')

    def decorator(fun):
        _KL_REGISTRY[type_p, type_q] = fun
@ -735,7 +735,7 @@ def _kl_uniform_beta(p, q):
    common_term = p.high - p.low
    t1 = torch.log(common_term)
    t2 = (q.concentration1 - 1) * (_x_log_x(p.high) - _x_log_x(p.low) - common_term) / common_term
-    t3 = (q.concentration0 - 1) * (_x_log_x((1 - p.high)) - _x_log_x((1 - p.low)) + common_term) / common_term
+    t3 = (q.concentration0 - 1) * (_x_log_x(1 - p.high) - _x_log_x(1 - p.low) + common_term) / common_term
    t4 = q.concentration1.lgamma() + q.concentration0.lgamma() - (q.concentration1 + q.concentration0).lgamma()
    result = t3 + t4 - t1 - t2
    result[(p.high > q.support.upper_bound) | (p.low < q.support.lower_bound)] = inf
--- a/torch/distributions/lowrank_multivariate_normal.py
+++ b/torch/distributions/lowrank_multivariate_normal.py
@ -93,7 +93,7 @@ class LowRankMultivariateNormal(Distribution):
            raise ValueError("cov_factor must be a batch of matrices with shape {} x m"
                             .format(event_shape[0]))
        if cov_diag.shape[-1:] != event_shape:
-            raise ValueError("cov_diag must be a batch of vectors with shape {}".format(event_shape))
+            raise ValueError(f"cov_diag must be a batch of vectors with shape {event_shape}")

        loc_ = loc.unsqueeze(-1)
        cov_diag_ = cov_diag.unsqueeze(-1)
--- a/torch/distributions/mixture_same_family.py
+++ b/torch/distributions/mixture_same_family.py
@ -71,17 +71,17 @@ class MixtureSameFamily(Distribution):
        cdbs = self._component_distribution.batch_shape[:-1]
        for size1, size2 in zip(reversed(mdbs), reversed(cdbs)):
            if size1 != 1 and size2 != 1 and size1 != size2:
-                raise ValueError("`mixture_distribution.batch_shape` ({0}) is not "
+                raise ValueError(f"`mixture_distribution.batch_shape` ({mdbs}) is not "
                                 "compatible with `component_distribution."
-                                 "batch_shape`({1})".format(mdbs, cdbs))
+                                 f"batch_shape`({cdbs})")

        # Check that the number of mixture component matches
        km = self._mixture_distribution.logits.shape[-1]
        kc = self._component_distribution.batch_shape[-1]
        if km is not None and kc is not None and km != kc:
-            raise ValueError("`mixture_distribution component` ({0}) does not"
+            raise ValueError(f"`mixture_distribution component` ({km}) does not"
                             " equal `component_distribution.batch_shape[-1]`"
-                             " ({1})".format(km, kc))
+                             f" ({kc})")
        self._num_component = km

        event_shape = self._component_distribution.event_shape
--- a/torch/distributions/transformed_distribution.py
+++ b/torch/distributions/transformed_distribution.py
@ -51,7 +51,7 @@ class TransformedDistribution(Distribution):
                raise ValueError("transforms must be a Transform or a list of Transforms")
            self.transforms = transforms
        else:
-            raise ValueError("transforms must be a Transform or list, but was {}".format(transforms))
+            raise ValueError(f"transforms must be a Transform or list, but was {transforms}")

        # Reshape base_distribution according to transforms.
        base_shape = base_distribution.batch_shape + base_distribution.event_shape
--- a/torch/distributions/transforms.py
+++ b/torch/distributions/transforms.py
@ -135,7 +135,7 @@ class Transform:
            return self
        if type(self).__init__ is Transform.__init__:
            return type(self)(cache_size=cache_size)
-        raise NotImplementedError("{}.with_cache is not implemented".format(type(self)))
+        raise NotImplementedError(f"{type(self)}.with_cache is not implemented")

    def __eq__(self, other):
        return self is other
@ -506,7 +506,7 @@ class ReshapeTransform(Transform):
            raise ValueError("Too few dimensions on input")
        cut = len(shape) - len(self.in_shape)
        if shape[cut:] != self.in_shape:
-            raise ValueError("Shape mismatch: expected {} but got {}".format(shape[cut:], self.in_shape))
+            raise ValueError(f"Shape mismatch: expected {shape[cut:]} but got {self.in_shape}")
        return shape[:cut] + self.out_shape

    def inverse_shape(self, shape):
@ -514,7 +514,7 @@ class ReshapeTransform(Transform):
            raise ValueError("Too few dimensions on input")
        cut = len(shape) - len(self.out_shape)
        if shape[cut:] != self.out_shape:
-            raise ValueError("Shape mismatch: expected {} but got {}".format(shape[cut:], self.out_shape))
+            raise ValueError(f"Shape mismatch: expected {shape[cut:]} but got {self.out_shape}")
        return shape[:cut] + self.in_shape


--- a/torch/optim/adadelta.py
+++ b/torch/optim/adadelta.py
@ -22,13 +22,13 @@ class Adadelta(Optimizer):
        differentiable: bool = False,
    ):
        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
+            raise ValueError(f"Invalid learning rate: {lr}")
        if not 0.0 <= rho <= 1.0:
-            raise ValueError("Invalid rho value: {}".format(rho))
+            raise ValueError(f"Invalid rho value: {rho}")
        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
+            raise ValueError(f"Invalid epsilon value: {eps}")
        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")

        defaults = dict(
            lr=lr,
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@ -23,11 +23,11 @@ class Adagrad(Optimizer):
        differentiable: bool = False,
    ):
        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
+            raise ValueError(f"Invalid learning rate: {lr}")
        if not 0.0 <= lr_decay:
-            raise ValueError("Invalid lr_decay value: {}".format(lr_decay))
+            raise ValueError(f"Invalid lr_decay value: {lr_decay}")
        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
        if not 0.0 <= initial_accumulator_value:
            raise ValueError(
                "Invalid initial_accumulator_value value: {}".format(
@ -35,7 +35,7 @@ class Adagrad(Optimizer):
                )
            )
        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
+            raise ValueError(f"Invalid epsilon value: {eps}")

        defaults = dict(
            lr=lr,
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@ -16,15 +16,15 @@ class Adam(Optimizer):
                 maximize: bool = False, capturable: bool = False,
                 differentiable: bool = False, fused: Optional[bool] = None):
        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
+            raise ValueError(f"Invalid learning rate: {lr}")
        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
+            raise ValueError(f"Invalid epsilon value: {eps}")
        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")

        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad,
--- a/torch/optim/adamax.py
+++ b/torch/optim/adamax.py
@ -22,15 +22,15 @@ class Adamax(Optimizer):
        differentiable: bool = False,
    ):
        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
+            raise ValueError(f"Invalid learning rate: {lr}")
        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
+            raise ValueError(f"Invalid epsilon value: {eps}")
        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")

        defaults = dict(
            lr=lr,
--- a/torch/optim/adamw.py
+++ b/torch/optim/adamw.py
@ -26,15 +26,15 @@ class AdamW(Optimizer):
        fused: Optional[bool] = None,
    ):
        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
+            raise ValueError(f"Invalid learning rate: {lr}")
        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
+            raise ValueError(f"Invalid epsilon value: {eps}")
        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
        defaults = dict(
            lr=lr,
            betas=betas,
--- a/torch/optim/asgd.py
+++ b/torch/optim/asgd.py
@ -28,9 +28,9 @@ class ASGD(Optimizer):
        differentiable: bool = False,
    ):
        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
+            raise ValueError(f"Invalid learning rate: {lr}")
        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")

        defaults = dict(
            lr=lr,
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@ -1366,11 +1366,11 @@ class CosineAnnealingWarmRestarts(LRScheduler):

    def __init__(self, optimizer, T_0, T_mult=1, eta_min=0, last_epoch=-1, verbose=False):
        if T_0 <= 0 or not isinstance(T_0, int):
-            raise ValueError("Expected positive integer T_0, but got {}".format(T_0))
+            raise ValueError(f"Expected positive integer T_0, but got {T_0}")
        if T_mult < 1 or not isinstance(T_mult, int):
-            raise ValueError("Expected integer T_mult >= 1, but got {}".format(T_mult))
+            raise ValueError(f"Expected integer T_mult >= 1, but got {T_mult}")
        if not isinstance(eta_min, (float, int)):
-            raise ValueError("Expected float or int eta_min, but got {} of type {}".format(eta_min, type(eta_min)))
+            raise ValueError(f"Expected float or int eta_min, but got {eta_min} of type {type(eta_min)}")
        self.T_0 = T_0
        self.T_i = T_0
        self.T_mult = T_mult
@ -1425,7 +1425,7 @@ class CosineAnnealingWarmRestarts(LRScheduler):
                self.T_i = self.T_i * self.T_mult
        else:
            if epoch < 0:
-                raise ValueError("Expected non-negative epoch, but got {}".format(epoch))
+                raise ValueError(f"Expected non-negative epoch, but got {epoch}")
            if epoch >= self.T_0:
                if self.T_mult == 1:
                    self.T_cur = epoch % self.T_0
@ -1590,13 +1590,13 @@ class OneCycleLR(LRScheduler):
            raise ValueError("You must define either total_steps OR (epochs AND steps_per_epoch)")
        elif total_steps is not None:
            if total_steps <= 0 or not isinstance(total_steps, int):
-                raise ValueError("Expected positive integer total_steps, but got {}".format(total_steps))
+                raise ValueError(f"Expected positive integer total_steps, but got {total_steps}")
            self.total_steps = total_steps
        else:
            if epochs <= 0 or not isinstance(epochs, int):
-                raise ValueError("Expected positive integer epochs, but got {}".format(epochs))
+                raise ValueError(f"Expected positive integer epochs, but got {epochs}")
            if steps_per_epoch <= 0 or not isinstance(steps_per_epoch, int):
-                raise ValueError("Expected positive integer steps_per_epoch, but got {}".format(steps_per_epoch))
+                raise ValueError(f"Expected positive integer steps_per_epoch, but got {steps_per_epoch}")
            self.total_steps = epochs * steps_per_epoch

        if three_phase:
@ -1643,11 +1643,11 @@ class OneCycleLR(LRScheduler):

        # Validate pct_start
        if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float):
-            raise ValueError("Expected float between 0 and 1 pct_start, but got {}".format(pct_start))
+            raise ValueError(f"Expected float between 0 and 1 pct_start, but got {pct_start}")

        # Validate anneal_strategy
        if anneal_strategy not in ['cos', 'linear']:
-            raise ValueError("anneal_strategy must by one of 'cos' or 'linear', instead got {}".format(anneal_strategy))
+            raise ValueError(f"anneal_strategy must by one of 'cos' or 'linear', instead got {anneal_strategy}")
        elif anneal_strategy == 'cos':
            self.anneal_func = self._annealing_cos
        elif anneal_strategy == 'linear':
--- a/torch/optim/nadam.py
+++ b/torch/optim/nadam.py
@ -11,17 +11,17 @@ class NAdam(Optimizer):
                 weight_decay=0, momentum_decay=4e-3, *, foreach: Optional[bool] = None,
                 differentiable: bool = False):
        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
+            raise ValueError(f"Invalid learning rate: {lr}")
        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
+            raise ValueError(f"Invalid epsilon value: {eps}")
        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
        if not 0.0 <= momentum_decay:
-            raise ValueError("Invalid momentum_decay value: {}".format(momentum_decay))
+            raise ValueError(f"Invalid momentum_decay value: {momentum_decay}")
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, momentum_decay=momentum_decay,
                        foreach=foreach, differentiable=differentiable)
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@ -246,10 +246,10 @@ class Optimizer:
        format_string = self.__class__.__name__ + ' ('
        for i, group in enumerate(self.param_groups):
            format_string += '\n'
-            format_string += 'Parameter Group {0}\n'.format(i)
+            format_string += f'Parameter Group {i}\n'
            for key in sorted(group.keys()):
                if key != 'params':
-                    format_string += '    {0}: {1}\n'.format(key, group[key])
+                    format_string += f'    {key}: {group[key]}\n'
        format_string += ')'
        return format_string

@ -304,7 +304,7 @@ class Optimizer:
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            self, *_ = args
-            profile_name = "Optimizer.step#{}.step".format(self.__class__.__name__)
+            profile_name = f"Optimizer.step#{self.__class__.__name__}.step"
            with torch.autograd.profiler.record_function(profile_name):
                # call optimizer step pre hooks
                for pre_hook in chain(_global_optimizer_pre_hooks.values(), self._optimizer_step_pre_hooks.values()):
@ -337,7 +337,7 @@ class Optimizer:
            return _group_tensors_by_device_and_dtype(tensorlistlist, with_indices)

    def _patch_step_function(self):
-        self._zero_grad_profile_name = "Optimizer.zero_grad#{}.zero_grad".format(self.__class__.__name__)
+        self._zero_grad_profile_name = f"Optimizer.zero_grad#{self.__class__.__name__}.zero_grad"
        hooked = getattr(self.__class__.step, "hooked", None)
        if not hooked:
            self.__class__.step = self.profile_hook_step(self.__class__.step)  # type: ignore[method-assign]
@ -468,8 +468,8 @@ class Optimizer:
                             "that doesn't match the size of optimizer's group")

        # Update the state
-        id_map = dict(zip(chain.from_iterable((g['params'] for g in saved_groups)),
-                      chain.from_iterable((g['params'] for g in groups))))
+        id_map = dict(zip(chain.from_iterable(g['params'] for g in saved_groups),
+                      chain.from_iterable(g['params'] for g in groups)))

        def cast(param, value, param_id=None, param_groups=None, key=None):
            r"""Make a deep copy of value, casting all tensors to device of param."""
--- a/torch/optim/radam.py
+++ b/torch/optim/radam.py
@ -22,15 +22,15 @@ class RAdam(Optimizer):
        differentiable: bool = False,
    ):
        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
+            raise ValueError(f"Invalid learning rate: {lr}")
        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
+            raise ValueError(f"Invalid epsilon value: {eps}")
        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
        defaults = dict(
            lr=lr,
            betas=betas,
--- a/torch/optim/rmsprop.py
+++ b/torch/optim/rmsprop.py
@ -22,15 +22,15 @@ class RMSprop(Optimizer):
        differentiable: bool = False,
    ):
        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
+            raise ValueError(f"Invalid learning rate: {lr}")
        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
+            raise ValueError(f"Invalid epsilon value: {eps}")
        if not 0.0 <= momentum:
-            raise ValueError("Invalid momentum value: {}".format(momentum))
+            raise ValueError(f"Invalid momentum value: {momentum}")
        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
        if not 0.0 <= alpha:
-            raise ValueError("Invalid alpha value: {}".format(alpha))
+            raise ValueError(f"Invalid alpha value: {alpha}")

        defaults = dict(
            lr=lr,
--- a/torch/optim/rprop.py
+++ b/torch/optim/rprop.py
@ -20,9 +20,9 @@ class Rprop(Optimizer):
        differentiable: bool = False,
    ):
        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
+            raise ValueError(f"Invalid learning rate: {lr}")
        if not 0.0 < etas[0] < 1.0 < etas[1]:
-            raise ValueError("Invalid eta values: {}, {}".format(etas[0], etas[1]))
+            raise ValueError(f"Invalid eta values: {etas[0]}, {etas[1]}")

        defaults = dict(
            lr=lr,
--- a/torch/optim/sgd.py
+++ b/torch/optim/sgd.py
@ -11,11 +11,11 @@ class SGD(Optimizer):
                 weight_decay=0, nesterov=False, *, maximize: bool = False, foreach: Optional[bool] = None,
                 differentiable: bool = False):
        if lr is not required and lr < 0.0:
-            raise ValueError("Invalid learning rate: {}".format(lr))
+            raise ValueError(f"Invalid learning rate: {lr}")
        if momentum < 0.0:
-            raise ValueError("Invalid momentum value: {}".format(momentum))
+            raise ValueError(f"Invalid momentum value: {momentum}")
        if weight_decay < 0.0:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov,
--- a/torch/optim/sparse_adam.py
+++ b/torch/optim/sparse_adam.py
@ -7,13 +7,13 @@ __all__ = ['SparseAdam']
 class SparseAdam(Optimizer):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, maximize: bool = False):
        if not 0.0 < lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
+            raise ValueError(f"Invalid learning rate: {lr}")
        if not 0.0 < eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
+            raise ValueError(f"Invalid epsilon value: {eps}")
        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")

        params = list(params)

--- a/torch/package/_importlib.py
+++ b/torch/package/_importlib.py
@ -31,13 +31,13 @@ def _resolve_name(name, package, level):
    if len(bits) < level:
        raise ValueError("attempted relative import beyond top-level package")
    base = bits[0]
-    return "{}.{}".format(base, name) if name else base
+    return f"{base}.{name}" if name else base


 def _sanity_check(name, package, level):
    """Verify arguments are "sane"."""
    if not isinstance(name, str):
-        raise TypeError("module name must be str, not {}".format(type(name)))
+        raise TypeError(f"module name must be str, not {type(name)}")
    if level < 0:
        raise ValueError("level must be >= 0")
    if level > 0:
@ -90,6 +90,6 @@ def _normalize_path(path):
    """
    parent, file_name = os.path.split(path)
    if parent:
-        raise ValueError("{!r} must be only a file name".format(path))
+        raise ValueError(f"{path!r} must be only a file name")
    else:
        return file_name
--- a/torch/package/file_structure_representation.py
+++ b/torch/package/file_structure_representation.py
@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 from typing import Dict, List

 from .glob_group import GlobGroup, GlobPattern
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@ -79,7 +79,7 @@ class PackagingErrorReason(Enum):
    """

    def __repr__(self):
-        return "<%s.%s>" % (self.__class__.__name__, self.name)
+        return f"<{self.__class__.__name__}.{self.name}>"

    IS_EXTENSION_MODULE = (
        "Module is a C extension module. torch.package supports Python modules only."
@ -156,14 +156,12 @@ class PackagingError(Exception):
                    message.write(f"      Context: {error_context}\n")
                if module_name in _DISALLOWED_MODULES:
                    message.write(
-                        (
-                            "      Note: While we usually use modules in the python standard library "
-                            f"from the local environment, `{module_name}` has a lot of system "
-                            "level access and therefore can pose a security risk. We heavily "
-                            f"recommend removing `{module_name}` from your packaged code. However, if that "
-                            "is not possible, add it to the extern list by calling "
-                            f'PackageExporter.extern("`{module_name}`")\n'
-                        )
+                        "      Note: While we usually use modules in the python standard library "
+                        f"from the local environment, `{module_name}` has a lot of system "
+                        "level access and therefore can pose a security risk. We heavily "
+                        f"recommend removing `{module_name}` from your packaged code. However, if that "
+                        "is not possible, add it to the extern list by calling "
+                        f'PackageExporter.extern("`{module_name}`")\n'
                    )
                if debug:
                    module_path = dependency_graph.first_path(module_name)
@ -173,10 +171,8 @@ class PackagingError(Exception):
        if not debug:
            message.write("\n")
            message.write(
-                (
-                    "Set debug=True when invoking PackageExporter for a visualization of where "
-                    "broken modules are coming from!\n"
-                )
+                "Set debug=True when invoking PackageExporter for a visualization of where "
+                "broken modules are coming from!\n"
            )
        # Save the dependency graph so that tooling can get at it.
        self.dependency_graph = dependency_graph
--- a/torch/package/package_importer.py
+++ b/torch/package/package_importer.py
@ -539,7 +539,7 @@ class PackageImporter(Importer):
                    if not recursive and hasattr(module, "__all__"):
                        self._handle_fromlist(module, module.__all__, recursive=True)
                elif not hasattr(module, x):
-                    from_name = "{}.{}".format(module_name, x)
+                    from_name = f"{module_name}.{x}"
                    try:
                        self._gcd_import(from_name)
                    except ModuleNotFoundError as exc:
@ -587,13 +587,13 @@ class PackageImporter(Importer):
        """
        if hasattr(package, "__spec__"):
            if package.__spec__.submodule_search_locations is None:
-                raise TypeError("{!r} is not a package".format(package.__spec__.name))
+                raise TypeError(f"{package.__spec__.name!r} is not a package")
            else:
                return package
        else:
            module = self.import_module(package)
            if module.__spec__.submodule_search_locations is None:
-                raise TypeError("{!r} is not a package".format(package))
+                raise TypeError(f"{package!r} is not a package")
            else:
                return module

--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@ -738,11 +738,11 @@ class MemoryProfile:

        for node in self._data_flow_graph.flow_nodes:
            all_tensor_versions.update(((k, v) for k, (_, v) in node.inputs.items()))
-            all_tensor_versions.update(((key, 0) for key in node.intermediates))
+            all_tensor_versions.update((key, 0) for key in node.intermediates)
            all_tensor_versions.update(node.outputs.items())

        for i in self._categories._values.values():
-            all_tensor_versions.update(((key, 0) for key in i._by_id_keyset))
+            all_tensor_versions.update((key, 0) for key in i._by_id_keyset)

        return {
            (key, version): self._categories.get(key, version)
--- a/torch/profiler/_pattern_matcher.py
+++ b/torch/profiler/_pattern_matcher.py
@ -642,7 +642,7 @@ def report_all_anti_patterns(prof,
        json_report_path = os.path.join(json_report_dir,
                                        "torchtidy_report.json")
        if os.path.exists(json_report_path):
-            with open(json_report_path, "r") as f:
+            with open(json_report_path) as f:
                exisiting_report = json.load(f)
                exisiting_report.update(report_dict)
                report_dict = exisiting_report
--- a/torch/signal/windows/windows.py
+++ b/torch/signal/windows/windows.py
@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 from typing import Optional, Iterable

 import torch
--- a/torch/sparse/semi_structured.py
+++ b/torch/sparse/semi_structured.py
@ -136,28 +136,22 @@ class SparseSemiStructuredTensor(torch.Tensor):
            # check device
            if not original_tensor.is_cuda:
                raise RuntimeError(
-                    (
-                        f"Error original_tensor.device= {original_tensor.device} is not supported! "
-                        "Only CUDA tensors are currently supported."
-                    )
+                    f"Error original_tensor.device= {original_tensor.device} is not supported! "
+                    "Only CUDA tensors are currently supported."
                )

            # check dim
            if original_tensor.dim() != 2:
                raise RuntimeError(
-                    (
-                        f"Error original_tensor.dim = {original_tensor.dim()} is not supported! "
-                        "Only 2d tensors are currently supported."
-                    )
+                    f"Error original_tensor.dim = {original_tensor.dim()} is not supported! "
+                    "Only 2d tensors are currently supported."
                )

            # check dtype
            if original_tensor.dtype not in _DTYPE_TO_SEMI_STRUCTURED_SPARSE_CONFIG:
                raise RuntimeError(
-                    (
-                        f"Error original_tensor.dtype {original_tensor.dtype} is not a supported dtype! "
-                        "dtype must be one of: {_DTYPE_TO_SEMI_STRUCTURED_SPARSE_CONFIG}"
-                    )
+                    f"Error original_tensor.dtype {original_tensor.dtype} is not a supported dtype! "
+                    "dtype must be one of: {_DTYPE_TO_SEMI_STRUCTURED_SPARSE_CONFIG}"
                )

            # check shape
@ -167,10 +161,8 @@ class SparseSemiStructuredTensor(torch.Tensor):
            if m < min_rows or m % min_rows or n < min_cols or n % min_cols:
                # TODO in the future we can add in padding to support dimensions that aren't perfect multiples
                raise RuntimeError(
-                    (
-                        f"Error original_tensor.shape {original_tensor.shape} is not supported! "
-                        "Both dimensions must be larger or equal than and a multiple of ({min_rows}, {min_cols})"
-                    )
+                    f"Error original_tensor.shape {original_tensor.shape} is not supported! "
+                    "Both dimensions must be larger or equal than and a multiple of ({min_rows}, {min_cols})"
                )

            # This code calculates the size of the compressed tensor.