Pull in fairscale.nn.Pipe into PyTorch. (#44090)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44090 This is an initial commit pulling in the torchgpipe fork at https://github.com/facebookresearch/fairscale. The purpose of this commit is to just pull in the code and ensure all tests and builds work fine. We will slowly modify this to match our intended API mentioned in https://fb.quip.com/txurAV3zIFox#RPZACAfAKMq. Follow up PRs would address further changes needed on top of the initial commit.. We're pulling the code into the `torch.distributed._pipeline.sync` package. The package is private on purpose since there is a lot of work (ex: docs, API changes etc.) that needs to go in before we can actually officially support this. ghstack-source-id: 114864254 Test Plan: 1) waitforbuildbot 2) Ran all tests on my devgpu Reviewed By: mrshenli Differential Revision: D23493316 fbshipit-source-id: fe3c8b7dadeeb86abdc00e8a8652491b0b16743a
2025-10-20 21:14:14 +08:00 · 2020-10-22 10:53:07 -07:00
parent b63ddd6f57
commit 06d50b5eb0
53 changed files with 6532 additions and 7 deletions
--- a/13
+++ b/13
@ -16,23 +16,26 @@ Copyright (c) 2016-present, Facebook Inc. All rights reserved.
 All contributions by Facebook:
 Copyright (c) 2016 Facebook Inc.
- 
+
 All contributions by Google:
 Copyright (c) 2015 Google Inc.
 All rights reserved.
- 
+
 All contributions by Yangqing Jia:
 Copyright (c) 2015 Yangqing Jia
 All rights reserved.
- 
+
 All contributions by Kakao Brain:
 Copyright 2019-2020 Kakao Brain
 All contributions from Caffe:
 Copyright(c) 2013, 2014, 2015, the respective contributors
 All rights reserved.
- 
+
 All other contributions:
 Copyright(c) 2015, 2016 the respective contributors
 All rights reserved.
- 
+
 Caffe2 uses a copyright model similar to Caffe: each contributor holds
 copyright over their contributions to Caffe2. The project versioning records
 all such contribution and copyright details. If a contributor wants to further
--- a/3
+++ b/3
@ -22,6 +22,9 @@ All contributions by Yangqing Jia:
 Copyright (c) 2015 Yangqing Jia
 All rights reserved.
 All contributions by Kakao Brain:
 Copyright 2019-2020 Kakao Brain
 All other contributions:
 Copyright(c) 2015, 2016 the respective contributors
 All rights reserved.
--- a/test/distributed/_pipeline/sync/LICENSE
+++ b/test/distributed/_pipeline/sync/LICENSE
@ -0,0 +1,27 @@
 Copyright 2019-2020 Kakao Brain
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 1. Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
 3. Neither the name of the copyright holder nor the names of its
   contributors may be used to endorse or promote products derived from this
   software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
--- a/test/distributed/_pipeline/sync/init.py
+++ b/test/distributed/_pipeline/sync/init.py
@ -0,0 +1,8 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 # tests/__init__.py makes pytest can import the application without custom sys.path or PYTHONPATH.
 # See also: https://docs.pytest.org/en/latest/goodpractices.html
--- a/test/distributed/_pipeline/sync/conftest.py
+++ b/test/distributed/_pipeline/sync/conftest.py
@ -0,0 +1,37 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 import pytest
 import torch
@pytest.fixture(autouse=True)
 def manual_seed_zero():
    torch.manual_seed(0)
@pytest.fixture(scope="session")
 def cuda_sleep():
    # Warm-up CUDA.
    torch.empty(1, device="cuda")
    # From test/test_cuda.py in PyTorch.
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()
    torch.cuda._sleep(1000000)
    end.record()
    end.synchronize()
    cycles_per_ms = 1000000 / start.elapsed_time(end)
    def cuda_sleep(seconds):
        torch.cuda._sleep(int(seconds * cycles_per_ms * 1000))
    return cuda_sleep
 def pytest_report_header():
    return f"torch: {torch.__version__}"
--- a/test/distributed/_pipeline/sync/skip/init.py
+++ b/test/distributed/_pipeline/sync/skip/init.py
@ -0,0 +1,6 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
--- a/test/distributed/_pipeline/sync/skip/test_api.py
+++ b/test/distributed/_pipeline/sync/skip/test_api.py
@ -0,0 +1,45 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 import copy
 from torch import nn
 from torch.distributed._pipeline.sync.skip import Namespace, skippable, stash
 def test_namespace_difference():
    ns1 = Namespace()
    ns2 = Namespace()
    assert ns1 != ns2
 def test_namespace_copy():
    ns = Namespace()
    assert copy.copy(ns) == ns
    assert copy.copy(ns) is not ns
 def test_skippable_repr():
    @skippable(stash=["hello"])
    class Hello(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv = nn.Conv2d(1, 1, 1)
        def forward(self, x):
            yield stash("hello", x)
            return self.conv(x) # noqa
    m = Hello()
    assert (
        repr(m)
        == """
@skippable(Hello(
  (conv): Conv2d(1, 1, kernel_size=(1, 1), stride=(1, 1))
 ))
 """.strip()
    )
--- a/test/distributed/_pipeline/sync/skip/test_gpipe.py
+++ b/test/distributed/_pipeline/sync/skip/test_gpipe.py
@ -0,0 +1,106 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 import pytest
 import torch
 from torch import nn
 from torch.distributed._pipeline.sync import Pipe
 from torch.distributed._pipeline.sync.skip import pop, skippable, stash
 from torch.distributed._pipeline.sync.skip.portal import PortalBlue, PortalCopy, PortalOrange
@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
@pytest.mark.parametrize("balance", [[3], [1, 2], [2, 1], [1, 1, 1]], ids=["3", "1:2", "2:1", "1:1:1"])
@pytest.mark.parametrize("checkpoint", ["never", "always", "except_last"])
 def test_1to3(balance, checkpoint):
    if torch.cuda.device_count() < len(balance):
        pytest.skip("at least %d cuda devices required" % len(balance))
    @skippable(stash=["1to3"])
    class Layer1(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv = nn.Conv2d(3, 3, 1)
        def forward(self, input):
            yield stash("1to3", input)
            output = self.conv(input)
            return output # noqa
    class Layer2(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv = nn.Conv2d(3, 3, 1)
        def forward(self, input):
            output = self.conv(input)
            return output
    @skippable(pop=["1to3"])
    class Layer3(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv = nn.Conv2d(3, 3, 1)
        def forward(self, input):
            skip_1to3 = yield pop("1to3")
            output = self.conv(input) + skip_1to3
            return output
    model = nn.Sequential(Layer1(), Layer2(), Layer3())
    model = Pipe(model, balance, chunks=3, checkpoint=checkpoint)
    in_device = model.devices[0]
    out_device = model.devices[-1]
    input = torch.rand(30, 3, 224, 224, device=in_device, requires_grad=True)
    output = model(input)
    loss = output.mean()
    loss.backward()
    assert torch.allclose(output.norm(), torch.tensor(1039.0, device=out_device), atol=6e-1)
    assert torch.allclose(input.grad.norm(), torch.tensor(0.0004533053, device=in_device))
 def test_none_skip():
    @skippable(stash=["none"])
    class Stash(nn.Module):
        def forward(self, input):
            yield stash("none", None)
            return input # noqa
    @skippable(pop=["none"])
    class Pop(nn.Module):
        def forward(self, input):
            none = yield pop("none")
            assert none is None
            return input
    model = nn.Sequential(Stash(), Pop())
    model = Pipe(model, [1, 1], devices=["cpu", "cpu"], chunks=5)
    input = torch.rand(10, requires_grad=True)
    output = model(input)
    def assert_grad_fn_is_not_portal(grad_fn, visited=None):
        if visited is None:
            visited = set()
        if grad_fn in visited or grad_fn is None:
            return
        assert not isinstance(grad_fn, PortalBlue._backward_cls)
        assert not isinstance(grad_fn, PortalCopy._backward_cls)
        assert not isinstance(grad_fn, PortalOrange._backward_cls)
        visited.add(grad_fn)
        for next_grad_fn, _ in grad_fn.next_functions:
            assert_grad_fn_is_not_portal(next_grad_fn, visited)
    assert_grad_fn_is_not_portal(output.grad_fn)
    output.sum().backward()
    assert input.grad.mean().item() == 1
--- a/test/distributed/_pipeline/sync/skip/test_inspect_skip_layout.py
+++ b/test/distributed/_pipeline/sync/skip/test_inspect_skip_layout.py
@ -0,0 +1,111 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 from torch import nn
 from torch.distributed._pipeline.sync.skip import Namespace, pop, skippable, stash
 from torch.distributed._pipeline.sync.skip.layout import inspect_skip_layout
 class Pass(nn.Module):
    def forward(self, input):
        return input
@skippable(stash=["foo"])
 class StashFoo(nn.Module):
    def forward(self, input):
        yield stash("foo", input)
        return input # noqa
@skippable(pop=["foo"])
 class PopFoo(nn.Module):
    def forward(self, input):
        foo = yield stash("foo")
        return input + foo
@skippable(stash=["bar"])
 class StashBar(nn.Module):
    def forward(self, input):
        yield stash("bar", input)
        return input # noqa
@skippable(pop=["bar"])
 class PopBar(nn.Module):
    def forward(self, input):
        bar = yield pop("bar")
        return input + bar
 def test_no_skippables():
    p1 = nn.Sequential(Pass())
    p2 = nn.Sequential(Pass())
    layout = inspect_skip_layout([p1, p2])
    policy = [list(layout.copy_policy(i)) for i in range(2)]
    assert policy == [[], []]
 def test_inner_partition():
    p1 = nn.Sequential(StashFoo(), PopFoo())
    p2 = nn.Sequential(Pass())
    layout = inspect_skip_layout([p1, p2])
    policy = [list(layout.copy_policy(i)) for i in range(2)]
    assert policy == [[], []]
 def test_adjoining_partitions():
    p1 = nn.Sequential(StashFoo())
    p2 = nn.Sequential(PopFoo())
    layout = inspect_skip_layout([p1, p2])
    policy = [list(layout.copy_policy(i)) for i in range(2)]
    assert policy == [[], [(0, None, "foo")]]
 def test_far_partitions():
    p1 = nn.Sequential(StashFoo())
    p2 = nn.Sequential(Pass())
    p3 = nn.Sequential(PopFoo())
    layout = inspect_skip_layout([p1, p2, p3])
    policy = [list(layout.copy_policy(i)) for i in range(3)]
    assert policy == [[], [], [(0, None, "foo")]]
 def test_pop_2_from_different_partitions():
    p1 = nn.Sequential(StashFoo())
    p2 = nn.Sequential(StashBar())
    p3 = nn.Sequential(PopBar(), PopFoo())
    layout = inspect_skip_layout([p1, p2, p3])
    policy = [list(layout.copy_policy(i)) for i in range(3)]
    # p3 pops 'bar' before 'foo', but the plan is sorted by source partition index.
    assert policy == [[], [], [(0, None, "foo"), (1, None, "bar")]]
 def test_namespace():
    ns1 = Namespace()
    ns2 = Namespace()
    p1 = nn.Sequential(StashFoo().isolate(ns1))
    p2 = nn.Sequential(StashFoo().isolate(ns2))
    p3 = nn.Sequential(PopFoo().isolate(ns2), PopFoo().isolate(ns1))
    layout = inspect_skip_layout([p1, p2, p3])
    policy = [list(layout.copy_policy(i)) for i in range(3)]
    # p3 pops 'bar' before 'foo', but the plan is sorted by source partition index.
    assert policy == [[], [], [(0, ns1, "foo"), (1, ns2, "foo")]]
--- a/test/distributed/_pipeline/sync/skip/test_leak.py
+++ b/test/distributed/_pipeline/sync/skip/test_leak.py
@ -0,0 +1,126 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 import pytest
 import torch
 from torch import nn
 from torch.distributed._pipeline.sync import Pipe, is_checkpointing, is_recomputing
 from torch.distributed._pipeline.sync.skip import pop, skippable, stash
 from torch.distributed._pipeline.sync.skip.tracker import current_skip_tracker
@skippable(stash=["skip"])
 class Stash(nn.Module):
    def forward(self, input):
        yield stash("skip", input)
        return input # noqa
@skippable(pop=["skip"])
 class Pop(nn.Module):
    def forward(self, input):
        skip = yield pop("skip")
        return input + skip
@pytest.mark.parametrize("train", [True, False], ids=["train", "eval"])
@pytest.mark.parametrize("checkpoint", ["always", "except_last", "never"])
 def test_delete_portal_tensor(train, checkpoint):
    # Without checkpointing:
    # +- Stash --+  +--- Pop ----+ - - - layers
    # | 2,blue,1 |--| 1,orange,0 | - - - tensor_life and portal function
    # +----------+  +------------+
    #
    # With checkpointing:
    # +- Stash --+  +--- Pop ----+  +--- Pop'----+  +- Stash'--+
    # | 3,blue,2 |--| 2,orange,1 |--| 1,orange,0 |--| 1,blue,0 |
    # +----------+  +------------+  +------------+  +----------+
    def portal_tensor_life_is(tensor_life, skip_tracker=None):
        if skip_tracker is None:
            skip_tracker = current_skip_tracker()
        # Get the current portal.
        portal = list(skip_tracker.portals.values())[0]
        if tensor_life == 0:
            return portal.tensor_life == 0 and portal.tensor is None
        else:
            return portal.tensor_life == tensor_life and portal.tensor is not None
    # Check the portal tensor after 'Stash'.
    stash_ = Stash()
    @stash_.register_forward_hook
    def check_portal_tensor_after_stash(*_):
        if is_checkpointing():
            assert portal_tensor_life_is(2)
        elif is_recomputing():
            assert portal_tensor_life_is(0)
        else:
            assert portal_tensor_life_is(1)
    pop_ = Pop()
    @pop_.register_forward_hook
    def check_portal_tensor_after_pop(*_):
        if is_checkpointing():
            assert portal_tensor_life_is(1)
        elif is_recomputing():
            assert portal_tensor_life_is(0)
        else:
            assert portal_tensor_life_is(0)
    class NoPortalTensorAtBackward(nn.Module):
        class F(torch.autograd.Function):
            @staticmethod
            def forward(ctx, input):
                ctx.skip_tracker = current_skip_tracker()
                return input.detach()
            @staticmethod
            def backward(ctx, grad):
                assert portal_tensor_life_is(0, skip_tracker=ctx.skip_tracker)
                return grad
        def forward(self, input):
            return self.F.apply(input)
    model = nn.Sequential(NoPortalTensorAtBackward(), stash_, pop_)
    model = Pipe(model, balance=[2, 1], devices=["cpu", "cpu"], chunks=2, checkpoint=checkpoint)
    input = torch.rand(10, requires_grad=True)
    if train:
        model.train()
        output = model(input)
        output.norm().backward()
    else:
        model.eval()
        with torch.no_grad():
            model(input)
@pytest.mark.parametrize("train", [True, False], ids=["train", "eval"])
 def test_no_portal_without_pipe(train, monkeypatch):
    def deny(*args, **kwargs):
        raise AssertionError("tried to create Portal without Pipe")
    monkeypatch.setattr("torch.distributed._pipeline.sync.skip.portal.Portal.__init__", deny)
    model = nn.Sequential(Stash(), Pop())
    input = torch.rand(10, requires_grad=True)
    if train:
        model.train()
        output = model(input)
        output.norm().backward()
    else:
        model.eval()
        with torch.no_grad():
            model(input)
--- a/test/distributed/_pipeline/sync/skip/test_portal.py
+++ b/test/distributed/_pipeline/sync/skip/test_portal.py
@ -0,0 +1,155 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 import pytest
 import torch
 from torch.distributed._pipeline.sync.dependency import fork, join
 from torch.distributed._pipeline.sync.skip.portal import Portal
 from torch.distributed._pipeline.sync.stream import default_stream
@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
 def test_copy_returns_on_next_device():
    portal = Portal(torch.rand(1), tensor_life=1)
    prev_stream = default_stream(torch.device("cpu"))
    next_stream = default_stream(torch.device("cuda"))
    phony = torch.zeros(0, requires_grad=True)
    assert phony.device.type == "cpu"
    phony = portal.copy(prev_stream, next_stream, phony)
    assert phony.device.type == "cuda"
 def test_blue_orange():
    tensor1 = torch.rand(1, requires_grad=True)
    tensor2 = torch.rand(1, requires_grad=True)
    # Same with: output = tensor1*2 + tensor2
    #
    #                +----------------------+
    #                |                      |
    # tensor2 -- PortalBlue -+      +- PortalOrange -+
    #                        |      |                |
    # tensor1 ------------ Join -- Fork --- Mul --- Add -- output
    #
    main = tensor1
    portal = Portal(tensor2, tensor_life=2)
    phony = portal.blue()
    main = join(main, phony)
    main, phony = fork(main)
    sub = portal.orange(phony)
    output = main * 2 + sub
    output.backward()
    assert torch.allclose(tensor1.grad, torch.tensor([2.0]))
    assert torch.allclose(tensor2.grad, torch.tensor([1.0]))
 def test_blue_orange_not_requires_grad():
    tensor1 = torch.rand(1, requires_grad=True)
    tensor2 = torch.rand(1)
    # Same with: output = tensor1*2 + tensor2
    #
    #                +----------------------+
    #                |                      |
    # tensor2 -- PortalBlue -+      +- PortalOrange -+
    #                        |      |                |
    # tensor1 ------------ Join -- Fork --- Mul --- Add -- output
    #
    main = tensor1
    portal = Portal(tensor2, tensor_life=2)
    phony = portal.blue()
    main = join(main, phony)
    main, phony = fork(main)
    sub = portal.orange(phony)
    output = main * 2 + sub
    output.backward()
    assert torch.allclose(tensor1.grad, torch.tensor([2.0]))
    assert tensor2.grad is None
 def test_use_grad():
    tensor = torch.rand(1, requires_grad=True)
    portal = Portal(tensor, tensor_life=1)
    portal.put_grad(tensor)
    assert portal.use_grad() is tensor
    # Gradient in a portal is ephemeral.
    with pytest.raises(RuntimeError):
        portal.use_grad()
 class TestTensorLife:
    @pytest.fixture
    def new_portal(self):
        portal = None
        def new_portal(tensor_life):
            nonlocal portal
            tensor = torch.rand(1, requires_grad=True)
            portal = Portal(tensor, tensor_life)
            return portal, tensor
        yield new_portal
        # A test using this fixture must exhaust the tensor in the portal.
        with pytest.raises(RuntimeError):
            portal.check_tensor_life()
        assert portal.tensor is None
    def test_tensor_life_0(self, new_portal):
        portal, tensor = new_portal(0)
        assert portal.tensor is None
    def test_tensor_life_1(self, new_portal):
        portal, tensor = new_portal(1)
        assert portal.tensor is tensor
        portal.blue()
    def test_tensor_life_2(self, new_portal):
        portal, tensor = new_portal(2)
        assert portal.tensor is tensor
        phony = portal.blue()
        assert portal.orange(phony).data_ptr() == tensor.data_ptr()
    def test_tensor_life_3(self, new_portal):
        portal, tensor = new_portal(3)
        assert portal.tensor is tensor
        phony = portal.blue()
        assert portal.orange(phony).data_ptr() == tensor.data_ptr()
        assert portal.orange(phony).data_ptr() == tensor.data_ptr()
    def test_tensor_life_4(self, new_portal):
        portal, tensor = new_portal(4)
        assert portal.tensor is tensor
        phony = portal.blue()
        assert portal.orange(phony).data_ptr() == tensor.data_ptr()
        assert portal.orange(phony).data_ptr() == tensor.data_ptr()
        portal.blue()
    def test_tensor_life_3_plus_1(self, new_portal):
        portal, tensor = new_portal(3)
        assert portal.tensor is tensor
        phony = portal.blue()
        assert portal.orange(phony).data_ptr() == tensor.data_ptr()
        assert portal.orange(phony).data_ptr() == tensor.data_ptr()
        another_tensor = torch.rand(1, requires_grad=True)
        portal.put_tensor(another_tensor, tensor_life=1)
        portal.blue()
--- a/test/distributed/_pipeline/sync/skip/test_stash_pop.py
+++ b/test/distributed/_pipeline/sync/skip/test_stash_pop.py
@ -0,0 +1,136 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 import pytest
 import torch
 from torch import nn
 from torch.distributed._pipeline.sync.skip import pop, skippable, stash
 from torch.distributed._pipeline.sync.skip.tracker import SkipTracker, use_skip_tracker
@pytest.fixture(autouse=True)
 def skip_tracker():
    skip_tracker = SkipTracker()
    with use_skip_tracker(skip_tracker):
        yield skip_tracker
 def test_stash(skip_tracker):
    @skippable(stash=["foo"])
    class Stash(nn.Module):
        def forward(self, input):
            yield stash("foo", input)
            return input * 2 # noqa
    l1 = Stash()
    assert len(skip_tracker.tensors) == 0
    with use_skip_tracker(skip_tracker):
        l1(torch.tensor(42))
    assert len(skip_tracker.tensors) == 1
 def test_pop():
    @skippable(stash=["foo"])
    class Stash(nn.Module):
        def forward(self, input):
            yield stash("foo", input)
            return input * 2 # noqa
    @skippable(pop=["foo"])
    class Pop(nn.Module):
        def forward(self, input):
            foo = yield pop("foo")
            return foo # noqa
    l1 = Stash()
    l2 = Pop()
    output = l2(l1(torch.tensor(42)))
    assert output.item() == 42
 def test_declare_but_not_use():
    @skippable(stash=["foo"])
    class Stash(nn.Module):
        def forward(self, input):
            return input * 2
    @skippable(pop=["foo"])
    class Pop(nn.Module):
        def forward(self, input):
            return input * 3
    l1 = Stash()
    l2 = Pop()
    with pytest.raises(RuntimeError):
        l1(torch.tensor(42))
    with pytest.raises(RuntimeError):
        l2(torch.tensor(42))
 def test_stash_not_declared():
    @skippable()
    class Stash(nn.Module):
        def forward(self, input):
            yield stash("foo", input)
            return input * 2 # noqa
    l1 = Stash()
    with pytest.raises(RuntimeError):
        l1(torch.tensor(42))
 def test_pop_not_declared():
    @skippable(stash=["foo"])
    class Stash(nn.Module):
        def forward(self, input):
            yield stash("foo", input)
            return input * 2 # noqa
    @skippable()
    class Pop(nn.Module):
        def forward(self, input):
            foo = yield pop("foo")
            return foo # noqa
    l1 = Stash()
    l2 = Pop()
    latent = l1(torch.tensor(42))
    with pytest.raises(RuntimeError):
        l2(latent)
 def test_pop_not_stashed():
    @skippable(pop=["foo"])
    class Pop(nn.Module):
        def forward(self, input):
            yield pop("foo")
    l1 = Pop()
    with pytest.raises(RuntimeError):
        l1(torch.tensor(42))
 def test_stash_none():
    @skippable(stash=["foo"])
    class Stash(nn.Module):
        def forward(self, input):
            yield stash("foo", None)
            return input * 2 # noqa
    l1 = Stash()
    l1(torch.tensor(42))
--- a/test/distributed/_pipeline/sync/skip/test_tracker.py
+++ b/test/distributed/_pipeline/sync/skip/test_tracker.py
@ -0,0 +1,127 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 from queue import Queue
 import threading
 import pytest
 import torch
 from torch import nn
 from torch.distributed._pipeline.sync.checkpoint import enable_checkpointing, enable_recomputing
 from torch.distributed._pipeline.sync.microbatch import Batch
 from torch.distributed._pipeline.sync.skip import pop, skippable, stash
 from torch.distributed._pipeline.sync.skip.layout import SkipLayout
 from torch.distributed._pipeline.sync.skip.tracker import SkipTracker, SkipTrackerThroughPotals, current_skip_tracker
 def test_default_skip_tracker():
    q = Queue()
    def f():
        q.put(current_skip_tracker())
    t = threading.Thread(target=f)
    t.start()
    t.join()
    skip_tracker = q.get()
    assert type(skip_tracker) is SkipTracker
    assert type(skip_tracker) is not SkipTrackerThroughPotals
@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
 def test_default_skip_tracker_by_data_parallel():
    @skippable(stash=["foo"])
    class Stash(nn.Module):
        def forward(self, input):
            yield stash("foo", input)
            return input * 2 # noqa
    @skippable(pop=["foo"])
    class Pop(nn.Module):
        def forward(self, input):
            foo = yield pop("foo")
            return foo
    model = nn.Sequential(Stash(), Pop())
    model = nn.DataParallel(model, device_ids=[0, 0], output_device=0)
    input = torch.rand(10, device=0)
    output = model(input)
    assert torch.allclose(output, input)
 def test_reuse_portal():
    skip_layout = SkipLayout(num_partitions=2, skip_routes={(None, "test"): (0, 1)})
    skip_tracker = SkipTrackerThroughPotals(skip_layout)
    batch = Batch(torch.tensor([1.0]))
    a = torch.tensor([2.0])
    b = torch.tensor([2.0])
    skip_tracker.save(batch, None, "test", a)
    portal = skip_tracker.portals[(None, "test")]
    skip_tracker.save(batch, None, "test", b)
    assert portal is skip_tracker.portals[(None, "test")]
 def test_no_copy_no_portal():
    skip_layout = SkipLayout(num_partitions=2, skip_routes={(None, "copy"): (0, 1), (None, "not_copy"): (0, 0)})
    skip_tracker = SkipTrackerThroughPotals(skip_layout)
    batch = Batch(torch.tensor([1.0]))
    a = torch.tensor([2.0])
    b = torch.tensor([2.0])
    skip_tracker.save(batch, None, "copy", a)
    skip_tracker.save(batch, None, "not_copy", b)
    assert (None, "copy") in skip_tracker.portals
    assert (None, "copy") not in skip_tracker.tensors
    assert (None, "not_copy") in skip_tracker.tensors
    assert (None, "not_copy") not in skip_tracker.portals
 def test_tensor_life_without_checkpointing():
    skip_layout = SkipLayout(num_partitions=2, skip_routes={(None, "test"): (0, 1)})
    skip_tracker = SkipTrackerThroughPotals(skip_layout)
    batch = Batch(torch.tensor([1.0]))
    tensor = torch.tensor([2.0])
    skip_tracker.save(batch, None, "test", tensor)
    assert skip_tracker.portals[(None, "test")].tensor_life == 1
    skip_tracker.load(batch, None, "test")
    assert skip_tracker.portals[(None, "test")].tensor_life == 0
 def test_tensor_life_with_checkpointing():
    skip_layout = SkipLayout(num_partitions=2, skip_routes={(None, "test"): (0, 1)})
    skip_tracker = SkipTrackerThroughPotals(skip_layout)
    batch = Batch(torch.tensor([1.0]))
    tensor = torch.tensor([2.0])
    with enable_checkpointing():
        skip_tracker.save(batch, None, "test", tensor)
    assert skip_tracker.portals[(None, "test")].tensor_life == 2
    with enable_checkpointing():
        skip_tracker.load(batch, None, "test")
    assert skip_tracker.portals[(None, "test")].tensor_life == 1
    with enable_recomputing():
        skip_tracker.load(batch, None, "test")
    assert skip_tracker.portals[(None, "test")].tensor_life == 0
    with enable_recomputing():
        skip_tracker.save(batch, None, "test", tensor)
    assert skip_tracker.portals[(None, "test")].tensor_life == 0
--- a/test/distributed/_pipeline/sync/skip/test_verify_skippables.py
+++ b/test/distributed/_pipeline/sync/skip/test_verify_skippables.py
@ -0,0 +1,152 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 import pytest
 from torch import nn
 from torch.distributed._pipeline.sync.skip import Namespace, skippable, verify_skippables
 def test_matching():
    @skippable(stash=["foo"])
    class Layer1(nn.Module):
        pass
    @skippable(pop=["foo"])
    class Layer2(nn.Module):
        pass
    verify_skippables(nn.Sequential(Layer1(), Layer2()))
 def test_stash_not_pop():
    @skippable(stash=["foo"])
    class Layer1(nn.Module):
        pass
    with pytest.raises(TypeError) as e:
        verify_skippables(nn.Sequential(Layer1()))
    assert "no module declared 'foo' as poppable but stashed" in str(e.value)
 def test_pop_unknown():
    @skippable(pop=["foo"])
    class Layer1(nn.Module):
        pass
    with pytest.raises(TypeError) as e:
        verify_skippables(nn.Sequential(Layer1()))
    assert "'0' declared 'foo' as poppable but it was not stashed" in str(e.value)
 def test_stash_again():
    @skippable(stash=["foo"])
    class Layer1(nn.Module):
        pass
    @skippable(stash=["foo"])
    class Layer2(nn.Module):
        pass
    @skippable(pop=["foo"])
    class Layer3(nn.Module):
        pass
    with pytest.raises(TypeError) as e:
        verify_skippables(nn.Sequential(Layer1(), Layer2(), Layer3()))
    assert "'1' redeclared 'foo' as stashable" in str(e.value)
 def test_pop_again():
    @skippable(stash=["foo"])
    class Layer1(nn.Module):
        pass
    @skippable(pop=["foo"])
    class Layer2(nn.Module):
        pass
    @skippable(pop=["foo"])
    class Layer3(nn.Module):
        pass
    with pytest.raises(TypeError) as e:
        verify_skippables(nn.Sequential(Layer1(), Layer2(), Layer3()))
    assert "'2' redeclared 'foo' as poppable" in str(e.value)
 def test_stash_pop_together_different_names():
    @skippable(stash=["foo"])
    class Layer1(nn.Module):
        pass
    @skippable(pop=["foo"], stash=["bar"])
    class Layer2(nn.Module):
        pass
    @skippable(pop=["bar"])
    class Layer3(nn.Module):
        pass
    verify_skippables(nn.Sequential(Layer1(), Layer2(), Layer3()))
 def test_stash_pop_together_same_name():
    @skippable(stash=["foo"], pop=["foo"])
    class Layer1(nn.Module):
        pass
    with pytest.raises(TypeError) as e:
        verify_skippables(nn.Sequential(Layer1()))
    assert "'0' declared 'foo' both as stashable and as poppable" in str(e.value)
 def test_double_stash_pop():
    @skippable(stash=["foo"])
    class Layer1(nn.Module):
        pass
    @skippable(pop=["foo"])
    class Layer2(nn.Module):
        pass
    @skippable(stash=["foo"])
    class Layer3(nn.Module):
        pass
    @skippable(pop=["foo"])
    class Layer4(nn.Module):
        pass
    with pytest.raises(TypeError) as e:
        verify_skippables(nn.Sequential(Layer1(), Layer2(), Layer3(), Layer4()))
    assert "'2' redeclared 'foo' as stashable" in str(e.value)
    assert "'3' redeclared 'foo' as poppable" in str(e.value)
 def test_double_stash_pop_but_isolated():
    @skippable(stash=["foo"])
    class Layer1(nn.Module):
        pass
    @skippable(pop=["foo"])
    class Layer2(nn.Module):
        pass
    @skippable(stash=["foo"])
    class Layer3(nn.Module):
        pass
    @skippable(pop=["foo"])
    class Layer4(nn.Module):
        pass
    ns1 = Namespace()
    ns2 = Namespace()
    verify_skippables(
        nn.Sequential(Layer1().isolate(ns1), Layer2().isolate(ns1), Layer3().isolate(ns2), Layer4().isolate(ns2),)
    )
--- a/test/distributed/_pipeline/sync/test_balance.py
+++ b/test/distributed/_pipeline/sync/test_balance.py
@ -0,0 +1,225 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 import time
 import pytest
 import torch
 from torch import nn
 from torch.distributed._pipeline.sync.balance import balance_by_size, balance_by_time, blockpartition
 from torch.distributed._pipeline.sync.balance.profile import layerwise_sandbox
 skip_if_no_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
 devices = ["cpu"]
 if torch.cuda.is_available():
    devices.append("cuda")
 def test_blockpartition():
    assert blockpartition.solve([1, 2, 3, 4, 5, 6], partitions=2) == [[1, 2, 3, 4], [5, 6]]
 def test_blockpartition_zeros():
    assert blockpartition.solve([0, 0], partitions=2) == [[0], [0]]
 def test_blockpartition_non_positive_partitions():
    with pytest.raises(ValueError):
        blockpartition.solve([42], partitions=0)
    with pytest.raises(ValueError):
        blockpartition.solve([42], partitions=-1)
 def test_blockpartition_short_sequence():
    with pytest.raises(ValueError):
        blockpartition.solve([], partitions=1)
    with pytest.raises(ValueError):
        blockpartition.solve([42], partitions=2)
@pytest.mark.parametrize("device", devices)
@pytest.mark.skip(reason="Flaky due to time.sleep()")
 def test_balance_by_time(device):
    class Delay(nn.Module):
        def __init__(self, seconds):
            super().__init__()
            self.seconds = seconds
        def forward(self, x):
            time.sleep(self.seconds)
            return x
    model = nn.Sequential(*[Delay(i / 10) for i in [1, 2, 3, 4, 5, 6]])
    sample = torch.rand(1)
    balance = balance_by_time(2, model, sample, device=device)
    assert balance == [4, 2]
 def test_balance_by_time_loop_resets_input():
    # nn.Flatten was introduced at PyTorch 1.2.0.
    class Flatten(nn.Module):
        def forward(self, x):
            return x.flatten(1)
    model = nn.Sequential(nn.Conv2d(3, 2, 1), Flatten(), nn.Linear(128, 10))
    sample = torch.rand(10, 3, 8, 8)
    balance = balance_by_time(2, model, sample, device="cpu")
    assert balance == [1, 2]
@skip_if_no_cuda
 def test_balance_by_size_latent():
    class Expand(nn.Module):
        def __init__(self, times):
            super().__init__()
            self.times = times
        def forward(self, x):
            for i in range(self.times):
                x = x + torch.rand_like(x, requires_grad=True)
            return x
    sample = torch.rand(10, 100, 100)
    model = nn.Sequential(*[Expand(i) for i in [1, 2, 3, 4, 5, 6]])
    balance = balance_by_size(2, model, sample)
    assert balance == [4, 2]
    model = nn.Sequential(*[Expand(i) for i in [6, 5, 4, 3, 2, 1]])
    balance = balance_by_size(2, model, sample)
    assert balance == [2, 4]
@skip_if_no_cuda
 def test_balance_by_size_param():
    model = nn.Sequential(*[nn.Linear(i + 1, i + 2) for i in range(6)])
    sample = torch.rand(7, 1)
    balance = balance_by_size(2, model, sample, param_scale=100)
    assert balance == [4, 2]
    model = nn.Sequential(*[nn.Linear(i + 2, i + 1) for i in reversed(range(6))])
    sample = torch.rand(1, 7)
    balance = balance_by_size(2, model, sample, param_scale=100)
    assert balance == [2, 4]
@skip_if_no_cuda
 def test_balance_by_size_param_scale():
    class Tradeoff(nn.Module):
        def __init__(self, param_size, latent_size):
            super().__init__()
            self.fc = nn.Linear(param_size, param_size)
            self.latent_size = latent_size
        def forward(self, x):
            for i in range(self.latent_size):
                x = x + torch.rand_like(x, requires_grad=True)
            return x
    model = nn.Sequential(
        Tradeoff(param_size=1, latent_size=6),
        Tradeoff(param_size=2, latent_size=5),
        Tradeoff(param_size=3, latent_size=4),
        Tradeoff(param_size=4, latent_size=3),
        Tradeoff(param_size=5, latent_size=2),
        Tradeoff(param_size=6, latent_size=1),
    )
    sample = torch.rand(1, requires_grad=True)
    balance = balance_by_size(2, model, sample, param_scale=0)
    assert balance == [2, 4]
    balance = balance_by_size(2, model, sample, param_scale=100)
    assert balance == [4, 2]
@pytest.mark.parametrize("device", devices)
 def test_layerwise_sandbox(device):
    model = nn.Sequential(nn.Conv2d(3, 3, 1), nn.BatchNorm2d(3))
    model.eval()
    for layer in layerwise_sandbox(model, torch.device(device)):
        assert layer.training
        assert all(p.device.type == device for p in layer.parameters())
    assert all(not l.training for l in model)
    assert all(p.device.type == "cpu" for p in model.parameters())
@pytest.mark.parametrize("device", devices)
 def test_sandbox_during_profiling(device):
    model = nn.Sequential(nn.BatchNorm2d(3))
    before = {k: v.clone() for k, v in model.state_dict().items()}
    sample = torch.rand(1, 3, 10, 10)
    balance_by_time(1, model, sample, device=device)
    after = model.state_dict()
    assert before.keys() == after.keys()
    for key, value in before.items():
        assert torch.allclose(after[key], value), key
 def test_not_training():
    class AssertTraining(nn.Module):
        def forward(self, x):
            assert self.training
            return x
    model = nn.Sequential(AssertTraining())
    model.eval()
    assert not model.training
    sample = torch.rand(1)
    balance_by_time(1, model, sample, device="cpu")
    assert not model.training
 def test_balance_by_time_tuple():
    class Twin(nn.Module):
        def forward(self, x):
            return x, x.detach()
    class Add(nn.Module):
        def forward(self, a_b):
            a, b = a_b
            return a + b
    model = nn.Sequential(Twin(), Add())
    sample = torch.rand(1, requires_grad=True)
    balance_by_time(1, model, sample, device="cpu")
@skip_if_no_cuda
 def test_balance_by_size_tuple():
    class Twin(nn.Module):
        def forward(self, x):
            return x, x.detach()
    class Add(nn.Module):
        def forward(self, a_b):
            a, b = a_b
            return a + b
    model = nn.Sequential(Twin(), Add())
    sample = torch.rand(1, requires_grad=True)
    balance_by_size(1, model, sample)
 def test_already_has_grad():
    model = nn.Sequential(nn.Conv2d(3, 3, 1))
    sample = torch.rand(1, 3, 32, 32)
    model(sample).norm().backward()
    with pytest.raises(ValueError, match="some parameter already has gradient"):
        balance_by_time(1, model, sample, device="cpu")
--- a/test/distributed/_pipeline/sync/test_bugs.py
+++ b/test/distributed/_pipeline/sync/test_bugs.py
@ -0,0 +1,128 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 import pytest
 import torch
 from torch import nn
 import torch.nn.functional as F
 from torch.distributed._pipeline.sync import Pipe
 def test_python_autograd_function():
    # A Python autograd function might fail with this error:
    #
    #   RuntimeError: Returning Variables sharing storage with other Variables
    #   that require grad is not supported in Python functions. Please submit a
    #   feature request if you hit this error.
    #
    # It doesn't look like an essential restriction. But it happens on the
    # current PyTorch version. To avoid it, we should detach the tensor before
    # returning by identity autograd functions, such as Wait, Fork, and Join.
    #
    class Identity(torch.autograd.Function):
        @staticmethod
        def forward(ctx, input):
            return input
        @staticmethod
        def backward(ctx, grad):
            return grad
    class M(nn.Module):
        def forward(self, input):
            return Identity.apply(input)
    model = nn.Sequential(M(), M())
    model = Pipe(model, [1, 1], devices=["cpu", "cpu"], checkpoint="always")
    x = torch.rand(42)
    y = model(x)
    assert torch.allclose(x, y)
 def test_exception_no_hang():
    # In v0.0.2, once a failed partition receives a normal message
    # (non-closing) for the next micro-batch, a hang occured. The reason was
    # that a failed partition didn't call in_queue.task_done() on a normal
    # message. So the former partition was blocked at out_queue.join() for the
    # next of next micro-batch.
    class ExpectedException(Exception):
        pass
    class Pass(nn.Module):
        def forward(self, x):
            return x
    class Raise(nn.Module):
        def forward(self, x):
            raise ExpectedException()
    model = nn.Sequential(Pass(), Pass(), Raise())
    model = Pipe(model, [1, 1, 1], devices=["cpu", "cpu", "cpu"], chunks=3)
    with pytest.raises(ExpectedException):
        model(torch.rand(3))
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="2 cuda devices required")
 def test_tuple_wait(cuda_sleep):
    # In v0.0.3, Wait is applied to only the first tensor on a micro-batch.
    # Under this behavior, if checkpointing was disabled, there's a possibility
    # that gradient accumulations on other tensors are not synchronized
    # properly to the copy stream.
    class Sleep(torch.autograd.Function):
        @staticmethod
        def forward(ctx, x):
            return x.detach()
        @staticmethod
        def backward(ctx, grad):
            with torch.cuda.device(grad.device):
                cuda_sleep(0.05)
            return grad
    class Layer1(nn.Module):
        def forward(self, pair):
            a, b = pair
            return a * 1, b * 2, b * 3
    class Layer2(nn.Module):
        def forward(self, triple):
            a, b, c = triple
            b = Sleep.apply(b)
            return a + b + c
    model = nn.Sequential(Layer1(), Layer2())
    model = Pipe(model, [1, 1], devices=[0, 1], chunks=32, checkpoint="never")
    a = torch.rand(1024, 3, 32, 32, device=0, requires_grad=True)
    b = torch.rand(1024, 3, 32, 32, device=0, requires_grad=True)
    y = model((a, b))
    y.norm().backward()
    torch.cuda.synchronize(0)
    torch.cuda.synchronize(1)
    assert torch.isclose(b.grad.norm().cpu(), torch.tensor(5.000))
 def test_parallel_randoms():
    class Dropouts(nn.Module):
        def forward(self, x):
            for _ in range(100):
                x = F.dropout(x, p=0.001)
            return x
    model = nn.Sequential(Dropouts(), Dropouts())
    x = torch.rand(10, 10, requires_grad=True)
    model = Pipe(model, [1, 1], devices=["cpu", "cpu"], chunks=10, checkpoint="always")
    y = model(x)
    y.norm().backward()
    assert y.to(torch.bool).tolist() == x.grad.to(torch.bool).tolist()
--- a/test/distributed/_pipeline/sync/test_checkpoint.py
+++ b/test/distributed/_pipeline/sync/test_checkpoint.py
@ -0,0 +1,158 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 from functools import partial
 import pytest
 import torch
 from torch import nn
 import torch.cuda
 from torch.distributed._pipeline.sync.checkpoint import Checkpointing, checkpoint, is_checkpointing, is_recomputing
 from torch.distributed._pipeline.sync.dependency import fork, join
 from torch.distributed._pipeline.sync.microbatch import Batch
 devices = ["cpu"]
 if torch.cuda.is_available():
    devices.append("cuda")
@pytest.mark.parametrize("device", devices)
 def test_serial_checkpoints(device):
    # Copied from https://github.com/pytorch/pytorch/pull/18568.
    timeline = []
    class Log(torch.autograd.Function):
        @staticmethod
        def forward(ctx, name, x):
            ctx.name = name
            timeline.append(f"{name}:forward")
            return x.detach()
        @staticmethod
        def backward(ctx, grad_output):
            name = ctx.name
            timeline.append(f"{name}:backward")
            return None, grad_output
    a = torch.rand(1, device=device, requires_grad=True)
    b = torch.rand(1, device=device, requires_grad=True)
    # Increase the next function sequence number.
    _ = a + 1 + 2 + 3 + 4 + 5
    a = checkpoint(partial(Log.apply, "a"), a)
    a, phony = fork(a)
    b = join(b, phony)
    b = checkpoint(partial(Log.apply, "b"), b)
    c = torch.cat((a, b))
    out = c.sum()
    #                        +--> {a} --Checkpoint(Log)--> {a}
    # {out} --Sum--> {c} --Cat     ^-----------------------------+
    #                        +--> {b} --Checkpoint(Log)--> {b} --First--> {b}
    out.backward()
    assert timeline == ["a:forward", "b:forward", "b:forward", "b:backward", "a:forward", "a:backward"]
    #    |----------------------|  |-----------------------|  |-----------------------|
    #          forward pass            Checkpoint(Log[b])         Checkpoint(Log[a])
 def test_not_requires_grad():
    x = Batch(torch.rand(1, requires_grad=False))
    assert not x[0].requires_grad
    def f(x):
        return x * 2
    chk = Checkpointing(f, x)
    x = chk.checkpoint()
    assert x[0].requires_grad
    chk.recompute(x)
    assert x[0].requires_grad
    x.tensor.backward()
 def test_not_requires_grad_with_parameter():
    x = torch.rand(1, requires_grad=False)
    a = torch.rand(1, requires_grad=True)
    def f(x):
        return x * a
    y = checkpoint(f, x)
    y.backward()
    assert a.grad is not None
@pytest.mark.parametrize("device", devices)
 def test_random_in_checkpoint(device):
    dropout = nn.Dropout(p=0.5)
    torch.manual_seed(0)
    x = torch.randn(3, 3, device=device, requires_grad=True)
    y = dropout(x)
    y.norm().backward()
    torch.manual_seed(0)
    chk_x = torch.randn(3, 3, device=device, requires_grad=True)
    chk_y = checkpoint(dropout, chk_x)
    chk_y.norm().backward()
    assert torch.allclose(x.grad, chk_x.grad)
 def test_detect_checkpointing_recomputing():
    logs = []
    class Detect(nn.Module):
        def forward(self, input):
            logs.append((is_checkpointing(), is_recomputing()))
            return input
    model = Detect()
    input = torch.rand(1, requires_grad=True)
    output = checkpoint(model, input)
    output.backward()
    assert logs == [(True, False), (False, True)]
 def test_detect_checkpointing_recomputing_without_checkpoint():
    logs = []
    class Detect(nn.Module):
        def forward(self, input):
            logs.append((is_checkpointing(), is_recomputing()))
            return input
    model = Detect()
    input = torch.rand(1, requires_grad=True)
    output = model(input)
    output.backward()
    assert logs == [(False, False)]
 def test_non_grad_output():
    class ForkNonGrad(nn.Module):
        def forward(self, input):
            return (input * 2, torch.rand(1))
    model = ForkNonGrad()
    input = torch.rand(1, requires_grad=True)
    output = checkpoint(model, input)
    output[0].backward()
--- a/test/distributed/_pipeline/sync/test_copy.py
+++ b/test/distributed/_pipeline/sync/test_copy.py
@ -0,0 +1,68 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 import pytest
 import torch
 from torch.distributed._pipeline.sync.copy import Copy, Wait
 from torch.distributed._pipeline.sync.stream import CPUStream, current_stream, get_device, is_cuda, new_stream, use_stream
 skip_if_no_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
 def _test_copy_wait(prev_stream, next_stream, cuda_sleep=None):
    device = get_device(prev_stream)
    with use_stream(prev_stream):
        if is_cuda(prev_stream):
            cuda_sleep(0.5)
        x = torch.ones(100, device=device, requires_grad=True)
    (y,) = Copy.apply(prev_stream, next_stream, x)
    (y,) = Wait.apply(prev_stream, next_stream, x)
    with use_stream(next_stream):
        assert torch.allclose(y.sum(), torch.tensor(100.0, device=device))
        y.norm().backward()
    with use_stream(prev_stream):
        assert torch.allclose(x.grad.sum(), torch.tensor(10.0, device=device))
 def test_copy_wait_cpu_cpu():
    prev_stream = CPUStream
    next_stream = CPUStream
    _test_copy_wait(prev_stream, next_stream)
@skip_if_no_cuda
 def test_copy_wait_cpu_cuda(cuda_sleep):
    prev_stream = CPUStream
    next_stream = current_stream(torch.device("cuda"))
    _test_copy_wait(prev_stream, next_stream, cuda_sleep)
@skip_if_no_cuda
 def test_copy_wait_cuda_cpu(cuda_sleep):
    prev_stream = current_stream(torch.device("cuda"))
    next_stream = CPUStream
    _test_copy_wait(prev_stream, next_stream, cuda_sleep)
@skip_if_no_cuda
 def test_copy_wait_cuda_cuda(cuda_sleep):
    prev_stream = current_stream(torch.device("cuda"))
    next_stream = new_stream(torch.device("cuda"))
    _test_copy_wait(prev_stream, next_stream, cuda_sleep)
 def test_wait_multiple_tensors():
    a = torch.rand(1, requires_grad=True)
    b = torch.rand(1, requires_grad=True)
    a, b = Wait.apply(CPUStream, CPUStream, a, b)
    assert a.grad_fn is b.grad_fn
    assert a.grad_fn.__class__ is Wait._backward_cls
--- a/test/distributed/_pipeline/sync/test_deferred_batch_norm.py
+++ b/test/distributed/_pipeline/sync/test_deferred_batch_norm.py
@ -0,0 +1,192 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 from copy import deepcopy
 from itertools import chain
 import pytest
 import torch
 from torch import nn, optim
 from torch.distributed._pipeline.sync.batchnorm import DeferredBatchNorm
 CHUNKS = 4
 def tilt_dist(input):
    # Tilt variance by channel.
    rgb = input.transpose(0, 1)
    rgb[0] *= 1
    rgb[1] *= 10
    rgb[2] *= 100
    # Tilt mean by single batch.
    for i, single in enumerate(input):
        single += 2 ** i
    return input
 def chunked_forward(model, input, chunks=CHUNKS):
    output_chunks = []
    for chunk in input.chunk(chunks):
        output_chunks.append(model(chunk))
    return torch.cat(output_chunks)
@pytest.mark.parametrize("chunks", [1, 4])
@pytest.mark.parametrize("input_requires_grad", [True, False])
 def test_transparency(chunks, input_requires_grad):
    bn = nn.BatchNorm2d(3)
    dbn = DeferredBatchNorm.convert_deferred_batch_norm(deepcopy(bn), chunks=chunks)
    input1 = torch.rand(16, 3, 224, 224)
    input1 = tilt_dist(input1)
    input2 = input1.clone()
    input1.requires_grad = input_requires_grad
    input2.requires_grad = input_requires_grad
    output1 = chunked_forward(bn, input1, chunks=chunks)
    output2 = chunked_forward(dbn, input2, chunks=chunks)
    assert torch.allclose(output1, output2, atol=1e-4)
    output1.mean().backward()
    output2.mean().backward()
    assert torch.allclose(bn.weight.grad, dbn.weight.grad, atol=1e-4)
    if input_requires_grad:
        assert input1.grad is not None
        assert input2.grad is not None
        assert torch.allclose(input1.grad, input2.grad, atol=1e-4)
@pytest.mark.parametrize("momentum", [0.1, None])
 def test_running_stats(momentum):
    bn = nn.BatchNorm2d(3, momentum=momentum)
    dbn = DeferredBatchNorm.convert_deferred_batch_norm(deepcopy(bn), chunks=CHUNKS)
    input = torch.rand(16, 3, 224, 224)
    input = tilt_dist(input)
    bn(input)
    chunked_forward(dbn, input)
    assert torch.allclose(bn.running_mean, dbn.running_mean, atol=1e-4)
    assert torch.allclose(bn.running_var, dbn.running_var, atol=1e-4)
 def test_convert_deferred_batch_norm():
    bn = nn.BatchNorm2d(3, track_running_stats=False)
    bn = DeferredBatchNorm.convert_deferred_batch_norm(bn, chunks=CHUNKS)
    assert type(bn) is nn.BatchNorm2d  # because of track_running_stats=False
    dbn = DeferredBatchNorm(3, chunks=CHUNKS)
    dbn_again = DeferredBatchNorm.convert_deferred_batch_norm(dbn, chunks=CHUNKS)
    assert dbn is dbn_again
    dbn_again = DeferredBatchNorm.convert_deferred_batch_norm(dbn, chunks=CHUNKS + 1)
    assert dbn is not dbn_again  # because of different chunks
 def test_eval():
    bn = nn.BatchNorm2d(3)
    dbn = DeferredBatchNorm.convert_deferred_batch_norm(deepcopy(bn), chunks=CHUNKS)
    input = torch.rand(16, 3, 224, 224)
    input = tilt_dist(input)
    bn(input)
    chunked_forward(dbn, input)
    bn.eval()
    dbn.eval()
    assert torch.allclose(bn(input), dbn(input), atol=1e-4)
 def test_optimize():
    bn = nn.BatchNorm2d(3)
    dbn = DeferredBatchNorm.convert_deferred_batch_norm(deepcopy(bn), chunks=CHUNKS)
    opt = optim.SGD(chain(bn.parameters(), dbn.parameters()), lr=1.0)
    for i in range(5):
        input = torch.rand(16, 3, 224, 224)
        input = tilt_dist(input)
        # train
        y = bn(input)
        a = y.sum()
        a.backward()
        y = chunked_forward(dbn, input)
        b = y.sum()
        b.backward()
        opt.step()
        # eval
        bn.eval()
        dbn.eval()
        with torch.no_grad():
            assert torch.allclose(bn(input), dbn(input), atol=1e-1 * (10 ** i))
 def test_conv_bn():
    bn = nn.Sequential(nn.Conv2d(3, 3, 1), nn.BatchNorm2d(3))
    dbn = DeferredBatchNorm.convert_deferred_batch_norm(deepcopy(bn), chunks=CHUNKS)
    input = torch.rand(16, 3, 224, 224)
    input = tilt_dist(input)
    opt = optim.SGD(chain(bn.parameters(), dbn.parameters()), lr=0.1)
    # 1st step
    a = bn(input)
    b = chunked_forward(dbn, input)
    # Outputs are different. (per-mini-batch vs. per-micro-batch)
    assert not torch.allclose(a, b)
    a.sum().backward()
    b.sum().backward()
    opt.step()
    opt.zero_grad()
    # Conv layers are also trained differently because of their different outputs.
    assert not torch.allclose(bn[0].weight, dbn[0].weight)
    # But BNs track identical running stats.
    assert torch.allclose(bn[1].running_mean, dbn[1].running_mean, atol=1e-4)
    assert torch.allclose(bn[1].running_var, dbn[1].running_var, atol=1e3)
    # 2nd step
    a = bn(input)
    b = chunked_forward(dbn, input)
    a.sum().backward()
    b.sum().backward()
    # BNs can't track identical running stats due to the different conv layers.
    assert not torch.allclose(bn[1].running_mean, dbn[1].running_mean, atol=1e-4)
    assert not torch.allclose(bn[1].running_var, dbn[1].running_var, atol=1e3)
 def test_input_requiring_grad():
    dbn = DeferredBatchNorm(3, chunks=CHUNKS)
    input = torch.rand(16, 3, 224, 224)
    input = tilt_dist(input)
    input.requires_grad = True
    chunked_forward(dbn, input)
    assert not dbn.sum.requires_grad
    assert dbn.sum.grad_fn is None
--- a/test/distributed/_pipeline/sync/test_dependency.py
+++ b/test/distributed/_pipeline/sync/test_dependency.py
@ -0,0 +1,144 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 import weakref
 import pytest
 import torch
 from torch.distributed._pipeline.sync.dependency import Fork, Join, fork, join
@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
 def test_fork_join():
    logs = []
    class Log(torch.autograd.Function):
        @staticmethod
        def forward(ctx, number, tensor):
            ctx.number = number
            return tensor.detach()
        @staticmethod
        def backward(ctx, grad):
            logs.append(ctx.number)
            return None, grad
    a = torch.rand(1, device="cpu", requires_grad=True)
    b = torch.rand(1, device="cuda", requires_grad=True)
    a = Log.apply(1, a)
    a, phony = fork(a)
    b = join(a, phony)
    b = Log.apply(2, b)
    b = b.to("cpu")
    (a + b).backward()
    assert logs == [2, 1]
 def test_fork_join_enable_grad():
    x = torch.rand(1, requires_grad=True)
    with torch.enable_grad():
        x2, p = fork(x)
    assert p.requires_grad
    assert x2 is not x
    x = x2
    assert x.requires_grad
    assert p.requires_grad
    assert x.grad_fn.__class__ is Fork._backward_cls
    assert p.grad_fn.__class__ is Fork._backward_cls
    with torch.enable_grad():
        x2 = join(x, p)
    assert x2 is not x
    x = x2
    assert x.requires_grad
    assert x.grad_fn.__class__ is Join._backward_cls
 def test_fork_join_no_grad(monkeypatch):
    def do_not_apply(*args):
        raise AssertionError("Function.apply called")
    monkeypatch.setattr("torch.autograd.Function.apply", do_not_apply)
    x = torch.rand(1, requires_grad=True)
    with torch.no_grad():
        x2, p = fork(x)
    assert not p.requires_grad
    assert x2 is x
    x = x2
    with torch.no_grad():
        x2 = join(x, p)
    assert x2 is x
    x = x2
 def test_fork_leak():
    leak = None
    class F(torch.autograd.Function):
        @staticmethod
        def forward(ctx, input):
            return input
        @staticmethod
        def backward(ctx, grad):
            nonlocal leak
            leak = weakref.ref(ctx)
            return grad
    x = torch.rand(1, requires_grad=True)
    x = F.apply(x)
    x, phony = fork(x)
    x = join(x, phony)
    x.backward()
    del x, phony
    assert leak() is None
 def test_join_when_fork_not_requires_grad():
    x = torch.rand(2, 1)
    a, b = x.chunk(2)
    assert not a.requires_grad
    a, p = fork(a)
    assert not a.requires_grad
    assert not p.requires_grad
    assert not b.requires_grad
    b = join(b, p)
    assert not b.requires_grad
 def test_join_when_fork_requires_grad():
    x = torch.rand(2, 1)
    a, b = x.chunk(2)
    a.requires_grad_()
    assert a.requires_grad
    a, p = fork(a)
    assert a.requires_grad
    assert p.requires_grad
    assert not b.requires_grad
    b = join(b, p)
    assert b.requires_grad
--- a/test/distributed/_pipeline/sync/test_inplace.py
+++ b/test/distributed/_pipeline/sync/test_inplace.py
@ -0,0 +1,71 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 import pytest
 import torch
 from torch import nn
 from torch.distributed._pipeline.sync import Pipe
 def test_inplace_on_requires_grad():
    model = nn.Sequential(nn.Linear(1, 1), nn.ReLU(inplace=True))
    model = Pipe(model, [1, 1], devices=["cpu", "cpu"], checkpoint="always")
    x = torch.rand(1)
    y = model(x)
    message = r"a leaf Variable that requires grad .* used in an in-place operation."
    with pytest.raises(RuntimeError, match=message):
        y.backward()
@pytest.mark.xfail(strict=True)
 def test_inplace_on_not_requires_grad():
    # In-place operation on a tensor not requiring grad doesn't cause a
    # RuntimeError. Currently, we cannot detect this case.
    model = nn.Sequential(nn.ReLU(inplace=True))
    model = Pipe(model, [1], devices=["cpu"], checkpoint="always")
    x = torch.rand(1)
    y = model(x)
    del model
    message = r"a leaf Variable that requires grad .* used in an in-place operation."
    with pytest.raises(RuntimeError, match=message):
        y.backward()
@pytest.mark.xfail(strict=True)
 def test_inplace_incorrect_grad():
    class M(nn.Module):
        def forward(self, foo_bar):
            # 'foo' requires grad but 'bar' does not. In-place operation on
            # 'bar' won't cause a RuntimeError.
            foo, bar = foo_bar
            # add_(1) is not idempotent, in contrast to relu_(). If it is
            # executed multiple times, it will accumulates each difference onto
            # 'bar'.
            bar.add_(1)
            # 'bar' is still captured by checkpointing. 'foo' will get
            # incorrect grad.
            return foo * bar
    model = nn.Sequential(M())
    model = Pipe(model, [1], devices=["cpu"], checkpoint="always")
    foo = torch.tensor([1.0], requires_grad=True)
    bar = torch.tensor([1.0])
    output = model((foo, bar))
    del model
    output.backward()
    # The gradient of 'foo' should be 2, but it is 3 actually because
    # bar.add_(1) was executed twice due to checkpointing.
    assert foo.grad.item() == 2.0
--- a/test/distributed/_pipeline/sync/test_microbatch.py
+++ b/test/distributed/_pipeline/sync/test_microbatch.py
@ -0,0 +1,138 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 import pytest
 import torch
 import torch.cuda
 from torch.distributed._pipeline.sync.microbatch import Batch, check, gather, scatter
 def test_batch_atomic():
    x = torch.tensor(42)
    b = Batch(x)
    assert b.atomic
    assert b.tensor is x
    with pytest.raises(AttributeError):
        b.tensors
    assert list(b) == [x]
    assert len(b) == 1
    assert b[0] is x
 def test_batch_non_atomic():
    x, y = torch.tensor(42), torch.tensor(21)
    b = Batch((x, y))
    assert not b.atomic
    with pytest.raises(AttributeError):
        b.tensor
    assert b.tensors == (x, y)
    assert list(b) == [x, y]
    assert len(b) == 2
    assert b[0] is x
    assert b[1] is y
 def test_batch_call():
    a = Batch(torch.tensor(42))
    b = Batch((torch.tensor(42), torch.tensor(21)))
    def f(x):
        return x
    assert a.call(f).atomic
    assert not b.call(f).atomic
 def test_batch_setitem_by_index():
    a = Batch(torch.tensor(42))
    b = Batch((torch.tensor(42), torch.tensor(21)))
    a[0] = torch.tensor(0)
    b[0] = torch.tensor(0)
    assert a.atomic
    assert a[0].item() == 0
    assert not b.atomic
    assert len(b) == 2
    assert b[0].item() == 0
    assert b[1].item() == 21
 def test_batch_setitem_by_slice():
    a = Batch(torch.tensor(42))
    b = Batch((torch.tensor(42), torch.tensor(21)))
    a[:] = (torch.tensor(0),)
    b[:] = (torch.tensor(0),)
    assert a.atomic
    assert a[0].item() == 0
    assert not b.atomic
    assert len(b) == 1
    assert b[0].item() == 0
 def test_check():
    check(torch.tensor(42))
    check((torch.tensor(4), torch.tensor(2)))
    with pytest.raises(TypeError):
        check(42)
    with pytest.raises(TypeError):
        check("str")
    with pytest.raises(TypeError):
        check((torch.tensor(4), 2))
 def test_gather_tensors():
    a = torch.zeros(1, 1)
    b = torch.zeros(1, 1)
    ab = gather([Batch(a), Batch(b)])
    assert ab.size() == (2, 1)
 def test_gather_tuples():
    a = (torch.zeros(1, 1), torch.zeros(2, 2))
    b = (torch.zeros(1, 1), torch.zeros(2, 2))
    ab = gather([Batch(a), Batch(b)])
    assert isinstance(ab, tuple)
    assert ab[0].size() == (2, 1)
    assert ab[1].size() == (4, 2)
 def test_scatter_tensor():
    ab = torch.zeros(2, 1)
    a, b = scatter(ab, chunks=2)
    assert a.tensor.size() == (1, 1)
    assert b.tensor.size() == (1, 1)
 def test_scatter_tuple():
    ab = (torch.zeros(2, 1), torch.zeros(4, 2))
    a, b = scatter(ab, chunks=2)
    assert a.tensors[0].size() == (1, 1)
    assert b.tensors[0].size() == (1, 1)
    assert a.tensors[1].size() == (2, 2)
    assert b.tensors[1].size() == (2, 2)
--- a/test/distributed/_pipeline/sync/test_phony.py
+++ b/test/distributed/_pipeline/sync/test_phony.py
@ -0,0 +1,50 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 import torch
 from torch.distributed._pipeline.sync.phony import get_phony
 def test_phony_size():
    p = get_phony(torch.device("cpu"), requires_grad=False)
    assert p.size() == (0,)
 def test_phony_requires_grad():
    p1 = get_phony(torch.device("cpu"), requires_grad=True)
    p2 = get_phony(torch.device("cpu"), requires_grad=False)
    assert p1.requires_grad
    assert not p2.requires_grad
 def test_cached_phony():
    p1 = get_phony(torch.device("cpu"), requires_grad=True)
    p2 = get_phony(torch.device("cpu"), requires_grad=True)
    assert p1 is p2
    p3 = get_phony(torch.device("cpu"), requires_grad=False)
    p4 = get_phony(torch.device("cpu"), requires_grad=False)
    assert p3 is p4
    assert p1 is not p3
 def test_phony_in_autograd_function():
    class Phonify(torch.autograd.Function):
        @staticmethod
        def forward(ctx, input):
            phony = get_phony(input.device, requires_grad=False)
            return phony.detach()
    x = torch.rand(1, requires_grad=True)
    p1 = Phonify.apply(x)
    p2 = get_phony(torch.device("cpu"), requires_grad=True)
    assert p1 is not p2
    assert p1.grad_fn is not None
    assert p2.grad_fn is None
--- a/test/distributed/_pipeline/sync/test_pipe.py
+++ b/test/distributed/_pipeline/sync/test_pipe.py
@ -0,0 +1,608 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 from collections import OrderedDict
 from copy import deepcopy
 import time
 import pytest
 import torch
 from torch import nn
 from torch.distributed._pipeline.sync import Pipe
 skip_if_no_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
 def test_parameters():
    model = nn.Sequential(nn.Linear(1, 1))
    pipe = Pipe(model, balance=[1], devices=["cpu"], chunks=1)
    assert list(pipe.parameters()) != []
 def test_public_attrs():
    class MyString:
        def __init__(self, value):
            self.value = value
        def __str__(self):
            return self.value
    model = nn.Sequential(nn.Linear(1, 1))
    pipe = Pipe(model, balance=(1,), devices=("cpu",), chunks=42.000, checkpoint=MyString("always"))
    assert pipe.balance == [1]
    assert pipe.devices == [torch.device("cpu")]
    assert pipe.chunks == 42
    assert isinstance(pipe.chunks, int)
    assert pipe.checkpoint == "always"
    assert isinstance(pipe.checkpoint, str)
@pytest.mark.parametrize("balance", [[2], [1, 1]])
 def test_sequential_like(balance):
    a = nn.Linear(1, 1)
    b = nn.Linear(1, 1)
    model = nn.Sequential(a, b)
    model = Pipe(model, balance, devices=["cpu", "cpu"])
    assert len(model) == 2
    assert list(model) == [a, b]
    assert model[0] is a
    assert model[1] is b
    with pytest.raises(IndexError):
        _ = model[2]
    assert model[-1] is b
    assert model[-2] is a
 def test_balance_wrong_length():
    a = nn.Linear(1, 1)
    b = nn.Linear(1, 1)
    model = nn.Sequential(a, b)
    with pytest.raises(ValueError):
        Pipe(model, balance=[1])
    with pytest.raises(ValueError):
        Pipe(model, balance=[3])
 def test_balance_less_than_1():
    a = nn.Linear(1, 1)
    b = nn.Linear(1, 1)
    model = nn.Sequential(a, b)
    with pytest.raises(ValueError):
        Pipe(model, balance=[0, 2])
    with pytest.raises(ValueError):
        Pipe(model, balance=[-1, 3])
 def test_chunks_less_than_1():
    model = nn.Sequential(nn.Linear(1, 1))
    with pytest.raises(ValueError):
        Pipe(model, balance=[1], devices=["cpu"], chunks=0)
    with pytest.raises(ValueError):
        Pipe(model, balance=[1], devices=["cpu"], chunks=-1)
 def test_too_few_devices():
    model = nn.Sequential(nn.Linear(1, 1), nn.Linear(1, 1), nn.Linear(1, 1), nn.Linear(1, 1))
    with pytest.raises(IndexError):
        # len(balance) > len(devices)
        model = Pipe(model, balance=[1, 1, 1, 1], devices=["cpu"])
 def test_batch_size_indivisible():
    model = nn.Sequential(nn.Linear(1, 1))
    model = Pipe(model, balance=[1], devices=["cpu"], chunks=4)
    with pytest.warns(None) as record:
        model(torch.rand(7, 1))
    # Indivisible batch size is legal.
    assert not record
 def test_batch_size_small():
    model = nn.Sequential(nn.Linear(1, 1))
    model = Pipe(model, balance=[1], devices=["cpu"], chunks=4)
    with pytest.warns(None) as record:
        model(torch.rand(2, 1))
    # Batch size smaller than chunks is legal.
    assert not record
 def test_checkpoint_mode():
    def count_grad_fn(grad_fn, name, visited=None):
        if visited is None:
            visited = set()
        if grad_fn in visited:
            return 0
        visited.add(grad_fn)
        if grad_fn is None:
            return 0
        if grad_fn.__class__.__name__ == name:
            return 1
        counter = 0
        for next_grad_fn, _ in grad_fn.next_functions:
            counter += count_grad_fn(next_grad_fn, name, visited=visited)
        return counter
    model = nn.Sequential(nn.Linear(1, 1))
    input = torch.rand(2, 1)
    always = Pipe(model, balance=[1], devices=["cpu"], chunks=2, checkpoint="always")
    except_last = Pipe(model, balance=[1], devices=["cpu"], chunks=2, checkpoint="except_last")
    never = Pipe(model, balance=[1], devices=["cpu"], chunks=2, checkpoint="never")
    always_output = always(input)
    except_last_output = except_last(input)
    never_output = never(input)
    assert count_grad_fn(always_output.grad_fn, "CheckpointBackward") == 2
    assert count_grad_fn(except_last_output.grad_fn, "CheckpointBackward") == 1
    assert count_grad_fn(never_output.grad_fn, "CheckpointBackward") == 0
 def test_checkpoint_mode_invalid():
    model = nn.Sequential(nn.Linear(1, 1))
    with pytest.raises(ValueError, match="checkpoint is not one of 'always', 'except_last', or 'never'"):
        Pipe(model, balance=[1], devices=["cpu"], chunks=2, checkpoint="INVALID_CHECKPOINT")
 def test_checkpoint_mode_when_chunks_1():
    model = nn.Sequential(nn.Linear(1, 1))
    # All checkpoint modes are fine.
    Pipe(model, balance=[1], devices=["cpu"], chunks=1, checkpoint="except_last")
    Pipe(model, balance=[1], devices=["cpu"], chunks=1, checkpoint="always")
    Pipe(model, balance=[1], devices=["cpu"], chunks=1, checkpoint="never")
 def test_checkpoint_eval():
    model = nn.Sequential(nn.Linear(1, 1))
    model = Pipe(model, balance=[1], devices=["cpu"], chunks=2)
    input = torch.rand(2, 1)
    def find_grad_fn(grad_fn, name):
        if grad_fn is None:
            return False
        if grad_fn.__class__.__name__ == name:
            return True
        for next_grad_fn, _ in grad_fn.next_functions:
            if find_grad_fn(next_grad_fn, name):
                return True
        return False
    model.train()
    train_output = model(input)
    assert find_grad_fn(train_output.grad_fn, "CheckpointBackward")
    assert find_grad_fn(train_output.grad_fn, "RecomputeBackward")
    model.eval()
    eval_output = model(input)
    assert not find_grad_fn(eval_output.grad_fn, "CheckpointBackward")
    assert not find_grad_fn(eval_output.grad_fn, "RecomputeBackward")
 def test_checkpoint_non_float_input():
    class ForkNonFloat(nn.Module):
        def forward(self, input):
            return (input * 2, torch.tensor([False]))
    class JoinNonFloat(nn.Module):
        def forward(self, input):
            return input[0] * 2
    model = nn.Sequential(ForkNonFloat(), JoinNonFloat())
    model = Pipe(model, balance=[1, 1], devices=["cpu", "cpu"], chunks=1, checkpoint="always")
    input = torch.rand(1, requires_grad=True)
    output = model(input)
    output.backward()
 def test_no_grad():
    model = nn.Sequential(nn.Linear(1, 1))
    model = Pipe(model, balance=[1], devices=["cpu"], chunks=2)
    input = torch.rand(2, 1)
    latent = None
    def hook(module, input, output):
        _ = module
        _ = input
        nonlocal latent
        latent = output
    partition = model.partitions[0]
    partition.register_forward_hook(hook)
    with torch.no_grad():
        model(input)
    assert latent.grad_fn is None
 def test_exception():
    class ExpectedException(Exception):
        pass
    class Raise(nn.Module):
        def forward(self, *_):
            raise ExpectedException()
    model = nn.Sequential(Raise())
    model = Pipe(model, balance=[1], devices=["cpu"], chunks=1)
    with pytest.raises(ExpectedException):
        model(torch.rand(1))
 def test_exception_early_stop_asap():
    """Even the first partitions have finished to process, the partition before
    the failed partition should be killed as soon as possible.
    """
    class ExpectedException(Exception):
        pass
    class Pass(nn.Module):
        def forward(self, x):
            return x
    counter = 0
    class Counter(nn.Module):
        def forward(self, x):
            time.sleep(0.1)
            nonlocal counter
            counter += 1
            return x
    class Raise(nn.Module):
        def forward(self, x):
            raise ExpectedException()
    model = nn.Sequential(Pass(), Pass(), Counter(), Raise())
    model = Pipe(model, [1, 1, 1, 1], devices=["cpu", "cpu", "cpu", "cpu"], chunks=3)
    with pytest.raises(ExpectedException):
        model(torch.rand(3))
    # If the early stop doesn't work, it would be 3 instead.
    assert counter == 2
 def test_input_pair():
    class Two(nn.Module):
        def __init__(self):
            super().__init__()
            self.fc_a = nn.Linear(1, 1)
            self.fc_b = nn.Linear(1, 1)
        def forward(self, a_and_b):
            a, b = a_and_b
            return (self.fc_a(a), self.fc_b(b))
    model = nn.Sequential(Two())
    model = Pipe(model, balance=[1], devices=["cpu"], chunks=2)
    a = torch.rand(10, 1, requires_grad=True)
    b = torch.rand(10, 1, requires_grad=True)
    a_out, b_out = model((a, b))
    loss = (a_out + b_out).mean()
    loss.backward()
    assert a.grad is not None
    assert b.grad is not None
 def test_input_singleton():
    class One(nn.Module):
        def __init__(self):
            super().__init__()
            self.fc = nn.Linear(1, 1)
        def forward(self, only_a):
            (a,) = only_a
            return (self.fc(a),)
    model = nn.Sequential(One())
    model = Pipe(model, balance=[1], devices=["cpu"], chunks=2)
    a = torch.rand(10, 1, requires_grad=True)
    (a_out,) = model((a,))
    loss = a_out.mean()
    loss.backward()
    assert all(p.grad is not None for p in model.parameters())
    assert a.grad is not None
 def test_input_varargs():
    model = nn.Sequential(nn.Linear(1, 1))
    model = Pipe(model, balance=[1], devices=["cpu"])
    a = torch.rand(1)
    b = torch.rand(1)
    # TypeError: forward() takes 2 positional arguments but 3 were given
    with pytest.raises(TypeError):
        model(a, b)
 def test_non_tensor():
    class NonTensor(nn.Module):
        def forward(self, _):
            return "hello"
    model = nn.Sequential(NonTensor())
    model = Pipe(model, balance=[1], devices=["cpu"])
    x = torch.rand(1)
    # TypeError: expected Tensor as element 0 in argument 0, but got str
    with pytest.raises(TypeError):
        model(x)
    # TypeError: expected Tensor to scatter, but got str
    with pytest.raises(TypeError):
        model("hello")
 def test_non_tensor_tuple():
    class NonTensorTuple(nn.Module):
        def forward(self, x):
            return (x, "hello")
    model = nn.Sequential(NonTensorTuple())
    model = Pipe(model, balance=[1], devices=["cpu"])
    x = torch.rand(1)
    # TypeError: CheckpointBackward.forward: expected Variable (got str) for return value 1
    with pytest.raises(TypeError):
        model(x)
    # TypeError: expected Tensor to scatter, but got str
    with pytest.raises(TypeError):
        model((x, "hello"))
@pytest.mark.parametrize("checkpoint", ["never", "always", "except_last"])
 def test_deferred_batch_norm(checkpoint):
    bn = nn.BatchNorm2d(3)
    pipe_bn = deepcopy(bn)
    pipe = Pipe(
        nn.Sequential(pipe_bn), balance=[1], devices=["cpu"], chunks=2, checkpoint=checkpoint, deferred_batch_norm=True
    )
    x = torch.rand(4, 3, 10, 10)
    pipe(x).mean().backward()
    bn(x).mean().backward()
    assert torch.allclose(pipe[0].running_mean, bn.running_mean, atol=1e-4)
    assert torch.allclose(pipe[0].running_var, bn.running_var, atol=1e-4)
@pytest.mark.parametrize("checkpoint", ["never", "always"])
 def test_deferred_batch_norm_params(checkpoint):
    bn = nn.BatchNorm2d(3)
    pipe_bn = deepcopy(bn)
    pipe = Pipe(
        nn.Sequential(pipe_bn), balance=[1], devices=["cpu"], chunks=1, checkpoint=checkpoint, deferred_batch_norm=True
    )
    x = torch.rand(4, 3, 10, 10)
    pipe(x).mean().backward()
    bn(x).mean().backward()
    assert pipe[0].weight.grad is not None
    assert pipe[0].bias.grad is not None
    assert torch.allclose(pipe[0].weight.grad, bn.weight.grad, atol=1e-4)
    assert torch.allclose(pipe[0].bias.grad, bn.bias.grad, atol=1e-4)
 def test_devices():
    a = nn.Linear(1, 1)
    b = nn.Linear(1, 1)
    c = nn.Linear(1, 1)
    # There are extra two devices.
    devices = ["cpu", "cpu", "cpu", "cpu", "cpu"]
    model = nn.Sequential(a, b, c)
    model = Pipe(model, [1, 1, 1], devices=devices)
    cpu = torch.device("cpu")
    # Extra devices must be discarded.
    assert model.devices == [cpu, cpu, cpu]
 def test_partitions():
    a = nn.Linear(1, 1)
    b = nn.Linear(1, 1)
    model = nn.Sequential(a, b)
    model = Pipe(model, [1, 1], devices=["cpu", "cpu"])
    assert isinstance(model.partitions, nn.ModuleList)
    assert isinstance(model.partitions[0], nn.Sequential)
    assert isinstance(model.partitions[1], nn.Sequential)
    assert "partitions.0.0.weight" in model.state_dict()
 def test_deny_moving():
    a = nn.Linear(1, 1)
    b = nn.Linear(1, 1)
    model = nn.Sequential(a, b)
    model = Pipe(model, [1, 1], devices=["cpu", "cpu"])
    # Moving is denied.
    with pytest.raises(TypeError):
        model.cuda()
    with pytest.raises(TypeError):
        model.cpu()
    with pytest.raises(TypeError):
        model.to(torch.device("cuda"))
    with pytest.raises(TypeError):
        model.to(0)
    with pytest.raises(TypeError):
        model.to("cuda")
    with pytest.raises(TypeError):
        model.to(device=0)
    with pytest.raises(TypeError):
        model.to(torch.rand(1))
    with pytest.raises(TypeError):
        model.to(tensor=torch.rand(1))
    # Casting is allowed.
    model.half()
    model.to(torch.double)
    model.to(dtype=torch.float)
 def test_empty_module():
    # Empty sequential module is not illegal.
    model = nn.Sequential()
    model = Pipe(model, [])
    assert model(torch.tensor(42)) == torch.tensor(42)
    assert model((torch.tensor(42),)) == (torch.tensor(42),)
    # But only tensor or tensors is legal in Pipe.
    with pytest.raises(TypeError):
        model(42)
 def test_named_children():
    a = nn.Linear(1, 1)
    b = nn.Linear(1, 1)
    model = nn.Sequential(OrderedDict([("a", a), ("b", b)]))
    model = Pipe(model, [1, 1], devices=["cpu", "cpu"])
    names = set(n for n, _ in model.named_modules())
    assert "partitions.0.a" in names
    assert "partitions.1.b" in names
    # Pipe doesn't support __getattr__. Unlike nn.Sequential, Pipe requires
    # several methods in its namespace.
    with pytest.raises(AttributeError):
        model.a
 def test_recommend_auto_balance():
    with pytest.raises(ValueError, match="torch.distributed._pipeline.sync.balance"):
        # balance is required
        Pipe(nn.Sequential())
    with pytest.raises(ValueError, match="torch.distributed._pipeline.sync.balance"):
        # module and sum of balance have differen length (module: 0, sum of balance: 1)
        Pipe(nn.Sequential(), [1])
    with pytest.raises(ValueError, match="torch.distributed._pipeline.sync.balance"):
        # module and sum of balance have different length (module: 2, sum of balance: 1)
        Pipe(nn.Sequential(nn.Linear(1, 1), nn.Linear(1, 1)), [1])
 def test_verify_module_non_sequential():
    with pytest.raises(TypeError, match="module must be nn.Sequential to be partitioned"):
        Pipe(nn.Module(), [1])
 def test_verify_module_duplicate_children():
    conv = nn.Conv2d(3, 3, 1)
    model = nn.Sequential(conv, conv)
    with pytest.raises(ValueError, match="module with duplicate children is not supported"):
        Pipe(model, [1, 1])
@skip_if_no_cuda
 def test_verify_module_duplicate_parameters_on_distinct_devices():
    class Surrogate(nn.Module):
        def __init__(self, module):
            super().__init__()
            self.module = module
    conv = nn.Conv2d(3, 3, 1)
    model = nn.Sequential(Surrogate(conv), Surrogate(conv))
    with pytest.raises(ValueError, match="module with duplicate parameters on distinct devices is not supported"):
        Pipe(model, [1, 1], devices=["cpu", "cuda"])
 def test_verify_module_duplicate_parameters_on_same_device():
    class Surrogate(nn.Module):
        def __init__(self, module):
            super().__init__()
            self.module = module
    conv = nn.Conv2d(3, 3, 1)
    model = nn.Sequential(Surrogate(conv), Surrogate(conv))
    Pipe(model, [1, 1], devices=["cpu", "cpu"])
 def test_forward_lockstep():
    timeline = []
    class DelayedLog(nn.Module):
        def __init__(self, j, seconds):
            super().__init__()
            self.i = 0
            self.j = j
            self.seconds = seconds
        def forward(self, x):
            time.sleep(self.seconds)
            timeline.append((self.i, self.j))
            self.i += 1
            return x
    model = nn.Sequential(DelayedLog(0, seconds=0), DelayedLog(1, seconds=0.1))
    model = Pipe(model, balance=[1, 1], devices=["cpu", "cpu"], chunks=3)
    model(torch.rand(3, 1))
    # Expected timeline: (Logs are recorded at !)
    #
    # Partition #0: 0! 1!   2!
    # Partition #1:    000! 111! 222!
    #
    assert timeline == [(0, 0), (1, 0), (0, 1), (2, 0), (1, 1), (2, 1)]
--- a/test/distributed/_pipeline/sync/test_pipeline.py
+++ b/test/distributed/_pipeline/sync/test_pipeline.py
@ -0,0 +1,29 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 from torch.distributed._pipeline.sync.pipeline import clock_cycles
 def test_clock_cycles():
    assert list(clock_cycles(1, 1)) == [[(0, 0)]]
    assert list(clock_cycles(1, 3)) == [[(0, 0)], [(0, 1)], [(0, 2)]]
    assert list(clock_cycles(3, 1)) == [[(0, 0)], [(1, 0)], [(2, 0)]]
    assert list(clock_cycles(3, 3)) == [  # noqa
        [(0, 0)],
        [(1, 0), (0, 1)],
        [(2, 0), (1, 1), (0, 2)],
        [(2, 1), (1, 2)],
        [(2, 2)],
    ]
    assert list(clock_cycles(4, 2)) == [  # noqa
        [(0, 0)],
        [(1, 0), (0, 1)],
        [(2, 0), (1, 1)],
        [(3, 0), (2, 1)],
        [(3, 1)],
    ]
--- a/test/distributed/_pipeline/sync/test_stream.py
+++ b/test/distributed/_pipeline/sync/test_stream.py
@ -0,0 +1,188 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 import pytest
 import torch
 from torch.distributed._pipeline.sync.stream import (
    CPUStream,
    current_stream,
    default_stream,
    get_device,
    is_cuda,
    new_stream,
    record_stream,
    use_device,
    use_stream,
    wait_stream,
 )
 skip_if_no_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
 class TestNewStream:
    def test_new_stream_cpu(self):
        stream = new_stream(torch.device("cpu"))
        assert stream is CPUStream
    @skip_if_no_cuda
    def test_new_stream_cuda(self):
        stream = new_stream(torch.device("cuda"))
        assert isinstance(stream, torch.cuda.Stream)
        assert stream != torch.cuda.default_stream()
 class TestCurrentStream:
    def test_current_stream_cpu(self):
        stream = current_stream(torch.device("cpu"))
        assert stream is CPUStream
    @skip_if_no_cuda
    def test_current_stream_cuda(self):
        stream = current_stream(torch.device("cuda"))
        assert isinstance(stream, torch.cuda.Stream)
        assert stream == torch.cuda.current_stream()
 class TestDefaultStream:
    def test_default_stream_cpu(self):
        stream = default_stream(torch.device("cpu"))
        assert stream is CPUStream
    @skip_if_no_cuda
    def test_default_stream_cuda(self):
        stream = default_stream(torch.device("cuda"))
        assert isinstance(stream, torch.cuda.Stream)
        assert stream == torch.cuda.default_stream()
 class TestUseDevice:
    def test_use_device_cpu(self):
        with use_device(torch.device("cpu")):
            pass
    @skip_if_no_cuda
    def test_use_device_cuda(self):
        with use_device(torch.device("cuda")):
            pass
 class TestUseStream:
    def test_use_stream_cpu(self):
        with use_stream(CPUStream):
            pass
    @skip_if_no_cuda
    def test_use_stream_cuda(self):
        stream = new_stream(torch.device("cuda"))
        with use_stream(stream):
            assert current_stream(torch.device("cuda")) == stream
 class TestGetDevice:
    def test_get_device_cpu(self):
        assert get_device(CPUStream).type == "cpu"
    @skip_if_no_cuda
    def test_get_device_cuda(self):
        stream = current_stream(torch.device("cuda"))
        assert get_device(stream).type == "cuda"
 class TestWaitStream:
    def _test_wait_stream(self, source, target, cuda_sleep=None):
        with use_stream(target):
            if is_cuda(target):
                cuda_sleep(0.5)
            x = torch.ones(100, 100, device=get_device(target))
        wait_stream(source, target)
        with use_stream(source):
            assert x.sum().item() == 10000
    def test_wait_stream_cpu_cpu(self):
        source = CPUStream
        target = CPUStream
        self._test_wait_stream(source, target)
    @skip_if_no_cuda
    def test_wait_stream_cpu_cuda(self, cuda_sleep):
        source = CPUStream
        target = new_stream(torch.device("cuda"))
        self._test_wait_stream(source, target, cuda_sleep)
    @skip_if_no_cuda
    def test_wait_stream_cuda_cpu(self, cuda_sleep):
        source = new_stream(torch.device("cuda"))
        target = CPUStream
        self._test_wait_stream(source, target, cuda_sleep)
    @skip_if_no_cuda
    def test_wait_stream_cuda_cuda(self, cuda_sleep):
        source = current_stream(torch.device("cuda"))
        target = new_stream(torch.device("cuda"))
        self._test_wait_stream(source, target, cuda_sleep)
 class TestRecordStream:
    def test_record_stream_cpu(self):
        # It should silently ignore CPU tensors.
        x = torch.rand(1, device=torch.device("cpu"))
        record_stream(x, CPUStream)
    @skip_if_no_cuda
    def test_record_stream_cuda(self, cuda_sleep):
        # This test detects unexpected block reallocation. For reliable test,
        # the stream to allocate tensors is isolated. The allocator will not
        # reuse free blocks which were allocated from another stream.
        stream_alloc = new_stream(torch.device("cuda"))
        with torch.cuda.stream(stream_alloc):
            x = torch.rand(1, device=torch.device("cuda"))
        stream = new_stream(torch.device("cuda"))
        record_stream(x, stream)
        with use_stream(stream):
            cuda_sleep(0.5)
        # 'x' is deleted at Python's perspective. But the block of 'x' is still
        # required for 'stream'. 'y' shouldn't be allocated to the block.
        data_ptr = x.data_ptr()
        del x
        stream_alloc.synchronize()
        with torch.cuda.stream(stream_alloc):
            y = torch.rand(1, device=torch.device("cuda"))
        assert y.data_ptr() != data_ptr
        # Pause Python until 'stream' finishes tasks queued. Now the block of
        # 'x' is free to be reallocated.
        wait_stream(CPUStream, stream)
        with torch.cuda.stream(stream_alloc):
            z = torch.rand(1, device=torch.device("cuda"))
        assert z.data_ptr() == data_ptr
    @skip_if_no_cuda
    def test_record_stream_shifted_view(self, cuda_sleep):
        # Issue: https://github.com/pytorch/pytorch/issues/27366
        stream_alloc = new_stream(torch.device("cuda"))
        with torch.cuda.stream(stream_alloc):
            x = torch.rand(2, device=torch.device("cuda"))
        y = x[1:]
        assert y.data_ptr() > x.data_ptr()
        stream = new_stream(torch.device("cuda"))
        with use_stream(stream):
            cuda_sleep(0.5)
        record_stream(y, stream)
        data_ptr = x.data_ptr()
        del x, y
        stream_alloc.synchronize()
        with torch.cuda.stream(stream_alloc):
            z = torch.rand(2, device=torch.device("cuda"))
        assert z.data_ptr() != data_ptr
--- a/test/distributed/_pipeline/sync/test_transparency.py
+++ b/test/distributed/_pipeline/sync/test_transparency.py
@ -0,0 +1,43 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 import torch
 from torch import nn
 from torch.distributed._pipeline.sync import Pipe
 def test_simple_linears():
    def sum_grad(parameters):
        return sum([p.grad.sum() for p in parameters if p.grad is not None])
    def zero_grad(parameters):
        for p in parameters:
            p.grad = None
    inputs = torch.rand(8, 1)
    model = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 4), nn.Linear(4, 2), nn.Linear(2, 1),)
    # Without Pipe
    outputs = model(inputs)
    loss = outputs.mean()
    loss.backward()
    grad_without_pipe = sum_grad(model.parameters())
    zero_grad(model.parameters())
    # With Pipe
    model = Pipe(model, [2, 2], devices=["cpu", "cpu"], chunks=4)
    outputs = model(inputs)
    loss = outputs.mean()
    loss.backward()
    grad_with_pipe = sum_grad(model.parameters())
    # Both grads should be identical.
    assert torch.allclose(grad_with_pipe, grad_without_pipe)
--- a/test/distributed/_pipeline/sync/test_worker.py
+++ b/test/distributed/_pipeline/sync/test_worker.py
@ -0,0 +1,163 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 import threading
 import time
 import pytest
 import torch
 from torch.distributed._pipeline.sync.microbatch import Batch
 from torch.distributed._pipeline.sync.stream import CPUStream
 from torch.distributed._pipeline.sync.worker import Task, spawn_workers
 class fake_device:
    """A test double for :class:`torch.device`. Every fake device is different
    with each other.
    """
    type = "fake"
    index = None
 def test_join_running_workers():
    count = 0
    def counter():
        nonlocal count
        time.sleep(0.1)
        count += 1
        return Batch(())
    with spawn_workers([fake_device() for _ in range(10)]) as (in_queues, out_queues):
        def call_in_worker(i, f):
            task = Task(CPUStream, compute=f, finalize=None)
            in_queues[i].put(task)
        for i in range(10):
            call_in_worker(i, counter)
    # There's no nondeterminism because 'spawn_workers' joins all running
    # workers.
    assert count == 10
 def test_join_running_workers_with_exception():
    class ExpectedException(Exception):
        pass
    count = 0
    def counter():
        nonlocal count
        time.sleep(0.1)
        count += 1
        return Batch(())
    with pytest.raises(ExpectedException):
        with spawn_workers([fake_device() for _ in range(10)]) as (in_queues, out_queues):
            def call_in_worker(i, f):
                task = Task(CPUStream, compute=f, finalize=None)
                in_queues[i].put(task)
            for i in range(10):
                call_in_worker(i, counter)
            raise ExpectedException
    # There's no nondeterminism because only 1 task can be placed in input
    # queues.
    assert count == 10
 def test_compute_multithreading():
    """Task.compute should be executed on multiple threads."""
    thread_ids = set()
    def log_thread_id():
        thread_id = threading.current_thread().ident
        thread_ids.add(thread_id)
        return Batch(())
    with spawn_workers([fake_device() for _ in range(2)]) as (in_queues, out_queues):
        for i in range(2):
            t = Task(CPUStream, compute=log_thread_id, finalize=None)
            in_queues[i].put(t)
        for i in range(2):
            out_queues[i].get()
    assert len(thread_ids) == 2
 def test_compute_success():
    """Task.compute returns (True, (task, batch)) on success."""
    def _42():
        return Batch(torch.tensor(42))
    with spawn_workers([torch.device("cpu")]) as (in_queues, out_queues):
        t = Task(CPUStream, compute=_42, finalize=None)
        in_queues[0].put(t)
        ok, (task, batch) = out_queues[0].get()
        assert ok
        assert task is t
        assert isinstance(batch, Batch)
        assert batch[0].item() == 42
 def test_compute_exception():
    """Task.compute returns (False, exc_info) on failure."""
    def zero_div():
        0 / 0
    with spawn_workers([torch.device("cpu")]) as (in_queues, out_queues):
        t = Task(CPUStream, compute=zero_div, finalize=None)
        in_queues[0].put(t)
        ok, exc_info = out_queues[0].get()
        assert not ok
        assert isinstance(exc_info, tuple)
        assert issubclass(exc_info[0], ZeroDivisionError)
@pytest.mark.parametrize("grad_mode", [True, False])
 def test_grad_mode(grad_mode):
    def detect_grad_enabled():
        x = torch.rand(1, requires_grad=torch.is_grad_enabled())
        return Batch(x)
    with torch.set_grad_enabled(grad_mode):
        with spawn_workers([torch.device("cpu")]) as (in_queues, out_queues):
            task = Task(CPUStream, compute=detect_grad_enabled, finalize=None)
            in_queues[0].put(task)
            ok, (_, batch) = out_queues[0].get()
            assert ok
            assert batch[0].requires_grad == grad_mode
 def test_worker_per_device():
    cpu = torch.device("cpu")
    cpu0 = torch.device("cpu", index=0)
    fake1 = fake_device()
    fake2 = fake_device()
    with spawn_workers([cpu, cpu, cpu0, fake1, fake2]) as (in_queues, out_queues):
        assert len(in_queues) == len(out_queues) == 5
        # 0: cpu, 1: cpu, 2: cpu0
        assert in_queues[0] is in_queues[1] is in_queues[2]
        assert out_queues[0] is out_queues[1] is out_queues[2]
        # 3: fake1, 4: fake2
        assert in_queues[3] is not in_queues[4]
        assert out_queues[3] is not out_queues[4]
--- a/test/run_test.py
+++ b/test/run_test.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python
 import argparse
 import copy
 from datetime import datetime
 import importlib
 import modulefinder
@ -95,6 +96,54 @@ TESTS = [
    'test_fx_experimental',
    'test_functional_autograd_benchmark',
    'test_package',
    'distributed/_pipeline/sync/skip/test_api',
    'distributed/_pipeline/sync/skip/test_gpipe',
    'distributed/_pipeline/sync/skip/test_inspect_skip_layout',
    'distributed/_pipeline/sync/skip/test_leak',
    'distributed/_pipeline/sync/skip/test_portal',
    'distributed/_pipeline/sync/skip/test_stash_pop',
    'distributed/_pipeline/sync/skip/test_tracker',
    'distributed/_pipeline/sync/skip/test_verify_skippables',
    'distributed/_pipeline/sync/test_balance',
    'distributed/_pipeline/sync/test_bugs',
    'distributed/_pipeline/sync/test_checkpoint',
    'distributed/_pipeline/sync/test_copy',
    'distributed/_pipeline/sync/test_deferred_batch_norm',
    'distributed/_pipeline/sync/test_dependency',
    'distributed/_pipeline/sync/test_inplace',
    'distributed/_pipeline/sync/test_microbatch',
    'distributed/_pipeline/sync/test_phony',
    'distributed/_pipeline/sync/test_pipe',
    'distributed/_pipeline/sync/test_pipeline',
    'distributed/_pipeline/sync/test_stream',
    'distributed/_pipeline/sync/test_transparency',
    'distributed/_pipeline/sync/test_worker',
 ]
 # Tests need to be run with pytest.
 USE_PYTEST_LIST = [
    'distributed/_pipeline/sync/skip/test_api',
    'distributed/_pipeline/sync/skip/test_gpipe',
    'distributed/_pipeline/sync/skip/test_inspect_skip_layout',
    'distributed/_pipeline/sync/skip/test_leak',
    'distributed/_pipeline/sync/skip/test_portal',
    'distributed/_pipeline/sync/skip/test_stash_pop',
    'distributed/_pipeline/sync/skip/test_tracker',
    'distributed/_pipeline/sync/skip/test_verify_skippables',
    'distributed/_pipeline/sync/test_balance',
    'distributed/_pipeline/sync/test_bugs',
    'distributed/_pipeline/sync/test_checkpoint',
    'distributed/_pipeline/sync/test_copy',
    'distributed/_pipeline/sync/test_deferred_batch_norm',
    'distributed/_pipeline/sync/test_dependency',
    'distributed/_pipeline/sync/test_inplace',
    'distributed/_pipeline/sync/test_microbatch',
    'distributed/_pipeline/sync/test_phony',
    'distributed/_pipeline/sync/test_pipe',
    'distributed/_pipeline/sync/test_pipeline',
    'distributed/_pipeline/sync/test_stream',
    'distributed/_pipeline/sync/test_transparency',
    'distributed/_pipeline/sync/test_worker',
 ]
 WINDOWS_BLOCKLIST = [
@ -170,6 +219,28 @@ SLOW_TESTS = [
    'test_quantization',
    'test_determination',
    'test_futures',
    'distributed/_pipeline/sync/skip/test_api',
    'distributed/_pipeline/sync/skip/test_gpipe',
    'distributed/_pipeline/sync/skip/test_inspect_skip_layout',
    'distributed/_pipeline/sync/skip/test_leak',
    'distributed/_pipeline/sync/skip/test_portal',
    'distributed/_pipeline/sync/skip/test_stash_pop',
    'distributed/_pipeline/sync/skip/test_tracker',
    'distributed/_pipeline/sync/skip/test_verify_skippables',
    'distributed/_pipeline/sync/test_balance',
    'distributed/_pipeline/sync/test_bugs',
    'distributed/_pipeline/sync/test_checkpoint',
    'distributed/_pipeline/sync/test_copy',
    'distributed/_pipeline/sync/test_deferred_batch_norm',
    'distributed/_pipeline/sync/test_dependency',
    'distributed/_pipeline/sync/test_inplace',
    'distributed/_pipeline/sync/test_microbatch',
    'distributed/_pipeline/sync/test_phony',
    'distributed/_pipeline/sync/test_pipe',
    'distributed/_pipeline/sync/test_pipeline',
    'distributed/_pipeline/sync/test_stream',
    'distributed/_pipeline/sync/test_transparency',
    'distributed/_pipeline/sync/test_worker',
 ]
 _DEP_MODULES_CACHE: Dict[str, set] = {}
@ -762,12 +833,15 @@ def main():
    failure_messages = []
    try:
        for test in selected_tests:
-            err_message = run_test_module(test, test_directory, options)
+            options_clone = copy.deepcopy(options)
            if test in USE_PYTEST_LIST:
                options_clone.pytest = True
            err_message = run_test_module(test, test_directory, options_clone)
            if err_message is None:
                continue
            has_failed = True
            failure_messages.append(err_message)
-            if not options.continue_through_error:
+            if not options_clone.continue_through_error:
                raise RuntimeError(err_message)
            print_to_stderr(err_message)
    finally:
--- a/torch/distributed/_pipeline/init.py
+++ b/torch/distributed/_pipeline/init.py
--- a/torch/distributed/_pipeline/sync/LICENSE
+++ b/torch/distributed/_pipeline/sync/LICENSE
@ -0,0 +1,27 @@
 Copyright 2019-2020 Kakao Brain
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 1. Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
 3. Neither the name of the copyright holder nor the names of its
   contributors may be used to endorse or promote products derived from this
   software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
--- a/torch/distributed/_pipeline/sync/init.py
+++ b/torch/distributed/_pipeline/sync/init.py
@ -0,0 +1,11 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """A Pipe implementation in PyTorch."""
 from .checkpoint import is_checkpointing, is_recomputing
 from .pipe import Pipe
 __all__ = ["Pipe", "is_checkpointing", "is_recomputing"]
--- a/torch/distributed/_pipeline/sync/balance/init.py
+++ b/torch/distributed/_pipeline/sync/balance/init.py
@ -0,0 +1,164 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """A helper to roughly balance a sequential module.
 Usage::
    import torch
    from torch.distributed._pipeline.sync import Pipe
    from torch.distributed._pipeline.sync.balance import balance_by_time
    sample = torch.empty(128, 3, 224, 224)
    balance = balance_by_time(torch.cuda.device_count(), model, sample)
    pipe = Pipe(model, balance, chunks=8)
 """
 from typing import List, Tuple, Union
 import torch
 from torch import Tensor
 import torch.nn as nn
 from . import blockpartition
 from .profile import profile_sizes, profile_times
 __all__ = ["balance_by_time", "balance_by_size"]
 Device = Union[torch.device, int, str]
 Tensors = Tuple[Tensor, ...]
 TensorOrTensors = Union[Tensor, Tensors]
 def balance_cost(cost: List[int], partitions: int) -> List[int]:
    partitioned = blockpartition.solve(cost, partitions)
    return [len(p) for p in partitioned]
 def balance_by_time(
    partitions: int,
    module: nn.Sequential,
    sample: TensorOrTensors,
    *,
    timeout: float = 1.0,
    device: Device = torch.device("cuda"),
 ) -> List[int]:
    """Naive automatic balancing by elapsed time per layer.
    ::
        sample = torch.empty(128, 3, 224, 224)
        balance = balance_by_time(torch.cuda.device_count(), model, sample)
        pipe = Pipe(model, balance, chunks=8)
    Args:
        partitions (int):
            intended number of partitions
        module (torch.nn.Sequential):
            sequential module to be partitioned
        sample (torch.Tensor):
            example input with arbitrary batch size
    Keyword Args:
        timeout (float):
            profiling iterates again if the timeout (in second) is not exceeded
            (default: ``1.0``)
        device ('cpu' or 'cuda' device):
            CPU or CUDA device where each layer is profiled (default: the
            current CUDA device)
    Returns:
        A list of number of layers in each partition. Use it for the `balance`
        parameter of :class:`~torchpipe.Pipe`.
    .. note::
        `module` and `sample` must be placed on the same device.
    """
    times = profile_times(module, sample, timeout, torch.device(device))
    return balance_cost(times, partitions)
 def balance_by_size(
    partitions: int,
    module: nn.Sequential,
    input: TensorOrTensors,
    *,
    chunks: int = 1,
    param_scale: float = 2.0,
    device: Device = torch.device("cuda"),
 ) -> List[int]:
    """Naive automatic balancing by CUDA memory usage per layer.
    During training, required memory for parameters depends on which optimizer
    is used. Optimizers may use buffers for each parameter to track
    optimization statistics internally, such as momentum buffer in SGD.
    To get more reliable size based balance, you should specify `param_scale`
    with regard to your optimizer. The default `param_scale` is 2 instead of 1
    due to gradient accumulation which is necessary for every optimizer.
    Follow this guide to choose correct `param_scale` for typical optimizers:
    =========  =============  =========================================
    Optimizer  `param_scale`  Internal State
    =========  =============  =========================================
    SGD        2--3           (momentum_buffer)
    Adam       4--5           exp_avg, exp_avg_sq, (max_exp_avg_sq)
    Adadelta   4              square_avg, acc_delta
    Adagrad    3              sum
    RMSprop    3--5           square_avg, (momentum_buffer), (grad_avg)
    =========  =============  =========================================
    Here's a simple example with the Adam optimizer::
        balance = balance_by_size(
            torch.cuda.device_count(),
            model,
            # Same size with mini-batch to train
            torch.empty(1024, 3, 224, 224),
            # Number of micro-batches to train with Pipe
            chunks=8,
            # 4 for Adam
            param_scale=4.0,
        )
        pipe = Pipe(model, balance, chunks=8)
        adam = Adam(pipe.parameters())
    Args:
        partitions (int):
            intended number of partitions
        module (torch.nn.Sequential):
            sequential module to be partitioned
        input (torch.Tensor):
            example mini-batch with the same size to train
    Keyword Args:
        chunks (int):
            number of micro-batches will be used to train (default: ``1``)
        param_scale (float):
            how many copies of parameters would be allocated for training. It
            depends on optimizer. See the above guide. (default: ``2.0``)
        device ('cuda' device):
            CUDA device where each layer is profiled (default: the current CUDA
            device)
    Returns:
        A list of number of layers in each partition. Use it for the `balance`
        parameter of :class:`~torchpipe.Pipe`.
    .. note::
        `module` and `input` must be placed on the same CUDA device.
    """
    sizes = profile_sizes(module, input, chunks, param_scale, torch.device(device))
    return balance_cost(sizes, partitions)
--- a/torch/distributed/_pipeline/sync/balance/blockpartition.py
+++ b/torch/distributed/_pipeline/sync/balance/blockpartition.py
@ -0,0 +1,95 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """Implements "Block Partitions of Sequences" by Imre Bárány et al.
 Paper: https://arxiv.org/pdf/1308.2452.pdf
 """
 from typing import Iterator, List, Tuple
 __all__ = ["solve"]
 def solve(sequence: List[int], partitions: int = 1) -> List[List[int]]:
    """Splits a sequence into several partitions to minimize variance for each
    partition.
    The result might not be optimal. However, it can be done only in O(kn³),
    where k is the number of partitions and n is the length of the sequence.
    """
    if partitions < 1:
        raise ValueError(f"partitions must be a positive integer ({partitions} < 1)")
    n = len(sequence)
    if n < partitions:
        raise ValueError(f"sequence is shorter than intended partitions ({n} < {partitions})")
    # Normalize the sequence in [0, 1].
    minimum = min(sequence)
    maximum = max(sequence) - minimum
    normal_sequence: List[float]
    if maximum == 0:
        normal_sequence = [0 for _ in sequence]
    else:
        normal_sequence = [(x - minimum) / maximum for x in sequence]
    splits = [n // partitions * (x + 1) for x in range(partitions - 1)] + [n]
    def block_size(i: int) -> float:
        start = splits[i - 1] if i > 0 else 0
        stop = splits[i]
        return sum(normal_sequence[start:stop])
    def leaderboard() -> Iterator[Tuple[float, int]]:
        return ((block_size(i), i) for i in range(partitions))
    while True:
        """
        (1) Fix p ∈ [k] with M(P) = bp. So Bp is a maximal block of P.
        """
        # max_size: M(P)
        max_size, p = max(leaderboard())
        while True:
            """
            (2) If M(P) ≤ m(P) + 1, then stop.
            """
            # min_size: m(P)
            min_size, q = min(leaderboard())
            if max_size <= min_size + 1:
                return [sequence[i:j] for i, j in zip([0] + splits[:-1], splits)]
            """
            (3) If M(P) > m(P) + 1, then let m(P) = bq for the q ∈ [k] which is
            closest to p (ties broken arbitrarily). Thus Bq is a minimal block
            of P. Let Bh be the block next to Bq between Bp and Bq. (Note that
            Bh is a non-empty block: if it were, then m(P) = 0 and we should
            have chosen Bh instead of Bq.)
            """
            if p < q:
                """
                So either p < q and then h = q−1 and we define P ∗ by moving
                the last element from Bh = Bq−1 to Bq,
                """
                h = q - 1
                splits[h] -= 1
            else:
                """
                or q < p, and then h = q + 1 and P ∗ is obtained by moving the
                first element of Bh = Bq+1 to Bq.
                """
                h = q + 1
                splits[q] += 1
            """
            Set P = P ∗ . If p = h, then go to (1), else go to (2).
            """
            if p == h:
                break
--- a/torch/distributed/_pipeline/sync/balance/profile.py
+++ b/torch/distributed/_pipeline/sync/balance/profile.py
@ -0,0 +1,114 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """Per-layer profilers."""
 import copy
 import time
 from typing import Generator, List, Tuple, Union
 import torch
 from torch import Tensor
 import torch.nn as nn
 from ..microbatch import Batch
 __all__: List[str] = []
 Device = Union[torch.device, int, str]
 Tensors = Tuple[Tensor, ...]
 TensorOrTensors = Union[Tensor, Tensors]
 def layerwise_sandbox(module: nn.Sequential, device: torch.device,) -> Generator[nn.Module, None, None]:
    """Copies layers for ease to profile. It doesn't modify the given
    module.
    """
    for layer in module:
        layer_copy = copy.deepcopy(layer)
        layer_copy.to(device)
        layer_copy.train()
        yield layer_copy
 def detach(batch: Batch) -> None:
    """Detaches from autograd graph."""
    for i, x in enumerate(batch):
        batch[i] = x.detach().requires_grad_(x.requires_grad)
 def profile_times(module: nn.Sequential, sample: TensorOrTensors, timeout: float, device: torch.device,) -> List[int]:
    """Profiles elapsed times per layer."""
    if any(p.grad is not None for p in module.parameters()):
        raise ValueError("some parameter already has gradient")
    _batch = Batch(sample)
    for i, x in enumerate(_batch):
        _batch[i] = x.detach().to(device).requires_grad_(x.requires_grad)
    time_bufs: List[List[float]] = [[] for _ in module]
    begun_at = time.time()
    while time.time() - begun_at < timeout:
        batch = _batch
        for i, layer in enumerate(layerwise_sandbox(module, device)):
            detach(batch)
            if device.type == "cuda":
                torch.cuda.synchronize(device)
            tick = time.time()
            # Forward
            batch = batch.call(layer)
            # Backward
            backward_tensors = tuple(y for y in batch if y.requires_grad)
            if backward_tensors:
                torch.autograd.backward(backward_tensors, backward_tensors)
            if device.type == "cuda":
                torch.cuda.synchronize(device)
            tock = time.time()
            time_bufs[i].append(tock - tick)
    us = 1_000_000
    return [sum(int(t * us) for t in buf) for buf in time_bufs]
 def profile_sizes(
    module: nn.Sequential, input: TensorOrTensors, chunks: int, param_scale: float, device: torch.device,
 ) -> List[int]:
    """Profiles CUDA memory usage per layer."""
    if device.type != "cuda":
        raise ValueError("size profiler supports only CUDA device")
    batch = Batch(input)
    sizes: List[int] = []
    latent_scale = batch[0].size(0) / chunks
    for i, x in enumerate(batch):
        batch[i] = x[:1].detach().to(device).requires_grad_(x.requires_grad)
    for layer in layerwise_sandbox(module, device):
        detach(batch)
        # Detect memory usage at forward.
        memory_before = torch.cuda.memory_allocated(device)
        batch = batch.call(layer)
        memory_after = torch.cuda.memory_allocated(device)
        latent_size = memory_after - memory_before
        # Analyze size of parameters.
        param_size = sum(p.storage().size() * p.storage().element_size() for p in layer.parameters())
        # Combine size of parameters and activations with normalize scales.
        size = latent_size * latent_scale + param_size * param_scale
        sizes.append(int(size))
    return sizes
--- a/torch/distributed/_pipeline/sync/balance/py.typed
+++ b/torch/distributed/_pipeline/sync/balance/py.typed
@ -0,0 +1,6 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
--- a/torch/distributed/_pipeline/sync/batchnorm.py
+++ b/torch/distributed/_pipeline/sync/batchnorm.py
@ -0,0 +1,159 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """Tracks the running statistics per mini-batch instead of micro-batch."""
 from typing import Optional, TypeVar, cast
 import torch
 from torch import Tensor, nn
 import torch.nn.functional as F
 from torch.nn.modules.batchnorm import _BatchNorm
 from .checkpoint import is_recomputing
 __all__ = ["DeferredBatchNorm"]
 TModule = TypeVar("TModule", bound=nn.Module)
 class DeferredBatchNorm(_BatchNorm):
    """A BatchNorm layer tracks multiple micro-batches to update running
    statistics per mini-batch.
    """
    sum: Tensor
    sum_squares: Tensor
    def __init__(
        self,
        num_features: int,
        eps: float = 1e-5,
        momentum: Optional[float] = 0.1,
        affine: bool = True,
        chunks: int = 1,
    ) -> None:
        super().__init__(num_features, eps, momentum, affine, track_running_stats=True)
        self.register_buffer("sum", torch.zeros_like(self.running_mean))
        self.register_buffer("sum_squares", torch.zeros_like(self.running_var))
        self.counter = 0
        self.tracked = 0
        self.chunks = chunks
    def _check_input_dim(self, input: Tensor) -> None:
        # It's the typical _check_input_dim() implementation in PyTorch.
        if input.dim() <= 2:
            raise ValueError("expected at least 3D input (got %dD input)" % input.dim())
    def _track(self, input: Tensor) -> bool:
        """Tracks statistics of a micro-batch."""
        # Dimensions except channel. For example, (0, 2, 3) is for BatchNorm2d.
        dim = [0]
        dim.extend(range(2, input.dim()))
        with torch.no_grad():
            self.sum += input.sum(dim)
            self.sum_squares += (input ** 2).sum(dim)
        size = input.size().numel() // input.size(1)
        self.counter += size
        self.tracked += 1
        return self.tracked == self.chunks
    def _commit(self) -> None:
        """Updates the running statistics of a mini-batch."""
        exponential_average_factor = 0.0
        self.num_batches_tracked += 1
        if self.momentum is None:  # use cumulative moving average
            exponential_average_factor = 1.0 / float(self.num_batches_tracked)
        else:  # use exponential moving average
            exponential_average_factor = self.momentum
        mean = self.sum / self.counter
        var = self.sum_squares / self.counter - mean ** 2
        # Calculate the exponential moving average here.
        m = exponential_average_factor
        self.running_mean *= 1 - m
        self.running_mean += mean * m
        self.running_var *= 1 - m
        self.running_var += var * m
        self.sum.zero_()
        self.sum_squares.zero_()
        self.counter = 0
        self.tracked = 0
    def forward(self, input: Tensor) -> Tensor:  # type: ignore
        if not self.training:
            # Don't train parameters on the evaluation mode.
            return F.batch_norm(
                input,
                running_mean=self.running_mean,
                running_var=self.running_var,
                weight=self.weight,
                bias=self.bias,
                training=False,
                momentum=0.0,
                eps=self.eps,
            )
        if not is_recomputing():
            # Track a micro-batch on the training mode
            # but not under a recomputation.
            tracked_enough = self._track(input)
            # Update the running statistics for a mini-batch
            # if it has tracked enough micro-batches.
            if tracked_enough:
                self._commit()
        # Normalize a micro-batch and train the parameters.
        return F.batch_norm(
            input,
            running_mean=None,
            running_var=None,
            weight=self.weight,
            bias=self.bias,
            training=True,
            momentum=0.0,
            eps=self.eps,
        )
    @classmethod
    def convert_deferred_batch_norm(cls, module: TModule, chunks: int = 1) -> TModule:
        """Converts a :class:`nn.BatchNorm` or underlying
        :class:`nn.BatchNorm`s into :class:`DeferredBatchNorm`::
            from torchvision.models.resnet import resnet101
            from torchpipe.batchnorm import DeferredBatchNorm
            model = resnet101()
            model = DeferredBatchNorm.convert_deferred_batch_norm(model)
        """
        if isinstance(module, DeferredBatchNorm) and module.chunks is chunks:
            return cast(TModule, module)
        module_output: nn.Module = module
        if isinstance(module, _BatchNorm) and module.track_running_stats:
            module_output = DeferredBatchNorm(module.num_features, module.eps, module.momentum, module.affine, chunks)
            if module.affine:
                module_output.register_parameter("weight", module.weight)
                module_output.register_parameter("bias", module.bias)
            module_output.register_buffer("running_mean", module.running_mean)
            module_output.register_buffer("running_var", module.running_var)
            module_output.register_buffer("num_batches_tracked", module.num_batches_tracked)
        for name, child in module.named_children():
            module_output.add_module(name, cls.convert_deferred_batch_norm(child, chunks))
        return cast(TModule, module_output)
--- a/torch/distributed/_pipeline/sync/checkpoint.py
+++ b/torch/distributed/_pipeline/sync/checkpoint.py
@ -0,0 +1,317 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """Checkpointing with preceding recomputation.
 PyTorch already provides the official checkpointing utilities in
 :mod:`torch.utils.checkpoint`. The official checkpointing combines
 recomputation and recursive backpropagation into one autograd function named
 ``CheckpointFunction``. Hence, the recomputation can be started only when the
 gradients arrive to the function. In Pipe, the recomputation needs to precede
 the gradient arrival to minimize the GPU idle time.
 We solve this problem by introducing separate autograd functions named
 :class:`Recompute` and :class:`Checkpoint`. Each function represents
 recomputation and recursive backpropagation, respectively. We can manipulate
 the control flow in aspect of both the autograd engine and CUDA with a pair of
 the functions.
 Specifically, we place CUDA stream synchronization between :class:`Recompute`
 and :class:`Checkpoint` to delay only :class:`Checkpoint` until the gradient is
 copied entirely.
 """
 from collections import deque
 from contextlib import contextmanager
 import threading
 from typing import TYPE_CHECKING, Deque, Generator, List, Optional, Tuple, Union
 import torch
 from torch import ByteTensor, Tensor
 import torch.autograd
 from .dependency import fork, join
 from .microbatch import Batch
 from .phony import get_phony
 __all__ = ["is_checkpointing", "is_recomputing"]
 Tensors = Tuple[Tensor, ...]
 TensorOrTensors = Union[Tensor, Tensors]
 # Types for shared memory between Checkpoint and Recompute.
 Recomputed = Tuple[TensorOrTensors, Tensors]  # (output, input_leaf)
 RNGStates = Tuple[ByteTensor, Optional[ByteTensor]]  # (cpu_rng_state, gpu_rng_state)
 if TYPE_CHECKING:
    from typing_extensions import Protocol
 else:
    Protocol = object
 # Protocol with __call__ instead of Callable can be used as an attribute type.
 # See: https://github.com/python/mypy/issues/708#issuecomment-561735949
 class Function(Protocol):
    def __call__(self, input: TensorOrTensors) -> TensorOrTensors:
        ...
 def checkpoint(function: Function, input: TensorOrTensors) -> TensorOrTensors:
    """Makes a checkpoint with a simple interface like
    :func:`torch.utils.checkpoint.checkpoint`. It's only used to test or debug
    :class:`Checkpoint` and :class:`Recompute` without boilerplate.
    """
    batch = Batch(input)
    chk = Checkpointing(function, batch)
    batch = chk.checkpoint()
    chk.recompute(batch)
    return batch.tensor_or_tensors
 class Checkpointing:
    """Generates a pair of :class:`Checkpoint` and :class:`Recompute`."""
    def __init__(self, function: Function, batch: Batch) -> None:
        self.function = function
        self.batch = batch
        # Shared memory between Checkpoint and Recompute. 1-length deque is
        # used for mutability and length limitation.
        self.recomputed: Deque[Recomputed] = deque(maxlen=1)
        self.rng_states: Deque[RNGStates] = deque(maxlen=1)
    def checkpoint(self) -> Batch:
        """Returns a batch applied by :class:`Checkpoint`."""
        input_atomic = self.batch.atomic
        input = tuple(self.batch)
        # Use a phony which requires grad to ensure that Checkpoint can be
        # tracked by the autograd engine even when none of the input tensors
        # require grad.
        phony = get_phony(self.batch[0].device, requires_grad=True)
        output = Checkpoint.apply(phony, self.recomputed, self.rng_states, self.function, input_atomic, *input)
        # Gradients are only supported for float Tensors.
        if isinstance(output, tuple):
            output = tuple([x if x.is_floating_point() else x.detach() for x in output])
        return Batch(output)
    def recompute(self, batch: Batch) -> None:
        """Applies :class:`Recompute` to the batch in place."""
        input_atomic = self.batch.atomic
        input = tuple(self.batch)
        # batch[0] is always requiring grad, because it has been passed
        # checkpoint with a phony requiring grad.
        batch[0], phony = fork(batch[0])
        phony = Recompute.apply(phony, self.recomputed, self.rng_states, self.function, input_atomic, *input)
        batch[0] = join(batch[0], phony)
 class ThreadLocal(threading.local):
    def __init__(self) -> None:
        self.is_checkpointing = False
        self.is_recomputing = False
 thread_local = ThreadLocal()
@contextmanager
 def enable_checkpointing() -> Generator[None, None, None]:
    """Makes :func:`is_checkpointing` return :data:`True` within a context."""
    orig = thread_local.is_checkpointing
    thread_local.is_checkpointing = True
    try:
        yield
    finally:
        thread_local.is_checkpointing = orig
@contextmanager
 def enable_recomputing() -> Generator[None, None, None]:
    """Makes :func:`is_recomputing` return :data:`True` within a context."""
    orig = thread_local.is_recomputing
    thread_local.is_recomputing = True
    try:
        yield
    finally:
        thread_local.is_recomputing = orig
 def is_checkpointing() -> bool:
    """Whether the current forward propagation is under checkpointing.
    Returns:
        bool: :data:`True` if it's under checkpointing.
    """
    return thread_local.is_checkpointing
 def is_recomputing() -> bool:
    """Whether the current forward propagation is under checkpoint
    recomputation. Use this to prevent duplicated side-effects at forward
    propagation::
        class Counter(nn.Module):
            def __init__(self):
                super().__init__()
                self.counter = 0
            def forward(self, input):
                if not is_recomputing():
                    self.counter += 1
                return input
    Returns:
        bool: :data:`True` if it's under checkpoint recomputation.
    .. seealso:: :ref:`Detecting Recomputation`
    """
    return thread_local.is_recomputing
 class Context:
    """The common interface between the :class:`Checkpoint` and
    :class:`Recompute` context.
    """
    recomputed: Deque[Recomputed]
    rng_states: Deque[RNGStates]
    function: Function
    input_atomic: bool
    saved_tensors: Tuple[Tensor, ...]
    def save_for_backward(self, *tensors: Tensor) -> None:  # pragma: no cover
        pass
 def save_rng_states(device: torch.device, rng_states: Deque[RNGStates],) -> None:
    """:meth:`Checkpoint.forward` captures the current PyTorch's random number
    generator states at CPU and GPU to reuse in :meth:`Recompute.backward`.
    .. seealso:: :ref:`Referential Transparency`
    """
    cpu_rng_state = torch.get_rng_state()
    gpu_rng_state: Optional[ByteTensor]
    if device.type == "cuda":
        gpu_rng_state = torch.cuda.get_rng_state(device)
    else:
        gpu_rng_state = None
    rng_states.append((cpu_rng_state, gpu_rng_state))
@contextmanager
 def restore_rng_states(device: torch.device, rng_states: Deque[RNGStates],) -> Generator[None, None, None]:
    """:meth:`Recompute.backward` restores the random number generator states
    captured by :func:`save_rng_states` within its context.
    .. seealso:: :ref:`Referential Transparency`
    """
    cpu_rng_state, gpu_rng_state = rng_states.pop()
    gpu_devices: List[torch.device] = []
    if device.type == "cuda":
        gpu_devices.append(device)
    with torch.random.fork_rng(gpu_devices):
        torch.set_rng_state(cpu_rng_state)
        if gpu_rng_state is not None:
            torch.cuda.set_rng_state(gpu_rng_state, device)
        yield
 class Checkpoint(torch.autograd.Function):
    @staticmethod
    # type: ignore
    def forward(
        ctx: Context,
        phony: Tensor,
        recomputed: Deque[Recomputed],
        rng_states: Deque[RNGStates],
        function: Function,
        input_atomic: bool,
        *input: Tensor,
    ) -> TensorOrTensors:
        ctx.recomputed = recomputed
        ctx.rng_states = rng_states
        save_rng_states(input[0].device, ctx.rng_states)
        ctx.function = function
        ctx.input_atomic = input_atomic
        ctx.save_for_backward(*input)
        with torch.no_grad(), enable_checkpointing():
            output = function(input[0] if input_atomic else input)
        return output
    @staticmethod
    def backward(ctx: Context, *grad_output: Tensor,) -> Tuple[Optional[Tensor], ...]:  # pragma: no cover
        output, input_leaf = ctx.recomputed.pop()
        if isinstance(output, tuple):
            tensors = output
        else:
            tensors = (output,)
        if any(y.requires_grad for y in tensors):
            tensors = tuple([x for x in tensors if x.requires_grad])
            torch.autograd.backward(tensors, grad_output)
        grad_input: List[Optional[Tensor]] = [None, None, None, None, None]
        grad_input.extend(x.grad for x in input_leaf)
        return tuple(grad_input)
 class Recompute(torch.autograd.Function):
    @staticmethod
    # type: ignore
    def forward(
        ctx: Context,
        phony: Tensor,
        recomputed: Deque[Recomputed],
        rng_states: Deque[RNGStates],
        function: Function,
        input_atomic: bool,
        *input: Tensor,
    ) -> Tensor:
        ctx.recomputed = recomputed
        ctx.rng_states = rng_states
        ctx.function = function
        ctx.input_atomic = input_atomic
        ctx.save_for_backward(*input)
        return phony
    @staticmethod
    def backward(ctx: Context, *grad_output: Tensor) -> Tuple[None, ...]:  # pragma: no cover
        input = ctx.saved_tensors
        input_leaf = tuple(x.detach().requires_grad_(x.requires_grad) for x in input)
        with restore_rng_states(input[0].device, ctx.rng_states):
            with torch.enable_grad(), enable_recomputing():
                output = ctx.function(input_leaf[0] if ctx.input_atomic else input_leaf)
        ctx.recomputed.append((output, input_leaf))
        grad_input: List[None] = [None, None, None, None, None]
        grad_input.extend(None for _ in ctx.saved_tensors)
        return tuple(grad_input)
--- a/torch/distributed/_pipeline/sync/copy.py
+++ b/torch/distributed/_pipeline/sync/copy.py
@ -0,0 +1,104 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """Autograd functions for stream-aware CUDA copy. It is used to overlap copy
 and computation on the same GPU.
 """
 from collections import deque
 from typing import Deque, List, Optional, Tuple
 import torch
 from torch import Tensor
 from .stream import AbstractStream, current_stream, get_device, record_stream, use_stream, wait_stream
 __all__: List[str] = []
 Tensors = Tuple[Tensor, ...]
 # Common interface between :class:`Copy` and :class:`Wait`.
 class Context:
    prev_stream: AbstractStream
    next_stream: AbstractStream
 class Copy(torch.autograd.Function):
    """Copies tensors on specific streams."""
    @staticmethod
    # type: ignore
    def forward(ctx: Context, prev_stream: AbstractStream, next_stream: AbstractStream, *input: Tensor,) -> Tensors:
        ctx.prev_stream = prev_stream
        ctx.next_stream = next_stream
        output = []
        output_stream = current_stream(get_device(next_stream))
        with use_stream(prev_stream), use_stream(next_stream):
            for x in input:
                y = x.to(get_device(next_stream), non_blocking=True)
                output.append(y)
                # 'prev_stream' is not where 'x' has been allocated.
                record_stream(x, prev_stream)
                # 'y' has been allocated on 'next_stream'.
                # It might be used on the current stream captured as 'output_stream'.
                record_stream(y, output_stream)
        return tuple(output)
    @staticmethod
    def backward(ctx: Context, *grad_output: Tensor,) -> Tuple[Optional[Tensor], ...]:
        prev_stream = ctx.prev_stream
        next_stream = ctx.next_stream
        grad_input: Deque[Tensor] = deque(maxlen=len(grad_output))
        input_stream = current_stream(get_device(prev_stream))
        with use_stream(prev_stream), use_stream(next_stream):
            for x in reversed(grad_output):
                y = x.to(get_device(prev_stream), non_blocking=True)
                grad_input.appendleft(y)
                # 'next_stream' is not where 'x' has been allocated.
                record_stream(x, next_stream)
                # 'y' has been allocated on 'prev_stream'.
                # It might be used on the current stream captured as 'input_stream'.
                record_stream(y, input_stream)
        grad_streams: Tuple[Optional[Tensor], ...] = (None, None)
        return grad_streams + tuple(grad_input)
 class Wait(torch.autograd.Function):
    """Synchronizes a stream to another stream.
    Place it just before you want to start an operation on the next stream,
    provided that all operations on the previous stream are done.
    """
    @staticmethod
    # type: ignore
    def forward(ctx: Context, prev_stream: AbstractStream, next_stream: AbstractStream, *input: Tensor,) -> Tensors:
        ctx.prev_stream = prev_stream
        ctx.next_stream = next_stream
        wait_stream(next_stream, prev_stream)
        return tuple(x.detach() for x in input)
    @staticmethod
    def backward(ctx: Context, *grad_input: Tensor,) -> Tuple[Optional[Tensor], ...]:
        prev_stream = ctx.prev_stream
        next_stream = ctx.next_stream
        wait_stream(prev_stream, next_stream)
        grad_streams: Tuple[Optional[Tensor], ...] = (None, None)
        return grad_streams + grad_input
--- a/torch/distributed/_pipeline/sync/dependency.py
+++ b/torch/distributed/_pipeline/sync/dependency.py
@ -0,0 +1,54 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """Arbitrary dependency between two autograd lanes."""
 from typing import List, Tuple
 import torch
 from torch import Tensor
 from .phony import get_phony
 __all__: List[str] = []
 def fork(input: Tensor) -> Tuple[Tensor, Tensor]:
    """Branches out from an autograd lane of the given tensor."""
    if torch.is_grad_enabled() and input.requires_grad:
        input, phony = Fork.apply(input)
    else:
        phony = get_phony(input.device, requires_grad=False)
    return input, phony
 class Fork(torch.autograd.Function):
    @staticmethod
    def forward(ctx: "Fork", input: Tensor) -> Tuple[Tensor, Tensor]:  # type: ignore
        phony = get_phony(input.device, requires_grad=False)
        return input.detach(), phony.detach()
    @staticmethod
    def backward(ctx: "Fork", grad_input: Tensor, grad_grad: Tensor) -> Tensor:  # type: ignore
        return grad_input
 def join(input: Tensor, phony: Tensor) -> Tensor:
    """Merges two autograd lanes."""
    if torch.is_grad_enabled() and (input.requires_grad or phony.requires_grad):
        input = Join.apply(input, phony)
    return input
 class Join(torch.autograd.Function):
    @staticmethod
    def forward(ctx: "Join", input: Tensor, phony: Tensor) -> Tensor:  # type: ignore
        return input.detach()
    @staticmethod
    def backward(ctx: "Join", grad_input: Tensor) -> Tuple[Tensor, None]:  # type: ignore
        return grad_input, None
--- a/torch/distributed/_pipeline/sync/microbatch.py
+++ b/torch/distributed/_pipeline/sync/microbatch.py
@ -0,0 +1,185 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """Manipulation of micro-batches."""
 import typing
 from typing import Callable, Iterable, Iterator, List, Tuple, Union, cast
 import torch
 from torch import Tensor
 import torch.cuda.comm
 __all__: List[str] = []
 Tensors = Tuple[Tensor, ...]
 TensorOrTensors = Union[Tensor, Tensors]
 Function = Callable[[TensorOrTensors], TensorOrTensors]
 class Batch:
    """An abstraction of an atomic tensor or a tuple of tensors. This
    eliminates every boilerplate code to classify an atomic tensor or a tuple
    of tensors.
    ::
        x = generate_tensor_or_tensors()
        x = Batch(x)
        # in-place update
        x[0] = F.apply(x[0])
        x[:] = F.apply(*x)
        # f(x) if x is a tensor.
        # f(*x) if x is a tuple of tensors.
        # y is also a batch.
        y = x.call(f)
    """
    def __init__(self, value: TensorOrTensors) -> None:
        self.value = value
        self.atomic = torch.is_tensor(value)
    @property
    def tensor(self) -> Tensor:
        """Retrieves the underlying tensor."""
        if not self.atomic:
            raise AttributeError("not atomic batch")
        return cast(Tensor, self.value)
    @property
    def tensors(self) -> Tensors:
        """Retrieves the underlying tensors."""
        if self.atomic:
            raise AttributeError("batch is atomic")
        return cast(Tensors, self.value)
    @property
    def tensor_or_tensors(self) -> TensorOrTensors:
        """Retrieves the underlying tensor or tensors regardless of type."""
        return self.value
    def call(self, function: Function) -> "Batch":
        """Calls a function by the underlying tensor or tensors. It also wraps
        the output with :class:`Batch`.
        """
        return Batch(function(self.value))
    def __repr__(self) -> str:
        return f"Batch[atomic={self.atomic!r}]({self.value!r})"
    def __iter__(self) -> Iterator[Tensor]:
        if self.atomic:
            yield self.tensor
        else:
            yield from self.tensors
    def __len__(self) -> int:
        return 1 if self.atomic else len(self.tensors)
    def __getitem__(self, index: int) -> Tensor:
        if not self.atomic:
            return self.tensors[index]
        if index != 0:
            raise IndexError("atomic batch allows index 0 only")
        return self.tensor
    # NOTE(sublee): pyflakes can't detect "overload" instead of "typing.overload".
    @typing.overload
    def __setitem__(self, index: int, value: Tensor) -> None:
        ...
    @typing.overload
    def __setitem__(self, index: slice, value: Tensors) -> None:
        ...
    def __setitem__(self, index: Union[int, slice], value: TensorOrTensors) -> None:
        if isinstance(index, int):
            value = cast(Tensor, value)
            self._setitem_by_index(index, value)
        else:
            value = cast(Tensors, value)
            self._setitem_by_slice(index, value)
    def _setitem_by_index(self, index: int, value: Tensor) -> None:
        if not self.atomic:
            i = index
            self.value = self.value[:i] + (value,) + self.value[i + 1 :]
            return
        if index != 0:
            raise IndexError("atomic batch allows index 0 only")
        self.value = value
    def _setitem_by_slice(self, index: slice, value: Tensors) -> None:
        if not (index.start is index.stop is index.step is None):
            raise NotImplementedError("only slice [:] supported")
        if not self.atomic:
            self.value = value
            return
        if len(value) != 1:
            raise IndexError("atomic batch cannot be replaced with multiple tensors")
        self.value = value[0]
 def check(input: TensorOrTensors) -> None:
    """Checks whether the input is a tensor or tensors.
    Raises:
        TypeError: input is not a tensor or tensors.
    """
    if isinstance(input, tuple):
        for x in input:
            check(x)
        return
    if not isinstance(input, Tensor):
        raise TypeError(f"expected Tensor, but got {input.__class__.__name__}")
 def scatter(input: TensorOrTensors, chunks: int) -> List[Batch]:
    """Splits an input mini-batch into multiple micro-batches."""
    inputs: Iterable[TensorOrTensors]
    if isinstance(input, Tensor):
        inputs = input.chunk(chunks)
    else:
        rotated: List[Tensors] = []
        for tensor in input:
            tensors = tensor.chunk(chunks)
            rotated.append(cast(Tensors, tensors))
        inputs = zip(*rotated)
    return [Batch(x) for x in inputs]
 def gather(outputs: List[Batch]) -> TensorOrTensors:
    """Concatenates output micro-batches into a mini-batch."""
    output: TensorOrTensors
    if outputs[0].atomic:
        tensors = tuple(b.tensor for b in outputs)
        output = torch.cat(tensors)
    else:
        rotated = [b.tensors for b in outputs]
        output_buf = []
        for tensors in zip(*rotated):
            output_buf.append(torch.cat(tensors))
        output = tuple(output_buf)
    return output
--- a/torch/distributed/_pipeline/sync/phony.py
+++ b/torch/distributed/_pipeline/sync/phony.py
@ -0,0 +1,49 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """Provides phony for arbitrary dependency in a autograd graph."""
 from typing import Dict, List, Tuple
 import torch
 from torch import Tensor
 from .stream import default_stream, use_stream
 __all__: List[str] = []
 _phonies: Dict[Tuple[torch.device, bool], Tensor] = {}
 def get_phony(device: torch.device, *, requires_grad: bool) -> Tensor:
    """Gets a phony. Phony is tensor without space. It is useful to make
    arbitrary dependency in a autograd graph because it doesn't require any
    gradient accumulation.
    .. note::
        Phonies for each device are cached. If an autograd function gets a phony
        internally, the phony must be detached to be returned. Otherwise, the
        autograd engine will mutate the cached phony in-place::
            class Phonify(torch.autograd.Function):
                @staticmethod
                def forward(ctx, input):
                    phony = get_phony(input.device, requires_grad=False)
                    return phony.detach()  # detach() is necessary.
    """
    key = (device, requires_grad)
    try:
        phony = _phonies[key]
    except KeyError:
        with use_stream(default_stream(device)):
            phony = torch.empty(0, device=device, requires_grad=requires_grad)
        _phonies[key] = phony
    return phony
--- a/torch/distributed/_pipeline/sync/pipe.py
+++ b/torch/distributed/_pipeline/sync/pipe.py
@ -0,0 +1,394 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """The Pipe interface."""
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union, cast
 import torch
 from torch import Tensor, nn
 import torch.autograd
 import torch.cuda
 from . import microbatch
 from .batchnorm import DeferredBatchNorm
 from .pipeline import Pipeline
 from .skip.layout import inspect_skip_layout
 from .skip.skippable import verify_skippables
 from .stream import AbstractStream, new_stream
 __all__ = ["Pipe"]
 Device = Union[torch.device, int, str]
 Devices = Union[Iterable[Device], List[Device]]
 Tensors = Tuple[Tensor, ...]
 TensorOrTensors = Union[Tensor, Tensors]
 if TYPE_CHECKING:
    Module = nn.Module[TensorOrTensors]
    NamedModules = OrderedDict[str, Module]
 else:
    Module = nn.Module
    NamedModules = OrderedDict
 def recommend_auto_balance(message: str) -> str:
    """Expands a message with recommendation to :mod:`torchpipe.balance`."""
    return f"""{message}
 If your model is still under development, its optimal balance would change
 frequently. In this case, we highly recommend 'torch.distributed._pipeline.sync.balance' for
 naive automatic balancing:
  from torch.distributed._pipeline.sync import Pipe
  from torch.distributed._pipeline.sync.balance import balance_by_time
  partitions = torch.cuda.device_count()
  sample = torch.empty(...)
  balance = balance_by_time(partitions, model, sample)
  model = Pipe(model, balance, ...)
 """
 def verify_module(module: nn.Sequential) -> None:
    if not isinstance(module, nn.Sequential):
        raise TypeError("module must be nn.Sequential to be partitioned")
    named_children = list(module.named_children())
    if len(named_children) != len(module):
        raise ValueError("module with duplicate children is not supported")
 def verify_splitting(
    module: nn.Sequential, partitions: List[nn.Sequential], balance: Iterable[int], devices: List[torch.device]
 ) -> None:
    num_parameters = len(list(module.parameters()))
    num_child_parameters = sum(len(list(child.parameters())) for child in module.children())
    if num_parameters == num_child_parameters:
        return
    for i in range(len(partitions)):
        for j in range(i + 1, len(partitions)):
            parti = partitions[i]
            partj = partitions[j]
            if devices[i] == devices[j]:
                continue
            for p in parti.parameters():
                for q in partj.parameters():
                    if p is q:
                        raise ValueError("module with duplicate parameters on distinct devices is not supported")
 class BalanceError(ValueError):
    pass
 def split_module(
    module: nn.Sequential, balance: Iterable[int], devices: List[torch.device],
 ) -> Tuple[List[nn.Sequential], List[int], List[torch.device]]:
    """Splits a module into multiple partitions.
    Returns:
        A tuple of (partitions, balance, devices).
        Partitions are represented as a :class:`~torch.nn.ModuleList` whose
        item is a partition. All layers in a partition are placed in the
        same device.
    Raises:
        BalanceError:
            wrong balance
        IndexError:
            the number of devices is fewer than the number of partitions.
    """
    balance = list(balance)
    if len(module) != sum(balance):
        raise BalanceError(
            "module and sum of balance have different length "
            f"(module: {len(module)}, sum of balance: {sum(balance)})"
        )
    if any(x <= 0 for x in balance):
        raise BalanceError(f"all balance numbers must be positive integer (balance: {balance})")
    if len(balance) > len(devices):
        raise IndexError(
            "too few devices to hold given partitions " f"(devices: {len(devices)}, partitions: {len(balance)})"
        )
    j = 0
    partitions = []
    layers: NamedModules = OrderedDict()
    for name, layer in module.named_children():
        layers[name] = layer
        if len(layers) == balance[j]:
            # Group buffered layers as a partition.
            partition = nn.Sequential(layers)
            device = devices[j]
            partition.to(device)
            partitions.append(partition)
            # Prepare for the next partition.
            layers.clear()
            j += 1
    partitions = cast(List[nn.Sequential], nn.ModuleList(partitions))
    del devices[j:]
    return partitions, balance, devices
 MOVING_DENIED = TypeError("denied to move parameters and buffers, " "because Pipe should manage device placement")
 class Pipe(Module):
    """Wraps an arbitrary :class:`nn.Sequential <torch.nn.Sequential>` module
    to train on Pipe_. If the module requires lots of memory, Pipe will be
    very efficient.
    ::
        model = nn.Sequential(a, b, c, d)
        model = Pipe(model, balance=[1, 1, 1, 1], chunks=8)
        output = model(input)
    .. _Pipe: https://arxiv.org/abs/1811.06965
    Pipe combines pipeline parallelism with checkpointing to reduce peak
    memory required to train while minimizing device under-utilization.
    You should determine the balance when defining a :class:`Pipe` module, as
    balancing will not be done automatically. The module will be partitioned
    into multiple devices according to the given balance. You may rely on
    heuristics to find your own optimal configuration.
    Args:
        module (torch.nn.Sequential):
            sequential module to be parallelized
        balance (ints):
            list of number of layers in each partition
    Keyword Args:
        devices (iterable of devices):
            devices to use (default: all CUDA devices)
        chunks (int):
            number of micro-batches (default: ``1``)
        checkpoint (str):
            when to enable checkpointing, one of ``'always'``,
            ``'except_last'``, or ``'never'`` (default: ``'except_last'``)
        deferred_batch_norm (bool):
            whether to use deferred BatchNorm moving statistics (default:
            :data:`False`, see :ref:`Deferred Batch Normalization` for more
            details)
    Raises:
        TypeError:
            the module is not a :class:`nn.Sequential <torch.nn.Sequential>`.
        ValueError:
            invalid arguments, or wrong balance
        IndexError:
            the number of devices is fewer than the number of partitions.
    """
    #: The number of layers in each partition.
    balance: List[int] = []
    #                    ^^
    # The default value [] required for Sphinx's autoattribute.
    #: The devices mapped to each partition.
    #:
    #: ``devices[-1]`` refers to the device of the last partition, which means
    #: it is the output device. Probably, you need to use it to transfer the
    #: target to calculate the loss without a device mismatch
    #: :exc:`RuntimeError`. For example::
    #:
    #:     out_device = pipe.devices[-1]
    #:
    #:     for input, target in loader:
    #:         target = target.to(out_device, non_blocking=True)
    #:         output = pipe(input)
    #:         loss = F.cross_entropy(output, target)
    #:
    devices: List[torch.device] = []
    #: The number of micro-batches.
    chunks: int = 1
    #: The checkpoint mode to determine when to enable checkpointing. It is one
    #: of ``'always'``, ``'except_last'``, or ``'never'``.
    checkpoint: str = "except_last"
    def __init__(
        self,
        module: nn.Sequential,
        balance: Optional[Iterable[int]] = None,
        *,
        devices: Optional[Devices] = None,
        chunks: int = chunks,
        checkpoint: str = checkpoint,
        deferred_batch_norm: bool = False,
    ) -> None:
        super().__init__()
        chunks = int(chunks)
        checkpoint = str(checkpoint)
        if balance is None:
            raise ValueError(recommend_auto_balance("balance is required"))
        if chunks <= 0:
            raise ValueError("number of chunks must be positive integer")
        if checkpoint not in ["always", "except_last", "never"]:
            raise ValueError("checkpoint is not one of 'always', 'except_last', or 'never'")
        verify_module(module)
        # Verify if the underlying skippable modules satisfy integrity. The
        # integrity can be verified before forward() because it is static.
        verify_skippables(module)
        self.chunks = chunks
        self.checkpoint = checkpoint
        if deferred_batch_norm:
            module = DeferredBatchNorm.convert_deferred_batch_norm(module, chunks)
        if devices is None:
            devices = range(torch.cuda.device_count())
        devices = [torch.device(d) for d in devices]
        devices = cast(List[torch.device], devices)
        try:
            self.partitions, self.balance, self.devices = split_module(module, balance, devices)
        except BalanceError as exc:
            raise ValueError(recommend_auto_balance(str(exc)))
        verify_splitting(module, self.partitions, self.balance, self.devices)
        self._copy_streams: List[List[AbstractStream]] = []
        self._skip_layout = inspect_skip_layout(self.partitions)
        # Separate CUDA streams for copy.
        copy_streams = self._ensure_copy_streams()
        # The micro-batch index where the checkpointing stops.
        checkpoint_stop = {"always": self.chunks, "except_last": self.chunks - 1, "never": 0}[self.checkpoint]
        self.pipeline = Pipeline(self.partitions, self.devices, copy_streams, self._skip_layout, checkpoint_stop)
    def __len__(self) -> int:
        """Counts the length of the underlying sequential module."""
        return sum(len(p) for p in self.partitions)
    def __getitem__(self, index: int) -> nn.Module:
        """Gets a layer in the underlying sequential module."""
        partitions = self.partitions
        if index < 0:
            partitions = partitions[::-1]
        for partition in partitions:
            try:
                return partition[index]
            except IndexError:
                pass
            shift = len(partition)
            if index < 0:
                index += shift
            else:
                index -= shift
        raise IndexError
    def __iter__(self) -> Iterable[nn.Module]:
        """Iterates over children of the underlying sequential module."""
        for partition in self.partitions:
            yield from partition
    # Pipe should manage the device of each partition.
    # Deny cuda(), cpu(), and to() with device, by TypeError.
    def cuda(self, device: Optional[Device] = None) -> "Pipe":
        raise MOVING_DENIED
    def cpu(self) -> "Pipe":
        raise MOVING_DENIED
    def to(self, *args: Any, **kwargs: Any) -> "Pipe":
        # Deny these usages:
        #
        # - to(device[, dtype, non_blocking])
        # - to(tensor[, non_blocking])
        #
        # But allow this:
        #
        # - to(dtype[, non_blocking])
        #
        if "device" in kwargs or "tensor" in kwargs:
            raise MOVING_DENIED
        if args:
            if isinstance(args[0], (torch.device, int, str)):
                raise MOVING_DENIED
            if torch.is_tensor(args[0]):
                raise MOVING_DENIED
        return super().to(*args, **kwargs)
    def _ensure_copy_streams(self) -> List[List[AbstractStream]]:
        """Ensures that :class:`Pipe` caches CUDA streams for copy.
        It's worth to cache CUDA streams although PyTorch already manages a
        pool of pre-allocated CUDA streams, because it may reduce GPU memory
        fragementation when the number of micro-batches is small.
        """
        if not self._copy_streams:
            for device in self.devices:
                self._copy_streams.append([new_stream(device) for _ in range(self.chunks)])
        return self._copy_streams
    def forward(self, input: TensorOrTensors) -> TensorOrTensors:  # type: ignore
        """:class:`Pipe` is a fairly transparent module wrapper. It doesn't
        modify the input and output signature of the underlying module. But
        there's type restriction. Input and output have to be a
        :class:`~torch.Tensor` or a tuple of tensors. This restriction is
        applied at partition boundaries too.
        Args:
            input (torch.Tensor or tensors): input mini-batch
        Returns:
            tensor or tensors: output mini-batch
        Raises:
            TypeError: input is not a tensor or tensors.
        """
        microbatch.check(input)
        if not self.devices:
            # Empty sequential module is not illegal.
            return input
        # Divide a mini-batch into micro-batches.
        batches = microbatch.scatter(input, self.chunks)
        # Run pipeline parallelism.
        self.pipeline.run(batches)
        # Merge the micro-batches into one mini-batch.
        output = microbatch.gather(batches)
        return output
--- a/torch/distributed/_pipeline/sync/pipeline.py
+++ b/torch/distributed/_pipeline/sync/pipeline.py
@ -0,0 +1,257 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """The pipeline parallelism of Pipe."""
 from queue import Queue
 from types import TracebackType
 from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, Type, Union, cast
 import torch
 from torch import Tensor, nn
 from torch.autograd.profiler import record_function
 from .checkpoint import Checkpointing
 from .copy import Copy, Wait
 from .dependency import fork, join
 from .microbatch import Batch
 from .skip.layout import SkipLayout
 from .skip.tracker import SkipTrackerThroughPotals, use_skip_tracker
 from .stream import AbstractStream, current_stream, use_device
 from .worker import Task, create_workers, join_workers
 __all__: List[str] = []
 Tensors = Tuple[Tensor, ...]
 TensorOrTensors = Union[Tensor, Tensors]
 ExcInfo = Tuple[Type[BaseException], BaseException, TracebackType]
 # Queue is generic only in stubs.
 # https://mypy.readthedocs.io/en/latest/common_issues.html#using-classes-that-are-generic-in-stubs-but-not-at-runtime
 if TYPE_CHECKING:
    InQueue = Queue[Optional["Task"]]
    OutQueue = Queue[Tuple[bool, Union[Tuple["Task", Batch], ExcInfo, None]]]
 else:
    InQueue = Queue
    OutQueue = Queue
 def depend(fork_from: Batch, join_to: Batch) -> None:
    fork_from[0], phony = fork(fork_from[0])
    join_to[0] = join(join_to[0], phony)
 def copy(batch: Batch, prev_stream: AbstractStream, next_stream: AbstractStream) -> None:
    batch[:] = Copy.apply(prev_stream, next_stream, *batch)
    # Gradients are only supported for float Tensors.
    batch[:] = tuple([x if x.is_floating_point() else x.detach() for x in batch])
 def wait(batch: Batch, prev_stream: AbstractStream, next_stream: AbstractStream) -> None:
    batch[:] = Wait.apply(prev_stream, next_stream, *batch)
    # Gradients are only supported for float Tensors.
    batch[:] = tuple([x if x.is_floating_point() else x.detach() for x in batch])
 def clock_cycles(m: int, n: int) -> Iterable[List[Tuple[int, int]]]:
    """Generates schedules for each clock cycle."""
    # m: number of micro-batches
    # n: number of partitions
    # i: index of micro-batch
    # j: index of partition
    # k: clock number
    #
    # k (i,j) (i,j) (i,j)
    # - ----- ----- -----
    # 0 (0,0)
    # 1 (1,0) (0,1)
    # 2 (2,0) (1,1) (0,2)
    # 3       (2,1) (1,2)
    # 4             (2,2)
    for k in range(m + n - 1):
        yield [(k - j, j) for j in range(max(1 + k - m, 0), min(1 + k, n))]
 class Pipeline:
    """The pipeline parallelism for Pipe."""
    def __init__(
        self,
        partitions: List[nn.Sequential],
        devices: List[torch.device],
        copy_streams: List[List[AbstractStream]],
        skip_layout: SkipLayout,
        checkpoint_stop: int,
    ) -> None:
        self.partitions = partitions
        self.devices = devices
        self.copy_streams = copy_streams
        self.skip_layout = skip_layout
        self.checkpoint_stop = checkpoint_stop
        (self.in_queues, self.out_queues) = create_workers(devices)
    def __del__(self) -> None:
        join_workers(self.in_queues, self.out_queues)
    def run(self, batches: List[Batch]) -> None:
        """Runs pipeline parallelism.
        It modifies the given batches in place.
        """
        partitions = self.partitions
        devices = self.devices
        skip_layout = self.skip_layout
        m = len(batches)
        n = len(partitions)
        skip_trackers = [SkipTrackerThroughPotals(skip_layout) for _ in batches]
        for schedule in clock_cycles(m, n):
            self.fence(batches, schedule, skip_trackers)
            self.compute(batches, schedule, skip_trackers)
    def fence(
        self, batches: List[Batch], schedule: List[Tuple[int, int]], skip_trackers: List[SkipTrackerThroughPotals],
    ) -> None:
        """Copies micro-batches after computation for the previous
        micro-batches.
        """
        copy_streams = self.copy_streams
        skip_layout = self.skip_layout
        for i, j in schedule:
            # Ensure that batches[i-1] is executed after batches[i] in
            # backpropagation by an explicit dependency.
            if i != 0 and j != 0:
                depend(batches[i - 1], batches[i])
            next_stream = copy_streams[j][i]
            for prev_j, ns, name in skip_layout.copy_policy(j):
                prev_stream = copy_streams[prev_j][i]
                skip_trackers[i].copy(batches[i], prev_stream, next_stream, ns, name)
            if j != 0:
                prev_stream = copy_streams[j - 1][i]
                copy(batches[i], prev_stream, next_stream)
    def compute(
        self, batches: List[Batch], schedule: List[Tuple[int, int]], skip_trackers: List[SkipTrackerThroughPotals],
    ) -> None:
        """Runs tasks with synchronization to copy streams."""
        partitions = self.partitions
        devices = self.devices
        copy_streams = self.copy_streams
        checkpoint_stop = self.checkpoint_stop
        # Disable checkpointing if in eval mode.
        if not self.partitions[0].training:
            checkpoint_stop = 0
        n = len(partitions)
        streams = [current_stream(d) for d in devices]
        exc_info: Optional[ExcInfo] = None
        # With checkpointing, the autograd graph looks like this diagram:
        # ┌─────┸──────┐
        # │    Copy    │
        # └─────┰──────┘   (fence)
        # ─ ─ ─ ╂ ─ ─ ─ ─ ─ ─ ─ ─ ─
        #       ┃          (compute)
        # ┌─────┸──────┐
        # │    Wait    │ [1] Synchronize the current stream with the copy stream.
        # └─────┰──────┘
        # ┌─────┸──────┐
        # │ Checkpoint │ [2] Compute a partition within checkpointing.
        # └─────┰──────┘
        # ┌─────┸──────┐
        # │    Wait    │ [3] Synchronize the copy stream with the current stream.
        # └─────┰──────┘
        #       ┠ ─ ─ ─ ┐
        #       ┃ ┌─────┴─────┐
        #       ┃ │ Recompute │ [4] Schedule the recomputation at backpropagation.
        #       ┃ └─────┬─────┘
        #       ┠ ─ ─ ─ ┘
        #       ┃
        # ─ ─ ─ ╂ ─ ─ ─ ─ ─ ─ ─ ─ ─
        # ┌─────┸──────┐   (fence)
        # │    Copy    │
        # └─────┰──────┘
        for i, j in schedule:
            batch = batches[i]
            partition = partitions[j]
            # Synchronize with the copied input. ([1] in the diagram)
            if j != 0:
                wait(batch, copy_streams[j][i], streams[j])
            # Determine whether checkpointing or not.
            checkpoint = i < checkpoint_stop
            if checkpoint:
                def function(
                    input: TensorOrTensors,
                    partition: nn.Sequential = partition,
                    skip_tracker: SkipTrackerThroughPotals = skip_trackers[i],
                    chunk_id: int = i,
                    part_id: int = j,
                ) -> TensorOrTensors:
                    with use_skip_tracker(skip_tracker), record_function("chunk%d-part%d" % (chunk_id, part_id)):
                        return partition(input)
                chk = Checkpointing(function, batch)
                task = Task(streams[j], compute=chk.checkpoint, finalize=chk.recompute)
                del function, chk
            else:
                def compute(
                    batch: Batch = batch,
                    partition: nn.Sequential = partition,
                    skip_tracker: SkipTrackerThroughPotals = skip_trackers[i],
                    chunk_id: int = i,
                    part_id: int = j,
                ) -> Batch:
                    with use_skip_tracker(skip_tracker), record_function("chunk%d-part%d" % (chunk_id, part_id)):
                        return batch.call(partition)
                task = Task(streams[j], compute=compute, finalize=None)
                del compute
            # Compute tasks in parallel. ([2] in the diagram)
            self.in_queues[j].put(task)
        for i, j in schedule:
            ok, payload = self.out_queues[j].get()
            # Hold the first exception.
            if exc_info is not None:
                continue
            elif not ok:
                exc_info = cast(ExcInfo, payload)
                continue
            task, batch = cast(Tuple[Task, Batch], payload)
            # The copy stream synchronizes to copy the output. ([3] in the
            # diagram)
            if j != n - 1:
                wait(batch, streams[j], copy_streams[j][i])
            # Finalize tasks. If checkpointing is enabled, here the
            # recomputation is scheduled at backpropagation. ([4] in the
            # diagram)
            with use_device(devices[j]):
                task.finalize(batch)
            batches[i] = batch
        # Fail at the first exception.
        if exc_info is not None:
            raise exc_info[0].with_traceback(exc_info[1], exc_info[2])
--- a/torch/distributed/_pipeline/sync/py.typed
+++ b/torch/distributed/_pipeline/sync/py.typed
@ -0,0 +1,6 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
--- a/torch/distributed/_pipeline/sync/skip/init.py
+++ b/torch/distributed/_pipeline/sync/skip/init.py
@ -0,0 +1,11 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """Supports efficiency with skip connections."""
 from .namespace import Namespace
 from .skippable import pop, skippable, stash, verify_skippables
 __all__ = ["skippable", "stash", "pop", "verify_skippables", "Namespace"]
--- a/torch/distributed/_pipeline/sync/skip/layout.py
+++ b/torch/distributed/_pipeline/sync/skip/layout.py
@ -0,0 +1,86 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """Static skip connection layout of ``@skippable`` modules."""
 from typing import Dict, Iterable, List, Tuple
 from torch import nn
 from .namespace import Namespace
 __all__: List[str] = []
 class SkipLayout:
    """Represents a skip connection layout across partitions."""
    # Skip routes indexed by 'ns, name': {(ns, name): (prev_j, next_j), ...}
    by_ns_name: Dict[Tuple[Namespace, str], Tuple[int, int]]
    # Skip routes indexed by partition number 'j': [[next_j]: [(prev_j, ns, name), ...], ...]
    by_partition: List[List[Tuple[int, Namespace, str]]]
    def __init__(self, num_partitions: int, skip_routes: Dict[Tuple[Namespace, str], Tuple[int, int]],) -> None:
        # The skip routes are already indexed by 'ns, name'.
        self.by_ns_name = skip_routes
        # Index skip routes by partition number 'j'.
        self.by_partition = [[] for _ in range(num_partitions)]
        for (ns, name), (prev_j, next_j) in skip_routes.items():
            self.by_partition[next_j].append((prev_j, ns, name))
        for p in self.by_partition:
            p.sort()
    def copy_policy(self, next_j: int) -> Iterable[Tuple[int, Namespace, str]]:
        """Generates skip routes for the given destination partition number.
        The skip routes are sorted by source partition number in ascending
        order.
        Yields:
            Each tuple of (source partition number, namespace, name).
        """
        for prev_j, ns, name in self.by_partition[next_j]:
            if prev_j == next_j:
                # This skip tensor will be popped at the same partition where
                # it is stashed. In this case, copy is not required.
                continue
            yield (prev_j, ns, name)
    def requires_copy(self, ns: Namespace, name: str) -> bool:
        """Whether the given namespace and name requires partition-to-partition
        copy or not.
        """
        prev_j, next_j = self.by_ns_name.get((ns, name), (-1, -1))
        return prev_j != next_j
 def inspect_skip_layout(partitions: List[nn.Sequential]) -> SkipLayout:
    """Inspects the skip connection layout in the given partitions."""
    # NOTE(sublee): Hide circular import inside this subroutine. Circular
    # import is not ideal but placing this logic near to SkipLayout may
    # increase cohesion of code.
    from .skippable import Skippable
    skip_routes: Dict[Tuple[Namespace, str], Tuple[int, int]] = {}
    stashed_at: Dict[Tuple[Namespace, str], int] = {}
    for j, partition in enumerate(partitions):
        for layer in partition:
            if not isinstance(layer, Skippable):
                continue
            for ns, name in layer.stashable():
                stashed_at[(ns, name)] = j
            for ns, name in layer.poppable():
                prev_j = stashed_at.pop((ns, name))
                skip_routes[(ns, name)] = (prev_j, j)
    return SkipLayout(len(partitions), skip_routes)
--- a/torch/distributed/_pipeline/sync/skip/namespace.py
+++ b/torch/distributed/_pipeline/sync/skip/namespace.py
@ -0,0 +1,50 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """Provides isolated namespace of skip tensors."""
 import abc
 from functools import total_ordering
 from typing import Any
 import uuid
 __all__ = ["Namespace"]
@total_ordering
 class Namespace(metaclass=abc.ABCMeta):
    """Namespace for isolating skip tensors used by :meth:`isolate()
    <torchpipe.skip.skippable.Skippable.isolate>`.
    """
    __slots__ = ("id",)
    def __init__(self) -> None:
        self.id = uuid.uuid4()
    def __repr__(self) -> str:
        return f"<Namespace '{self.id}'>"
    def __hash__(self) -> int:
        return hash(self.id)
    # Namespaces should support ordering, since SkipLayout will sort tuples
    # including a namespace. But actual order between namespaces is not
    # important. That's why they are ordered by version 4 UUID which generates
    # random numbers.
    def __lt__(self, other: Any) -> bool:
        if isinstance(other, Namespace):
            return self.id < other.id
        return False
    def __eq__(self, other: Any) -> bool:
        if isinstance(other, Namespace):
            return self.id == other.id
        return False
 # 'None' is the default namespace,
 # which means that 'isinstance(None, Namespace)' is 'True'.
 Namespace.register(type(None))
--- a/torch/distributed/_pipeline/sync/skip/portal.py
+++ b/torch/distributed/_pipeline/sync/skip/portal.py
@ -0,0 +1,231 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """Portal keeps a tensor in the pocket plane. The tensor becomes hidden to the
 autograd engine. The shared context of three functions (:class:`PortalBlue`,
 :class:`PortalOrange`, and :class:`PortalCopy`) out of the computation graph is
 one of the most important feature of :mod:`torchpipe.skip`.
 The metaphor is inspired by Portal™ from Valve.
 """
 from typing import List, Optional, Tuple
 import torch
 from torch import Tensor
 from ..copy import Context as CopyContext
 from ..copy import Copy
 from ..phony import get_phony
 from ..stream import AbstractStream, get_device
 __all__: List[str] = []
 class Portal:
    """A portal for a tensor."""
    def __init__(self, tensor: Optional[Tensor], tensor_life: int) -> None:
        self.put_tensor(tensor, tensor_life)
        self.grad: Optional[Tensor] = None
    def blue(self) -> Tensor:
        """Creates a :class:`PortalBlue` which hides the underlying tensor from
        the autograd engine.
        Join the returning phony to the main lane of the autograd graph to
        assure the correct backpropagation::
            PortalBlue --+
                         |
            ---------- Join --
        """
        tensor = self.use_tensor()
        if tensor is None:
            return get_phony(torch.device("cpu"), requires_grad=False)
        return PortalBlue.apply(self, tensor)
    def orange(self, phony: Tensor) -> Optional[Tensor]:
        """Creates a :class:`PortalOrange` which retrieves the hidden tensor
        without losing ability of backpropagation.
        Give a phony forked from the main lane of an autograd graph::
                +-- PortalOrange --+
                |                  |
            -- Fork --------- f(a, b) --
        """
        self.check_tensor_life()
        if self.tensor is None:
            return self.use_tensor()
        return PortalOrange.apply(self, phony)
    def copy(self, prev_stream: AbstractStream, next_stream: AbstractStream, phony: Tensor,) -> Tensor:
        """Copies the hidden tensor by a :class:`PortalCopy`.
        Give a phony and use the returning phony to keep backpropagation::
                +-- PortalCopy --+
                |                |
            -- Fork ---------- Join --
        """
        if self.tensor is None:
            return get_phony(torch.device("cpu"), requires_grad=False)
        return PortalCopy.apply(self, prev_stream, next_stream, phony)
    def check_tensor_life(self) -> None:
        if self.tensor_life <= 0:
            raise RuntimeError("tensor in portal has been removed")
    def put_tensor(self, tensor: Optional[Tensor], tensor_life: int) -> None:
        """Stores a tensor into this portal."""
        # [Life of Tensor through Portal]
        #
        # The tensor can be retrieved by use_tensor() up to 'tensor_life'
        # times. When the life becomes 0, the tensor will be deleted for
        # deallocation in CUDA memory.
        #
        # The below events participate in a tensor through a portal.
        # Note that [x] denotes the events which call use_tensor():
        #
        #  1. [x] blue()
        #  2. [ ]   PortalBlue.forward
        #  3. [ ] copy()
        #  4. [ ]   PortalCopy.forward
        #  5. [ ] orange()
        #  6. [x]   PortalOrange.forward
        # - - - - - - - - - - - - - - - - - - - - - - - - - - -
        #  7. [ ] orange() (recomputed)
        #  8. [x]   PortalOrange.forward (recomputed)
        #  9. [ ]   PortalOrange.backward
        # 10. [ ] PortalCopy.backward
        # 11. [x] blue() (recomputed)
        # 12. [ ]   PortalBlue.forward (recomputed)
        # 13. [ ]   PortalBlue.backward
        #
        self.tensor_life = tensor_life
        if tensor_life > 0:
            self.tensor = tensor
        else:
            self.tensor = None
    def use_tensor(self) -> Optional[Tensor]:
        """Retrieves the underlying tensor and decreases the tensor  life. When
        the life becomes 0, it the tensor will be removed.
        """
        self.check_tensor_life()
        tensor = self.tensor
        self.tensor_life -= 1
        if self.tensor_life <= 0:
            self.tensor = None
        return tensor
    def put_grad(self, grad: Tensor) -> None:
        """Stores a gradient into this portal."""
        self.grad = grad
    def use_grad(self) -> Tensor:
        """Retrieves and removes the underlying gradient. The gradient is
        always ephemeral.
        """
        if self.grad is None:
            raise RuntimeError("grad in portal has been removed or never set")
        grad = self.grad
        self.grad = None
        return grad
 # Common interface between :class:`PortalBlue`, :class:`PortalOrange`, and
 # :class:`PortalCopy`.
 class Context(CopyContext):
    portal: Portal
 class PortalBlue(torch.autograd.Function):
    """Hides a tensor from the autograd engine by a :class:`Portal`."""
    @staticmethod
    # type: ignore
    def forward(
        ctx: Context,
        portal: Portal,
        # This tensor must be retrieved by portal.use_tensor().
        tensor: Tensor,
    ) -> Tensor:
        ctx.portal = portal
        phony = get_phony(tensor.device, requires_grad=False)
        return phony.detach()
    @staticmethod
    # type: ignore
    def backward(ctx: Context, grad_phony: Tensor,) -> Tuple[None, Tensor]:
        # The paired PortalOrange should keep the gradient.
        grad = ctx.portal.use_grad()
        return None, grad
 class PortalOrange(torch.autograd.Function):
    """Retrieves the hidden tensor from a :class:`Portal`."""
    @staticmethod
    # type: ignore
    def forward(ctx: Context, portal: Portal, phony: Tensor) -> Tensor:
        ctx.portal = portal
        tensor = portal.use_tensor()
        assert tensor is not None
        return tensor.detach()
    @staticmethod
    def backward(ctx: Context, grad: Tensor) -> Tuple[None, None]:  # type: ignore
        # The paired PortalBlue will use the gradient.
        ctx.portal.put_grad(grad)
        return None, None
 class PortalCopy(torch.autograd.Function):
    """Copies the hidden tensor in a :class:`Portal`. It replaces the hidden
    tensor with copied one.
    """
    @staticmethod
    # type: ignore
    def forward(
        ctx: Context, portal: Portal, prev_stream: AbstractStream, next_stream: AbstractStream, phony: Tensor,
    ) -> Tensor:
        ctx.portal = portal
        assert portal.tensor is not None
        (portal.tensor,) = Copy.forward(ctx, prev_stream, next_stream, portal.tensor)
        phony = get_phony(get_device(next_stream), requires_grad=False)
        return phony.detach()
    @staticmethod
    # type: ignore
    def backward(ctx: Context, grad_phony: Tensor,) -> Tuple[None, None, None, None]:
        portal = ctx.portal
        assert portal.grad is not None
        _, _, portal.grad = Copy.backward(ctx, portal.grad)
        return None, None, None, None
--- a/torch/distributed/_pipeline/sync/skip/skippable.py
+++ b/torch/distributed/_pipeline/sync/skip/skippable.py
@ -0,0 +1,439 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """The user interface to define skip connections."""
 from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    ClassVar,
    Dict,
    FrozenSet,
    Generator,
    Iterable,
    List,
    Optional,
    Set,
    Tuple,
    Type,
    TypeVar,
    Union,
    cast,
 )
 from torch import Tensor, nn
 from ..microbatch import Batch
 from .namespace import Namespace
 from .tracker import current_skip_tracker
 __all__ = ["skippable", "stash", "pop", "verify_skippables"]
 Tensors = Tuple[Tensor, ...]
 TensorOrTensors = Union[Tensor, Tensors]
 StashPop = Union["stash", "pop"]
 StashPopGenerator = Generator[StashPop, Optional[Tensor], TensorOrTensors]
 if TYPE_CHECKING:
    SkippableModule = nn.Module[Union[StashPopGenerator, TensorOrTensors]]
 else:
    SkippableModule = nn.Module
 T = TypeVar("T", bound="Skippable")
 class Skippable(nn.Module):
    """The base class for skippable modules.
    Do not use this class directly. Define a subclass by :func:`skippable`
    instead.
    """
    module_cls: ClassVar[Type[SkippableModule]]
    stashable_names: ClassVar[FrozenSet[str]]
    poppable_names: ClassVar[FrozenSet[str]]
    def __init__(self, *args: Any, **kwargs: Any) -> None:
        super().__init__()
        self.module = self.module_cls(*args, **kwargs)  # type: ignore
        self.namespaces: Dict[str, Namespace] = {}
    def __repr__(self) -> str:
        return f"@skippable({self.module})"
    def namespaced(self, name: str) -> Tuple[Namespace, str]:
        """Prepends namespace for the given skip name."""
        ns = self.namespaces.get(name)
        ns = cast(Namespace, ns)
        return (ns, name)
    def stashable(self) -> Iterable[Tuple[Namespace, str]]:
        """Iterates over namespaced skip names to be stashed."""
        for name in self.stashable_names:
            yield self.namespaced(name)
    def poppable(self) -> Iterable[Tuple[Namespace, str]]:
        """Iterates over namespaced skip names to be popped."""
        for name in self.poppable_names:
            yield self.namespaced(name)
    def isolate(self: T, ns: Namespace, *, only: Optional[Iterable[str]] = None) -> T:
        r"""Isolates a specified subset or the whole set of skip tensors into a
        namespace. In a single sequential module, skip tensors with the same
        name are not allowed unless they are isolated by different namespaces.
        Here's an example using the same name for skip tensors twice. Each pair
        of ``Layer1`` and ``Layer2`` is isolated with its own namespace ``ns1``
        and ``ns2``. There is no conflict anymore::
            ns1 = Namespace()
            ns2 = Namespace()
            model = nn.Sequential(
                Layer1().isolate(ns1),
                Layer1().isolate(ns2),
                Layer2(),
                Layer3().isolate(ns2),
                Layer3().isolate(ns1),
            )
        When `only` parameter is omitted, all skip tensors are isolated. You
        can isolate a subset of skip tensors by passing `only` parameter::
            ns_alice = Namespace()
            ns_bob = Namespace()
            model = nn.Sequential(
                ...
                StashStashPop().isolate(ns_alice, only=['alice']) \
                               .isolate(ns_bob, only=['bob']),
                ...
            )
        Args:
            ns (Namespace):
                namespace for isolation
        Keyword Args:
            only (iterable of strs):
                names of specific skip tensors to be isolated (omit this option
                to isolate all skip tensors declared in this module)
        Returns:
            this module itself
        """
        names: Iterable[str]
        if only is None:
            names = self.stashable_names | self.poppable_names
        else:
            names = set(only)
        for name in names:
            self.namespaces[name] = ns
        return self
    def dispatch(
        self,
        input: TensorOrTensors,
        handle_stash: Callable[[str, Optional[Tensor]], None],
        handle_pop: Callable[[str], Optional[Tensor]],
    ) -> TensorOrTensors:
        """Dispatches :class:`stash` or :class:`pop` commands generated by the
        module's ``forward()``.
        """
        generator = self.module(input)
        if not isinstance(generator, Generator):
            # The underlying module returned output without any yield.
            output = generator
            return output
        try:
            op = next(generator)
            while True:
                if isinstance(op, stash):
                    handle_stash(op.name, op.tensor)
                    op = next(generator)
                    continue
                if isinstance(op, pop):
                    tensor = handle_pop(op.name)
                    op = generator.send(tensor)
                    continue
                raise TypeError("%r is not a command from @skippable" % op)
        except StopIteration as stop:
            output = stop.args[0]
            return output
    def forward(self, input: TensorOrTensors) -> TensorOrTensors:  # type: ignore
        """Performs the forward propagation. :class:`stash` or :class:`pop`
        commands will be handled by portals silently. The portals won't be
        exposed to users.
        Raises:
            RuntimeError:
                illegal 'stash' or 'pop' is found.
        """
        skip_tracker = current_skip_tracker()
        stashed_tensors: Dict[str, Optional[Tensor]] = {}
        # Load skip tensors that might be popped.
        poppable_tensors = {}
        batch = Batch(input)
        for ns, name in self.poppable():
            try:
                poppable_tensors[name] = skip_tracker.load(batch, ns, name)
            except KeyError:
                raise RuntimeError(f"'{name}' has not been stashed")
        input = batch.tensor_or_tensors
        # Handle skip commands.
        def handle_stash(name: str, tensor: Optional[Tensor]) -> None:
            if name not in self.stashable_names:
                raise RuntimeError(f"'{name}' has not been declared as stashable")
            stashed_tensors[name] = tensor
        def handle_pop(name: str) -> Optional[Tensor]:
            if name not in self.poppable_names:
                raise RuntimeError(f"'{name}' has not been declared as poppable")
            return poppable_tensors.pop(name)
        output = self.dispatch(input, handle_stash, handle_pop)
        # All declared skips must be stashed or popped.
        not_stashed = self.stashable_names - stashed_tensors.keys()
        if not_stashed:
            comma_names = ", ".join("'%s'" % n for n in not_stashed)
            raise RuntimeError(f"{comma_names} must be stashed but have not")
        not_popped = poppable_tensors.keys()
        if not_popped:
            comma_names = ", ".join("'%s'" % n for n in not_popped)
            raise RuntimeError(f"{comma_names} must be popped but have not")
        # Save stashed skip tensors.
        batch = Batch(output)
        for ns, name in self.stashable():
            tensor = stashed_tensors[name]
            skip_tracker.save(batch, ns, name, tensor)
        output = batch.tensor_or_tensors
        return output
 # TODO(sublee): Move to above of Skippable class for better read flow.
 def skippable(
    stash: Iterable[str] = (), pop: Iterable[str] = (),
 ) -> Callable[[Type[SkippableModule]], Type[Skippable]]:
    """The decorator to define a :class:`nn.Module <torch.nn.Module>` with skip
    connections. Decorated modules are called "skippable". This functionality
    works perfectly fine even when the module is not wrapped by
    :class:`~torchpipe.Pipe`.
    Each skip tensor is managed by its name. Before manipulating skip tensors,
    a skippable module must statically declare the names for skip tensors by
    `stash` and/or `pop` parameters. Skip tensors with pre-declared name can be
    stashed by ``yield stash(name, tensor)`` or popped by ``tensor = yield
    pop(name)``.
    Here is an example with three layers. A skip tensor named "1to3" is stashed
    and popped at the first and last layer, respectively::
        @skippable(stash=['1to3'])
        class Layer1(nn.Module):
            def forward(self, input):
                yield stash('1to3', input)
                return f1(input)
        class Layer2(nn.Module):
            def forward(self, input):
                return f2(input)
        @skippable(pop=['1to3'])
        class Layer3(nn.Module):
            def forward(self, input):
                skip_1to3 = yield pop('1to3')
                return f3(input) + skip_1to3
        model = nn.Sequential(Layer1(), Layer2(), Layer3())
    One skippable module can stash or pop multiple skip tensors::
        @skippable(stash=['alice', 'bob'], pop=['carol'])
        class StashStashPop(nn.Module):
            def forward(self, input):
                yield stash('alice', f_alice(input))
                yield stash('bob', f_bob(input))
                carol = yield pop('carol')
                return input + carol
    Every skip tensor must be associated with exactly one pair of `stash` and
    `pop`. :class:`~torchpipe.Pipe` checks this restriction automatically
    when wrapping a module. You can also check the restriction by
    :func:`~torchpipe.skip.verify_skippables` without
    :class:`~torchpipe.Pipe`.
    .. note::
        :func:`@skippable <skippable>` changes the type of the wrapped class.
        But currently (mypy v0.740), mypy could not understand class decorators
        yet (`#3135 <https://github.com/python/mypy/issues/3135>`_).
        There are two workarounds:
        1. Naively ignore type errors by ``# type: ignore``.
        2. Use ``skippable()()`` as a function instead of a decorator.
    .. seealso:: :ref:`Long Skip Connections`
    """
    stashable_names = frozenset(stash)
    poppable_names = frozenset(pop)
    def extend_skippable(module_cls: Type[SkippableModule]) -> Type[Skippable]:
        name = module_cls.__name__
        bases = (Skippable,)
        attrs = {"module_cls": module_cls, "stashable_names": stashable_names, "poppable_names": poppable_names}
        return type(name, bases, attrs)
    return extend_skippable
 class stash:
    """The command to stash a skip tensor.
    ::
        def forward(self, input):
            yield stash('name', input)
            return f(input)
    Args:
        name (str): name of skip tensor
        input (torch.Tensor or None): tensor to pass to the skip connection
    """
    __slots__ = ("name", "tensor")
    def __init__(self, name: str, tensor: Optional[Tensor]) -> None:
        self.name = name
        self.tensor = tensor
 class pop:
    """The command to pop a skip tensor.
    ::
        def forward(self, input):
            skip = yield pop('name')
            return f(input) + skip
    Args:
        name (str): name of skip tensor
    Returns:
        the skip tensor previously stashed by another layer under the same name
    """
    __slots__ = ("name",)
    def __init__(self, name: str) -> None:
        self.name = name
 def verify_skippables(module: nn.Sequential) -> None:
    """Verifies if the underlying skippable modules satisfy integrity.
    Every skip tensor must have only one pair of `stash` and `pop`. If there
    are one or more unmatched pairs, it will raise :exc:`TypeError` with the
    detailed messages.
    Here are a few failure cases. :func:`verify_skippables` will report failure
    for these cases::
        # Layer1 stashes "1to3".
        # Layer3 pops "1to3".
        nn.Sequential(Layer1(), Layer2())
        #               └──── ?
        nn.Sequential(Layer2(), Layer3())
        #                   ? ────┘
        nn.Sequential(Layer1(), Layer2(), Layer3(), Layer3())
        #               └───────────────────┘       ^^^^^^
        nn.Sequential(Layer1(), Layer1(), Layer2(), Layer3())
        #             ^^^^^^      └───────────────────┘
    To use the same name for multiple skip tensors, they must be isolated by
    different namespaces. See :meth:`isolate()
    <torchpipe.skip.skippable.Skippable.isolate>`.
    Raises:
        TypeError:
            one or more pairs of `stash` and `pop` are not matched.
    """
    stashed: Set[Tuple[Namespace, str]] = set()
    popped: Set[Tuple[Namespace, str]] = set()
    msgs: List[str] = []
    for layer_name, layer in module.named_children():
        if not isinstance(layer, Skippable):
            continue
        for name in layer.stashable_names & layer.poppable_names:
            msg = f"'{layer_name}' declared '{name}' both as stashable and as poppable"
            msgs.append(msg)
        for ns, name in layer.stashable():
            if name in layer.poppable_names:
                continue
            if (ns, name) in stashed:
                msg = f"'{layer_name}' redeclared '{name}' as stashable " "but not isolated by namespace"
                msgs.append(msg)
                continue
            stashed.add((ns, name))
        for ns, name in layer.poppable():
            if name in layer.stashable_names:
                continue
            if (ns, name) in popped:
                msg = f"'{layer_name}' redeclared '{name}' as poppable " "but not isolated by namespace"
                msgs.append(msg)
                continue
            if (ns, name) not in stashed:
                msg = f"'{layer_name}' declared '{name}' as poppable but it was not stashed"
                msgs.append(msg)
                continue
            popped.add((ns, name))
    for (_, name) in stashed - popped:
        msg = f"no module declared '{name}' as poppable but stashed"
        msgs.append(msg)
    if msgs:
        raise TypeError(
            "one or more pairs of stash and pop do not match:\n\n%s" "" % "\n".join("* %s" % x for x in msgs)
        )
--- a/torch/distributed/_pipeline/sync/skip/tracker.py
+++ b/torch/distributed/_pipeline/sync/skip/tracker.py
@ -0,0 +1,177 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """Tracks skip tensors on a thread."""
 from contextlib import contextmanager
 import threading
 from typing import Dict, Generator, List, Optional, Tuple
 from torch import Tensor
 from ..checkpoint import is_checkpointing
 from ..dependency import fork, join
 from ..microbatch import Batch
 from ..stream import AbstractStream
 from .layout import SkipLayout
 from .namespace import Namespace
 from .portal import Portal
 __all__: List[str] = []
 class SkipTracker:
    """Tracks saved skip tensors.
    It will update the given micro-batch in place. This is because when it
    manipulates the underlying skip tensors, the current micro-batch also has
    to be connected with the skip tensors.
    One thread has one skip tracker. Call :func:`current_skip_tracker` to get
    the skip tracker on the current thread.
    """
    def __init__(self) -> None:
        self.tensors: Dict[Tuple[Namespace, str], Optional[Tensor]] = {}
    def save(self, batch: Batch, ns: Namespace, name: str, tensor: Optional[Tensor]) -> None:
        self.tensors[(ns, name)] = tensor
    def load(self, batch: Batch, ns: Namespace, name: str) -> Optional[Tensor]:
        return self.tensors.pop((ns, name))
    def copy(
        self, batch: Batch, prev_stream: AbstractStream, next_stream: AbstractStream, ns: Namespace, name: str,
    ) -> None:
        raise TypeError("copy is not supported for non-portal skip tensors")
 class SkipTrackerThroughPotals(SkipTracker):
    """Tracks saved skip tensors through portals. The skip tensors will be
    hidden in portals so that the autograd engine does not need to track them.
    This tracker is only used when the training or evaluating module is wrapped
    with :class:`torchpipe.Pipe`.
    """
    def __init__(self, skip_layout: SkipLayout) -> None:
        super().__init__()
        self.skip_layout = skip_layout
        self.portals: Dict[Tuple[Namespace, str], Portal] = {}
    def save(self, batch: Batch, ns: Namespace, name: str, tensor: Optional[Tensor]) -> None:
        """Saves the stashed skip tensor in a portal. The portal is then
        connected to the given micro-batch with :class:`Join`.
        """
        if not self.skip_layout.requires_copy(ns, name):
            super().save(batch, ns, name, tensor)
            return
        # See [Tensor Life of Portal] at Portal.put_tensor() to understand the
        # below tensor_life values. Here are the selected events which retrieve
        # the tensor in portal:
        #
        #  1. [x] blue()
        #     ...
        #  6. [x]   PortalOrange.forward
        #     ...
        #  8. [x]   PortalOrange.forward (recomputed)
        #     ...
        # 11. [x] blue() (recomputed)
        #
        if (ns, name) not in self.portals:
            if is_checkpointing():
                # Under checkpointing, the tensor used by the first
                # PortalOrange should be alive in the portal. This tensor will
                # be used again by the second PortalOrange during the
                # recomputation.
                tensor_life = 3  # Delete at [8. PortalOrange.forward (recomputed)]
            else:
                tensor_life = 2  # Delete at [6. PortalOrange.forward]
            portal = Portal(tensor, tensor_life)
            self.portals[(ns, name)] = portal
        else:
            # Under recomputation, the portal already exists.
            portal = self.portals[(ns, name)]
            # The existing tensor life already became 0. It should be reset as
            # 1 to delete the tensor after the second PortalBlue immediately.
            tensor_life = 1  # Delete at [11. blue() (recomputed)]
            portal.put_tensor(tensor, tensor_life)
        phony = portal.blue()
        batch[0] = join(batch[0], phony)
    def load(self, batch: Batch, ns: Namespace, name: str) -> Optional[Tensor]:
        """Loads a skip tensor from the corresponding portal to pop. The given
        micro-batch is connected to the portal with :class:`Fork`.
        """
        if not self.skip_layout.requires_copy(ns, name):
            tensor = super().load(batch, ns, name)
            return tensor
        portal = self.portals[(ns, name)]
        batch[0], phony = fork(batch[0])
        tensor = portal.orange(phony)
        return tensor
    def copy(
        self, batch: Batch, prev_stream: AbstractStream, next_stream: AbstractStream, ns: Namespace, name: str,
    ) -> None:
        """Copies the skip tensor in the corresponding portal. The given
        micro-batch and the portal will be tied with :class:`Fork` and
        :class:`Join`.
        """
        assert self.skip_layout.requires_copy(ns, name)
        batch[0], phony = fork(batch[0])
        portal = self.portals[(ns, name)]
        phony = portal.copy(prev_stream, next_stream, phony)
        batch[0] = join(batch[0], phony)
 class ThreadLocal(threading.local):
    def __init__(self) -> None:
        self.skip_tracker: Optional[SkipTracker] = None
 thread_local = ThreadLocal()
@contextmanager
 def use_skip_tracker(skip_tracker: SkipTracker) -> Generator[None, None, None]:
    """Registers the given skip tracker on the current thread within a
    context::
        with use_skip_tracker(my_skip_tracker):
            ...
    """
    orig = thread_local.skip_tracker
    thread_local.skip_tracker = skip_tracker
    try:
        yield
    finally:
        thread_local.skip_tracker = orig
 def current_skip_tracker() -> SkipTracker:
    """Gets the skip tracker on the current thread."""
    skip_tracker = thread_local.skip_tracker
    if skip_tracker is None:
        skip_tracker = SkipTracker()
        thread_local.skip_tracker = skip_tracker
    return skip_tracker
--- a/torch/distributed/_pipeline/sync/stream.py
+++ b/torch/distributed/_pipeline/sync/stream.py
@ -0,0 +1,117 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """Utilities for eliminating boilerplate code to handle abstract streams with
 CPU device.
 """
 from contextlib import contextmanager
 from typing import Generator, List, Union, cast
 import torch
 __all__: List[str] = []
 class CPUStreamType:
    pass
 # The placeholder on place of streams for the CPU device instead of CUDA.
 CPUStream = CPUStreamType()
 # It represents both CUDA streams and the CPU stream.
 AbstractStream = Union[torch.cuda.Stream, CPUStreamType]
 def new_stream(device: torch.device) -> AbstractStream:
    """Creates a new stream for either CPU or CUDA device."""
    if device.type != "cuda":
        return CPUStream
    return torch.cuda.Stream(device)
 def current_stream(device: torch.device) -> AbstractStream:
    """:func:`torch.cuda.current_stream` for either CPU or CUDA device."""
    if device.type != "cuda":
        return CPUStream
    return torch.cuda.current_stream(device)
 def default_stream(device: torch.device) -> AbstractStream:
    """:func:`torch.cuda.default_stream` for either CPU or CUDA device."""
    if device.type != "cuda":
        return CPUStream
    return torch.cuda.default_stream(device)
@contextmanager
 def use_device(device: torch.device) -> Generator[None, None, None]:
    """:func:`torch.cuda.device` for either CPU or CUDA device."""
    if device.type != "cuda":
        yield
        return
    with torch.cuda.device(device):
        yield
@contextmanager
 def use_stream(stream: AbstractStream) -> Generator[None, None, None]:
    """:func:`torch.cuda.stream` for either CPU or CUDA stream."""
    if not is_cuda(stream):
        yield
        return
    with torch.cuda.stream(as_cuda(stream)):
        yield
 def get_device(stream: AbstractStream) -> torch.device:
    """Gets the device from CPU or CUDA stream."""
    if is_cuda(stream):
        return as_cuda(stream).device
    return torch.device("cpu")
 def wait_stream(source: AbstractStream, target: AbstractStream) -> None:
    """:meth:`torch.cuda.Stream.wait_stream` for either CPU or CUDA stream. It
    makes the source stream wait until the target stream completes work queued.
    """
    if is_cuda(target):
        if is_cuda(source):
            # A CUDA stream waits another CUDA stream.
            as_cuda(source).wait_stream(as_cuda(target))
        else:
            # CPU waits a CUDA stream.
            as_cuda(target).synchronize()
    # If the target is CPU, synchronization is not required.
 def record_stream(tensor: torch.Tensor, stream: AbstractStream) -> None:
    """:meth:`torch.Tensor.record_stream` for either CPU or CUDA stream."""
    if is_cuda(stream):
        # NOTE(sublee): record_stream() on a shifted view tensor throws
        # RuntimeError in PyTorch 1.1.0, and does nothing in 1.2.0. To safely
        # protect the tensor against unexpected reallocation, here we use a
        # temporal tensor associated with the same storage without shifting as
        # a workaround.
        #
        # Issue: https://github.com/pytorch/pytorch/issues/27366
        #
        tensor = tensor.new_empty([0]).set_(tensor.storage())
        tensor.record_stream(as_cuda(stream))
 def is_cuda(stream: AbstractStream) -> bool:
    """Returns ``True`` if the given stream is a valid CUDA stream."""
    return stream is not CPUStream
 def as_cuda(stream: AbstractStream) -> torch.cuda.Stream:
    """Casts the given stream as :class:`torch.cuda.Stream`."""
    return cast(torch.cuda.Stream, stream)
--- a/torch/distributed/_pipeline/sync/worker.py
+++ b/torch/distributed/_pipeline/sync/worker.py
@ -0,0 +1,151 @@
 # Copyright 2019 Kakao Brain
 #
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """Multithreading in pipeline parallelism."""
 from contextlib import contextmanager
 from queue import Queue
 import sys
 from threading import Thread
 from types import TracebackType
 from typing import TYPE_CHECKING, Callable, Dict, Generator, List, Optional, Tuple, Type, Union, cast
 import torch
 from .microbatch import Batch
 from .stream import AbstractStream, use_device, use_stream
 __all__: List[str] = []
 ExcInfo = Tuple[Type[BaseException], BaseException, TracebackType]
 # Queue is generic only in stubs.
 # https://mypy.readthedocs.io/en/latest/common_issues.html#using-classes-that-are-generic-in-stubs-but-not-at-runtime
 if TYPE_CHECKING:
    InQueue = Queue[Optional["Task"]]
    OutQueue = Queue[Tuple[bool, Union[Tuple["Task", Batch], ExcInfo, None]]]
 else:
    InQueue = Queue
    OutQueue = Queue
 class Task:
    """A task represents how to compute a micro-batch on a partition.
    It consists of two parts: :meth:`compute` and :meth:`finalize`.
    :meth:`compute` should be executed in worker threads concurrently.
    :meth:`finalize` should be executed after when worker threads complete to
    execute :meth:`compute`.
    :meth:`compute` might be boosted by worker threads. Because it produces
    several CUDA API calls by user code. In PyTorch, parallel CUDA API calls
    are not serialized through GIL. So more than one CUDA API call can be
    produced at the same time.
    """
    def __init__(
        self, stream: AbstractStream, *, compute: Callable[[], Batch], finalize: Optional[Callable[[Batch], None]],
    ) -> None:
        self.stream = stream
        self._compute = compute
        self._finalize = finalize
        self._grad_enabled = torch.is_grad_enabled()
    def compute(self) -> Batch:
        with use_stream(self.stream), torch.set_grad_enabled(self._grad_enabled):
            return self._compute()
    def finalize(self, batch: Batch) -> None:
        if self._finalize is None:
            return
        with use_stream(self.stream), torch.set_grad_enabled(self._grad_enabled):
            self._finalize(batch)
 def worker(in_queue: InQueue, out_queue: OutQueue, device: torch.device) -> None:
    """The main loop of a worker thread."""
    with use_device(device):
        while True:
            task = in_queue.get()
            if task is None:
                break
            try:
                batch = task.compute()
            except Exception:
                exc_info = cast(ExcInfo, sys.exc_info())
                out_queue.put((False, exc_info))
                continue
            out_queue.put((True, (task, batch)))
    done = (False, None)
    out_queue.put(done)
 def create_workers(devices: List[torch.device],) -> Tuple[List[InQueue], List[OutQueue]]:
    """Spawns worker threads. A worker thread is bound to a device."""
    in_queues: List[InQueue] = []
    out_queues: List[OutQueue] = []
    # Spawn workers.
    workers: Dict[torch.device, Tuple[InQueue, OutQueue]] = {}
    def normalize_device(device: torch.device) -> torch.device:
        if device.type == "cuda" and device.index is None:
            return torch.device("cuda", index=torch.cuda.current_device())
        if device.type == "cpu" and device.index is not None:
            return torch.device("cpu")
        return device
    for device in devices:
        device = normalize_device(device)
        try:
            in_queue, out_queue = workers[device]
        except KeyError:
            in_queue = Queue()
            out_queue = Queue()
            workers[device] = (in_queue, out_queue)
            t = Thread(target=worker, args=(in_queue, out_queue, device), daemon=True,)
            t.start()
        in_queues.append(in_queue)
        out_queues.append(out_queue)
    return (in_queues, out_queues)
 def join_workers(in_queues: List[InQueue], out_queues: List[OutQueue]) -> None:
    # Close workers.
    for in_queue in set(in_queues):
        in_queue.put(None)
    # Join running workers.
    running = set(out_queues)
    while running:
        out_queue = running.pop()
        ok, payload = out_queue.get()
        done = (False, None)
        if (ok, payload) == done:
            continue
        running.add(out_queue)
@contextmanager
 def spawn_workers(devices: List[torch.device],) -> Generator[Tuple[List[InQueue], List[OutQueue]], None, None]:
    try:
        (in_queues, out_queues) = create_workers(devices)
        yield (in_queues, out_queues)
    finally:
        join_workers(in_queues, out_queues)