pytorch
diff --git a/‎tests/test_aot_cudagraphs.py‎
Lines changed: 182 additions & 0 deletions b/‎tests/test_aot_cudagraphs.py‎
Lines changed: 182 additions & 0 deletions
diff --git a/‎torchdynamo/optimizations/training.py‎
Lines changed: 161 additions & 1 deletion b/‎torchdynamo/optimizations/training.py‎
Lines changed: 161 additions & 1 deletion
@@ -0,0 +1,182 @@
+# Owner(s): ["module: cuda graphs"]
+
+import functools
+import unittest
+from unittest.mock import patch
+
+import torch
+
+import torchdynamo
+import torchdynamo.testing
+from torchdynamo.testing import same
+
+
+def composed(*decs):
+ def deco(f):
+ for dec in reversed(decs):
+ f = dec(f)
+ return f
+
+ return deco
+
+
+def assert_aot_autograd_counter(ok=True):
+ def deco(f):
+ @functools.wraps(f)
+ def wrap(self, *args, **kwargs):
+ torchdynamo.utils.counters.clear()
+ r = f(self, *args, **kwargs)
+ c_ok = torchdynamo.utils.counters["aot_autograd"]["ok"]
+ c_not_ok = torchdynamo.utils.counters["aot_autograd"]["not_ok"]
+ if ok:
+ self.assertGreater(c_ok, 0)
+ self.assertEqual(c_not_ok, 0)
+ else:
+ self.assertEqual(c_ok, 0)
+ self.assertGreater(c_not_ok, 0)
+ return r
+
+ return wrap
+
+ return deco
+
+
+def patch_all(ok=True):
+ return composed(
+ patch("torchdynamo.config.verify_correctness", True),
+ assert_aot_autograd_counter(ok),
+ )
+
+
+N_ITERS = 5
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "these tests require cuda")
+class TestAotCudagraphs(torchdynamo.testing.TestCase):
+ @patch_all()
+ def test_basic(self):
+ def model(x, y):
+ return (x + y) * y
+
+ with torchdynamo.optimize("aot_cudagraphs"):
+ for i in range(N_ITERS):
+ x = torch.randn(3, device="cuda", requires_grad=True)
+ y = torch.randn(3, device="cuda")
+ loss = model(x, y).sum()
+ loss.backward()
+
+ @patch_all()
+ def test_dtoh(self):
+ def model(x, y):
+ a = x + y
+ b = a.cpu() * 3
+ return b
+
+ with torchdynamo.optimize("aot_cudagraphs"):
+ for i in range(N_ITERS):
+ x = torch.randn(3, device="cuda", requires_grad=True)
+ y = torch.randn(3, device="cuda")
+ loss = model(x, y).sum()
+ loss.backward()
+
+ @patch_all()
+ def test_htod(self):
+ def model(x, y):
+ a = x + y
+ return a * 3
+
+ with torchdynamo.optimize("aot_cudagraphs"):
+ for i in range(N_ITERS):
+ x = torch.randn(3, device="cuda", requires_grad=True)
+ y = torch.randn((), device="cpu")
+ loss = model(x, y).sum()
+ loss.backward()
+
+ @patch("functorch._src.config.use_functionalize", True)
+ @patch_all(ok=False) # input mutation not supported yet
+ def test_mutate_input(self):
+ def model(x, y):
+ y.add_(3)
+ return x * y
+
+ with torchdynamo.optimize("aot_cudagraphs"):
+ for i in range(N_ITERS):
+ with self.subTest(i):
+ x = torch.randn(3, device="cuda", requires_grad=True)
+ y = torch.randn(3, device="cuda")
+ y_orig = y.clone()
+ loss = model(x, y).sum()
+ self.assertTrue(same(y, y_orig + 3))
+ loss.backward()
+
+ @patch_all()
+ def test_mutate_constant(self):
+ def model(x, y):
+ c = torch.tensor(1)
+ c.add_(2)
+ return x * y * 0 + c
+
+ with torchdynamo.optimize("aot_cudagraphs"):
+ for i in range(N_ITERS):
+ with self.subTest(i):
+ x = torch.randn(1, device="cuda", requires_grad=True)
+ y = torch.randn(1, device="cuda")
+ loss = model(x, y).sum()
+ self.assertTrue(same(loss, torch.tensor(3.0, device="cuda")))
+ loss.backward()
+
+ @patch_all()
+ def test_factory(self):
+ def model(y):
+ x = torch.zeros(3, device="cuda:0")
+ x.add_(3)
+ return x * y
+
+ with torchdynamo.optimize("aot_cudagraphs"):
+ for i in range(N_ITERS):
+ with self.subTest(i):
+ y = torch.randn(3, device="cuda:0", requires_grad=True)
+ loss = model(y).sum()
+ loss.backward()
+
+ # Internal resize_ inside models appear to be broken right now
+ @unittest.expectedFailure
+ @patch("functorch._src.config.use_functionalize", True)
+ @patch_all()
+ def test_mutated_metadata(self):
+ # more tortured example at
+ # https://github.com/pytorch/pytorch/issues/81385
+ def model(x):
+ x = x.clone()
+ x.resize_(20)
+ x.fill_(2)
+ return x
+
+ with torchdynamo.optimize("aot_cudagraphs"):
+ for i in range(N_ITERS):
+ with self.subTest(i):
+ x = torch.empty(0, device="cuda:0")
+ rx = model(x)
+ self.assertTrue(same(rx, torch.full((20,), 2.0, device="cuda:0")))
+
+ @patch("functorch._src.config.use_functionalize", True)
+ @patch_all()
+ def test_dead_fill(self):
+ def model(x):
+ x = x.clone()
+ y = x[0:0]
+ x.fill_(2)
+ y.fill_(3)
+ return x, y
+
+ with torchdynamo.optimize("aot_cudagraphs"):
+ for i in range(N_ITERS):
+ with self.subTest(i):
+ x = torch.empty(20, device="cuda:0")
+ rx, ry = model(x)
+ self.assertTrue(same(rx, torch.full((20,), 2.0, device="cuda:0")))
+ self.assertTrue(same(ry, torch.empty(0, device="cuda:0")))
+
+
+if __name__ == "__main__":
+ unittest.main()
@@ -1,7 +1,16 @@
 import logging
+import operator
+from collections import defaultdict
+from typing import Set
 
 import torch
+from torch.fx import GraphModule
+from torch.fx.passes.backends.cudagraphs import partition_cudagraphs
+from torch.multiprocessing.reductions import StorageWeakRef
+from torch.nn import Module
+from torch.utils._pytree import tree_map
 
+import torchdynamo
 from torchdynamo import config
 from torchdynamo.utils import clone_inputs
 from torchdynamo.utils import count_calls
@@ -59,7 +68,7 @@ def __init__(self, gm: torch.fx.GraphModule, example_inputs):
  # - data mutation of inputs (fixed when we stop recording the
  # copy_ directly into the graph)
  # - metadata mutation of inputs (fixed if we do an extra partition
- # to avoid AOTAutograd on the mutated inputs, or if we some how
+ # to avoid AotAutograd on the mutated inputs, or if we some how
  # get custom autograd function to reflect metadata changes to the
  # original tensor)
  mutated = has_mutation(self.gm, self.example_inputs, inputs_only=True)
@@ -249,6 +258,153 @@ def candidate(self):
 aot_prims_nvfuser = AotPrimsNvfuser.compile_fn
 
 
+def cloner(t):
+ if isinstance(t, torch.Tensor):
+ return t.clone()
+ else:
+ return t
+
+
+class CudaGraphModule(Module):
+ gm: GraphModule
+ mutated_inputs: Set[int]
+
+ def __init__(self, gm, mutated_inputs):
+ super().__init__()
+ self.gm = gm
+ self.mutated_inputs = mutated_inputs
+
+ warmed_up = False
+
+ # these are all None or all filled
+ graph = None
+ static_inputs = None
+ static_outputs = None
+
+ # NB: we override __call__ as we don't need any nn.Module machinery
+ # and to reduce overhead
+ def __call__(self, *args):
+ # TODO: once we've recorded here, we'd like to replace the __call__
+ # implementation with compiled bytecode that copies into static, replays
+ # the cuda graph, then copies out. First condition is the hotpath,
+ # needs optimizing
+ if self.graph is not None:
+ assert len(args) == len(self.static_inputs)
+ for dst, src in zip(self.static_inputs, args):
+ dst.copy_(src)
+ self.graph.replay()
+ for i in self.mutated_inputs:
+ args[i].copy_(self.static_inputs[i])
+ return tree_map(cloner, self.static_outputs)
+
+ elif self.warmed_up:
+ # record
+ self.static_inputs = [x.clone() for x in args]
+ self.graph = torch.cuda.CUDAGraph()
+ with torch.cuda.graph(self.graph):
+ self.static_outputs = self.gm(*self.static_inputs)
+ # NB: recording doesn't actually run the operations, so
+ # now we immediately replay the graph to serve up the result
+ self.graph.replay()
+ for i in self.mutated_inputs:
+ args[i].copy_(self.static_inputs[i])
+ return tree_map(cloner, self.static_outputs)
+
+ else:
+ # warmup
+ stream = torch.cuda.Stream()
+ stream.wait_stream(torch.cuda.current_stream())
+ with torch.cuda.stream(stream):
+ r = self.gm(*args)
+ torch.cuda.current_stream().wait_stream(stream)
+ self.warmed_up = True
+ return r
+
+
+# Interpreter versions of these passes can be found at
+# https://gist.github.com/ezyang/df2d746cac3b2c7d55c181e37c57ef23
+
+
+def find_input_mutations(g):
+ FK = "fake_result"
+ inputs = defaultdict(set)
+ input_idx = 0
+ mutated_inputs = set()
+ for n in g.nodes:
+ if n.op == "placeholder":
+ inputs[StorageWeakRef(n.meta[FK].storage())].add(input_idx)
+ input_idx += 1
+ elif n.op == "call_function":
+ if n.target is operator.getitem:
+ continue
+ schema = n.target._schema
+ for i, arg in enumerate(schema.arguments):
+ if i < len(n.args):
+ argument = n.args[i]
+ else:
+ if arg.name not in n.kwargs:
+ continue
+ argument = n.kwargs[arg.name]
+ mut_arg = False
+ if arg.alias_info:
+ if arg.alias_info.is_write:
+ mut_arg = True
+ if mut_arg:
+ # TODO: not correct for args that contain tensors in a struct
+ # like list
+ mutated_inputs |= inputs[
+ StorageWeakRef(argument.meta[FK].storage())
+ ]
+ # TODO: error on unrecognized nodes
+ return mutated_inputs
+
+
+# Mutates input graph
+def apply_cuda_graphs(gm):
+ for n in gm.graph.nodes:
+ if n.op == "call_module":
+ assert not n.kwargs
+ submod = gm.get_submodule(n.target)
+ gm.delete_submodule(n.target)
+ mutated_inputs = find_input_mutations(submod.graph)
+ gm.add_submodule(n.target, CudaGraphModule(submod, mutated_inputs))
+ # NB: we didn't actually change the graph, no need for recompile
+
+
+def cudagraphs(model, inputs):
+ model = partition_cudagraphs(model, inputs)
+ apply_cuda_graphs(model)
+ return model
+
+
+def raw_aot_autograd_cudagraphs(model, inputs):
+ kwargs = {
+ # these are taken from memory_efficient_fusion()
+ "fw_compiler": cudagraphs,
+ "bw_compiler": cudagraphs,
+ "hasher_type": "StaticShapeHasher",
+ }
+
+ def _wrapped_bw_compiler(*args, **kwargs):
+ # stop TorchDynamo from trying to compile our generated backwards pass
+ return torchdynamo.disable(bw_compiler(*args, **kwargs)) # type: ignore[operator]
+
+ bw_compiler = kwargs.get("bw_compiler") or kwargs["fw_compiler"]
+ kwargs["bw_compiler"] = _wrapped_bw_compiler
+
+ from functorch.compile import aot_module_simplified # type: ignore[import]
+
+ return aot_module_simplified(model, **kwargs)
+
+
+class AotAutogradCudaGraphs(AotAutogradStrategy):
+ def candidate(self):
+ return raw_aot_autograd_cudagraphs(self.gm, self.example_inputs)
+
+
+aot_cudagraphs = AotAutogradCudaGraphs.compile_fn
+
+
 def create_aot_backends():
  """
  Register aliases for the AOT backends
@@ -280,3 +436,7 @@ def create_aot_backends():
  # without worrying about the impact of decomposisitons. More details at
  # https://github.com/pytorch/torchdynamo/issues/611
  BACKENDS["aot_nvfuser_nodecomps"] = aot_mem_efficient_fusion_no_decomp
+
+ # aot_cudagraphs only applies CUDA graphs to the graph. It is also helpful
+ # for debugging and can serve as a perf baseline.
+ BACKENDS["aot_cudagraphs"] = aot_cudagraphs