gchanan
diff --git a/‎test/test_jit.py‎
Lines changed: 5 additions & 12 deletions b/‎test/test_jit.py‎
Lines changed: 5 additions & 12 deletions
diff --git a/‎torch/csrc/jit/profiling_graph_executor_impl.cpp‎
Lines changed: 85 additions & 71 deletions b/‎torch/csrc/jit/profiling_graph_executor_impl.cpp‎
Lines changed: 85 additions & 71 deletions
diff --git a/‎torch/csrc/jit/profiling_graph_executor_impl.h‎
Lines changed: 2 additions & 3 deletions b/‎torch/csrc/jit/profiling_graph_executor_impl.h‎
Lines changed: 2 additions & 3 deletions
@@ -7421,31 +7421,24 @@ def func():
  self.assertEqual(t1.device, t2.device)
 
 
- @unittest.skipIf(GRAPH_EXECUTOR == ProfilingMode.SIMPLE, "Simple Executor doesn't have any shapes to propagate")
+ @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.LEGACY, "Simple Executor doesn't have any shapes to propagate")
  def test_tensor_as_tensor_shape_prop(self):
  tensor_template = dedent('''
  def func():
  return torch.{tensor_op}({input})
  ''')
  ops = ['tensor', 'as_tensor']
  inputs = ['[1]', '[False]', '[2.5]', '0.5', '1', 'False', '[[1]]']
- if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
- expected_shape = ["Long(1)", "Bool(1)", "Double(1)", "Double()", "Long()", "Bool()", "Long(1, 1)"]
- else:
- expected_shape = ["Long(*)", ("Bool(*)"), "Double(*)", "Double()", "Long()", "Bool()", "Long(*, *)"]
+ expected_shape = ["Long(*)", ("Bool(*)"), "Double(*)", "Double()", "Long()", "Bool()", "Long(*, *)"]
 
  for op in ops:
  for inp, expect in zip(inputs, expected_shape):
  code = tensor_template.format(tensor_op=op, input=inp)
  scope = {}
  exec(code, globals(), scope)
- if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
- fn = self.checkScript(code, ())
- FileCheck().check(expect).check("aten::{tensor_op}".format(tensor_op=op)).run(fn.graph_for())
- else:
- cu = torch.jit.CompilationUnit(code)
- torch._C._jit_pass_complete_shape_analysis(cu.func.graph, (), False)
- FileCheck().check(expect).check("aten::{tensor_op}".format(tensor_op=op)).run(cu.func.graph)
+ cu = torch.jit.CompilationUnit(code)
+ torch._C._jit_pass_complete_shape_analysis(cu.func.graph, (), False)
+ FileCheck().check(expect).check("aten::{tensor_op}".format(tensor_op=op)).run(cu.func.graph)
 
  @torch.jit.script
  def test_dtype(inp_dtype):
 
@@ -2,14 +2,18 @@
 #include <torch/csrc/jit/passes/bailout_graph.h>
 #include <torch/csrc/jit/passes/canonicalize_ops.h>
 #include <torch/csrc/jit/passes/clear_undefinedness.h>
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
 #include <torch/csrc/jit/passes/create_autodiff_subgraphs.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/graph_fuser.h>
 #include <torch/csrc/jit/passes/guard_elimination.h>
 #include <torch/csrc/jit/passes/inline_autodiff_subgraphs.h>
+#include <torch/csrc/jit/passes/inplace_check.h>
 #include <torch/csrc/jit/passes/insert_guards.h>
 #include <torch/csrc/jit/passes/lower_grad_of.h>
+#include <torch/csrc/jit/passes/peephole.h>
 #include <torch/csrc/jit/passes/remove_expands.h>
 #include <torch/csrc/jit/passes/requires_grad_analysis.h>
 #include <torch/csrc/jit/passes/shape_analysis.h>
@@ -53,11 +57,67 @@ static bool needsGradientInProfilingMode(Block* b) {
  return false;
 }
 
-std::shared_ptr<Graph> ProfilingGraphExecutorImpl::prepareGraph(
- const std::shared_ptr<Graph>& graph,
- Stack& stack) {
- auto g = graph->copy();
- return g;
+void ProfilingGraphExecutorImpl::runProfilingOptimizations(
+ std::shared_ptr<Graph>& copy) {
+ if (!getGraphExecutorOptimize()) {
+ LowerGradOf(*copy);
+ runRequiredPasses(copy);
+ return;
+ }
+
+ InsertGuards(copy);
+ LowerGradOf(*copy);
+ EliminateRedundantGuards(copy);
+ InsertBailOuts(copy);
+ GRAPH_DUMP("After InsertBailOuts: ", copy);
+ specializeAutogradZero(*copy);
+
+ runRequiredPasses(copy);
+ ConstantPropagation(copy);
+ runOptimization(copy);
+
+ if (needsGradientInProfilingMode(copy->block())) {
+ auto diff_nodes = CreateAutodiffSubgraphs(
+ copy,
+ getAutodiffSubgraphInlining() ? autodiffSubgraphNodeThreshold : 1);
+ for (Node* dnode : diff_nodes) {
+ auto diff_graph = std::move(dnode->g(attr::Subgraph));
+ Gradient gradient = differentiate(diff_graph);
+ runOptimization(gradient.f);
+ // run non diff optimization on the forward graph
+ runNondiffOptimization(gradient.f);
+ packGradient(gradient, dnode);
+ }
+ InlineAutodiffSubgraphs(
+ copy,
+ getAutodiffSubgraphInlining() ? autodiffSubgraphInlineThreshold : 1);
+
+ } else {
+ runNondiffOptimization(copy);
+ }
+ EliminateDeadCode(copy);
+ GRAPH_DUMP("Optimized Graph : ", copy);
+}
+
+void ProfilingGraphExecutorImpl::runProfilingInsensitiveOptimizations(
+ std::shared_ptr<Graph>& copy) {
+ LowerGradOf(*copy);
+ GRAPH_DUMP("runProfilingInsensitiveOptimizations", copy);
+ if (getProfilingMode()) {
+ ClearUndefinedness(copy);
+ }
+ runRequiredPasses(copy);
+ if (!getGraphExecutorOptimize()) {
+ return;
+ }
+
+ ConstantPropagation(copy);
+ EliminateDeadCode(copy);
+ EliminateCommonSubexpression(copy);
+ ConstantPooling(copy);
+ PeepholeOptimize(copy);
+ EliminateDeadCode(copy);
+ CheckInplace(copy);
 }
 
 ProfilingGraphExecutorImpl::ProfilingGraphExecutorImpl(
@@ -67,89 +127,43 @@ ProfilingGraphExecutorImpl::ProfilingGraphExecutorImpl(
 ExecutionPlan ProfilingGraphExecutorImpl::getPlanFor(Stack& stack) {
  std::lock_guard<std::mutex> lock(compile_mutex);
  GRAPH_DEBUG("Running ProfilingGraphExecutorImpl ", this);
+
  if (optimized_plan_) {
  return *optimized_plan_;
  }
 
- std::shared_ptr<Graph> copy;
- if (getProfilingMode()) {
- if (!pr_) {
- pr_ = ProfilingRecord::instrumentGraph(prepareGraph(graph, stack));
- auto copy = pr_->graph()->copy();
- LowerGradOf(*copy);
- specializeAutogradZero(*copy);
- runRequiredPasses(copy);
- GRAPH_DUMP("Profiled Graph: ", copy);
- profiling_plan_ = ExecutionPlan(copy);
- // fall-through
- }
-
- if (!pr_->ready()) {
- return *profiling_plan_;
- }
- copy = pr_->graph()->copy();
-
- } else {
- copy = graph->copy();
- }
-
- if (!getGraphExecutorOptimize()) {
- runRequiredPasses(copy);
+ // simple executor
+ if (!getProfilingMode()) {
+ auto copy = graph->copy();
+ runProfilingInsensitiveOptimizations(copy);
+ GRAPH_DUMP("Optimized SimpleExecutor Graph : ", copy);
  optimized_plan_ = ExecutionPlan(copy);
  return *optimized_plan_;
  }
 
- InsertGuards(copy);
- LowerGradOf(*copy);
- if (getProfilingMode()) {
- EliminateRedundantGuards(copy);
- InsertBailOuts(copy);
- GRAPH_DUMP("After InsertBailOuts: ", copy);
+ // if a profiling graph hasn't been created yet
+ if (!pr_) {
+ auto copy = graph->copy();
+ runProfilingInsensitiveOptimizations(copy);
+ pr_ = ProfilingRecord::instrumentGraph(copy);
+ auto pr_copy = pr_->graph()->copy();
+ GRAPH_DUMP("Profiled Graph: ", pr_copy);
+ profiling_plan_ = ExecutionPlan(pr_copy);
+ // fall-through
  }
 
- specializeAutogradZero(*copy);
- if (!getProfilingMode()) {
- ClearUndefinedness(copy);
+ // profile until a graph is ready
+ if (!pr_->ready()) {
+ return *profiling_plan_;
  }
 
- runRequiredPasses(copy);
- ConstantPropagation(copy);
- runOptimization(copy);
-
- // TODO: insert grad propagation
- bool needs_gradient = getProfilingMode()
- ? needsGradientInProfilingMode(copy->block())
- : true;
- if (needs_gradient) {
- // for Simple Executor skip creating autodiff graphs
- // and let autograd handle backward for us
- if (getProfilingMode()) {
- auto diff_nodes = CreateAutodiffSubgraphs(
- copy,
- getAutodiffSubgraphInlining() ? autodiffSubgraphNodeThreshold : 1);
- for (Node *dnode : diff_nodes) {
- auto diff_graph = std::move(dnode->g(attr::Subgraph));
- Gradient gradient = differentiate(diff_graph);
- runOptimization(gradient.f);
- // run non diff optimization on the forward graph
- runNondiffOptimization(gradient.f);
- packGradient(gradient, dnode);
- }
- InlineAutodiffSubgraphs(copy, getAutodiffSubgraphInlining()
- ? autodiffSubgraphInlineThreshold
- : 1);
- }
- } else {
- runNondiffOptimization(copy);
- }
- EliminateDeadCode(copy);
- GRAPH_DUMP("Optimized Graph : ", copy);
+ auto copy = pr_->graph()->copy();
+ runProfilingOptimizations(copy);
  // cache
  optimized_plan_ = ExecutionPlan(copy);
  return *optimized_plan_;
 }
 
-
 GraphExecutorState ProfilingGraphExecutorImpl::getDebugState() {
  GraphExecutorState state;
  TORCH_INTERNAL_ASSERT(optimized_plan_);
 
@@ -12,9 +12,8 @@ struct ProfilingGraphExecutorImpl : public GraphExecutorImplBase {
  ~ProfilingGraphExecutorImpl() override = default;
 
  private:
- std::shared_ptr<Graph> prepareGraph(
- const std::shared_ptr<Graph>& graph,
- Stack& stack);
+ void runProfilingInsensitiveOptimizations(std::shared_ptr<Graph>& graph);
+ void runProfilingOptimizations(std::shared_ptr<Graph>& graph);
  std::unique_ptr<ProfilingRecord> pr_;
  c10::optional<ExecutionPlan>
  profiling_plan_; // plan to run in order to profiling the code