tensorflow
diff --git a/‎mesh_tensorflow/transformer/moe.py‎
Lines changed: 2 additions & 0 deletions b/‎mesh_tensorflow/transformer/moe.py‎
Lines changed: 2 additions & 0 deletions
@@ -526,6 +526,8 @@ def _compute_output(hidden, layer_name):
  # Extra reshape reduces communication cost for model-parallel versions.
  # For model-parallel versions, this reshape causes an mtf.slice and for non-
  # model-parallel versions, this has no effect.
+ d_model_split_dim = mtf.Dimension(
+ "d_model_split", expert_output.shape[-1].size)
  expert_output = mtf.reshape(
  expert_output,
  mtf.Shape([