ray-project
diff --git a/‎python/ray/data/_internal/logical/rules/operator_fusion.py‎
Lines changed: 37 additions & 2 deletions b/‎python/ray/data/_internal/logical/rules/operator_fusion.py‎
Lines changed: 37 additions & 2 deletions
diff --git a/‎python/ray/data/tests/test_operator_fusion.py‎
Lines changed: 2 additions & 1 deletion b/‎python/ray/data/tests/test_operator_fusion.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/ray/data/tests/test_repartition_e2e.py‎
Lines changed: 10 additions & 6 deletions b/‎python/ray/data/tests/test_repartition_e2e.py‎
Lines changed: 10 additions & 6 deletions
@@ -90,6 +90,30 @@ def _fuse_streaming_repartition_operators_in_dag(
 
  This will ensure the map_batch's function receive the correct number of rows.
  We also ensure the output rows is `batch_size`.
+
+ Why don't we fuse `StreamingRepartition -> MapBatches`?
+
+ ----------------------------------------------------------------------------------------------------
+ | | Number of `map_batches` tasks |
+ |----------------------|---------------------------------------------------------------------------|
+ | Fused | num_input_blocks (which is <= num output blocks of StreamingRepartition) |
+ | Not fused | num output blocks of StreamingRepartition |
+ ----------------------------------------------------------------------------------------------------
+
+ When fused, the number of tasks equals the number of input blocks, which is
+ <= the number of output blocks of StreamingRepartition. If StreamingRepartition
+ is supposed to break down blocks to increase parallelism, that won't happen
+ when fused. So we don't fuse.
+
+ Why do we fuse `MapBatches -> StreamingRepartition` (when `batch_size % target_num_rows == 0`)?
+ ----------------------------------------------------------
+ | | Number of `map_batches` tasks |
+ |----------------------|--------------------------------|
+ | Fused | total_rows / batch_size |
+ | Not fused | total_rows / batch_size |
+ ----------------------------------------------------------
+
+ Parallelism is unchanged, so we fuse to avoid intermediate materialization.
  """
  upstream_ops = dag.input_dependencies
  while (
@@ -252,8 +276,17 @@ def _can_fuse(self, down_op: PhysicalOperator, up_op: PhysicalOperator) -> bool:
  if isinstance(down_logical_op, StreamingRepartition):
  return (
  isinstance(up_logical_op, MapBatches)
+ and up_logical_op._batch_size is not None
+ and down_logical_op.target_num_rows_per_block is not None
+ and down_logical_op.target_num_rows_per_block > 0
+ # When the batch_size is a multiple of target_num_rows_per_block, fusing would still produce exactly identical sequence of blocks.
+ # See `_fuse_streaming_repartition_operators_in_dag` docstring for details.
+ # TODO: when the StreamingRepartition supports none_strict_mode, we can fuse
+ # `MapBatches -> StreamingRepartition` no matter what the `batch_size` and `target_num_rows` are.
+ # https://anyscale1.atlassian.net/browse/DATA-1731
  and up_logical_op._batch_size
- == down_logical_op.target_num_rows_per_block
+ % down_logical_op.target_num_rows_per_block
+ == 0
  )
  # Other operators cannot fuse with StreamingRepartition.
  if isinstance(up_logical_op, StreamingRepartition):
@@ -276,7 +309,9 @@ def _get_fused_streaming_repartition_operator(
  up_logical_op = self._op_map.pop(up_op)
  assert isinstance(up_logical_op, MapBatches)
  assert isinstance(down_logical_op, StreamingRepartition)
- assert up_logical_op._batch_size == down_logical_op.target_num_rows_per_block
+ assert (
+ up_logical_op._batch_size % down_logical_op.target_num_rows_per_block == 0
+ )
  batch_size = up_logical_op._batch_size
 
  compute = self._fuse_compute_strategy(
 
@@ -745,9 +745,10 @@ def test_zero_copy_fusion_eliminate_build_output_blocks(
 @pytest.mark.parametrize(
  "order,target_num_rows,batch_size,should_fuse",
  [
- # map_batches -> streaming_repartition: fuse when batch_size == target_num_rows
+ # map_batches -> streaming_repartition: fuse when batch_size is a multiple of target_num_rows
  ("map_then_sr", 20, 20, True),
  ("map_then_sr", 20, 10, False),
+ ("map_then_sr", 20, 40, True),
  ("map_then_sr", 20, None, False),
  # streaming_repartition -> map_batches: not fused
  ("sr_then_map", 20, 20, False),
 
@@ -233,22 +233,25 @@ def test_repartition_empty_datasets(ray_start_regular_shared_2_cpus, shuffle):
 
 
 @pytest.mark.parametrize("streaming_repartition_first", [True, False])
+@pytest.mark.parametrize("n_target_num_rows", [1, 5])
 def test_streaming_repartition_write_with_operator_fusion(
  ray_start_regular_shared_2_cpus,
  tmp_path,
  disable_fallback_to_object_extension,
  streaming_repartition_first,
+ n_target_num_rows,
 ):
  """Test that write with streaming repartition produces exact partitions
  with operator fusion.
  This test verifies:
  * StreamingRepartition and MapBatches operators are fused, with both orders
  """
+ target_num_rows = 20
 
  def fn(batch):
  # Get number of rows from the first column (batch is a dict of column_name -> array)
  num_rows = len(batch["id"])
- assert num_rows == 20, f"Expected batch size 20, got {num_rows}"
+ assert num_rows == b_s, f"Expected batch size {b_s}, got {num_rows}"
  return batch
 
  # Configure shuffle strategy
@@ -270,12 +273,13 @@ def fn(batch):
  ds = ds.repartition(target_num_rows_per_block=30)
 
  # Verify fusion of StreamingRepartition and MapBatches operators
+ b_s = target_num_rows * n_target_num_rows
  if streaming_repartition_first:
- ds = ds.repartition(target_num_rows_per_block=20)
- ds = ds.map_batches(fn, batch_size=20)
+ ds = ds.repartition(target_num_rows_per_block=target_num_rows)
+ ds = ds.map_batches(fn, batch_size=b_s)
  else:
- ds = ds.map_batches(fn, batch_size=20)
- ds = ds.repartition(target_num_rows_per_block=20)
+ ds = ds.map_batches(fn, batch_size=b_s)
+ ds = ds.repartition(target_num_rows_per_block=target_num_rows)
  planner = create_planner()
  physical_plan = planner.plan(ds._logical_plan)
  physical_plan = PhysicalOptimizer().optimize(physical_plan)
@@ -286,7 +290,7 @@ def fn(batch):
  else:
  assert (
  physical_op.name
- == "MapBatches(fn)->StreamingRepartition[num_rows_per_block=20]"
+ == f"MapBatches(fn)->StreamingRepartition[num_rows_per_block={target_num_rows}]"
  )
 
  # Write output to local Parquet files partitioned by key