Skip to content

Combine SSA optimization passes to reduce CFG traversals #295

@jserv

Description

@jserv

Description

The SSA optimizer currently makes multiple independent passes over the CFG, each reading and modifying instructions. Combining related optimizations into single traversals would improve cache locality and reduce compilation time by 25-30%.

Current Implementation

// src/ssa.c - Current multiple passes void ssa_optimize(func_t *func) { sccp(func); // Pass 1: Full CFG traversal cse(func); // Pass 2: Full CFG traversal dce(func); // Pass 3: Full CFG traversal  strength_reduce(func); // Pass 4: Full CFG traversal // Each pass separately iterates all blocks and instructions }

Performance profile for 10,000 instruction function:

  • 4 separate passes × 10,000 instructions = 40,000 instruction visits
  • Poor cache locality between passes
  • Redundant liveness/dominance recalculation

Proposed Combined Pass Architecture

1. Single-Pass Local Optimizations

// Combine all local (within basic block) optimizations typedef struct { bool changed; int constants_propagated; int expressions_eliminated; int strength_reductions; int algebraic_simplifications; } local_opt_stats_t; local_opt_stats_t optimize_basic_block(basic_block_t *bb) { local_opt_stats_t stats = {0}; // Single pass through instructions for (insn_t *insn = bb->first; insn; insn = insn->next) { // Try all applicable optimizations on this instruction // 1. Constant propagation if (try_constant_propagation(insn)) { stats.constants_propagated++; stats.changed = true; } // 2. Algebraic simplification (may enable more constant prop) if (try_algebraic_simplification(insn)) { stats.algebraic_simplifications++; stats.changed = true; // Retry constant propagation after simplification if (try_constant_propagation(insn)) { stats.constants_propagated++; } } // 3. Strength reduction if (try_strength_reduction(insn)) { stats.strength_reductions++; stats.changed = true; } // 4. Common subexpression elimination if (try_eliminate_common_subexpr(insn, bb)) { stats.expressions_eliminated++; stats.changed = true; } // 5. Peephole optimization if (try_peephole_optimization(insn)) { stats.changed = true; } } return stats; }

2. Worklist-Driven Global Optimization

// Process blocks in intelligent order typedef struct { basic_block_t **blocks; int count; int capacity; bool *in_worklist; // Bitmap for O(1) membership test } worklist_t; void optimize_function_worklist(func_t *func) { worklist_t *worklist = create_worklist(func->bb_count); // Add all blocks initially for (int i = 0; i < func->bb_count; i++) { worklist_add(worklist, func->bbs[i]); } // Process until convergence while (!worklist_empty(worklist)) { basic_block_t *bb = worklist_remove(worklist); // Apply all local optimizations local_opt_stats_t stats = optimize_basic_block(bb); if (stats.changed) { // Add successors to worklist if this block changed for (int i = 0; i < bb->succ_count; i++) { if (!worklist->in_worklist[bb->successors[i]->id]) { worklist_add(worklist, bb->successors[i]); } } // Add users of defined variables (for CSE) add_users_to_worklist(bb, worklist); } } // One final pass for dead code elimination eliminate_dead_code_global(func); }

3. Optimization Context Sharing

// Share analysis results between optimizations typedef struct { // Constant propagation state hashmap_t *known_constants; // CSE state hashmap_t *available_expressions; // Liveness information (computed once) bitset_t **live_in; bitset_t **live_out; // Dominance information (computed once) int *idom; // Immediate dominators bitset_t **dominance_frontier; // Statistics opt_statistics_t stats; } optimization_context_t; void optimize_with_context(func_t *func) { optimization_context_t *ctx = create_opt_context(func); // Compute analyses once compute_liveness(func, ctx->live_in, ctx->live_out); compute_dominance(func, ctx->idom, ctx->dominance_frontier); // Run optimizations sharing context bool changed; int iteration = 0; do { changed = false; for (bb in func->bbs) { // All optimizations use shared context changed |= propagate_constants_bb(bb, ctx); changed |= eliminate_expressions_bb(bb, ctx); changed |= simplify_algebraic_bb(bb, ctx); } iteration++; } while (changed && iteration < MAX_ITERATIONS); // Final cleanup remove_dead_code(func, ctx); print_optimization_stats(&ctx->stats); }

4. Cascading Optimizations

// Enable cascading optimizations in single pass bool optimize_instruction_cascade(insn_t *insn, optimization_context_t *ctx) { bool changed = false; bool local_changed; // Keep applying optimizations until fixed point do { local_changed = false; // Constant folding might enable simplification if (fold_constants(insn, ctx)) { local_changed = true; ctx->stats.constants_folded++; } // Simplification might enable more folding if (local_changed || simplify_algebraic(insn, ctx)) { local_changed = true; ctx->stats.algebraic_simplified++; // Try constant folding again if (fold_constants(insn, ctx)) { ctx->stats.constants_folded++; } } // Strength reduction on simplified expression if (reduce_strength(insn, ctx)) { local_changed = true; ctx->stats.strength_reduced++; } changed |= local_changed; } while (local_changed); return changed; }

5. Fast Pattern Matching

// Optimize pattern matching for common cases typedef struct { opcode_t op; bool (*matcher)(insn_t *); void (*optimizer)(insn_t *, optimization_context_t *); } optimization_pattern_t; // Pre-sorted by frequency for better branch prediction optimization_pattern_t patterns[] = { {OP_ADD, match_add_zero, optimize_identity}, {OP_MUL, match_mul_power2, optimize_mul_to_shift}, {OP_SUB, match_sub_self, optimize_to_zero}, {OP_DIV, match_div_power2, optimize_div_to_shift}, // ... more patterns }; void apply_patterns(insn_t *insn, optimization_context_t *ctx) { // Fast path for common opcodes if (insn->op >= OP_ADD && insn->op <= OP_XOR) { int pattern_idx = pattern_lookup_table[insn->op]; if (pattern_idx >= 0) { optimization_pattern_t *p = &patterns[pattern_idx]; if (p->matcher(insn)) { p->optimizer(insn, ctx); ctx->stats.patterns_matched++; } } } }

6. Batch Processing for Cache Efficiency

// Process multiple instructions together for better cache usage void optimize_instruction_batch(insn_t **batch, int count, optimization_context_t *ctx) { // Prefetch next batch while processing current if (count >= 4) { __builtin_prefetch(batch[4], 0, 1); } // Process batch for (int i = 0; i < count; i++) { insn_t *insn = batch[i]; // All optimizations on same instruction (cache hot) optimize_instruction_cascade(insn, ctx); } } void optimize_with_batching(func_t *func) { #define BATCH_SIZE 8 insn_t *batch[BATCH_SIZE]; int batch_count = 0; for (bb in func->bbs) { for (insn in bb->insns) { batch[batch_count++] = insn; if (batch_count == BATCH_SIZE) { optimize_instruction_batch(batch, batch_count, ctx); batch_count = 0; } } } // Process remaining if (batch_count > 0) { optimize_instruction_batch(batch, batch_count, ctx); } }

Performance Comparison

Benchmark Results

// Test with 10,000 instruction function void benchmark_optimization_passes() { func_t *func = generate_large_function(10000); // Old approach clock_t start = clock(); ssa_optimize_old(func); clock_t old_time = clock() - start; // New combined approach func = generate_large_function(10000); // Reset start = clock(); optimize_with_context(func); clock_t new_time = clock() - start; printf("Old approach: %.3f ms\n", old_time * 1000.0 / CLOCKS_PER_SEC); printf("New approach: %.3f ms\n", new_time * 1000.0 / CLOCKS_PER_SEC); printf("Speedup: %.2fx\n", (float)old_time / new_time); }

Integration Plan

  1. Phase 1: Implement combined local optimizations
  2. Phase 2: Add worklist infrastructure
  3. Phase 3: Share optimization context
  4. Phase 4: Benchmark and tune
  5. Phase 5: Remove old separate passes

Testing Strategy

# Correctness: ensure same optimization results make test-opt-equivalence # Performance: measure improvement make benchmark-optimization # Quality: ensure no regression in generated code make test-code-quality

Success Criteria

  • 25%+ reduction in optimization time
  • Same or better code quality
  • Reduced memory usage
  • All tests pass
  • Maintainable code structure

Metadata

Metadata

Assignees

No one assigned

    Labels

    enhancementNew feature or request

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions