Combine SSA optimization passes to reduce CFG traversals

Description

The SSA optimizer currently makes multiple independent passes over the CFG, each reading and modifying instructions. Combining related optimizations into single traversals would improve cache locality and reduce compilation time by 25-30%.

Current Implementation

// src/ssa.c - Current multiple passes void ssa_optimize(func_t *func) { sccp(func); // Pass 1: Full CFG traversal cse(func); // Pass 2: Full CFG traversal dce(func); // Pass 3: Full CFG traversal  strength_reduce(func); // Pass 4: Full CFG traversal // Each pass separately iterates all blocks and instructions }

Performance profile for 10,000 instruction function:

4 separate passes × 10,000 instructions = 40,000 instruction visits
Poor cache locality between passes
Redundant liveness/dominance recalculation

Proposed Combined Pass Architecture

1. Single-Pass Local Optimizations

// Combine all local (within basic block) optimizations typedef struct { bool changed; int constants_propagated; int expressions_eliminated; int strength_reductions; int algebraic_simplifications; } local_opt_stats_t; local_opt_stats_t optimize_basic_block(basic_block_t *bb) { local_opt_stats_t stats = {0}; // Single pass through instructions for (insn_t *insn = bb->first; insn; insn = insn->next) { // Try all applicable optimizations on this instruction // 1. Constant propagation if (try_constant_propagation(insn)) { stats.constants_propagated++; stats.changed = true; } // 2. Algebraic simplification (may enable more constant prop) if (try_algebraic_simplification(insn)) { stats.algebraic_simplifications++; stats.changed = true; // Retry constant propagation after simplification if (try_constant_propagation(insn)) { stats.constants_propagated++; } } // 3. Strength reduction if (try_strength_reduction(insn)) { stats.strength_reductions++; stats.changed = true; } // 4. Common subexpression elimination if (try_eliminate_common_subexpr(insn, bb)) { stats.expressions_eliminated++; stats.changed = true; } // 5. Peephole optimization if (try_peephole_optimization(insn)) { stats.changed = true; } } return stats; }

2. Worklist-Driven Global Optimization

// Process blocks in intelligent order typedef struct { basic_block_t **blocks; int count; int capacity; bool *in_worklist; // Bitmap for O(1) membership test } worklist_t; void optimize_function_worklist(func_t *func) { worklist_t *worklist = create_worklist(func->bb_count); // Add all blocks initially for (int i = 0; i < func->bb_count; i++) { worklist_add(worklist, func->bbs[i]); } // Process until convergence while (!worklist_empty(worklist)) { basic_block_t *bb = worklist_remove(worklist); // Apply all local optimizations local_opt_stats_t stats = optimize_basic_block(bb); if (stats.changed) { // Add successors to worklist if this block changed for (int i = 0; i < bb->succ_count; i++) { if (!worklist->in_worklist[bb->successors[i]->id]) { worklist_add(worklist, bb->successors[i]); } } // Add users of defined variables (for CSE) add_users_to_worklist(bb, worklist); } } // One final pass for dead code elimination eliminate_dead_code_global(func); }

3. Optimization Context Sharing

// Share analysis results between optimizations typedef struct { // Constant propagation state hashmap_t *known_constants; // CSE state hashmap_t *available_expressions; // Liveness information (computed once) bitset_t **live_in; bitset_t **live_out; // Dominance information (computed once) int *idom; // Immediate dominators bitset_t **dominance_frontier; // Statistics opt_statistics_t stats; } optimization_context_t; void optimize_with_context(func_t *func) { optimization_context_t *ctx = create_opt_context(func); // Compute analyses once compute_liveness(func, ctx->live_in, ctx->live_out); compute_dominance(func, ctx->idom, ctx->dominance_frontier); // Run optimizations sharing context bool changed; int iteration = 0; do { changed = false; for (bb in func->bbs) { // All optimizations use shared context changed |= propagate_constants_bb(bb, ctx); changed |= eliminate_expressions_bb(bb, ctx); changed |= simplify_algebraic_bb(bb, ctx); } iteration++; } while (changed && iteration < MAX_ITERATIONS); // Final cleanup remove_dead_code(func, ctx); print_optimization_stats(&ctx->stats); }

4. Cascading Optimizations

// Enable cascading optimizations in single pass bool optimize_instruction_cascade(insn_t *insn, optimization_context_t *ctx) { bool changed = false; bool local_changed; // Keep applying optimizations until fixed point do { local_changed = false; // Constant folding might enable simplification if (fold_constants(insn, ctx)) { local_changed = true; ctx->stats.constants_folded++; } // Simplification might enable more folding if (local_changed || simplify_algebraic(insn, ctx)) { local_changed = true; ctx->stats.algebraic_simplified++; // Try constant folding again if (fold_constants(insn, ctx)) { ctx->stats.constants_folded++; } } // Strength reduction on simplified expression if (reduce_strength(insn, ctx)) { local_changed = true; ctx->stats.strength_reduced++; } changed |= local_changed; } while (local_changed); return changed; }

5. Fast Pattern Matching

// Optimize pattern matching for common cases typedef struct { opcode_t op; bool (*matcher)(insn_t *); void (*optimizer)(insn_t *, optimization_context_t *); } optimization_pattern_t; // Pre-sorted by frequency for better branch prediction optimization_pattern_t patterns[] = { {OP_ADD, match_add_zero, optimize_identity}, {OP_MUL, match_mul_power2, optimize_mul_to_shift}, {OP_SUB, match_sub_self, optimize_to_zero}, {OP_DIV, match_div_power2, optimize_div_to_shift}, // ... more patterns }; void apply_patterns(insn_t *insn, optimization_context_t *ctx) { // Fast path for common opcodes if (insn->op >= OP_ADD && insn->op <= OP_XOR) { int pattern_idx = pattern_lookup_table[insn->op]; if (pattern_idx >= 0) { optimization_pattern_t *p = &patterns[pattern_idx]; if (p->matcher(insn)) { p->optimizer(insn, ctx); ctx->stats.patterns_matched++; } } } }

6. Batch Processing for Cache Efficiency

// Process multiple instructions together for better cache usage void optimize_instruction_batch(insn_t **batch, int count, optimization_context_t *ctx) { // Prefetch next batch while processing current if (count >= 4) { __builtin_prefetch(batch[4], 0, 1); } // Process batch for (int i = 0; i < count; i++) { insn_t *insn = batch[i]; // All optimizations on same instruction (cache hot) optimize_instruction_cascade(insn, ctx); } } void optimize_with_batching(func_t *func) { #define BATCH_SIZE 8 insn_t *batch[BATCH_SIZE]; int batch_count = 0; for (bb in func->bbs) { for (insn in bb->insns) { batch[batch_count++] = insn; if (batch_count == BATCH_SIZE) { optimize_instruction_batch(batch, batch_count, ctx); batch_count = 0; } } } // Process remaining if (batch_count > 0) { optimize_instruction_batch(batch, batch_count, ctx); } }

Performance Comparison

Benchmark Results

// Test with 10,000 instruction function void benchmark_optimization_passes() { func_t *func = generate_large_function(10000); // Old approach clock_t start = clock(); ssa_optimize_old(func); clock_t old_time = clock() - start; // New combined approach func = generate_large_function(10000); // Reset start = clock(); optimize_with_context(func); clock_t new_time = clock() - start; printf("Old approach: %.3f ms\n", old_time * 1000.0 / CLOCKS_PER_SEC); printf("New approach: %.3f ms\n", new_time * 1000.0 / CLOCKS_PER_SEC); printf("Speedup: %.2fx\n", (float)old_time / new_time); }

Integration Plan

Phase 1: Implement combined local optimizations
Phase 2: Add worklist infrastructure
Phase 3: Share optimization context
Phase 4: Benchmark and tune
Phase 5: Remove old separate passes

Testing Strategy

# Correctness: ensure same optimization results make test-opt-equivalence # Performance: measure improvement make benchmark-optimization # Quality: ensure no regression in generated code make test-code-quality

Success Criteria

25%+ reduction in optimization time
Same or better code quality
Reduced memory usage
All tests pass
Maintainable code structure

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Combine SSA optimization passes to reduce CFG traversals #295

Description

Current Implementation

Proposed Combined Pass Architecture

1. Single-Pass Local Optimizations

2. Worklist-Driven Global Optimization

3. Optimization Context Sharing

4. Cascading Optimizations

5. Fast Pattern Matching

6. Batch Processing for Cache Efficiency

Performance Comparison

Benchmark Results

Integration Plan

Testing Strategy

Success Criteria

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Combine SSA optimization passes to reduce CFG traversals #295

Description

Description

Current Implementation

Proposed Combined Pass Architecture

1. Single-Pass Local Optimizations

2. Worklist-Driven Global Optimization

3. Optimization Context Sharing

4. Cascading Optimizations

5. Fast Pattern Matching

6. Batch Processing for Cache Efficiency

Performance Comparison

Benchmark Results

Integration Plan

Testing Strategy

Success Criteria

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions