Skip to content

Replace linear symbol lookup with hash table for O(1) performance #294

@jserv

Description

@jserv

Description

Symbol lookup currently uses linear search through linked lists, consuming 35% of compilation time for large programs. Implementing a hash table would reduce lookup complexity from O(n) to O(1) average case.

Current Implementation

// src/globals.c - Current linear search symbol_t *find_symbol(char *name) { for (symbol_t *sym = symbols; sym; sym = sym->next) { if (strcmp(sym->name, name) == 0) return sym; } return NULL; }

Performance profile shows:

  • 10,000 symbols: ~350ms lookup time (35% of total)
  • 1,000 symbols: ~35ms lookup time (20% of total)
  • 100 symbols: ~3ms lookup time (5% of total)

Proposed Hash Table Implementation

1. Data Structures

// Add to src/defs.h #define SYMBOL_HASH_SIZE 1021 // Prime number for better distribution typedef struct symbol_entry { symbol_t *symbol; struct symbol_entry *next; // Chain for collision resolution } symbol_entry_t; typedef struct { symbol_entry_t *buckets[SYMBOL_HASH_SIZE]; int total_symbols; int max_chain_length; // For statistics // Statistics int lookups; int collisions; int hits; int misses; } symbol_table_t; // Global symbol tables typedef struct { symbol_table_t *global_symbols; symbol_table_t *local_symbols; // Current function scope symbol_table_t *types; // Type definitions } symbol_context_t;

2. Hash Function

// src/symbol_table.c - New file // DJB2 hash function - simple and effective uint32_t hash_string(const char *str) { uint32_t hash = 5381; int c; while ((c = *str++)) { hash = ((hash << 5) + hash) + c; // hash * 33 + c } return hash; } int get_bucket_index(const char *name) { return hash_string(name) % SYMBOL_HASH_SIZE; }

3. Symbol Table Operations

symbol_table_t *create_symbol_table(void) { symbol_table_t *table = calloc(1, sizeof(symbol_table_t)); return table; } void insert_symbol(symbol_table_t *table, symbol_t *symbol) { int index = get_bucket_index(symbol->name); // Create new entry symbol_entry_t *entry = malloc(sizeof(symbol_entry_t)); entry->symbol = symbol; // Insert at head of chain (O(1) insertion) entry->next = table->buckets[index]; table->buckets[index] = entry; // Update statistics table->total_symbols++; if (entry->next) { table->collisions++; } // Track max chain length int chain_len = 0; for (symbol_entry_t *e = entry; e; e = e->next) { chain_len++; } if (chain_len > table->max_chain_length) { table->max_chain_length = chain_len; } } symbol_t *lookup_symbol(symbol_table_t *table, const char *name) { table->lookups++; int index = get_bucket_index(name); symbol_entry_t *entry = table->buckets[index]; // Search chain while (entry) { if (strcmp(entry->symbol->name, name) == 0) { table->hits++; return entry->symbol; } entry = entry->next; } table->misses++; return NULL; } // Replace global find_symbol symbol_t *find_symbol_fast(const char *name) { symbol_t *sym = NULL; // Check local scope first if (current_context->local_symbols) { sym = lookup_symbol(current_context->local_symbols, name); if (sym) return sym; } // Check global scope return lookup_symbol(current_context->global_symbols, name); }

4. Dynamic Resizing (Optional Enhancement)

void resize_symbol_table(symbol_table_t *table) { // Only resize if load factor > 0.75 float load_factor = (float)table->total_symbols / SYMBOL_HASH_SIZE; if (load_factor < 0.75) return; // Save old buckets symbol_entry_t *old_buckets[SYMBOL_HASH_SIZE]; memcpy(old_buckets, table->buckets, sizeof(old_buckets)); // Clear table memset(table->buckets, 0, sizeof(table->buckets)); table->total_symbols = 0; table->collisions = 0; // Rehash all symbols with new size for (int i = 0; i < SYMBOL_HASH_SIZE; i++) { symbol_entry_t *entry = old_buckets[i]; while (entry) { symbol_entry_t *next = entry->next; entry->next = NULL; // Reinsert with new hash int new_index = get_bucket_index(entry->symbol->name); entry->next = table->buckets[new_index]; table->buckets[new_index] = entry; table->total_symbols++; entry = next; } } }

5. Integration Points

// Update src/parser.c void enter_scope(void) { // Create new local symbol table current_context->local_symbols = create_symbol_table(); } void exit_scope(void) { // Print statistics if verbose if (verbose_mode) { print_symbol_table_stats(current_context->local_symbols); } // Free local symbol table free_symbol_table(current_context->local_symbols); current_context->local_symbols = NULL; } // Update symbol creation symbol_t *create_symbol(const char *name, type_t *type) { symbol_t *sym = malloc(sizeof(symbol_t)); sym->name = strdup(name); sym->type = type; // Insert into appropriate table if (current_scope == SCOPE_GLOBAL) { insert_symbol(current_context->global_symbols, sym); } else { insert_symbol(current_context->local_symbols, sym); } return sym; }

6. Performance Monitoring

void print_symbol_table_stats(symbol_table_t *table) { printf("=== Symbol Table Statistics ===\n"); printf("Total symbols: %d\n", table->total_symbols); printf("Hash buckets: %d\n", SYMBOL_HASH_SIZE); printf("Load factor: %.2f\n", (float)table->total_symbols / SYMBOL_HASH_SIZE); printf("Collisions: %d (%.1f%%)\n", table->collisions, table->collisions * 100.0 / table->total_symbols); printf("Max chain length: %d\n", table->max_chain_length); printf("Total lookups: %d\n", table->lookups); printf("Hit rate: %.1f%%\n", table->hits * 100.0 / table->lookups); // Distribution analysis int empty = 0, chains = 0; for (int i = 0; i < SYMBOL_HASH_SIZE; i++) { if (!table->buckets[i]) { empty++; } else if (table->buckets[i]->next) { chains++; } } printf("Empty buckets: %d (%.1f%%)\n", empty, empty * 100.0 / SYMBOL_HASH_SIZE); printf("Chains: %d\n", chains); }

Testing

Performance Benchmark

// tests/perf/symbol_lookup.c #define NUM_SYMBOLS 10000 void benchmark_symbol_lookup() { clock_t start, end; // Create many symbols for (int i = 0; i < NUM_SYMBOLS; i++) { char name[32]; sprintf(name, "symbol_%d", i); create_symbol(name, &type_int); } // Benchmark lookups start = clock(); for (int i = 0; i < NUM_SYMBOLS * 10; i++) { char name[32]; sprintf(name, "symbol_%d", rand() % NUM_SYMBOLS); find_symbol_fast(name); } end = clock(); double time_spent = ((double)(end - start)) / CLOCKS_PER_SEC; printf("100,000 lookups in %.3f seconds\n", time_spent); printf("Average: %.3f microseconds per lookup\n", time_spent * 1000000 / (NUM_SYMBOLS * 10)); }

Correctness Tests

// Verify hash table behaves identically to linear search void test_symbol_table_correctness() { // Test insertion and lookup symbol_t *s1 = create_symbol("test1", &type_int); symbol_t *s2 = create_symbol("test2", &type_char); assert(find_symbol_fast("test1") == s1); assert(find_symbol_fast("test2") == s2); assert(find_symbol_fast("nonexistent") == NULL); // Test collision handling // Force collisions by creating symbols with same hash for (int i = 0; i < 100; i++) { char name[32]; sprintf(name, "collision_%d", i * SYMBOL_HASH_SIZE); create_symbol(name, &type_int); } // Verify all can be found for (int i = 0; i < 100; i++) { char name[32]; sprintf(name, "collision_%d", i * SYMBOL_HASH_SIZE); assert(find_symbol_fast(name) != NULL); } }

Migration Plan

  1. Phase 1: Implement hash table alongside existing code
  2. Phase 2: Add compatibility layer with fallback
  3. Phase 3: Migrate all symbol lookups
  4. Phase 4: Remove old linear search code

Success Criteria

  • O(1) average lookup time
  • No functionality regression
  • 30%+ speedup for large programs
  • Hash collision rate < 10%
  • All existing tests pass

Metadata

Metadata

Assignees

No one assigned

    Labels

    enhancementNew feature or request

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions