- Notifications
You must be signed in to change notification settings - Fork 143
Open
Labels
enhancementNew feature or requestNew feature or request
Description
Description
Symbol lookup currently uses linear search through linked lists, consuming 35% of compilation time for large programs. Implementing a hash table would reduce lookup complexity from O(n) to O(1) average case.
Current Implementation
// src/globals.c - Current linear search symbol_t *find_symbol(char *name) { for (symbol_t *sym = symbols; sym; sym = sym->next) { if (strcmp(sym->name, name) == 0) return sym; } return NULL; }
Performance profile shows:
- 10,000 symbols: ~350ms lookup time (35% of total)
- 1,000 symbols: ~35ms lookup time (20% of total)
- 100 symbols: ~3ms lookup time (5% of total)
Proposed Hash Table Implementation
1. Data Structures
// Add to src/defs.h #define SYMBOL_HASH_SIZE 1021 // Prime number for better distribution typedef struct symbol_entry { symbol_t *symbol; struct symbol_entry *next; // Chain for collision resolution } symbol_entry_t; typedef struct { symbol_entry_t *buckets[SYMBOL_HASH_SIZE]; int total_symbols; int max_chain_length; // For statistics // Statistics int lookups; int collisions; int hits; int misses; } symbol_table_t; // Global symbol tables typedef struct { symbol_table_t *global_symbols; symbol_table_t *local_symbols; // Current function scope symbol_table_t *types; // Type definitions } symbol_context_t;
2. Hash Function
// src/symbol_table.c - New file // DJB2 hash function - simple and effective uint32_t hash_string(const char *str) { uint32_t hash = 5381; int c; while ((c = *str++)) { hash = ((hash << 5) + hash) + c; // hash * 33 + c } return hash; } int get_bucket_index(const char *name) { return hash_string(name) % SYMBOL_HASH_SIZE; }
3. Symbol Table Operations
symbol_table_t *create_symbol_table(void) { symbol_table_t *table = calloc(1, sizeof(symbol_table_t)); return table; } void insert_symbol(symbol_table_t *table, symbol_t *symbol) { int index = get_bucket_index(symbol->name); // Create new entry symbol_entry_t *entry = malloc(sizeof(symbol_entry_t)); entry->symbol = symbol; // Insert at head of chain (O(1) insertion) entry->next = table->buckets[index]; table->buckets[index] = entry; // Update statistics table->total_symbols++; if (entry->next) { table->collisions++; } // Track max chain length int chain_len = 0; for (symbol_entry_t *e = entry; e; e = e->next) { chain_len++; } if (chain_len > table->max_chain_length) { table->max_chain_length = chain_len; } } symbol_t *lookup_symbol(symbol_table_t *table, const char *name) { table->lookups++; int index = get_bucket_index(name); symbol_entry_t *entry = table->buckets[index]; // Search chain while (entry) { if (strcmp(entry->symbol->name, name) == 0) { table->hits++; return entry->symbol; } entry = entry->next; } table->misses++; return NULL; } // Replace global find_symbol symbol_t *find_symbol_fast(const char *name) { symbol_t *sym = NULL; // Check local scope first if (current_context->local_symbols) { sym = lookup_symbol(current_context->local_symbols, name); if (sym) return sym; } // Check global scope return lookup_symbol(current_context->global_symbols, name); }
4. Dynamic Resizing (Optional Enhancement)
void resize_symbol_table(symbol_table_t *table) { // Only resize if load factor > 0.75 float load_factor = (float)table->total_symbols / SYMBOL_HASH_SIZE; if (load_factor < 0.75) return; // Save old buckets symbol_entry_t *old_buckets[SYMBOL_HASH_SIZE]; memcpy(old_buckets, table->buckets, sizeof(old_buckets)); // Clear table memset(table->buckets, 0, sizeof(table->buckets)); table->total_symbols = 0; table->collisions = 0; // Rehash all symbols with new size for (int i = 0; i < SYMBOL_HASH_SIZE; i++) { symbol_entry_t *entry = old_buckets[i]; while (entry) { symbol_entry_t *next = entry->next; entry->next = NULL; // Reinsert with new hash int new_index = get_bucket_index(entry->symbol->name); entry->next = table->buckets[new_index]; table->buckets[new_index] = entry; table->total_symbols++; entry = next; } } }
5. Integration Points
// Update src/parser.c void enter_scope(void) { // Create new local symbol table current_context->local_symbols = create_symbol_table(); } void exit_scope(void) { // Print statistics if verbose if (verbose_mode) { print_symbol_table_stats(current_context->local_symbols); } // Free local symbol table free_symbol_table(current_context->local_symbols); current_context->local_symbols = NULL; } // Update symbol creation symbol_t *create_symbol(const char *name, type_t *type) { symbol_t *sym = malloc(sizeof(symbol_t)); sym->name = strdup(name); sym->type = type; // Insert into appropriate table if (current_scope == SCOPE_GLOBAL) { insert_symbol(current_context->global_symbols, sym); } else { insert_symbol(current_context->local_symbols, sym); } return sym; }
6. Performance Monitoring
void print_symbol_table_stats(symbol_table_t *table) { printf("=== Symbol Table Statistics ===\n"); printf("Total symbols: %d\n", table->total_symbols); printf("Hash buckets: %d\n", SYMBOL_HASH_SIZE); printf("Load factor: %.2f\n", (float)table->total_symbols / SYMBOL_HASH_SIZE); printf("Collisions: %d (%.1f%%)\n", table->collisions, table->collisions * 100.0 / table->total_symbols); printf("Max chain length: %d\n", table->max_chain_length); printf("Total lookups: %d\n", table->lookups); printf("Hit rate: %.1f%%\n", table->hits * 100.0 / table->lookups); // Distribution analysis int empty = 0, chains = 0; for (int i = 0; i < SYMBOL_HASH_SIZE; i++) { if (!table->buckets[i]) { empty++; } else if (table->buckets[i]->next) { chains++; } } printf("Empty buckets: %d (%.1f%%)\n", empty, empty * 100.0 / SYMBOL_HASH_SIZE); printf("Chains: %d\n", chains); }
Testing
Performance Benchmark
// tests/perf/symbol_lookup.c #define NUM_SYMBOLS 10000 void benchmark_symbol_lookup() { clock_t start, end; // Create many symbols for (int i = 0; i < NUM_SYMBOLS; i++) { char name[32]; sprintf(name, "symbol_%d", i); create_symbol(name, &type_int); } // Benchmark lookups start = clock(); for (int i = 0; i < NUM_SYMBOLS * 10; i++) { char name[32]; sprintf(name, "symbol_%d", rand() % NUM_SYMBOLS); find_symbol_fast(name); } end = clock(); double time_spent = ((double)(end - start)) / CLOCKS_PER_SEC; printf("100,000 lookups in %.3f seconds\n", time_spent); printf("Average: %.3f microseconds per lookup\n", time_spent * 1000000 / (NUM_SYMBOLS * 10)); }
Correctness Tests
// Verify hash table behaves identically to linear search void test_symbol_table_correctness() { // Test insertion and lookup symbol_t *s1 = create_symbol("test1", &type_int); symbol_t *s2 = create_symbol("test2", &type_char); assert(find_symbol_fast("test1") == s1); assert(find_symbol_fast("test2") == s2); assert(find_symbol_fast("nonexistent") == NULL); // Test collision handling // Force collisions by creating symbols with same hash for (int i = 0; i < 100; i++) { char name[32]; sprintf(name, "collision_%d", i * SYMBOL_HASH_SIZE); create_symbol(name, &type_int); } // Verify all can be found for (int i = 0; i < 100; i++) { char name[32]; sprintf(name, "collision_%d", i * SYMBOL_HASH_SIZE); assert(find_symbol_fast(name) != NULL); } }
Migration Plan
- Phase 1: Implement hash table alongside existing code
- Phase 2: Add compatibility layer with fallback
- Phase 3: Migrate all symbol lookups
- Phase 4: Remove old linear search code
Success Criteria
- O(1) average lookup time
- No functionality regression
- 30%+ speedup for large programs
- Hash collision rate < 10%
- All existing tests pass
Metadata
Metadata
Assignees
Labels
enhancementNew feature or requestNew feature or request