-
Notifications
You must be signed in to change notification settings - Fork 146
Open
Labels
enhancementNew feature or requestNew feature or request
Description
Description
Symbol lookup currently uses linear search through linked lists, consuming 35% of compilation time for large programs. Implementing a hash table would reduce lookup complexity from O(n) to O(1) average case.
Current Implementation
// src/globals.c - Current linear search
symbol_t *find_symbol(char *name) {
for (symbol_t *sym = symbols; sym; sym = sym->next) {
if (strcmp(sym->name, name) == 0)
return sym;
}
return NULL;
}Performance profile shows:
- 10,000 symbols: ~350ms lookup time (35% of total)
- 1,000 symbols: ~35ms lookup time (20% of total)
- 100 symbols: ~3ms lookup time (5% of total)
Proposed Hash Table Implementation
1. Data Structures
// Add to src/defs.h
#define SYMBOL_HASH_SIZE 1021 // Prime number for better distribution
typedef struct symbol_entry {
symbol_t *symbol;
struct symbol_entry *next; // Chain for collision resolution
} symbol_entry_t;
typedef struct {
symbol_entry_t *buckets[SYMBOL_HASH_SIZE];
int total_symbols;
int max_chain_length; // For statistics
// Statistics
int lookups;
int collisions;
int hits;
int misses;
} symbol_table_t;
// Global symbol tables
typedef struct {
symbol_table_t *global_symbols;
symbol_table_t *local_symbols; // Current function scope
symbol_table_t *types; // Type definitions
} symbol_context_t;2. Hash Function
// src/symbol_table.c - New file
// DJB2 hash function - simple and effective
uint32_t hash_string(const char *str) {
uint32_t hash = 5381;
int c;
while ((c = *str++)) {
hash = ((hash << 5) + hash) + c; // hash * 33 + c
}
return hash;
}
int get_bucket_index(const char *name) {
return hash_string(name) % SYMBOL_HASH_SIZE;
}3. Symbol Table Operations
symbol_table_t *create_symbol_table(void) {
symbol_table_t *table = calloc(1, sizeof(symbol_table_t));
return table;
}
void insert_symbol(symbol_table_t *table, symbol_t *symbol) {
int index = get_bucket_index(symbol->name);
// Create new entry
symbol_entry_t *entry = malloc(sizeof(symbol_entry_t));
entry->symbol = symbol;
// Insert at head of chain (O(1) insertion)
entry->next = table->buckets[index];
table->buckets[index] = entry;
// Update statistics
table->total_symbols++;
if (entry->next) {
table->collisions++;
}
// Track max chain length
int chain_len = 0;
for (symbol_entry_t *e = entry; e; e = e->next) {
chain_len++;
}
if (chain_len > table->max_chain_length) {
table->max_chain_length = chain_len;
}
}
symbol_t *lookup_symbol(symbol_table_t *table, const char *name) {
table->lookups++;
int index = get_bucket_index(name);
symbol_entry_t *entry = table->buckets[index];
// Search chain
while (entry) {
if (strcmp(entry->symbol->name, name) == 0) {
table->hits++;
return entry->symbol;
}
entry = entry->next;
}
table->misses++;
return NULL;
}
// Replace global find_symbol
symbol_t *find_symbol_fast(const char *name) {
symbol_t *sym = NULL;
// Check local scope first
if (current_context->local_symbols) {
sym = lookup_symbol(current_context->local_symbols, name);
if (sym) return sym;
}
// Check global scope
return lookup_symbol(current_context->global_symbols, name);
}4. Dynamic Resizing (Optional Enhancement)
void resize_symbol_table(symbol_table_t *table) {
// Only resize if load factor > 0.75
float load_factor = (float)table->total_symbols / SYMBOL_HASH_SIZE;
if (load_factor < 0.75) return;
// Save old buckets
symbol_entry_t *old_buckets[SYMBOL_HASH_SIZE];
memcpy(old_buckets, table->buckets, sizeof(old_buckets));
// Clear table
memset(table->buckets, 0, sizeof(table->buckets));
table->total_symbols = 0;
table->collisions = 0;
// Rehash all symbols with new size
for (int i = 0; i < SYMBOL_HASH_SIZE; i++) {
symbol_entry_t *entry = old_buckets[i];
while (entry) {
symbol_entry_t *next = entry->next;
entry->next = NULL;
// Reinsert with new hash
int new_index = get_bucket_index(entry->symbol->name);
entry->next = table->buckets[new_index];
table->buckets[new_index] = entry;
table->total_symbols++;
entry = next;
}
}
}5. Integration Points
// Update src/parser.c
void enter_scope(void) {
// Create new local symbol table
current_context->local_symbols = create_symbol_table();
}
void exit_scope(void) {
// Print statistics if verbose
if (verbose_mode) {
print_symbol_table_stats(current_context->local_symbols);
}
// Free local symbol table
free_symbol_table(current_context->local_symbols);
current_context->local_symbols = NULL;
}
// Update symbol creation
symbol_t *create_symbol(const char *name, type_t *type) {
symbol_t *sym = malloc(sizeof(symbol_t));
sym->name = strdup(name);
sym->type = type;
// Insert into appropriate table
if (current_scope == SCOPE_GLOBAL) {
insert_symbol(current_context->global_symbols, sym);
} else {
insert_symbol(current_context->local_symbols, sym);
}
return sym;
}6. Performance Monitoring
void print_symbol_table_stats(symbol_table_t *table) {
printf("=== Symbol Table Statistics ===\n");
printf("Total symbols: %d\n", table->total_symbols);
printf("Hash buckets: %d\n", SYMBOL_HASH_SIZE);
printf("Load factor: %.2f\n",
(float)table->total_symbols / SYMBOL_HASH_SIZE);
printf("Collisions: %d (%.1f%%)\n",
table->collisions,
table->collisions * 100.0 / table->total_symbols);
printf("Max chain length: %d\n", table->max_chain_length);
printf("Total lookups: %d\n", table->lookups);
printf("Hit rate: %.1f%%\n",
table->hits * 100.0 / table->lookups);
// Distribution analysis
int empty = 0, chains = 0;
for (int i = 0; i < SYMBOL_HASH_SIZE; i++) {
if (!table->buckets[i]) {
empty++;
} else if (table->buckets[i]->next) {
chains++;
}
}
printf("Empty buckets: %d (%.1f%%)\n",
empty, empty * 100.0 / SYMBOL_HASH_SIZE);
printf("Chains: %d\n", chains);
}Testing
Performance Benchmark
// tests/perf/symbol_lookup.c
#define NUM_SYMBOLS 10000
void benchmark_symbol_lookup() {
clock_t start, end;
// Create many symbols
for (int i = 0; i < NUM_SYMBOLS; i++) {
char name[32];
sprintf(name, "symbol_%d", i);
create_symbol(name, &type_int);
}
// Benchmark lookups
start = clock();
for (int i = 0; i < NUM_SYMBOLS * 10; i++) {
char name[32];
sprintf(name, "symbol_%d", rand() % NUM_SYMBOLS);
find_symbol_fast(name);
}
end = clock();
double time_spent = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("100,000 lookups in %.3f seconds\n", time_spent);
printf("Average: %.3f microseconds per lookup\n",
time_spent * 1000000 / (NUM_SYMBOLS * 10));
}Correctness Tests
// Verify hash table behaves identically to linear search
void test_symbol_table_correctness() {
// Test insertion and lookup
symbol_t *s1 = create_symbol("test1", &type_int);
symbol_t *s2 = create_symbol("test2", &type_char);
assert(find_symbol_fast("test1") == s1);
assert(find_symbol_fast("test2") == s2);
assert(find_symbol_fast("nonexistent") == NULL);
// Test collision handling
// Force collisions by creating symbols with same hash
for (int i = 0; i < 100; i++) {
char name[32];
sprintf(name, "collision_%d", i * SYMBOL_HASH_SIZE);
create_symbol(name, &type_int);
}
// Verify all can be found
for (int i = 0; i < 100; i++) {
char name[32];
sprintf(name, "collision_%d", i * SYMBOL_HASH_SIZE);
assert(find_symbol_fast(name) != NULL);
}
}Migration Plan
- Phase 1: Implement hash table alongside existing code
- Phase 2: Add compatibility layer with fallback
- Phase 3: Migrate all symbol lookups
- Phase 4: Remove old linear search code
Success Criteria
- O(1) average lookup time
- No functionality regression
- 30%+ speedup for large programs
- Hash collision rate < 10%
- All existing tests pass
Metadata
Metadata
Assignees
Labels
enhancementNew feature or requestNew feature or request