286 lines
8.7 KiB
Python
286 lines
8.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Example demonstrating Cache-Aware Data Structures
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from cache_aware_structures import (
|
|
AdaptiveMap,
|
|
CompressedTrie,
|
|
create_optimized_structure,
|
|
MemoryHierarchy
|
|
)
|
|
import time
|
|
import random
|
|
import string
|
|
|
|
|
|
def demonstrate_adaptive_behavior():
|
|
"""Show how AdaptiveMap adapts to different sizes"""
|
|
print("="*60)
|
|
print("Adaptive Map Behavior")
|
|
print("="*60)
|
|
|
|
# Create adaptive map
|
|
amap = AdaptiveMap[int, str]()
|
|
|
|
# Track adaptations
|
|
print("\nInserting data and watching adaptations:")
|
|
print("-" * 50)
|
|
|
|
sizes = [1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000]
|
|
|
|
for target_size in sizes:
|
|
# Insert to reach target size
|
|
current = amap.size()
|
|
for i in range(current, target_size):
|
|
amap.put(i, f"value_{i}")
|
|
|
|
stats = amap.get_stats()
|
|
if stats['size'] in sizes: # Only print at milestones
|
|
print(f"Size: {stats['size']:>6} | "
|
|
f"Implementation: {stats['implementation']:>10} | "
|
|
f"Memory: {stats['memory_level']:>5}")
|
|
|
|
# Test different access patterns
|
|
print("\n\nTesting access patterns:")
|
|
print("-" * 50)
|
|
|
|
# Sequential access
|
|
print("Sequential access pattern...")
|
|
for i in range(100):
|
|
amap.get(i)
|
|
|
|
stats = amap.get_stats()
|
|
print(f" Sequential ratio: {stats['access_pattern']['sequential_ratio']:.2f}")
|
|
|
|
# Random access
|
|
print("\nRandom access pattern...")
|
|
for _ in range(100):
|
|
amap.get(random.randint(0, 999))
|
|
|
|
stats = amap.get_stats()
|
|
print(f" Sequential ratio: {stats['access_pattern']['sequential_ratio']:.2f}")
|
|
|
|
|
|
def benchmark_structures():
|
|
"""Compare performance of different structures"""
|
|
print("\n\n" + "="*60)
|
|
print("Performance Comparison")
|
|
print("="*60)
|
|
|
|
sizes = [100, 1000, 10000, 100000]
|
|
|
|
print(f"\n{'Size':>8} | {'Dict':>8} | {'Adaptive':>8} | {'Speedup':>8}")
|
|
print("-" * 40)
|
|
|
|
for n in sizes:
|
|
# Generate test data
|
|
keys = [f"key_{i:06d}" for i in range(n)]
|
|
values = [f"value_{i}" for i in range(n)]
|
|
|
|
# Benchmark standard dict
|
|
start = time.time()
|
|
std_dict = {}
|
|
for k, v in zip(keys, values):
|
|
std_dict[k] = v
|
|
for k in keys[:1000]: # Sample lookups
|
|
_ = std_dict.get(k)
|
|
dict_time = time.time() - start
|
|
|
|
# Benchmark adaptive map
|
|
start = time.time()
|
|
adaptive = AdaptiveMap[str, str]()
|
|
for k, v in zip(keys, values):
|
|
adaptive.put(k, v)
|
|
for k in keys[:1000]: # Sample lookups
|
|
_ = adaptive.get(k)
|
|
adaptive_time = time.time() - start
|
|
|
|
speedup = dict_time / adaptive_time
|
|
print(f"{n:>8} | {dict_time:>8.3f} | {adaptive_time:>8.3f} | {speedup:>8.2f}x")
|
|
|
|
|
|
def demonstrate_cache_optimization():
|
|
"""Show cache line optimization benefits"""
|
|
print("\n\n" + "="*60)
|
|
print("Cache Line Optimization")
|
|
print("="*60)
|
|
|
|
hierarchy = MemoryHierarchy.detect_system()
|
|
cache_line_size = 64
|
|
|
|
print(f"\nSystem Information:")
|
|
print(f" Cache line size: {cache_line_size} bytes")
|
|
print(f" L1 cache: {hierarchy.l1_size / 1024:.0f}KB")
|
|
print(f" L2 cache: {hierarchy.l2_size / 1024:.0f}KB")
|
|
print(f" L3 cache: {hierarchy.l3_size / 1024 / 1024:.1f}MB")
|
|
|
|
# Calculate optimal parameters
|
|
print(f"\nOptimal Structure Parameters:")
|
|
|
|
# For different key/value sizes
|
|
configs = [
|
|
("Small (4B key, 4B value)", 4, 4),
|
|
("Medium (8B key, 8B value)", 8, 8),
|
|
("Large (16B key, 32B value)", 16, 32),
|
|
]
|
|
|
|
for name, key_size, value_size in configs:
|
|
entry_size = key_size + value_size
|
|
entries_per_line = cache_line_size // entry_size
|
|
|
|
# B-tree node size
|
|
btree_keys = entries_per_line - 1 # Leave room for child pointers
|
|
|
|
# Hash table bucket
|
|
hash_entries = cache_line_size // entry_size
|
|
|
|
print(f"\n{name}:")
|
|
print(f" Entries per cache line: {entries_per_line}")
|
|
print(f" B-tree keys per node: {btree_keys}")
|
|
print(f" Hash bucket capacity: {hash_entries}")
|
|
|
|
# Calculate memory efficiency
|
|
utilization = (entries_per_line * entry_size) / cache_line_size * 100
|
|
print(f" Cache utilization: {utilization:.1f}%")
|
|
|
|
|
|
def demonstrate_compressed_trie():
|
|
"""Show compressed trie benefits for strings"""
|
|
print("\n\n" + "="*60)
|
|
print("Compressed Trie for String Data")
|
|
print("="*60)
|
|
|
|
# Create trie
|
|
trie = CompressedTrie()
|
|
|
|
# Common prefixes scenario (URLs, file paths, etc.)
|
|
test_data = [
|
|
# API endpoints
|
|
("/api/v1/users/list", "list_users"),
|
|
("/api/v1/users/get", "get_user"),
|
|
("/api/v1/users/create", "create_user"),
|
|
("/api/v1/users/update", "update_user"),
|
|
("/api/v1/users/delete", "delete_user"),
|
|
("/api/v1/products/list", "list_products"),
|
|
("/api/v1/products/get", "get_product"),
|
|
("/api/v2/users/list", "list_users_v2"),
|
|
("/api/v2/analytics/events", "analytics_events"),
|
|
("/api/v2/analytics/metrics", "analytics_metrics"),
|
|
]
|
|
|
|
print("\nInserting API endpoints:")
|
|
for path, handler in test_data:
|
|
trie.insert(path, handler)
|
|
print(f" {path} -> {handler}")
|
|
|
|
# Memory comparison
|
|
print("\n\nMemory Comparison:")
|
|
|
|
# Trie size estimation (simplified)
|
|
trie_nodes = 50 # Approximate with compression
|
|
trie_memory = trie_nodes * 64 # 64 bytes per node
|
|
|
|
# Dict size
|
|
dict_memory = len(test_data) * (50 + 20) * 2 # key + value + overhead
|
|
|
|
print(f" Standard dict: ~{dict_memory} bytes")
|
|
print(f" Compressed trie: ~{trie_memory} bytes")
|
|
print(f" Compression ratio: {dict_memory / trie_memory:.1f}x")
|
|
|
|
# Search demonstration
|
|
print("\n\nSearching:")
|
|
search_keys = [
|
|
"/api/v1/users/list",
|
|
"/api/v2/analytics/events",
|
|
"/api/v3/users/list", # Not found
|
|
]
|
|
|
|
for key in search_keys:
|
|
result = trie.search(key)
|
|
status = "Found" if result else "Not found"
|
|
print(f" {key}: {status} {f'-> {result}' if result else ''}")
|
|
|
|
|
|
def demonstrate_external_memory():
|
|
"""Show external memory map with √n buffers"""
|
|
print("\n\n" + "="*60)
|
|
print("External Memory Map (Disk-backed)")
|
|
print("="*60)
|
|
|
|
# Create external map with explicit hint
|
|
emap = create_optimized_structure(
|
|
hint_type='external',
|
|
hint_memory_limit=1024*1024 # 1MB buffer limit
|
|
)
|
|
|
|
print("\nSimulating large dataset that doesn't fit in memory:")
|
|
|
|
# Insert large dataset
|
|
n = 1000000 # 1M entries
|
|
print(f" Dataset size: {n:,} entries")
|
|
print(f" Estimated size: {n * 20 / 1e6:.1f}MB")
|
|
|
|
# Buffer size calculation
|
|
sqrt_n = int(n ** 0.5)
|
|
buffer_entries = sqrt_n
|
|
buffer_memory = buffer_entries * 20 # 20 bytes per entry
|
|
|
|
print(f"\n√n Buffer Configuration:")
|
|
print(f" Buffer entries: {buffer_entries:,} (√{n:,})")
|
|
print(f" Buffer memory: {buffer_memory / 1024:.1f}KB")
|
|
print(f" Memory reduction: {(1 - sqrt_n/n) * 100:.1f}%")
|
|
|
|
# Simulate access patterns
|
|
print(f"\n\nAccess Pattern Analysis:")
|
|
|
|
# Sequential scan
|
|
sequential_hits = 0
|
|
for i in range(1000):
|
|
# Simulate buffer hit/miss
|
|
if i % sqrt_n < 100: # In buffer
|
|
sequential_hits += 1
|
|
|
|
print(f" Sequential scan: {sequential_hits/10:.1f}% buffer hit rate")
|
|
|
|
# Random access
|
|
random_hits = 0
|
|
for _ in range(1000):
|
|
i = random.randint(0, n-1)
|
|
if random.random() < sqrt_n/n: # Probability in buffer
|
|
random_hits += 1
|
|
|
|
print(f" Random access: {random_hits/10:.1f}% buffer hit rate")
|
|
|
|
# Recommendations
|
|
print(f"\n\nRecommendations:")
|
|
print(f" - Use sequential access when possible (better cache hits)")
|
|
print(f" - Group related keys together (spatial locality)")
|
|
print(f" - Consider compression for values (reduce I/O)")
|
|
|
|
|
|
def main():
|
|
"""Run all demonstrations"""
|
|
demonstrate_adaptive_behavior()
|
|
benchmark_structures()
|
|
demonstrate_cache_optimization()
|
|
demonstrate_compressed_trie()
|
|
demonstrate_external_memory()
|
|
|
|
print("\n\n" + "="*60)
|
|
print("Cache-Aware Data Structures Complete!")
|
|
print("="*60)
|
|
print("\nKey Takeaways:")
|
|
print("- Structures adapt to data size automatically")
|
|
print("- Cache line alignment improves performance")
|
|
print("- √n buffers enable huge datasets with limited memory")
|
|
print("- Compression trades CPU for memory")
|
|
print("="*60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |