""" SQLite Buffer Pool Space-Time Tradeoff Experiment Demonstrates how SQLite's page cache size affects query performance, validating Williams' √n space-time tradeoff in a real production database. Key parameters: - cache_size: Number of pages in memory (default 2000) - page_size: Size of each page (default 4096 bytes) This experiment shows: 1. Full cache (O(n) space): Fast queries 2. √n cache: Moderate slowdown 3. Minimal cache: Extreme slowdown """ import sqlite3 import time import os import numpy as np import matplotlib.pyplot as plt from typing import Dict, List, Tuple import json import tempfile import shutil class SQLiteExperiment: """Test SQLite performance with different cache sizes""" def __init__(self, num_rows: int, page_size: int = 4096): self.num_rows = num_rows self.page_size = page_size self.temp_dir = tempfile.mkdtemp() self.db_path = os.path.join(self.temp_dir, 'test.db') def cleanup(self): """Clean up temporary files""" if os.path.exists(self.temp_dir): shutil.rmtree(self.temp_dir) def setup_database(self): """Create and populate the test database""" conn = sqlite3.connect(self.db_path) conn.execute(f'PRAGMA page_size = {self.page_size}') conn.commit() # Create tables simulating a real app conn.execute(''' CREATE TABLE users ( id INTEGER PRIMARY KEY, name TEXT, email TEXT, created_at INTEGER, data BLOB ) ''') conn.execute(''' CREATE TABLE posts ( id INTEGER PRIMARY KEY, user_id INTEGER, title TEXT, content TEXT, created_at INTEGER, FOREIGN KEY (user_id) REFERENCES users(id) ) ''') # Insert data print(f"Populating database with {self.num_rows:,} users...") # Batch insert for efficiency batch_size = 1000 for i in range(0, self.num_rows, batch_size): batch = [] for j in range(min(batch_size, self.num_rows - i)): user_id = i + j # Add some data to make pages more realistic data = os.urandom(200) # 200 bytes of data per user batch.append(( user_id, f'User {user_id}', f'user{user_id}@example.com', int(time.time()) - user_id, data )) conn.executemany( 'INSERT INTO users VALUES (?, ?, ?, ?, ?)', batch ) # Insert 3 posts per user post_batch = [] for user in batch: user_id = user[0] for k in range(3): post_batch.append(( user_id * 3 + k, user_id, f'Post {k} by user {user_id}', f'Content of post {k}' * 10, # Make content larger int(time.time()) - user_id + k )) conn.executemany( 'INSERT INTO posts VALUES (?, ?, ?, ?, ?)', post_batch ) # Create indexes (common in real apps) conn.execute('CREATE INDEX idx_users_email ON users(email)') conn.execute('CREATE INDEX idx_posts_user ON posts(user_id)') conn.execute('CREATE INDEX idx_posts_created ON posts(created_at)') conn.commit() conn.close() # Get database size db_size = os.path.getsize(self.db_path) print(f"Database size: {db_size / 1024 / 1024:.1f} MB") return db_size def run_queries(self, cache_size: int, num_queries: int = 100) -> Dict: """Run queries with specified cache size""" conn = sqlite3.connect(self.db_path) # Set cache size (in pages) conn.execute(f'PRAGMA cache_size = {cache_size}') # Clear OS cache by reading another file (best effort) dummy_data = os.urandom(50 * 1024 * 1024) # 50MB del dummy_data # Get actual cache size in bytes cache_bytes = cache_size * self.page_size # Query patterns that simulate real usage query_times = { 'point_lookups': [], 'range_scans': [], 'joins': [], 'aggregations': [] } # Warm up conn.execute('SELECT COUNT(*) FROM users').fetchone() # 1. Point lookups (random access pattern) for _ in range(num_queries): user_id = np.random.randint(1, self.num_rows) start = time.time() conn.execute( 'SELECT * FROM users WHERE id = ?', (user_id,) ).fetchone() query_times['point_lookups'].append(time.time() - start) # 2. Range scans for _ in range(num_queries // 10): # Fewer range scans max_start = max(1, self.num_rows - 100) start_id = np.random.randint(1, max_start + 1) start = time.time() conn.execute( 'SELECT * FROM users WHERE id BETWEEN ? AND ?', (start_id, min(start_id + 100, self.num_rows)) ).fetchall() query_times['range_scans'].append(time.time() - start) # 3. Joins (most expensive) for _ in range(num_queries // 20): # Even fewer joins user_id = np.random.randint(1, self.num_rows) start = time.time() conn.execute(''' SELECT u.*, p.* FROM users u JOIN posts p ON u.id = p.user_id WHERE u.id = ? ''', (user_id,)).fetchall() query_times['joins'].append(time.time() - start) # 4. Aggregations for _ in range(num_queries // 20): start_time = int(time.time()) - np.random.randint(0, self.num_rows) start = time.time() conn.execute(''' SELECT COUNT(*), AVG(LENGTH(content)) FROM posts WHERE created_at > ? ''', (start_time,)).fetchone() query_times['aggregations'].append(time.time() - start) # Get cache statistics cache_hit = conn.execute('PRAGMA cache_stats').fetchone() conn.close() return { 'cache_size': cache_size, 'cache_bytes': cache_bytes, 'query_times': query_times, 'avg_point_lookup': np.mean(query_times['point_lookups']), 'avg_range_scan': np.mean(query_times['range_scans']), 'avg_join': np.mean(query_times['joins']), 'avg_aggregation': np.mean(query_times['aggregations']) } def analyze_page_distribution(self) -> Dict: """Analyze how data is distributed across pages""" conn = sqlite3.connect(self.db_path) # Get page count page_count = conn.execute('PRAGMA page_count').fetchone()[0] # Get various statistics stats = { 'page_count': page_count, 'page_size': self.page_size, 'total_size': page_count * self.page_size, 'users_count': conn.execute('SELECT COUNT(*) FROM users').fetchone()[0], 'posts_count': conn.execute('SELECT COUNT(*) FROM posts').fetchone()[0] } conn.close() return stats def run_sqlite_experiment(): """Run the complete SQLite buffer pool experiment""" print("="*60) print("SQLite Buffer Pool Space-Time Tradeoff Experiment") print("="*60) # Test with different database sizes sizes = [10000, 50000, 100000] # Number of users results = {} for num_users in sizes: print(f"\n{'='*40}") print(f"Testing with {num_users:,} users") print(f"{'='*40}") exp = SQLiteExperiment(num_users) db_size = exp.setup_database() stats = exp.analyze_page_distribution() print(f"Database pages: {stats['page_count']:,}") print(f"Page size: {stats['page_size']} bytes") # Test different cache sizes # Full cache, √n cache, minimal cache cache_configs = [ ('Full O(n)', stats['page_count']), # All pages in memory ('√n cache', int(np.sqrt(stats['page_count']))), # √n pages ('Minimal', 10) # Almost no cache ] user_results = [] for label, cache_size in cache_configs: print(f"\nTesting {label}: {cache_size} pages ({cache_size * 4096 / 1024:.1f} KB)") result = exp.run_queries(cache_size, num_queries=50) result['label'] = label user_results.append(result) print(f" Point lookups: {result['avg_point_lookup']*1000:.2f} ms") print(f" Range scans: {result['avg_range_scan']*1000:.2f} ms") print(f" Joins: {result['avg_join']*1000:.2f} ms") results[num_users] = { 'stats': stats, 'experiments': user_results } exp.cleanup() # Create visualizations create_sqlite_plots(results) # Save results with open('sqlite_results.json', 'w') as f: # Convert numpy types for JSON serialization def convert(o): if isinstance(o, np.integer): return int(o) if isinstance(o, np.floating): return float(o) if isinstance(o, np.ndarray): return o.tolist() return o json.dump(results, f, indent=2, default=convert) print("\n" + "="*60) print("EXPERIMENT COMPLETE") print("Generated files:") print(" - sqlite_results.json") print(" - sqlite_buffer_pool_analysis.png") print("="*60) return results def create_sqlite_plots(results: Dict): """Create publication-quality plots for SQLite experiment""" fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(14, 10)) # Plot 1: Point lookup performance vs cache size sizes = sorted(results.keys()) for size in sizes: experiments = results[size]['experiments'] cache_sizes = [e['cache_size'] for e in experiments] point_times = [e['avg_point_lookup'] * 1000 for e in experiments] # Convert to ms ax1.plot(cache_sizes, point_times, 'o-', label=f'{size:,} users', linewidth=2, markersize=8) ax1.set_xlabel('Cache Size (pages)', fontsize=12) ax1.set_ylabel('Avg Point Lookup Time (ms)', fontsize=12) ax1.set_title('Point Lookup Performance vs Cache Size', fontsize=14) ax1.set_xscale('log') ax1.set_yscale('log') ax1.legend() ax1.grid(True, alpha=0.3) # Plot 2: Slowdown factors base_size = sizes[1] # Use 50k as reference base_results = results[base_size]['experiments'] full_cache_time = base_results[0]['avg_point_lookup'] sqrt_cache_time = base_results[1]['avg_point_lookup'] min_cache_time = base_results[2]['avg_point_lookup'] categories = ['Full\nO(n)', '√n\nCache', 'Minimal\nO(1)'] slowdowns = [1, sqrt_cache_time/full_cache_time, min_cache_time/full_cache_time] bars = ax2.bar(categories, slowdowns, color=['green', 'orange', 'red']) ax2.set_ylabel('Slowdown Factor', fontsize=12) ax2.set_title(f'Query Slowdown vs Cache Size ({base_size:,} users)', fontsize=14) # Add value labels on bars for bar, val in zip(bars, slowdowns): height = bar.get_height() ax2.text(bar.get_x() + bar.get_width()/2., height, f'{val:.1f}×', ha='center', va='bottom', fontsize=11) ax2.grid(True, alpha=0.3, axis='y') # Plot 3: Memory usage efficiency for size in sizes: experiments = results[size]['experiments'] cache_mb = [e['cache_bytes'] / 1024 / 1024 for e in experiments] query_speed = [1 / e['avg_point_lookup'] for e in experiments] # Queries per second ax3.plot(cache_mb, query_speed, 's-', label=f'{size:,} users', linewidth=2, markersize=8) ax3.set_xlabel('Cache Size (MB)', fontsize=12) ax3.set_ylabel('Queries per Second', fontsize=12) ax3.set_title('Memory Efficiency: Speed vs Cache Size', fontsize=14) ax3.set_xscale('log') ax3.legend() ax3.grid(True, alpha=0.3) # Plot 4: Different query types query_types = ['Point\nLookup', 'Range\nScan', 'Join\nQuery'] for i, (label, cache_size) in enumerate(cache_configs[:3]): if i >= len(base_results): break result = base_results[i] times = [ result['avg_point_lookup'] * 1000, result['avg_range_scan'] * 1000, result['avg_join'] * 1000 ] x = np.arange(len(query_types)) width = 0.25 ax4.bar(x + i*width, times, width, label=label) ax4.set_xlabel('Query Type', fontsize=12) ax4.set_ylabel('Average Time (ms)', fontsize=12) ax4.set_title('Query Performance by Type and Cache Size', fontsize=14) ax4.set_xticks(x + width) ax4.set_xticklabels(query_types) ax4.legend() ax4.grid(True, alpha=0.3, axis='y') ax4.set_yscale('log') plt.suptitle('SQLite Buffer Pool: Space-Time Tradeoffs', fontsize=16) plt.tight_layout() plt.savefig('sqlite_buffer_pool_analysis.png', dpi=300, bbox_inches='tight') plt.close() # Helper to get theoretical cache configs cache_configs = [ ('Full O(n)', None), # Will be set based on page count ('√n cache', None), ('Minimal', 10) ] if __name__ == "__main__": run_sqlite_experiment()