sqrtspace-experiments/experiments/database_buffer_pool/sqlite_heavy_experiment.py

"""
SQLite experiment with heavier workload to demonstrate space-time tradeoffs
Uses larger data and more complex queries to stress the buffer pool
"""

import sqlite3
import time
import os
import numpy as np
import matplotlib.pyplot as plt
import json
import tempfile
import shutil
import gc

class SQLiteHeavyExperiment:
    """SQLite experiment with larger data to force real I/O"""

    def __init__(self, scale_factor: int = 100000):
        self.scale_factor = scale_factor
        self.temp_dir = tempfile.mkdtemp()
        self.db_path = os.path.join(self.temp_dir, 'heavy.db')

    def cleanup(self):
        """Clean up temporary files"""
        if os.path.exists(self.temp_dir):
            shutil.rmtree(self.temp_dir)

    def setup_database(self):
        """Create a database that's too large for small caches"""
        conn = sqlite3.connect(self.db_path)

        # Use larger pages for efficiency
        conn.execute('PRAGMA page_size = 8192')
        conn.execute('PRAGMA journal_mode = WAL')  # Write-ahead logging
        conn.commit()

        # Create tables that simulate real-world complexity
        conn.execute('''
            CREATE TABLE documents (
                id INTEGER PRIMARY KEY,
                user_id INTEGER,
                title TEXT,
                content TEXT,
                tags TEXT,
                created_at INTEGER,
                updated_at INTEGER,
                view_count INTEGER,
                data BLOB
            )
        ''')

        conn.execute('''
            CREATE TABLE analytics (
                id INTEGER PRIMARY KEY,
                doc_id INTEGER,
                event_type TEXT,
                user_id INTEGER,
                timestamp INTEGER,
                metadata TEXT,
                FOREIGN KEY (doc_id) REFERENCES documents(id)
            )
        ''')

        print(f"Populating database (this will take a moment)...")

        # Insert documents with realistic data
        batch_size = 1000
        total_docs = self.scale_factor

        for i in range(0, total_docs, batch_size):
            batch = []
            for j in range(min(batch_size, total_docs - i)):
                doc_id = i + j
                # Create variable-length content to simulate real documents
                content_length = np.random.randint(100, 2000)
                content = 'x' * content_length  # Simplified for speed

                # Random binary data to increase row size
                data_size = np.random.randint(500, 2000)
                data = os.urandom(data_size)

                batch.append((
                    doc_id,
                    np.random.randint(1, 10000),  # user_id
                    f'Document {doc_id}',
                    content,
                    f'tag{doc_id % 100},tag{doc_id % 50}',
                    int(time.time()) - doc_id,
                    int(time.time()) - doc_id // 2,
                    np.random.randint(0, 10000),
                    data
                ))

            conn.executemany(
                'INSERT INTO documents VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)',
                batch
            )

            # Insert analytics events (3-5 per document)
            analytics_batch = []
            for doc in batch:
                doc_id = doc[0]
                num_events = np.random.randint(3, 6)
                for k in range(num_events):
                    analytics_batch.append((
                        doc_id * 5 + k,
                        doc_id,
                        np.random.choice(['view', 'click', 'share', 'like']),
                        np.random.randint(1, 10000),
                        int(time.time()) - np.random.randint(0, 86400 * 30),
                        f'{{"source": "web", "version": {k}}}'
                    ))

            conn.executemany(
                'INSERT INTO analytics VALUES (?, ?, ?, ?, ?, ?)',
                analytics_batch
            )

            if (i + batch_size) % 10000 == 0:
                print(f"  Inserted {i + batch_size:,} / {total_docs:,} documents...")
                conn.commit()

        # Create indexes to make queries more realistic
        print("Creating indexes...")
        conn.execute('CREATE INDEX idx_docs_user ON documents(user_id)')
        conn.execute('CREATE INDEX idx_docs_created ON documents(created_at)')
        conn.execute('CREATE INDEX idx_analytics_doc ON analytics(doc_id)')
        conn.execute('CREATE INDEX idx_analytics_time ON analytics(timestamp)')

        conn.commit()

        # Analyze to update statistics
        conn.execute('ANALYZE')
        conn.close()

        # Get database size
        db_size = os.path.getsize(self.db_path)
        print(f"Database size: {db_size / 1024 / 1024:.1f} MB")

        return db_size

    def force_cache_clear(self):
        """Try to clear OS cache"""
        # Allocate and access large memory to evict cache
        try:
            dummy = np.zeros((100, 1024, 1024), dtype=np.uint8)  # 100MB
            dummy[:] = np.random.randint(0, 256, size=dummy.shape, dtype=np.uint8)
            del dummy
            gc.collect()
        except:
            pass

    def run_heavy_queries(self, cache_pages: int) -> dict:
        """Run queries that stress the cache"""
        conn = sqlite3.connect(self.db_path)

        # Set cache size
        conn.execute(f'PRAGMA cache_size = -{cache_pages * 8}')  # Negative = KB

        # Disable query optimizer shortcuts
        conn.execute('PRAGMA query_only = ON')

        results = {
            'random_reads': [],
            'sequential_scan': [],
            'complex_join': [],
            'aggregation': []
        }

        # 1. Random point queries (cache-unfriendly)
        print("  Running random reads...")
        for _ in range(50):
            doc_id = np.random.randint(1, self.scale_factor)
            start = time.time()
            conn.execute(
                'SELECT * FROM documents WHERE id = ?',
                (doc_id,)
            ).fetchone()
            results['random_reads'].append(time.time() - start)

        # 2. Sequential scan with filter
        print("  Running sequential scans...")
        for _ in range(5):
            min_views = np.random.randint(1000, 5000)
            start = time.time()
            conn.execute(
                'SELECT COUNT(*) FROM documents WHERE view_count > ?',
                (min_views,)
            ).fetchone()
            results['sequential_scan'].append(time.time() - start)

        # 3. Complex join queries
        print("  Running complex joins...")
        for _ in range(5):
            user_id = np.random.randint(1, 10000)
            start = time.time()
            conn.execute('''
                SELECT d.*, COUNT(a.id) as events
                FROM documents d
                LEFT JOIN analytics a ON d.id = a.doc_id
                WHERE d.user_id = ?
                GROUP BY d.id
                LIMIT 10
            ''', (user_id,)).fetchall()
            results['complex_join'].append(time.time() - start)

        # 4. Time-based aggregation
        print("  Running aggregations...")
        for _ in range(5):
            days_back = np.random.randint(1, 30)
            start_time = int(time.time()) - (days_back * 86400)
            start = time.time()
            conn.execute('''
                SELECT
                    event_type,
                    COUNT(*) as count,
                    COUNT(DISTINCT user_id) as unique_users
                FROM analytics
                WHERE timestamp > ?
                GROUP BY event_type
            ''', (start_time,)).fetchall()
            results['aggregation'].append(time.time() - start)

        conn.close()

        return {
            'cache_pages': cache_pages,
            'avg_random_read': np.mean(results['random_reads']),
            'avg_sequential': np.mean(results['sequential_scan']),
            'avg_join': np.mean(results['complex_join']),
            'avg_aggregation': np.mean(results['aggregation']),
            'p95_random_read': np.percentile(results['random_reads'], 95),
            'raw_results': results
        }

def run_heavy_experiment():
    """Run the heavy SQLite experiment"""

    print("="*60)
    print("SQLite Heavy Workload Experiment")
    print("Demonstrating space-time tradeoffs with real I/O pressure")
    print("="*60)

    # Create large database
    scale = 50000  # 50k documents = ~200MB database
    exp = SQLiteHeavyExperiment(scale)

    db_size = exp.setup_database()

    # Calculate page count
    page_size = 8192
    total_pages = db_size // page_size

    print(f"\nDatabase created:")
    print(f"  Documents: {scale:,}")
    print(f"  Size: {db_size / 1024 / 1024:.1f} MB")
    print(f"  Pages: {total_pages:,}")

    # Test different cache sizes
    cache_configs = [
        ('O(n) Full', min(total_pages, 10000)),  # Cap at 10k pages for memory
        ('O(√n)', int(np.sqrt(total_pages))),
        ('O(log n)', int(np.log2(total_pages))),
        ('O(1)', 10)
    ]

    results = []

    for label, cache_pages in cache_configs:
        cache_mb = cache_pages * page_size / 1024 / 1024
        print(f"\nTesting {label}: {cache_pages} pages ({cache_mb:.1f} MB)")

        # Clear cache between runs
        exp.force_cache_clear()
        time.sleep(1)  # Let system settle

        result = exp.run_heavy_queries(cache_pages)
        result['label'] = label
        result['cache_mb'] = cache_mb
        results.append(result)

        print(f"  Random read: {result['avg_random_read']*1000:.2f} ms")
        print(f"  Sequential: {result['avg_sequential']*1000:.2f} ms")
        print(f"  Complex join: {result['avg_join']*1000:.2f} ms")

    # Create visualization
    create_heavy_experiment_plot(results, db_size)

    # Calculate slowdowns
    base = results[0]['avg_random_read']
    for r in results:
        r['slowdown'] = r['avg_random_read'] / base

    # Save results
    with open('sqlite_heavy_results.json', 'w') as f:
        save_data = {
            'scale_factor': scale,
            'db_size_mb': db_size / 1024 / 1024,
            'results': [
                {
                    'label': r['label'],
                    'cache_mb': r['cache_mb'],
                    'avg_random_ms': r['avg_random_read'] * 1000,
                    'slowdown': r['slowdown']
                }
                for r in results
            ]
        }
        json.dump(save_data, f, indent=2)

    exp.cleanup()

    print("\n" + "="*60)
    print("RESULTS SUMMARY")
    print("="*60)
    for r in results:
        print(f"{r['label']:15} | Slowdown: {r['slowdown']:6.1f}x | "
              f"Random: {r['avg_random_read']*1000:6.2f} ms | "
              f"Join: {r['avg_join']*1000:6.2f} ms")

    print("\nFiles generated:")
    print("  - sqlite_heavy_experiment.png")
    print("  - sqlite_heavy_results.json")
    print("="*60)

def create_heavy_experiment_plot(results, db_size):
    """Create plot for heavy experiment"""

    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(14, 10))

    # Extract data
    labels = [r['label'] for r in results]
    cache_mb = [r['cache_mb'] for r in results]
    random_times = [r['avg_random_read'] * 1000 for r in results]
    join_times = [r['avg_join'] * 1000 for r in results]

    # Plot 1: Random read performance
    colors = ['green', 'orange', 'red', 'darkred']
    ax1.bar(labels, random_times, color=colors, edgecolor='black', linewidth=1.5)
    ax1.set_ylabel('Time (ms)', fontsize=12)
    ax1.set_title('Random Read Performance', fontsize=14)
    ax1.grid(True, alpha=0.3, axis='y')

    # Add value labels
    for i, (bar, val) in enumerate(zip(ax1.patches, random_times)):
        ax1.text(bar.get_x() + bar.get_width()/2., bar.get_height(),
                f'{val:.1f}', ha='center', va='bottom', fontsize=10)

    # Plot 2: Join query performance
    ax2.bar(labels, join_times, color=colors, edgecolor='black', linewidth=1.5)
    ax2.set_ylabel('Time (ms)', fontsize=12)
    ax2.set_title('Complex Join Performance', fontsize=14)
    ax2.grid(True, alpha=0.3, axis='y')

    # Plot 3: Cache efficiency
    db_mb = db_size / 1024 / 1024
    cache_pct = [(c / db_mb) * 100 for c in cache_mb]
    slowdowns = [r['avg_random_read'] / results[0]['avg_random_read'] for r in results]

    ax3.scatter(cache_pct, slowdowns, s=200, c=colors, edgecolor='black', linewidth=2)

    # Add theoretical √n curve
    x_theory = np.linspace(0.1, 100, 100)
    y_theory = 1 / np.sqrt(x_theory / 100)
    ax3.plot(x_theory, y_theory, 'b--', alpha=0.5, label='Theoretical 1/√x')

    ax3.set_xlabel('Cache Size (% of Database)', fontsize=12)
    ax3.set_ylabel('Slowdown Factor', fontsize=12)
    ax3.set_title('Space-Time Tradeoff', fontsize=14)
    ax3.set_xscale('log')
    ax3.set_yscale('log')
    ax3.legend()
    ax3.grid(True, alpha=0.3)

    # Plot 4: All query types comparison
    query_types = ['Random\nRead', 'Sequential\nScan', 'Complex\nJoin', 'Aggregation']

    x = np.arange(len(query_types))
    width = 0.2

    for i, r in enumerate(results):
        times = [
            r['avg_random_read'] * 1000,
            r['avg_sequential'] * 1000,
            r['avg_join'] * 1000,
            r['avg_aggregation'] * 1000
        ]
        ax4.bar(x + i*width, times, width, label=r['label'], color=colors[i])

    ax4.set_xlabel('Query Type', fontsize=12)
    ax4.set_ylabel('Time (ms)', fontsize=12)
    ax4.set_title('Performance by Query Type', fontsize=14)
    ax4.set_xticks(x + width * 1.5)
    ax4.set_xticklabels(query_types)
    ax4.legend(fontsize=10)
    ax4.grid(True, alpha=0.3, axis='y')
    ax4.set_yscale('log')

    plt.suptitle('SQLite Buffer Pool: Heavy Workload Analysis', fontsize=16)
    plt.tight_layout()
    plt.savefig('sqlite_heavy_experiment.png', dpi=300, bbox_inches='tight')
    plt.close()

if __name__ == "__main__":
    run_heavy_experiment()