sqrtspace-experiments/experiments/checkpointed_sorting/rigorous_experiment.py

"""
Rigorous sorting experiment with comprehensive statistical analysis
Addresses all concerns from RIGOR.txt:
- Multiple trials with statistical significance
- Multiple input sizes to show scaling
- Hardware/software environment logging
- Cache effects measurement
- RAM disk experiments to isolate I/O
"""

import os
import sys
import time
import tempfile
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import platform
import psutil
import json
from datetime import datetime
import subprocess
import shutil
from typing import List, Dict, Tuple
import tracemalloc

class ExperimentEnvironment:
    """Capture and log experimental environment"""

    @staticmethod
    def get_environment():
        """Get comprehensive environment information"""
        env = {
            'timestamp': datetime.now().isoformat(),
            'platform': platform.platform(),
            'processor': platform.processor(),
            'python_version': platform.python_version(),
            'cpu_count': psutil.cpu_count(logical=False),
            'cpu_count_logical': psutil.cpu_count(logical=True),
            'memory_total': psutil.virtual_memory().total,
            'memory_available': psutil.virtual_memory().available,
            'disk_usage': psutil.disk_usage('/').percent,
        }

        # Try to get CPU frequency
        try:
            cpu_freq = psutil.cpu_freq()
            if cpu_freq:
                env['cpu_freq_current'] = cpu_freq.current
                env['cpu_freq_max'] = cpu_freq.max
        except:
            pass

        # Get cache sizes on Linux/Mac
        try:
            if platform.system() == 'Darwin':
                # macOS
                result = subprocess.run(['sysctl', '-n', 'hw.l1icachesize'],
                                      capture_output=True, text=True)
                if result.returncode == 0:
                    env['l1_cache'] = int(result.stdout.strip())

                result = subprocess.run(['sysctl', '-n', 'hw.l2cachesize'],
                                      capture_output=True, text=True)
                if result.returncode == 0:
                    env['l2_cache'] = int(result.stdout.strip())

                result = subprocess.run(['sysctl', '-n', 'hw.l3cachesize'],
                                      capture_output=True, text=True)
                if result.returncode == 0:
                    env['l3_cache'] = int(result.stdout.strip())
        except:
            pass

        return env

class MemoryTrackedSort:
    """Sorting with detailed memory tracking"""

    def __init__(self, data_size: int):
        self.data_size = data_size
        self.data = np.random.rand(data_size).astype(np.float32)
        self.temp_dir = tempfile.mkdtemp()
        self.memory_measurements = []

    def cleanup(self):
        """Clean up temporary files"""
        if os.path.exists(self.temp_dir):
            shutil.rmtree(self.temp_dir)

    def measure_memory(self, label: str):
        """Record current memory usage"""
        current, peak = tracemalloc.get_traced_memory()
        self.memory_measurements.append({
            'label': label,
            'current': current,
            'peak': peak,
            'timestamp': time.time()
        })

    def in_memory_sort(self) -> Tuple[np.ndarray, Dict]:
        """Standard in-memory sorting with memory tracking"""
        tracemalloc.start()
        self.memory_measurements = []

        self.measure_memory('start')
        result = np.sort(self.data.copy())
        self.measure_memory('after_sort')

        current, peak = tracemalloc.get_traced_memory()
        tracemalloc.stop()

        return result, {
            'peak_memory': peak,
            'measurements': self.memory_measurements
        }

    def checkpoint_sort(self, memory_limit: int, use_ramdisk: bool = False) -> Tuple[np.ndarray, Dict]:
        """External merge sort with checkpointing"""
        tracemalloc.start()
        self.memory_measurements = []

        # Use RAM disk if requested
        if use_ramdisk:
            # Create tmpfs mount point (Linux) or use /tmp on macOS
            if platform.system() == 'Darwin':
                self.temp_dir = tempfile.mkdtemp(dir='/tmp')
            else:
                # Would need sudo for tmpfs mount, so use /dev/shm if available
                if os.path.exists('/dev/shm'):
                    self.temp_dir = tempfile.mkdtemp(dir='/dev/shm')

        chunk_size = max(1, memory_limit // 4)  # Reserve memory for merging
        num_chunks = (self.data_size + chunk_size - 1) // chunk_size

        self.measure_memory('start')

        # Phase 1: Sort chunks and write to disk
        chunk_files = []
        for i in range(num_chunks):
            start_idx = i * chunk_size
            end_idx = min((i + 1) * chunk_size, self.data_size)

            # Sort chunk in memory
            chunk = np.sort(self.data[start_idx:end_idx])

            # Write to disk (checkpoint)
            filename = os.path.join(self.temp_dir, f'chunk_{i}.npy')
            np.save(filename, chunk)
            chunk_files.append(filename)

            # Clear chunk from memory
            del chunk

            if i % 10 == 0:
                self.measure_memory(f'after_chunk_{i}')

        # Phase 2: K-way merge with limited memory
        result = self._k_way_merge(chunk_files, memory_limit)
        self.measure_memory('after_merge')

        # Cleanup
        for f in chunk_files:
            if os.path.exists(f):
                os.remove(f)

        current, peak = tracemalloc.get_traced_memory()
        tracemalloc.stop()

        return result, {
            'peak_memory': peak,
            'num_chunks': num_chunks,
            'chunk_size': chunk_size,
            'use_ramdisk': use_ramdisk,
            'measurements': self.memory_measurements
        }

    def _k_way_merge(self, chunk_files: List[str], memory_limit: int) -> np.ndarray:
        """Merge sorted chunks with limited memory"""
        import heapq

        num_chunks = len(chunk_files)
        buffer_size = max(1, memory_limit // (4 * num_chunks))

        # Open chunks and create initial buffers
        chunks = []
        buffers = []
        positions = []

        for i, filename in enumerate(chunk_files):
            chunk_data = np.load(filename)
            chunks.append(chunk_data)
            buffer_end = min(buffer_size, len(chunk_data))
            buffers.append(chunk_data[:buffer_end])
            positions.append(buffer_end)

        # Priority queue for merge
        heap = []
        for i, buffer in enumerate(buffers):
            if len(buffer) > 0:
                heapq.heappush(heap, (buffer[0], i, 0))

        result = []

        while heap:
            val, chunk_idx, buffer_idx = heapq.heappop(heap)
            result.append(val)

            # Move to next element
            buffer_idx += 1

            # Refill buffer if needed
            if buffer_idx >= len(buffers[chunk_idx]):
                pos = positions[chunk_idx]
                if pos < len(chunks[chunk_idx]):
                    # Load next batch
                    new_end = min(pos + buffer_size, len(chunks[chunk_idx]))
                    buffers[chunk_idx] = chunks[chunk_idx][pos:new_end]
                    positions[chunk_idx] = new_end
                    buffer_idx = 0
                else:
                    continue

            # Add next element to heap
            if buffer_idx < len(buffers[chunk_idx]):
                heapq.heappush(heap, (buffers[chunk_idx][buffer_idx], chunk_idx, buffer_idx))

        return np.array(result, dtype=np.float32)

def run_single_experiment(size: int, num_trials: int = 20) -> Dict:
    """Run experiment for a single input size"""
    print(f"\nRunning experiment for n={size:,} with {num_trials} trials...")

    results = {
        'size': size,
        'trials': {
            'in_memory': [],
            'checkpoint': [],
            'checkpoint_ramdisk': []
        },
        'memory': {
            'in_memory': [],
            'checkpoint': [],
            'checkpoint_ramdisk': []
        }
    }

    for trial in range(num_trials):
        if trial % 5 == 0:
            print(f"  Trial {trial+1}/{num_trials}...")

        exp = MemoryTrackedSort(size)

        # 1. In-memory sort
        start = time.time()
        result_mem, mem_stats = exp.in_memory_sort()
        time_mem = time.time() - start
        results['trials']['in_memory'].append(time_mem)
        results['memory']['in_memory'].append(mem_stats['peak_memory'])

        # 2. Checkpointed sort (disk)
        memory_limit = int(np.sqrt(size) * 4)
        start = time.time()
        result_check, check_stats = exp.checkpoint_sort(memory_limit, use_ramdisk=False)
        time_check = time.time() - start
        results['trials']['checkpoint'].append(time_check)
        results['memory']['checkpoint'].append(check_stats['peak_memory'])

        # 3. Checkpointed sort (RAM disk) - only on first trial to save time
        if trial == 0:
            start = time.time()
            result_ramdisk, ramdisk_stats = exp.checkpoint_sort(memory_limit, use_ramdisk=True)
            time_ramdisk = time.time() - start
            results['trials']['checkpoint_ramdisk'].append(time_ramdisk)
            results['memory']['checkpoint_ramdisk'].append(ramdisk_stats['peak_memory'])

            # Verify correctness
            assert np.allclose(result_mem, result_check), "Disk checkpoint failed"
            assert np.allclose(result_mem, result_ramdisk), "RAM disk checkpoint failed"
            print(f"  ✓ Correctness verified for all algorithms")

        exp.cleanup()

    # Calculate statistics
    for method in ['in_memory', 'checkpoint']:
        times = results['trials'][method]
        results[f'{method}_mean'] = np.mean(times)
        results[f'{method}_std'] = np.std(times)
        results[f'{method}_sem'] = stats.sem(times)
        results[f'{method}_ci'] = stats.t.interval(0.95, len(times)-1,
                                                   loc=np.mean(times),
                                                   scale=stats.sem(times))

        mems = results['memory'][method]
        results[f'{method}_memory_mean'] = np.mean(mems)
        results[f'{method}_memory_std'] = np.std(mems)

    # RAM disk stats (only one trial)
    if results['trials']['checkpoint_ramdisk']:
        results['checkpoint_ramdisk_mean'] = results['trials']['checkpoint_ramdisk'][0]
        results['checkpoint_ramdisk_memory'] = results['memory']['checkpoint_ramdisk'][0]

    # Calculate slowdowns
    results['slowdown_disk'] = results['checkpoint_mean'] / results['in_memory_mean']
    if 'checkpoint_ramdisk_mean' in results:
        results['slowdown_ramdisk'] = results['checkpoint_ramdisk_mean'] / results['in_memory_mean']
        results['io_overhead_factor'] = results['checkpoint_mean'] / results['checkpoint_ramdisk_mean']

    return results

def create_comprehensive_plots(all_results: List[Dict]):
    """Create publication-quality plots with error bars"""

    # Sort results by size
    all_results.sort(key=lambda x: x['size'])

    sizes = [r['size'] for r in all_results]

    # Figure 1: Time scaling with error bars
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

    # Extract data
    in_memory_means = [r['in_memory_mean'] for r in all_results]
    in_memory_errors = [r['in_memory_sem'] * 1.96 for r in all_results]  # 95% CI

    checkpoint_means = [r['checkpoint_mean'] for r in all_results]
    checkpoint_errors = [r['checkpoint_sem'] * 1.96 for r in all_results]

    # Plot with error bars
    ax1.errorbar(sizes, in_memory_means, yerr=in_memory_errors,
                 fmt='o-', label='In-memory O(n)',
                 color='blue', capsize=5, capthick=2, linewidth=2, markersize=8)

    ax1.errorbar(sizes, checkpoint_means, yerr=checkpoint_errors,
                 fmt='s-', label='Checkpointed O(√n)',
                 color='red', capsize=5, capthick=2, linewidth=2, markersize=8)

    # Add RAM disk results where available
    ramdisk_sizes = []
    ramdisk_means = []
    for r in all_results:
        if 'checkpoint_ramdisk_mean' in r:
            ramdisk_sizes.append(r['size'])
            ramdisk_means.append(r['checkpoint_ramdisk_mean'])

    if ramdisk_means:
        ax1.plot(ramdisk_sizes, ramdisk_means, 'D-',
                label='Checkpointed (RAM disk)',
                color='green', linewidth=2, markersize=8)

    # Theoretical curves
    sizes_theory = np.logspace(np.log10(min(sizes)), np.log10(max(sizes)), 100)

    # Fit power laws
    from scipy.optimize import curve_fit

    def power_law(x, a, b):
        return a * x**b

    # Fit in-memory times
    popt_mem, _ = curve_fit(power_law, sizes, in_memory_means)
    theory_mem = power_law(sizes_theory, *popt_mem)
    ax1.plot(sizes_theory, theory_mem, 'b--', alpha=0.5,
             label=f'Fit: O(n^{{{popt_mem[1]:.2f}}})')

    # Fit checkpoint times
    popt_check, _ = curve_fit(power_law, sizes, checkpoint_means)
    theory_check = power_law(sizes_theory, *popt_check)
    ax1.plot(sizes_theory, theory_check, 'r--', alpha=0.5,
             label=f'Fit: O(n^{{{popt_check[1]:.2f}}})')

    ax1.set_xlabel('Input Size (n)', fontsize=12)
    ax1.set_ylabel('Time (seconds)', fontsize=12)
    ax1.set_title('Sorting Time Complexity\n(20 trials per point, 95% CI)', fontsize=14)
    ax1.set_xscale('log')
    ax1.set_yscale('log')
    ax1.legend(loc='upper left')
    ax1.grid(True, alpha=0.3)

    # Subplot 2: Slowdown factors
    slowdowns_disk = [r['slowdown_disk'] for r in all_results]

    ax2.plot(sizes, slowdowns_disk, 'o-', color='red',
             linewidth=2, markersize=8, label='Disk I/O')

    # Add I/O overhead factor where available
    if ramdisk_sizes:
        io_factors = []
        for r in all_results:
            if 'io_overhead_factor' in r:
                io_factors.append(r['io_overhead_factor'])
        if io_factors:
            ax2.plot(ramdisk_sizes[:len(io_factors)], io_factors, 's-',
                    color='orange', linewidth=2, markersize=8,
                    label='Pure I/O overhead')

    # Theoretical √n line
    theory_slowdown = np.sqrt(sizes_theory / sizes[0])
    ax2.plot(sizes_theory, theory_slowdown, 'k--', alpha=0.5,
             label='Theoretical √n')

    ax2.set_xlabel('Input Size (n)', fontsize=12)
    ax2.set_ylabel('Slowdown Factor', fontsize=12)
    ax2.set_title('Space-Time Tradeoff Cost', fontsize=14)
    ax2.set_xscale('log')
    ax2.set_yscale('log')
    ax2.legend()
    ax2.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig('rigorous_sorting_analysis.png', dpi=300, bbox_inches='tight')
    plt.close()

    # Figure 2: Memory usage analysis
    fig, ax = plt.subplots(figsize=(10, 6))

    mem_theory = sizes_theory * 4  # 4 bytes per float
    mem_checkpoint = np.sqrt(sizes_theory) * 4

    ax.plot(sizes_theory, mem_theory, '-', label='Theoretical O(n)',
            color='blue', linewidth=2)
    ax.plot(sizes_theory, mem_checkpoint, '-', label='Theoretical O(√n)',
            color='red', linewidth=2)

    # Actual measured memory
    actual_mem_full = [r['in_memory_memory_mean'] for r in all_results]
    actual_mem_check = [r['checkpoint_memory_mean'] for r in all_results]

    ax.plot(sizes, actual_mem_full, 'o', label='Measured in-memory',
            color='blue', markersize=8)
    ax.plot(sizes, actual_mem_check, 's', label='Measured checkpoint',
            color='red', markersize=8)

    ax.set_xlabel('Input Size (n)', fontsize=12)
    ax.set_ylabel('Memory Usage (bytes)', fontsize=12)
    ax.set_title('Memory Usage: Theory vs Practice', fontsize=14)
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # Format y-axis
    ax.yaxis.set_major_formatter(plt.FuncFormatter(
        lambda y, _: f'{y/1e6:.0f}MB' if y >= 1e6 else f'{y/1e3:.0f}KB'
    ))

    plt.tight_layout()
    plt.savefig('memory_usage_analysis.png', dpi=300, bbox_inches='tight')
    plt.close()

def main():
    """Run comprehensive experiments"""

    print("="*60)
    print("RIGOROUS SPACE-TIME TRADEOFF EXPERIMENT")
    print("="*60)

    # Log environment
    env = ExperimentEnvironment.get_environment()
    print("\nExperimental Environment:")
    for key, value in env.items():
        if 'memory' in key or 'cache' in key:
            if isinstance(value, (int, float)):
                print(f"  {key}: {value:,}")
        else:
            print(f"  {key}: {value}")

    # Save environment
    with open('experiment_environment.json', 'w') as f:
        json.dump(env, f, indent=2)

    # Run experiments with multiple sizes
    sizes = [1000, 2000, 5000, 10000, 20000]  # Reasonable sizes for demo
    all_results = []

    for size in sizes:
        result = run_single_experiment(size, num_trials=20)
        all_results.append(result)

        # Print summary
        print(f"\nResults for n={size:,}:")
        print(f"  In-memory: {result['in_memory_mean']:.4f}s ± {result['in_memory_std']:.4f}s")
        print(f"  Checkpoint (disk): {result['checkpoint_mean']:.4f}s ± {result['checkpoint_std']:.4f}s")
        if 'checkpoint_ramdisk_mean' in result:
            print(f"  Checkpoint (RAM): {result['checkpoint_ramdisk_mean']:.4f}s")
            print(f"  Pure I/O overhead: {result['io_overhead_factor']:.1f}x")
        print(f"  Total slowdown: {result['slowdown_disk']:.1f}x")

    # Save raw results
    with open('experiment_results.json', 'w') as f:
        json.dump(all_results, f, indent=2)

    # Create plots
    create_comprehensive_plots(all_results)

    print("\n" + "="*60)
    print("EXPERIMENT COMPLETE")
    print("Generated files:")
    print("  - experiment_environment.json")
    print("  - experiment_results.json")
    print("  - rigorous_sorting_analysis.png")
    print("  - memory_usage_analysis.png")
    print("="*60)

if __name__ == "__main__":
    main()