sqrtspace-experiments/experiments/checkpointed_sorting/rigorous_experiment.py
2025-07-20 03:56:21 -04:00

506 lines
18 KiB
Python

"""
Rigorous sorting experiment with comprehensive statistical analysis
Addresses all concerns from RIGOR.txt:
- Multiple trials with statistical significance
- Multiple input sizes to show scaling
- Hardware/software environment logging
- Cache effects measurement
- RAM disk experiments to isolate I/O
"""
import os
import sys
import time
import tempfile
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import platform
import psutil
import json
from datetime import datetime
import subprocess
import shutil
from typing import List, Dict, Tuple
import tracemalloc
class ExperimentEnvironment:
"""Capture and log experimental environment"""
@staticmethod
def get_environment():
"""Get comprehensive environment information"""
env = {
'timestamp': datetime.now().isoformat(),
'platform': platform.platform(),
'processor': platform.processor(),
'python_version': platform.python_version(),
'cpu_count': psutil.cpu_count(logical=False),
'cpu_count_logical': psutil.cpu_count(logical=True),
'memory_total': psutil.virtual_memory().total,
'memory_available': psutil.virtual_memory().available,
'disk_usage': psutil.disk_usage('/').percent,
}
# Try to get CPU frequency
try:
cpu_freq = psutil.cpu_freq()
if cpu_freq:
env['cpu_freq_current'] = cpu_freq.current
env['cpu_freq_max'] = cpu_freq.max
except:
pass
# Get cache sizes on Linux/Mac
try:
if platform.system() == 'Darwin':
# macOS
result = subprocess.run(['sysctl', '-n', 'hw.l1icachesize'],
capture_output=True, text=True)
if result.returncode == 0:
env['l1_cache'] = int(result.stdout.strip())
result = subprocess.run(['sysctl', '-n', 'hw.l2cachesize'],
capture_output=True, text=True)
if result.returncode == 0:
env['l2_cache'] = int(result.stdout.strip())
result = subprocess.run(['sysctl', '-n', 'hw.l3cachesize'],
capture_output=True, text=True)
if result.returncode == 0:
env['l3_cache'] = int(result.stdout.strip())
except:
pass
return env
class MemoryTrackedSort:
"""Sorting with detailed memory tracking"""
def __init__(self, data_size: int):
self.data_size = data_size
self.data = np.random.rand(data_size).astype(np.float32)
self.temp_dir = tempfile.mkdtemp()
self.memory_measurements = []
def cleanup(self):
"""Clean up temporary files"""
if os.path.exists(self.temp_dir):
shutil.rmtree(self.temp_dir)
def measure_memory(self, label: str):
"""Record current memory usage"""
current, peak = tracemalloc.get_traced_memory()
self.memory_measurements.append({
'label': label,
'current': current,
'peak': peak,
'timestamp': time.time()
})
def in_memory_sort(self) -> Tuple[np.ndarray, Dict]:
"""Standard in-memory sorting with memory tracking"""
tracemalloc.start()
self.memory_measurements = []
self.measure_memory('start')
result = np.sort(self.data.copy())
self.measure_memory('after_sort')
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
return result, {
'peak_memory': peak,
'measurements': self.memory_measurements
}
def checkpoint_sort(self, memory_limit: int, use_ramdisk: bool = False) -> Tuple[np.ndarray, Dict]:
"""External merge sort with checkpointing"""
tracemalloc.start()
self.memory_measurements = []
# Use RAM disk if requested
if use_ramdisk:
# Create tmpfs mount point (Linux) or use /tmp on macOS
if platform.system() == 'Darwin':
self.temp_dir = tempfile.mkdtemp(dir='/tmp')
else:
# Would need sudo for tmpfs mount, so use /dev/shm if available
if os.path.exists('/dev/shm'):
self.temp_dir = tempfile.mkdtemp(dir='/dev/shm')
chunk_size = max(1, memory_limit // 4) # Reserve memory for merging
num_chunks = (self.data_size + chunk_size - 1) // chunk_size
self.measure_memory('start')
# Phase 1: Sort chunks and write to disk
chunk_files = []
for i in range(num_chunks):
start_idx = i * chunk_size
end_idx = min((i + 1) * chunk_size, self.data_size)
# Sort chunk in memory
chunk = np.sort(self.data[start_idx:end_idx])
# Write to disk (checkpoint)
filename = os.path.join(self.temp_dir, f'chunk_{i}.npy')
np.save(filename, chunk)
chunk_files.append(filename)
# Clear chunk from memory
del chunk
if i % 10 == 0:
self.measure_memory(f'after_chunk_{i}')
# Phase 2: K-way merge with limited memory
result = self._k_way_merge(chunk_files, memory_limit)
self.measure_memory('after_merge')
# Cleanup
for f in chunk_files:
if os.path.exists(f):
os.remove(f)
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
return result, {
'peak_memory': peak,
'num_chunks': num_chunks,
'chunk_size': chunk_size,
'use_ramdisk': use_ramdisk,
'measurements': self.memory_measurements
}
def _k_way_merge(self, chunk_files: List[str], memory_limit: int) -> np.ndarray:
"""Merge sorted chunks with limited memory"""
import heapq
num_chunks = len(chunk_files)
buffer_size = max(1, memory_limit // (4 * num_chunks))
# Open chunks and create initial buffers
chunks = []
buffers = []
positions = []
for i, filename in enumerate(chunk_files):
chunk_data = np.load(filename)
chunks.append(chunk_data)
buffer_end = min(buffer_size, len(chunk_data))
buffers.append(chunk_data[:buffer_end])
positions.append(buffer_end)
# Priority queue for merge
heap = []
for i, buffer in enumerate(buffers):
if len(buffer) > 0:
heapq.heappush(heap, (buffer[0], i, 0))
result = []
while heap:
val, chunk_idx, buffer_idx = heapq.heappop(heap)
result.append(val)
# Move to next element
buffer_idx += 1
# Refill buffer if needed
if buffer_idx >= len(buffers[chunk_idx]):
pos = positions[chunk_idx]
if pos < len(chunks[chunk_idx]):
# Load next batch
new_end = min(pos + buffer_size, len(chunks[chunk_idx]))
buffers[chunk_idx] = chunks[chunk_idx][pos:new_end]
positions[chunk_idx] = new_end
buffer_idx = 0
else:
continue
# Add next element to heap
if buffer_idx < len(buffers[chunk_idx]):
heapq.heappush(heap, (buffers[chunk_idx][buffer_idx], chunk_idx, buffer_idx))
return np.array(result, dtype=np.float32)
def run_single_experiment(size: int, num_trials: int = 20) -> Dict:
"""Run experiment for a single input size"""
print(f"\nRunning experiment for n={size:,} with {num_trials} trials...")
results = {
'size': size,
'trials': {
'in_memory': [],
'checkpoint': [],
'checkpoint_ramdisk': []
},
'memory': {
'in_memory': [],
'checkpoint': [],
'checkpoint_ramdisk': []
}
}
for trial in range(num_trials):
if trial % 5 == 0:
print(f" Trial {trial+1}/{num_trials}...")
exp = MemoryTrackedSort(size)
# 1. In-memory sort
start = time.time()
result_mem, mem_stats = exp.in_memory_sort()
time_mem = time.time() - start
results['trials']['in_memory'].append(time_mem)
results['memory']['in_memory'].append(mem_stats['peak_memory'])
# 2. Checkpointed sort (disk)
memory_limit = int(np.sqrt(size) * 4)
start = time.time()
result_check, check_stats = exp.checkpoint_sort(memory_limit, use_ramdisk=False)
time_check = time.time() - start
results['trials']['checkpoint'].append(time_check)
results['memory']['checkpoint'].append(check_stats['peak_memory'])
# 3. Checkpointed sort (RAM disk) - only on first trial to save time
if trial == 0:
start = time.time()
result_ramdisk, ramdisk_stats = exp.checkpoint_sort(memory_limit, use_ramdisk=True)
time_ramdisk = time.time() - start
results['trials']['checkpoint_ramdisk'].append(time_ramdisk)
results['memory']['checkpoint_ramdisk'].append(ramdisk_stats['peak_memory'])
# Verify correctness
assert np.allclose(result_mem, result_check), "Disk checkpoint failed"
assert np.allclose(result_mem, result_ramdisk), "RAM disk checkpoint failed"
print(f" ✓ Correctness verified for all algorithms")
exp.cleanup()
# Calculate statistics
for method in ['in_memory', 'checkpoint']:
times = results['trials'][method]
results[f'{method}_mean'] = np.mean(times)
results[f'{method}_std'] = np.std(times)
results[f'{method}_sem'] = stats.sem(times)
results[f'{method}_ci'] = stats.t.interval(0.95, len(times)-1,
loc=np.mean(times),
scale=stats.sem(times))
mems = results['memory'][method]
results[f'{method}_memory_mean'] = np.mean(mems)
results[f'{method}_memory_std'] = np.std(mems)
# RAM disk stats (only one trial)
if results['trials']['checkpoint_ramdisk']:
results['checkpoint_ramdisk_mean'] = results['trials']['checkpoint_ramdisk'][0]
results['checkpoint_ramdisk_memory'] = results['memory']['checkpoint_ramdisk'][0]
# Calculate slowdowns
results['slowdown_disk'] = results['checkpoint_mean'] / results['in_memory_mean']
if 'checkpoint_ramdisk_mean' in results:
results['slowdown_ramdisk'] = results['checkpoint_ramdisk_mean'] / results['in_memory_mean']
results['io_overhead_factor'] = results['checkpoint_mean'] / results['checkpoint_ramdisk_mean']
return results
def create_comprehensive_plots(all_results: List[Dict]):
"""Create publication-quality plots with error bars"""
# Sort results by size
all_results.sort(key=lambda x: x['size'])
sizes = [r['size'] for r in all_results]
# Figure 1: Time scaling with error bars
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# Extract data
in_memory_means = [r['in_memory_mean'] for r in all_results]
in_memory_errors = [r['in_memory_sem'] * 1.96 for r in all_results] # 95% CI
checkpoint_means = [r['checkpoint_mean'] for r in all_results]
checkpoint_errors = [r['checkpoint_sem'] * 1.96 for r in all_results]
# Plot with error bars
ax1.errorbar(sizes, in_memory_means, yerr=in_memory_errors,
fmt='o-', label='In-memory O(n)',
color='blue', capsize=5, capthick=2, linewidth=2, markersize=8)
ax1.errorbar(sizes, checkpoint_means, yerr=checkpoint_errors,
fmt='s-', label='Checkpointed O(√n)',
color='red', capsize=5, capthick=2, linewidth=2, markersize=8)
# Add RAM disk results where available
ramdisk_sizes = []
ramdisk_means = []
for r in all_results:
if 'checkpoint_ramdisk_mean' in r:
ramdisk_sizes.append(r['size'])
ramdisk_means.append(r['checkpoint_ramdisk_mean'])
if ramdisk_means:
ax1.plot(ramdisk_sizes, ramdisk_means, 'D-',
label='Checkpointed (RAM disk)',
color='green', linewidth=2, markersize=8)
# Theoretical curves
sizes_theory = np.logspace(np.log10(min(sizes)), np.log10(max(sizes)), 100)
# Fit power laws
from scipy.optimize import curve_fit
def power_law(x, a, b):
return a * x**b
# Fit in-memory times
popt_mem, _ = curve_fit(power_law, sizes, in_memory_means)
theory_mem = power_law(sizes_theory, *popt_mem)
ax1.plot(sizes_theory, theory_mem, 'b--', alpha=0.5,
label=f'Fit: O(n^{{{popt_mem[1]:.2f}}})')
# Fit checkpoint times
popt_check, _ = curve_fit(power_law, sizes, checkpoint_means)
theory_check = power_law(sizes_theory, *popt_check)
ax1.plot(sizes_theory, theory_check, 'r--', alpha=0.5,
label=f'Fit: O(n^{{{popt_check[1]:.2f}}})')
ax1.set_xlabel('Input Size (n)', fontsize=12)
ax1.set_ylabel('Time (seconds)', fontsize=12)
ax1.set_title('Sorting Time Complexity\n(20 trials per point, 95% CI)', fontsize=14)
ax1.set_xscale('log')
ax1.set_yscale('log')
ax1.legend(loc='upper left')
ax1.grid(True, alpha=0.3)
# Subplot 2: Slowdown factors
slowdowns_disk = [r['slowdown_disk'] for r in all_results]
ax2.plot(sizes, slowdowns_disk, 'o-', color='red',
linewidth=2, markersize=8, label='Disk I/O')
# Add I/O overhead factor where available
if ramdisk_sizes:
io_factors = []
for r in all_results:
if 'io_overhead_factor' in r:
io_factors.append(r['io_overhead_factor'])
if io_factors:
ax2.plot(ramdisk_sizes[:len(io_factors)], io_factors, 's-',
color='orange', linewidth=2, markersize=8,
label='Pure I/O overhead')
# Theoretical √n line
theory_slowdown = np.sqrt(sizes_theory / sizes[0])
ax2.plot(sizes_theory, theory_slowdown, 'k--', alpha=0.5,
label='Theoretical √n')
ax2.set_xlabel('Input Size (n)', fontsize=12)
ax2.set_ylabel('Slowdown Factor', fontsize=12)
ax2.set_title('Space-Time Tradeoff Cost', fontsize=14)
ax2.set_xscale('log')
ax2.set_yscale('log')
ax2.legend()
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('rigorous_sorting_analysis.png', dpi=300, bbox_inches='tight')
plt.close()
# Figure 2: Memory usage analysis
fig, ax = plt.subplots(figsize=(10, 6))
mem_theory = sizes_theory * 4 # 4 bytes per float
mem_checkpoint = np.sqrt(sizes_theory) * 4
ax.plot(sizes_theory, mem_theory, '-', label='Theoretical O(n)',
color='blue', linewidth=2)
ax.plot(sizes_theory, mem_checkpoint, '-', label='Theoretical O(√n)',
color='red', linewidth=2)
# Actual measured memory
actual_mem_full = [r['in_memory_memory_mean'] for r in all_results]
actual_mem_check = [r['checkpoint_memory_mean'] for r in all_results]
ax.plot(sizes, actual_mem_full, 'o', label='Measured in-memory',
color='blue', markersize=8)
ax.plot(sizes, actual_mem_check, 's', label='Measured checkpoint',
color='red', markersize=8)
ax.set_xlabel('Input Size (n)', fontsize=12)
ax.set_ylabel('Memory Usage (bytes)', fontsize=12)
ax.set_title('Memory Usage: Theory vs Practice', fontsize=14)
ax.set_xscale('log')
ax.set_yscale('log')
ax.legend()
ax.grid(True, alpha=0.3)
# Format y-axis
ax.yaxis.set_major_formatter(plt.FuncFormatter(
lambda y, _: f'{y/1e6:.0f}MB' if y >= 1e6 else f'{y/1e3:.0f}KB'
))
plt.tight_layout()
plt.savefig('memory_usage_analysis.png', dpi=300, bbox_inches='tight')
plt.close()
def main():
"""Run comprehensive experiments"""
print("="*60)
print("RIGOROUS SPACE-TIME TRADEOFF EXPERIMENT")
print("="*60)
# Log environment
env = ExperimentEnvironment.get_environment()
print("\nExperimental Environment:")
for key, value in env.items():
if 'memory' in key or 'cache' in key:
if isinstance(value, (int, float)):
print(f" {key}: {value:,}")
else:
print(f" {key}: {value}")
# Save environment
with open('experiment_environment.json', 'w') as f:
json.dump(env, f, indent=2)
# Run experiments with multiple sizes
sizes = [1000, 2000, 5000, 10000, 20000] # Reasonable sizes for demo
all_results = []
for size in sizes:
result = run_single_experiment(size, num_trials=20)
all_results.append(result)
# Print summary
print(f"\nResults for n={size:,}:")
print(f" In-memory: {result['in_memory_mean']:.4f}s ± {result['in_memory_std']:.4f}s")
print(f" Checkpoint (disk): {result['checkpoint_mean']:.4f}s ± {result['checkpoint_std']:.4f}s")
if 'checkpoint_ramdisk_mean' in result:
print(f" Checkpoint (RAM): {result['checkpoint_ramdisk_mean']:.4f}s")
print(f" Pure I/O overhead: {result['io_overhead_factor']:.1f}x")
print(f" Total slowdown: {result['slowdown_disk']:.1f}x")
# Save raw results
with open('experiment_results.json', 'w') as f:
json.dump(all_results, f, indent=2)
# Create plots
create_comprehensive_plots(all_results)
print("\n" + "="*60)
print("EXPERIMENT COMPLETE")
print("Generated files:")
print(" - experiment_environment.json")
print(" - experiment_results.json")
print(" - rigorous_sorting_analysis.png")
print(" - memory_usage_analysis.png")
print("="*60)
if __name__ == "__main__":
main()