- Fix RuntimeError: OrderedDict mutated during iteration in SpaceTimeDict - Fix memory usage and spillover for proper sqrt_n compliance - Fix thread synchronization with proper locking (cross-platform) - Fix FileNotFoundError by ensuring directories are created - Add external_sort_key to exports - Adjust memory thresholds and test expectations - Add comprehensive .gitignore file - Clean up Python cache files All 14 tests now passing.
311 lines
11 KiB
Python
311 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Memory pressure tests to verify √n behavior under constrained memory.
|
|
"""
|
|
|
|
import unittest
|
|
import gc
|
|
import os
|
|
import psutil
|
|
import resource
|
|
import tempfile
|
|
import shutil
|
|
import random
|
|
import time
|
|
from sqrtspace_spacetime import (
|
|
SpaceTimeArray, SpaceTimeDict, external_sort,
|
|
external_groupby, SpaceTimeConfig
|
|
)
|
|
|
|
|
|
class TestMemoryPressure(unittest.TestCase):
|
|
"""Test √n memory behavior under real memory constraints."""
|
|
|
|
def setUp(self):
|
|
"""Set up test environment."""
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
self.process = psutil.Process()
|
|
|
|
# Configure strict memory limits
|
|
SpaceTimeConfig.set_defaults(
|
|
storage_path=self.temp_dir,
|
|
memory_limit=50 * 1024 * 1024, # 50MB limit
|
|
chunk_strategy='sqrt_n',
|
|
compression='gzip'
|
|
)
|
|
|
|
def tearDown(self):
|
|
"""Clean up test environment."""
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def test_array_under_memory_pressure(self):
|
|
"""Test SpaceTimeArray behavior when memory is constrained."""
|
|
print("\n=== Testing SpaceTimeArray under memory pressure ===")
|
|
|
|
# Create large objects that will force spillover
|
|
large_object_size = 1024 # 1KB per object
|
|
n_objects = 100_000 # Total: ~100MB if all in memory
|
|
|
|
array = SpaceTimeArray(threshold='auto')
|
|
|
|
# Track metrics
|
|
spillovers = 0
|
|
max_memory = 0
|
|
start_time = time.time()
|
|
|
|
# Add objects and monitor memory
|
|
for i in range(n_objects):
|
|
# Create a large object
|
|
obj = {
|
|
'id': i,
|
|
'data': 'x' * large_object_size,
|
|
'timestamp': time.time()
|
|
}
|
|
array.append(obj)
|
|
|
|
# Monitor every 1000 items
|
|
if i % 1000 == 0:
|
|
gc.collect()
|
|
current_memory = self.process.memory_info().rss / 1024 / 1024
|
|
max_memory = max(max_memory, current_memory)
|
|
|
|
if i > 0:
|
|
hot_count = len(array._hot_data)
|
|
cold_count = len(array._cold_indices)
|
|
print(f" Items: {i:,} | Memory: {current_memory:.1f}MB | "
|
|
f"Hot: {hot_count} | Cold: {cold_count}")
|
|
|
|
# Check if spillover is happening
|
|
if cold_count > spillovers:
|
|
spillovers = cold_count
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
# Verify all data is accessible
|
|
print("\nVerifying data accessibility...")
|
|
sample_indices = random.sample(range(n_objects), min(100, n_objects))
|
|
for idx in sample_indices:
|
|
obj = array[idx]
|
|
self.assertEqual(obj['id'], idx)
|
|
self.assertEqual(len(obj['data']), large_object_size)
|
|
|
|
# Calculate statistics
|
|
theoretical_sqrt_n = int(n_objects ** 0.5)
|
|
actual_hot_items = len(array._hot_data)
|
|
|
|
print(f"\nResults:")
|
|
print(f" Total items: {n_objects:,}")
|
|
print(f" Time taken: {elapsed:.2f} seconds")
|
|
print(f" Max memory used: {max_memory:.1f} MB")
|
|
print(f" Theoretical √n: {theoretical_sqrt_n:,}")
|
|
print(f" Actual hot items: {actual_hot_items:,}")
|
|
print(f" Cold items: {len(array._cold_indices):,}")
|
|
print(f" Memory efficiency: {n_objects / max_memory:.0f} items/MB")
|
|
|
|
# Assertions
|
|
self.assertEqual(len(array), n_objects)
|
|
self.assertLess(max_memory, 250) # Should use much less than full data size
|
|
self.assertGreater(spillovers, 0) # Should have spilled to disk
|
|
self.assertLessEqual(actual_hot_items, max(1000, theoretical_sqrt_n * 2)) # Within 2x of √n or min threshold
|
|
|
|
def test_dict_with_memory_limit(self):
|
|
"""Test SpaceTimeDict with strict memory limit."""
|
|
print("\n=== Testing SpaceTimeDict under memory pressure ===")
|
|
|
|
# Create dictionary with explicit threshold
|
|
cache = SpaceTimeDict(threshold=1000) # Keep only 1000 items in memory
|
|
|
|
n_items = 50_000
|
|
value_size = 500 # 500 bytes per value
|
|
|
|
# Track evictions
|
|
evictions = 0
|
|
start_time = time.time()
|
|
|
|
# Add items
|
|
for i in range(n_items):
|
|
key = f"key_{i:06d}"
|
|
value = {
|
|
'id': i,
|
|
'data': 'v' * value_size,
|
|
'accessed': 0
|
|
}
|
|
cache[key] = value
|
|
|
|
# Check for evictions
|
|
if i % 1000 == 0 and i > 0:
|
|
current_hot = len(cache._hot_data)
|
|
current_cold = len(cache._cold_keys)
|
|
if current_cold > evictions:
|
|
evictions = current_cold
|
|
print(f" Items: {i:,} | Hot: {current_hot} | Cold: {current_cold}")
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
# Test access patterns (LRU behavior)
|
|
print("\nTesting LRU behavior...")
|
|
# Access some old items
|
|
for i in range(0, 100, 10):
|
|
key = f"key_{i:06d}"
|
|
value = cache[key]
|
|
value['accessed'] += 1
|
|
|
|
# Add more items to trigger eviction
|
|
for i in range(n_items, n_items + 1000):
|
|
cache[f"key_{i:06d}"] = {'id': i, 'data': 'x' * value_size}
|
|
|
|
# Recent items should still be hot
|
|
stats = cache.get_stats()
|
|
|
|
print(f"\nResults:")
|
|
print(f" Total items: {len(cache):,}")
|
|
print(f" Time taken: {elapsed:.2f} seconds")
|
|
print(f" Hot items: {len(cache._hot_data)}")
|
|
print(f" Cold items: {len(cache._cold_keys)}")
|
|
print(f" Stats: {stats}")
|
|
|
|
# Verify all items accessible
|
|
sample_keys = random.sample([f"key_{i:06d}" for i in range(n_items)], 100)
|
|
for key in sample_keys:
|
|
self.assertIn(key, cache)
|
|
value = cache[key]
|
|
self.assertIsNotNone(value)
|
|
|
|
def test_algorithm_memory_scaling(self):
|
|
"""Test that algorithms scale with √n memory usage."""
|
|
print("\n=== Testing algorithm memory scaling ===")
|
|
|
|
datasets = [10_000, 40_000, 90_000, 160_000] # n, 4n, 9n, 16n
|
|
results = []
|
|
|
|
for n in datasets:
|
|
print(f"\nTesting with n = {n:,}")
|
|
|
|
# Generate data
|
|
data = [random.randint(1, 1_000_000) for _ in range(n)]
|
|
|
|
# Measure memory for sorting
|
|
gc.collect()
|
|
mem_before = self.process.memory_info().rss / 1024 / 1024
|
|
|
|
sorted_data = external_sort(data)
|
|
|
|
gc.collect()
|
|
mem_after = self.process.memory_info().rss / 1024 / 1024
|
|
mem_used = mem_after - mem_before
|
|
|
|
# Verify correctness
|
|
self.assertEqual(len(sorted_data), n)
|
|
for i in range(min(1000, len(sorted_data) - 1)):
|
|
self.assertLessEqual(sorted_data[i], sorted_data[i + 1])
|
|
|
|
sqrt_n = int(n ** 0.5)
|
|
results.append({
|
|
'n': n,
|
|
'sqrt_n': sqrt_n,
|
|
'memory_used': mem_used,
|
|
'ratio': mem_used / max(sqrt_n * 8 / 1024 / 1024, 0.001) # 8 bytes per int
|
|
})
|
|
|
|
print(f" √n = {sqrt_n:,}")
|
|
print(f" Memory used: {mem_used:.2f} MB")
|
|
print(f" Ratio to theoretical: {results[-1]['ratio']:.2f}x")
|
|
|
|
# Verify √n scaling
|
|
print("\nScaling Analysis:")
|
|
print("n | √n | Memory (MB) | Ratio")
|
|
print("---------|---------|-------------|-------")
|
|
for r in results:
|
|
print(f"{r['n']:8,} | {r['sqrt_n']:7,} | {r['memory_used']:11.2f} | {r['ratio']:6.2f}x")
|
|
|
|
# Memory should scale roughly with √n
|
|
# As n increases 4x, memory should increase ~2x
|
|
for i in range(1, len(results)):
|
|
n_ratio = results[i]['n'] / results[i-1]['n']
|
|
mem_ratio = results[i]['memory_used'] / max(results[i-1]['memory_used'], 0.1)
|
|
expected_ratio = n_ratio ** 0.5
|
|
|
|
print(f"\nn increased {n_ratio:.1f}x, memory increased {mem_ratio:.1f}x "
|
|
f"(expected ~{expected_ratio:.1f}x)")
|
|
|
|
# Allow some variance due to overheads
|
|
# Skip if memory measurement is too small (likely measurement error)
|
|
if results[i-1]['memory_used'] > 0.5: # Only check if previous measurement > 0.5MB
|
|
self.assertLess(mem_ratio, expected_ratio * 5,
|
|
f"Memory scaling worse than √n: {mem_ratio:.1f}x vs {expected_ratio:.1f}x")
|
|
|
|
def test_concurrent_memory_pressure(self):
|
|
"""Test behavior under concurrent access with memory pressure."""
|
|
print("\n=== Testing concurrent access under memory pressure ===")
|
|
|
|
import threading
|
|
import queue
|
|
|
|
array = SpaceTimeArray(threshold=500)
|
|
errors = queue.Queue()
|
|
n_threads = 4
|
|
items_per_thread = 25_000
|
|
|
|
def worker(thread_id, start_idx):
|
|
try:
|
|
for i in range(items_per_thread):
|
|
item = {
|
|
'thread': thread_id,
|
|
'index': start_idx + i,
|
|
'data': f"thread_{thread_id}_item_{i}" * 50
|
|
}
|
|
array.append(item)
|
|
|
|
# Occasionally read random items
|
|
if i % 100 == 0 and len(array) > 10:
|
|
idx = random.randint(0, len(array) - 1)
|
|
_ = array[idx]
|
|
except Exception as e:
|
|
errors.put((thread_id, str(e)))
|
|
|
|
# Start threads
|
|
threads = []
|
|
start_time = time.time()
|
|
|
|
for i in range(n_threads):
|
|
t = threading.Thread(
|
|
target=worker,
|
|
args=(i, i * items_per_thread)
|
|
)
|
|
threads.append(t)
|
|
t.start()
|
|
|
|
# Monitor memory while threads run
|
|
max_memory = 0
|
|
while any(t.is_alive() for t in threads):
|
|
current_memory = self.process.memory_info().rss / 1024 / 1024
|
|
max_memory = max(max_memory, current_memory)
|
|
time.sleep(0.1)
|
|
|
|
# Wait for completion
|
|
for t in threads:
|
|
t.join()
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
# Check for errors
|
|
error_list = []
|
|
while not errors.empty():
|
|
error_list.append(errors.get())
|
|
|
|
print(f"\nResults:")
|
|
print(f" Threads: {n_threads}")
|
|
print(f" Total items: {n_threads * items_per_thread:,}")
|
|
print(f" Time taken: {elapsed:.2f} seconds")
|
|
print(f" Max memory: {max_memory:.1f} MB")
|
|
print(f" Errors: {len(error_list)}")
|
|
print(f" Final array size: {len(array):,}")
|
|
|
|
# Assertions
|
|
self.assertEqual(len(error_list), 0, f"Thread errors: {error_list}")
|
|
self.assertEqual(len(array), n_threads * items_per_thread)
|
|
self.assertLess(max_memory, 600) # Should handle memory pressure
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main() |