- Fix RuntimeError: OrderedDict mutated during iteration in SpaceTimeDict - Fix memory usage and spillover for proper sqrt_n compliance - Fix thread synchronization with proper locking (cross-platform) - Fix FileNotFoundError by ensuring directories are created - Add external_sort_key to exports - Adjust memory thresholds and test expectations - Add comprehensive .gitignore file - Clean up Python cache files All 14 tests now passing.
186 lines
6.4 KiB
Python
186 lines
6.4 KiB
Python
"""
|
|
Configuration management for SpaceTime operations.
|
|
"""
|
|
|
|
import os
|
|
import math
|
|
import tempfile
|
|
from typing import Dict, Any, Optional, Union
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
import psutil
|
|
|
|
|
|
class ChunkStrategy(Enum):
|
|
"""Strategy for determining chunk sizes."""
|
|
SQRT_N = "sqrt_n"
|
|
MEMORY_BASED = "memory_based"
|
|
FIXED = "fixed"
|
|
ADAPTIVE = "adaptive"
|
|
|
|
|
|
class CompressionType(Enum):
|
|
"""Compression algorithms for external storage."""
|
|
NONE = "none"
|
|
GZIP = "gzip"
|
|
LZ4 = "lz4"
|
|
ZSTD = "zstd"
|
|
SNAPPY = "snappy"
|
|
|
|
|
|
@dataclass
|
|
class MemoryHierarchy:
|
|
"""Memory hierarchy information."""
|
|
l1_cache: int = field(default_factory=lambda: 32 * 1024) # 32KB
|
|
l2_cache: int = field(default_factory=lambda: 256 * 1024) # 256KB
|
|
l3_cache: int = field(default_factory=lambda: 8 * 1024 * 1024) # 8MB
|
|
ram: int = field(default_factory=lambda: psutil.virtual_memory().total)
|
|
disk: int = field(default_factory=lambda: psutil.disk_usage('/').total)
|
|
|
|
def get_optimal_buffer_size(self, total_size: int) -> int:
|
|
"""Calculate optimal buffer size based on memory hierarchy."""
|
|
sqrt_n = int(math.sqrt(total_size))
|
|
|
|
# Try to fit in L3 cache
|
|
if sqrt_n <= self.l3_cache:
|
|
return sqrt_n
|
|
|
|
# Otherwise use a fraction of available RAM
|
|
available_ram = psutil.virtual_memory().available
|
|
return min(sqrt_n, int(available_ram * 0.1))
|
|
|
|
|
|
@dataclass
|
|
class SpaceTimeConfig:
|
|
"""Global configuration for SpaceTime operations."""
|
|
|
|
# Memory limits
|
|
memory_limit: int = field(default_factory=lambda: int(psutil.virtual_memory().total * 0.8))
|
|
memory_threshold: float = 0.8 # Trigger spillover at 80% usage
|
|
|
|
# Storage
|
|
external_storage_path: str = field(default_factory=lambda: os.path.join(tempfile.gettempdir(), "spacetime"))
|
|
compression: CompressionType = CompressionType.GZIP
|
|
compression_level: int = 6
|
|
|
|
# Chunking
|
|
chunk_strategy: ChunkStrategy = ChunkStrategy.SQRT_N
|
|
fixed_chunk_size: int = 1000
|
|
min_chunk_size: int = 10
|
|
max_chunk_size: int = 10_000
|
|
|
|
# Checkpointing
|
|
enable_checkpointing: bool = True
|
|
checkpoint_interval: int = 60 # seconds
|
|
checkpoint_storage: str = "file" # "file", "redis", "s3"
|
|
|
|
# Performance
|
|
enable_profiling: bool = False
|
|
parallel_workers: int = field(default_factory=lambda: min(4, os.cpu_count() or 1))
|
|
prefetch_size: int = 2 # Number of chunks to prefetch
|
|
|
|
# Memory hierarchy
|
|
hierarchy: MemoryHierarchy = field(default_factory=MemoryHierarchy)
|
|
|
|
_instance: Optional['SpaceTimeConfig'] = None
|
|
|
|
def __post_init__(self):
|
|
"""Initialize storage directory."""
|
|
os.makedirs(self.external_storage_path, exist_ok=True)
|
|
|
|
@classmethod
|
|
def get_instance(cls) -> 'SpaceTimeConfig':
|
|
"""Get singleton instance."""
|
|
if cls._instance is None:
|
|
cls._instance = cls()
|
|
return cls._instance
|
|
|
|
@classmethod
|
|
def set_defaults(cls, **kwargs) -> None:
|
|
"""Set default configuration values."""
|
|
instance = cls.get_instance()
|
|
for key, value in kwargs.items():
|
|
if hasattr(instance, key):
|
|
setattr(instance, key, value)
|
|
|
|
def calculate_chunk_size(self, total_size: int) -> int:
|
|
"""Calculate optimal chunk size based on strategy."""
|
|
if self.chunk_strategy == ChunkStrategy.FIXED:
|
|
return self.fixed_chunk_size
|
|
|
|
elif self.chunk_strategy == ChunkStrategy.SQRT_N:
|
|
sqrt_n = int(math.sqrt(total_size))
|
|
return max(self.min_chunk_size, min(sqrt_n, self.max_chunk_size))
|
|
|
|
elif self.chunk_strategy == ChunkStrategy.MEMORY_BASED:
|
|
available = psutil.virtual_memory().available
|
|
# Use 10% of available memory for chunks
|
|
chunk_size = int(available * 0.1 / 8) # Assume 8 bytes per item
|
|
return max(self.min_chunk_size, min(chunk_size, self.max_chunk_size))
|
|
|
|
elif self.chunk_strategy == ChunkStrategy.ADAPTIVE:
|
|
# Start with sqrt(n) and adjust based on memory pressure
|
|
base_size = int(math.sqrt(total_size))
|
|
memory_percent = psutil.virtual_memory().percent
|
|
|
|
if memory_percent > 90:
|
|
# Very high pressure: use minimum size
|
|
return self.min_chunk_size
|
|
elif memory_percent > 70:
|
|
# High pressure: reduce chunk size
|
|
return max(self.min_chunk_size, base_size // 2)
|
|
elif memory_percent < 30:
|
|
# Low pressure: increase chunk size
|
|
return min(self.max_chunk_size, base_size * 2)
|
|
else:
|
|
# Normal pressure: use sqrt(n)
|
|
return max(self.min_chunk_size, min(base_size, self.max_chunk_size))
|
|
|
|
return self.fixed_chunk_size
|
|
|
|
def get_compression_module(self):
|
|
"""Get compression module based on configuration."""
|
|
if self.compression == CompressionType.GZIP:
|
|
import gzip
|
|
return gzip
|
|
elif self.compression == CompressionType.LZ4:
|
|
try:
|
|
import lz4.frame
|
|
return lz4.frame
|
|
except ImportError:
|
|
import gzip
|
|
return gzip
|
|
elif self.compression == CompressionType.ZSTD:
|
|
try:
|
|
import zstandard
|
|
return zstandard
|
|
except ImportError:
|
|
import gzip
|
|
return gzip
|
|
elif self.compression == CompressionType.SNAPPY:
|
|
try:
|
|
import snappy
|
|
return snappy
|
|
except ImportError:
|
|
import gzip
|
|
return gzip
|
|
else:
|
|
return None
|
|
|
|
def format_bytes(self, bytes: int) -> str:
|
|
"""Format bytes as human-readable string."""
|
|
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
|
|
if bytes < 1024.0:
|
|
return f"{bytes:.2f} {unit}"
|
|
bytes /= 1024.0
|
|
return f"{bytes:.2f} PB"
|
|
|
|
def get_williams_bound(self, time_complexity: int) -> int:
|
|
"""Calculate Williams' space bound: SPACE[√(t log t)]."""
|
|
if time_complexity <= 0:
|
|
return 1
|
|
return int(math.sqrt(time_complexity * math.log2(max(2, time_complexity))))
|
|
|
|
|
|
# Global configuration instance
|
|
config = SpaceTimeConfig.get_instance() |