#!/usr/bin/env python3 """ Distributed Shuffle Optimizer: Optimize shuffle operations in distributed computing Features: - Buffer Sizing: Calculate optimal buffer sizes per node - Spill Strategy: Decide when to spill based on memory pressure - Aggregation Trees: Build √n-height aggregation trees - Network Awareness: Consider network topology in optimization - AI Explanations: Clear reasoning for optimization decisions """ import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import numpy as np import json import time import psutil import socket from dataclasses import dataclass, asdict from typing import Dict, List, Tuple, Optional, Any, Union from enum import Enum import heapq import zlib # Import core components from core.spacetime_core import ( MemoryHierarchy, SqrtNCalculator, OptimizationStrategy, MemoryProfiler ) class ShuffleStrategy(Enum): """Shuffle strategies for distributed systems""" ALL_TO_ALL = "all_to_all" # Every node to every node TREE_AGGREGATE = "tree_aggregate" # Hierarchical aggregation HASH_PARTITION = "hash_partition" # Hash-based partitioning RANGE_PARTITION = "range_partition" # Range-based partitioning COMBINER_BASED = "combiner_based" # Local combining first class CompressionType(Enum): """Compression algorithms for shuffle data""" NONE = "none" SNAPPY = "snappy" # Fast, moderate compression ZLIB = "zlib" # Slower, better compression LZ4 = "lz4" # Very fast, light compression @dataclass class NodeInfo: """Information about a compute node""" node_id: str hostname: str cpu_cores: int memory_gb: float network_bandwidth_gbps: float storage_type: str # 'ssd' or 'hdd' rack_id: Optional[str] = None @dataclass class ShuffleTask: """A shuffle task specification""" task_id: str input_partitions: int output_partitions: int data_size_gb: float key_distribution: str # 'uniform', 'skewed', 'heavy_hitters' value_size_avg: int # Average value size in bytes combiner_function: Optional[str] = None # 'sum', 'max', 'collect', etc. @dataclass class ShufflePlan: """Optimized shuffle execution plan""" strategy: ShuffleStrategy buffer_sizes: Dict[str, int] # node_id -> buffer_size spill_thresholds: Dict[str, float] # node_id -> threshold aggregation_tree: Optional[Dict[str, List[str]]] # parent -> children compression: CompressionType partition_assignment: Dict[int, str] # partition -> node_id estimated_time: float estimated_network_usage: float memory_usage: Dict[str, float] explanation: str @dataclass class ShuffleMetrics: """Metrics from shuffle execution""" total_time: float network_bytes: int disk_spills: int memory_peak: int compression_ratio: float skew_factor: float # Max/avg partition size class NetworkTopology: """Model network topology for optimization""" def __init__(self, nodes: List[NodeInfo]): self.nodes = {n.node_id: n for n in nodes} self.racks = self._group_by_rack(nodes) self.bandwidth_matrix = self._build_bandwidth_matrix() def _group_by_rack(self, nodes: List[NodeInfo]) -> Dict[str, List[str]]: """Group nodes by rack""" racks = {} for node in nodes: rack = node.rack_id or 'default' if rack not in racks: racks[rack] = [] racks[rack].append(node.node_id) return racks def _build_bandwidth_matrix(self) -> Dict[Tuple[str, str], float]: """Build bandwidth matrix between nodes""" matrix = {} for n1 in self.nodes: for n2 in self.nodes: if n1 == n2: matrix[(n1, n2)] = float('inf') # Local elif self._same_rack(n1, n2): # Same rack: use min node bandwidth matrix[(n1, n2)] = min( self.nodes[n1].network_bandwidth_gbps, self.nodes[n2].network_bandwidth_gbps ) else: # Cross-rack: assume 50% of node bandwidth matrix[(n1, n2)] = min( self.nodes[n1].network_bandwidth_gbps, self.nodes[n2].network_bandwidth_gbps ) * 0.5 return matrix def _same_rack(self, node1: str, node2: str) -> bool: """Check if two nodes are in the same rack""" r1 = self.nodes[node1].rack_id or 'default' r2 = self.nodes[node2].rack_id or 'default' return r1 == r2 def get_bandwidth(self, src: str, dst: str) -> float: """Get bandwidth between two nodes in Gbps""" return self.bandwidth_matrix.get((src, dst), 1.0) class CostModel: """Cost model for shuffle operations""" def __init__(self, topology: NetworkTopology): self.topology = topology self.hierarchy = MemoryHierarchy.detect_system() def estimate_shuffle_time(self, task: ShuffleTask, plan: ShufflePlan) -> float: """Estimate shuffle execution time""" # Network transfer time network_time = self._estimate_network_time(task, plan) # Disk I/O time (if spilling) io_time = self._estimate_io_time(task, plan) # CPU time (serialization, compression) cpu_time = self._estimate_cpu_time(task, plan) # Take max as they can overlap return max(network_time, io_time) + cpu_time * 0.1 def _estimate_network_time(self, task: ShuffleTask, plan: ShufflePlan) -> float: """Estimate network transfer time""" bytes_per_partition = task.data_size_gb * 1e9 / task.input_partitions if plan.strategy == ShuffleStrategy.ALL_TO_ALL: # Every partition to every node total_bytes = task.data_size_gb * 1e9 avg_bandwidth = np.mean(list(self.topology.bandwidth_matrix.values())) return total_bytes / (avg_bandwidth * 1e9) elif plan.strategy == ShuffleStrategy.TREE_AGGREGATE: # Log(n) levels in tree num_nodes = len(self.topology.nodes) tree_height = np.log2(num_nodes) bytes_per_level = task.data_size_gb * 1e9 / tree_height avg_bandwidth = np.mean(list(self.topology.bandwidth_matrix.values())) return tree_height * bytes_per_level / (avg_bandwidth * 1e9) else: # Hash/range partition: each partition to one node avg_bandwidth = np.mean(list(self.topology.bandwidth_matrix.values())) return bytes_per_partition * task.output_partitions / (avg_bandwidth * 1e9) def _estimate_io_time(self, task: ShuffleTask, plan: ShufflePlan) -> float: """Estimate disk I/O time if spilling""" total_spill = 0 for node_id, threshold in plan.spill_thresholds.items(): node = self.topology.nodes[node_id] buffer_size = plan.buffer_sizes[node_id] # Estimate spill amount node_data = task.data_size_gb * 1e9 / len(self.topology.nodes) if node_data > buffer_size: spill_amount = node_data - buffer_size total_spill += spill_amount if total_spill > 0: # Assume 200MB/s for HDD, 500MB/s for SSD io_speed = 500e6 if 'ssd' in str(plan).lower() else 200e6 return total_spill / io_speed return 0.0 def _estimate_cpu_time(self, task: ShuffleTask, plan: ShufflePlan) -> float: """Estimate CPU time for serialization and compression""" total_cores = sum(n.cpu_cores for n in self.topology.nodes.values()) # Serialization cost serialize_rate = 1e9 # 1GB/s per core serialize_time = task.data_size_gb * 1e9 / (serialize_rate * total_cores) # Compression cost if plan.compression != CompressionType.NONE: if plan.compression == CompressionType.ZLIB: compress_rate = 100e6 # 100MB/s per core elif plan.compression == CompressionType.SNAPPY: compress_rate = 500e6 # 500MB/s per core else: # LZ4 compress_rate = 1e9 # 1GB/s per core compress_time = task.data_size_gb * 1e9 / (compress_rate * total_cores) else: compress_time = 0 return serialize_time + compress_time class ShuffleOptimizer: """Main distributed shuffle optimizer""" def __init__(self, nodes: List[NodeInfo], memory_limit_fraction: float = 0.5): self.topology = NetworkTopology(nodes) self.cost_model = CostModel(self.topology) self.memory_limit_fraction = memory_limit_fraction self.sqrt_calc = SqrtNCalculator() def optimize_shuffle(self, task: ShuffleTask) -> ShufflePlan: """Generate optimized shuffle plan""" # Choose strategy based on task characteristics strategy = self._choose_strategy(task) # Calculate buffer sizes using √n principle buffer_sizes = self._calculate_buffer_sizes(task) # Determine spill thresholds spill_thresholds = self._calculate_spill_thresholds(task, buffer_sizes) # Build aggregation tree if needed aggregation_tree = None if strategy == ShuffleStrategy.TREE_AGGREGATE: aggregation_tree = self._build_aggregation_tree() # Choose compression compression = self._choose_compression(task) # Assign partitions to nodes partition_assignment = self._assign_partitions(task, strategy) # Estimate performance plan = ShufflePlan( strategy=strategy, buffer_sizes=buffer_sizes, spill_thresholds=spill_thresholds, aggregation_tree=aggregation_tree, compression=compression, partition_assignment=partition_assignment, estimated_time=0.0, estimated_network_usage=0.0, memory_usage={}, explanation="" ) # Calculate estimates plan.estimated_time = self.cost_model.estimate_shuffle_time(task, plan) plan.estimated_network_usage = self._estimate_network_usage(task, plan) plan.memory_usage = self._estimate_memory_usage(task, plan) # Generate explanation plan.explanation = self._generate_explanation(task, plan) return plan def _choose_strategy(self, task: ShuffleTask) -> ShuffleStrategy: """Choose shuffle strategy based on task characteristics""" # Small data: all-to-all is fine if task.data_size_gb < 1: return ShuffleStrategy.ALL_TO_ALL # Has combiner: use combining strategy if task.combiner_function: return ShuffleStrategy.COMBINER_BASED # Many nodes: use tree aggregation if len(self.topology.nodes) > 10: return ShuffleStrategy.TREE_AGGREGATE # Skewed data: use range partitioning if task.key_distribution == 'skewed': return ShuffleStrategy.RANGE_PARTITION # Default: hash partitioning return ShuffleStrategy.HASH_PARTITION def _calculate_buffer_sizes(self, task: ShuffleTask) -> Dict[str, int]: """Calculate optimal buffer sizes using √n principle""" buffer_sizes = {} for node_id, node in self.topology.nodes.items(): # Available memory for shuffle available_memory = node.memory_gb * 1e9 * self.memory_limit_fraction # Data size per node data_per_node = task.data_size_gb * 1e9 / len(self.topology.nodes) if data_per_node <= available_memory: # Can fit all data buffer_size = int(data_per_node) else: # Use √n buffer sqrt_buffer = self.sqrt_calc.calculate_interval( int(data_per_node / task.value_size_avg) ) * task.value_size_avg buffer_size = min(int(sqrt_buffer), int(available_memory)) buffer_sizes[node_id] = buffer_size return buffer_sizes def _calculate_spill_thresholds(self, task: ShuffleTask, buffer_sizes: Dict[str, int]) -> Dict[str, float]: """Calculate memory thresholds for spilling""" thresholds = {} for node_id, buffer_size in buffer_sizes.items(): # Spill at 80% of buffer to leave headroom thresholds[node_id] = buffer_size * 0.8 return thresholds def _build_aggregation_tree(self) -> Dict[str, List[str]]: """Build √n-height aggregation tree""" nodes = list(self.topology.nodes.keys()) n = len(nodes) # Calculate branching factor for √n height height = int(np.sqrt(n)) branching_factor = int(np.ceil(n ** (1 / height))) tree = {} # Build tree level by level current_level = nodes[:] while len(current_level) > 1: next_level = [] for i in range(0, len(current_level), branching_factor): # Group nodes group = current_level[i:i + branching_factor] if len(group) > 1: parent = group[0] # First node as parent tree[parent] = group[1:] # Rest as children next_level.append(parent) elif group: next_level.append(group[0]) current_level = next_level return tree def _choose_compression(self, task: ShuffleTask) -> CompressionType: """Choose compression based on data characteristics and network""" # Average network bandwidth avg_bandwidth = np.mean([ n.network_bandwidth_gbps for n in self.topology.nodes.values() ]) # High bandwidth: no compression if avg_bandwidth > 10: # 10+ Gbps return CompressionType.NONE # Large values: use better compression if task.value_size_avg > 1000: return CompressionType.ZLIB # Medium bandwidth: balanced compression if avg_bandwidth > 1: # 1-10 Gbps return CompressionType.SNAPPY # Low bandwidth: fast compression return CompressionType.LZ4 def _assign_partitions(self, task: ShuffleTask, strategy: ShuffleStrategy) -> Dict[int, str]: """Assign partitions to nodes""" nodes = list(self.topology.nodes.keys()) assignment = {} if strategy == ShuffleStrategy.HASH_PARTITION: # Round-robin assignment for i in range(task.output_partitions): assignment[i] = nodes[i % len(nodes)] elif strategy == ShuffleStrategy.RANGE_PARTITION: # Assign ranges to nodes partitions_per_node = task.output_partitions // len(nodes) for i, node in enumerate(nodes): start = i * partitions_per_node end = start + partitions_per_node if i == len(nodes) - 1: end = task.output_partitions for p in range(start, end): assignment[p] = node else: # Default: even distribution for i in range(task.output_partitions): assignment[i] = nodes[i % len(nodes)] return assignment def _estimate_network_usage(self, task: ShuffleTask, plan: ShufflePlan) -> float: """Estimate total network bytes""" base_bytes = task.data_size_gb * 1e9 # Apply compression ratio if plan.compression == CompressionType.ZLIB: base_bytes *= 0.3 # ~70% compression elif plan.compression == CompressionType.SNAPPY: base_bytes *= 0.5 # ~50% compression elif plan.compression == CompressionType.LZ4: base_bytes *= 0.7 # ~30% compression # Apply strategy multiplier if plan.strategy == ShuffleStrategy.ALL_TO_ALL: n = len(self.topology.nodes) base_bytes *= (n - 1) / n # Each node sends to n-1 others elif plan.strategy == ShuffleStrategy.TREE_AGGREGATE: # Log(n) levels base_bytes *= np.log2(len(self.topology.nodes)) return base_bytes def _estimate_memory_usage(self, task: ShuffleTask, plan: ShufflePlan) -> Dict[str, float]: """Estimate memory usage per node""" memory_usage = {} for node_id in self.topology.nodes: # Buffer memory buffer_mem = plan.buffer_sizes[node_id] # Overhead (metadata, indices) overhead = buffer_mem * 0.1 # Compression buffers if used compress_mem = 0 if plan.compression != CompressionType.NONE: compress_mem = min(buffer_mem * 0.1, 100 * 1024 * 1024) # Max 100MB memory_usage[node_id] = buffer_mem + overhead + compress_mem return memory_usage def _generate_explanation(self, task: ShuffleTask, plan: ShufflePlan) -> str: """Generate human-readable explanation""" explanations = [] # Strategy explanation strategy_reasons = { ShuffleStrategy.ALL_TO_ALL: "small data size allows full exchange", ShuffleStrategy.TREE_AGGREGATE: f"√n-height tree reduces network hops to {int(np.sqrt(len(self.topology.nodes)))}", ShuffleStrategy.HASH_PARTITION: "uniform data distribution suits hash partitioning", ShuffleStrategy.RANGE_PARTITION: "skewed data benefits from range partitioning", ShuffleStrategy.COMBINER_BASED: "combiner function enables local aggregation" } explanations.append( f"Using {plan.strategy.value} strategy because {strategy_reasons[plan.strategy]}." ) # Buffer sizing avg_buffer_mb = np.mean(list(plan.buffer_sizes.values())) / 1e6 explanations.append( f"Allocated {avg_buffer_mb:.0f}MB buffers per node using √n principle " f"to balance memory usage and I/O." ) # Compression if plan.compression != CompressionType.NONE: explanations.append( f"Applied {plan.compression.value} compression to reduce network " f"traffic by ~{(1 - plan.estimated_network_usage / (task.data_size_gb * 1e9)) * 100:.0f}%." ) # Performance estimate explanations.append( f"Estimated completion time: {plan.estimated_time:.1f}s with " f"{plan.estimated_network_usage / 1e9:.1f}GB network transfer." ) return " ".join(explanations) def execute_shuffle(self, task: ShuffleTask, plan: ShufflePlan) -> ShuffleMetrics: """Simulate shuffle execution (for testing)""" start_time = time.time() # Simulate execution time.sleep(0.1) # Simulate some work # Calculate metrics metrics = ShuffleMetrics( total_time=time.time() - start_time, network_bytes=int(plan.estimated_network_usage), disk_spills=sum(1 for b in plan.buffer_sizes.values() if b < task.data_size_gb * 1e9 / len(self.topology.nodes)), memory_peak=max(plan.memory_usage.values()), compression_ratio=1.0, skew_factor=1.0 ) if plan.compression == CompressionType.ZLIB: metrics.compression_ratio = 3.3 elif plan.compression == CompressionType.SNAPPY: metrics.compression_ratio = 2.0 elif plan.compression == CompressionType.LZ4: metrics.compression_ratio = 1.4 return metrics def create_test_cluster(num_nodes: int = 4) -> List[NodeInfo]: """Create a test cluster configuration""" nodes = [] for i in range(num_nodes): node = NodeInfo( node_id=f"node{i}", hostname=f"worker{i}.cluster.local", cpu_cores=16, memory_gb=64, network_bandwidth_gbps=10.0, storage_type='ssd', rack_id=f"rack{i // 2}" # 2 nodes per rack ) nodes.append(node) return nodes # Example usage if __name__ == "__main__": print("Distributed Shuffle Optimizer Example") print("="*60) # Create test cluster nodes = create_test_cluster(4) optimizer = ShuffleOptimizer(nodes) # Example 1: Small uniform shuffle print("\nExample 1: Small uniform shuffle") task1 = ShuffleTask( task_id="shuffle_1", input_partitions=100, output_partitions=100, data_size_gb=0.5, key_distribution='uniform', value_size_avg=100 ) plan1 = optimizer.optimize_shuffle(task1) print(f"Strategy: {plan1.strategy.value}") print(f"Compression: {plan1.compression.value}") print(f"Estimated time: {plan1.estimated_time:.2f}s") print(f"Explanation: {plan1.explanation}") # Example 2: Large skewed shuffle print("\n\nExample 2: Large skewed shuffle") task2 = ShuffleTask( task_id="shuffle_2", input_partitions=1000, output_partitions=500, data_size_gb=100, key_distribution='skewed', value_size_avg=1000, combiner_function='sum' ) plan2 = optimizer.optimize_shuffle(task2) print(f"Strategy: {plan2.strategy.value}") print(f"Buffer sizes: {list(plan2.buffer_sizes.values())[0] / 1e9:.1f}GB per node") print(f"Network usage: {plan2.estimated_network_usage / 1e9:.1f}GB") print(f"Explanation: {plan2.explanation}") # Example 3: Many nodes with aggregation print("\n\nExample 3: Many nodes with tree aggregation") large_cluster = create_test_cluster(16) large_optimizer = ShuffleOptimizer(large_cluster) task3 = ShuffleTask( task_id="shuffle_3", input_partitions=10000, output_partitions=16, data_size_gb=50, key_distribution='uniform', value_size_avg=200, combiner_function='collect' ) plan3 = large_optimizer.optimize_shuffle(task3) print(f"Strategy: {plan3.strategy.value}") if plan3.aggregation_tree: print(f"Tree height: {int(np.sqrt(len(large_cluster)))}") print(f"Tree structure sample: {list(plan3.aggregation_tree.items())[:3]}") print(f"Explanation: {plan3.explanation}") # Simulate execution print("\n\nSimulating shuffle execution...") metrics = optimizer.execute_shuffle(task1, plan1) print(f"Execution time: {metrics.total_time:.3f}s") print(f"Network bytes: {metrics.network_bytes / 1e6:.1f}MB") print(f"Compression ratio: {metrics.compression_ratio:.1f}x")