sqrtspace-tools/distsys/shuffle_optimizer.py

#!/usr/bin/env python3
"""
Distributed Shuffle Optimizer: Optimize shuffle operations in distributed computing

Features:
- Buffer Sizing: Calculate optimal buffer sizes per node
- Spill Strategy: Decide when to spill based on memory pressure
- Aggregation Trees: Build √n-height aggregation trees
- Network Awareness: Consider network topology in optimization
- AI Explanations: Clear reasoning for optimization decisions
"""

import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import numpy as np
import json
import time
import psutil
import socket
from dataclasses import dataclass, asdict
from typing import Dict, List, Tuple, Optional, Any, Union
from enum import Enum
import heapq
import zlib

# Import core components
from core.spacetime_core import (
    MemoryHierarchy,
    SqrtNCalculator,
    OptimizationStrategy,
    MemoryProfiler
)


class ShuffleStrategy(Enum):
    """Shuffle strategies for distributed systems"""
    ALL_TO_ALL = "all_to_all"              # Every node to every node
    TREE_AGGREGATE = "tree_aggregate"       # Hierarchical aggregation
    HASH_PARTITION = "hash_partition"       # Hash-based partitioning
    RANGE_PARTITION = "range_partition"     # Range-based partitioning
    COMBINER_BASED = "combiner_based"       # Local combining first


class CompressionType(Enum):
    """Compression algorithms for shuffle data"""
    NONE = "none"
    SNAPPY = "snappy"    # Fast, moderate compression
    ZLIB = "zlib"        # Slower, better compression
    LZ4 = "lz4"          # Very fast, light compression


@dataclass
class NodeInfo:
    """Information about a compute node"""
    node_id: str
    hostname: str
    cpu_cores: int
    memory_gb: float
    network_bandwidth_gbps: float
    storage_type: str  # 'ssd' or 'hdd'
    rack_id: Optional[str] = None


@dataclass
class ShuffleTask:
    """A shuffle task specification"""
    task_id: str
    input_partitions: int
    output_partitions: int
    data_size_gb: float
    key_distribution: str  # 'uniform', 'skewed', 'heavy_hitters'
    value_size_avg: int    # Average value size in bytes
    combiner_function: Optional[str] = None  # 'sum', 'max', 'collect', etc.


@dataclass
class ShufflePlan:
    """Optimized shuffle execution plan"""
    strategy: ShuffleStrategy
    buffer_sizes: Dict[str, int]  # node_id -> buffer_size
    spill_thresholds: Dict[str, float]  # node_id -> threshold
    aggregation_tree: Optional[Dict[str, List[str]]]  # parent -> children
    compression: CompressionType
    partition_assignment: Dict[int, str]  # partition -> node_id
    estimated_time: float
    estimated_network_usage: float
    memory_usage: Dict[str, float]
    explanation: str


@dataclass
class ShuffleMetrics:
    """Metrics from shuffle execution"""
    total_time: float
    network_bytes: int
    disk_spills: int
    memory_peak: int
    compression_ratio: float
    skew_factor: float  # Max/avg partition size


class NetworkTopology:
    """Model network topology for optimization"""

    def __init__(self, nodes: List[NodeInfo]):
        self.nodes = {n.node_id: n for n in nodes}
        self.racks = self._group_by_rack(nodes)
        self.bandwidth_matrix = self._build_bandwidth_matrix()

    def _group_by_rack(self, nodes: List[NodeInfo]) -> Dict[str, List[str]]:
        """Group nodes by rack"""
        racks = {}
        for node in nodes:
            rack = node.rack_id or 'default'
            if rack not in racks:
                racks[rack] = []
            racks[rack].append(node.node_id)
        return racks

    def _build_bandwidth_matrix(self) -> Dict[Tuple[str, str], float]:
        """Build bandwidth matrix between nodes"""
        matrix = {}
        for n1 in self.nodes:
            for n2 in self.nodes:
                if n1 == n2:
                    matrix[(n1, n2)] = float('inf')  # Local
                elif self._same_rack(n1, n2):
                    # Same rack: use min node bandwidth
                    matrix[(n1, n2)] = min(
                        self.nodes[n1].network_bandwidth_gbps,
                        self.nodes[n2].network_bandwidth_gbps
                    )
                else:
                    # Cross-rack: assume 50% of node bandwidth
                    matrix[(n1, n2)] = min(
                        self.nodes[n1].network_bandwidth_gbps,
                        self.nodes[n2].network_bandwidth_gbps
                    ) * 0.5
        return matrix

    def _same_rack(self, node1: str, node2: str) -> bool:
        """Check if two nodes are in the same rack"""
        r1 = self.nodes[node1].rack_id or 'default'
        r2 = self.nodes[node2].rack_id or 'default'
        return r1 == r2

    def get_bandwidth(self, src: str, dst: str) -> float:
        """Get bandwidth between two nodes in Gbps"""
        return self.bandwidth_matrix.get((src, dst), 1.0)


class CostModel:
    """Cost model for shuffle operations"""

    def __init__(self, topology: NetworkTopology):
        self.topology = topology
        self.hierarchy = MemoryHierarchy.detect_system()

    def estimate_shuffle_time(self, task: ShuffleTask, plan: ShufflePlan) -> float:
        """Estimate shuffle execution time"""
        # Network transfer time
        network_time = self._estimate_network_time(task, plan)

        # Disk I/O time (if spilling)
        io_time = self._estimate_io_time(task, plan)

        # CPU time (serialization, compression)
        cpu_time = self._estimate_cpu_time(task, plan)

        # Take max as they can overlap
        return max(network_time, io_time) + cpu_time * 0.1

    def _estimate_network_time(self, task: ShuffleTask, plan: ShufflePlan) -> float:
        """Estimate network transfer time"""
        bytes_per_partition = task.data_size_gb * 1e9 / task.input_partitions

        if plan.strategy == ShuffleStrategy.ALL_TO_ALL:
            # Every partition to every node
            total_bytes = task.data_size_gb * 1e9
            avg_bandwidth = np.mean(list(self.topology.bandwidth_matrix.values()))
            return total_bytes / (avg_bandwidth * 1e9)

        elif plan.strategy == ShuffleStrategy.TREE_AGGREGATE:
            # Log(n) levels in tree
            num_nodes = len(self.topology.nodes)
            tree_height = np.log2(num_nodes)
            bytes_per_level = task.data_size_gb * 1e9 / tree_height
            avg_bandwidth = np.mean(list(self.topology.bandwidth_matrix.values()))
            return tree_height * bytes_per_level / (avg_bandwidth * 1e9)

        else:
            # Hash/range partition: each partition to one node
            avg_bandwidth = np.mean(list(self.topology.bandwidth_matrix.values()))
            return bytes_per_partition * task.output_partitions / (avg_bandwidth * 1e9)

    def _estimate_io_time(self, task: ShuffleTask, plan: ShufflePlan) -> float:
        """Estimate disk I/O time if spilling"""
        total_spill = 0

        for node_id, threshold in plan.spill_thresholds.items():
            node = self.topology.nodes[node_id]
            buffer_size = plan.buffer_sizes[node_id]

            # Estimate spill amount
            node_data = task.data_size_gb * 1e9 / len(self.topology.nodes)
            if node_data > buffer_size:
                spill_amount = node_data - buffer_size
                total_spill += spill_amount

        if total_spill > 0:
            # Assume 200MB/s for HDD, 500MB/s for SSD
            io_speed = 500e6 if 'ssd' in str(plan).lower() else 200e6
            return total_spill / io_speed

        return 0.0

    def _estimate_cpu_time(self, task: ShuffleTask, plan: ShufflePlan) -> float:
        """Estimate CPU time for serialization and compression"""
        total_cores = sum(n.cpu_cores for n in self.topology.nodes.values())

        # Serialization cost
        serialize_rate = 1e9  # 1GB/s per core
        serialize_time = task.data_size_gb * 1e9 / (serialize_rate * total_cores)

        # Compression cost
        if plan.compression != CompressionType.NONE:
            if plan.compression == CompressionType.ZLIB:
                compress_rate = 100e6  # 100MB/s per core
            elif plan.compression == CompressionType.SNAPPY:
                compress_rate = 500e6  # 500MB/s per core
            else:  # LZ4
                compress_rate = 1e9    # 1GB/s per core

            compress_time = task.data_size_gb * 1e9 / (compress_rate * total_cores)
        else:
            compress_time = 0

        return serialize_time + compress_time


class ShuffleOptimizer:
    """Main distributed shuffle optimizer"""

    def __init__(self, nodes: List[NodeInfo], memory_limit_fraction: float = 0.5):
        self.topology = NetworkTopology(nodes)
        self.cost_model = CostModel(self.topology)
        self.memory_limit_fraction = memory_limit_fraction
        self.sqrt_calc = SqrtNCalculator()

    def optimize_shuffle(self, task: ShuffleTask) -> ShufflePlan:
        """Generate optimized shuffle plan"""
        # Choose strategy based on task characteristics
        strategy = self._choose_strategy(task)

        # Calculate buffer sizes using √n principle
        buffer_sizes = self._calculate_buffer_sizes(task)

        # Determine spill thresholds
        spill_thresholds = self._calculate_spill_thresholds(task, buffer_sizes)

        # Build aggregation tree if needed
        aggregation_tree = None
        if strategy == ShuffleStrategy.TREE_AGGREGATE:
            aggregation_tree = self._build_aggregation_tree()

        # Choose compression
        compression = self._choose_compression(task)

        # Assign partitions to nodes
        partition_assignment = self._assign_partitions(task, strategy)

        # Estimate performance
        plan = ShufflePlan(
            strategy=strategy,
            buffer_sizes=buffer_sizes,
            spill_thresholds=spill_thresholds,
            aggregation_tree=aggregation_tree,
            compression=compression,
            partition_assignment=partition_assignment,
            estimated_time=0.0,
            estimated_network_usage=0.0,
            memory_usage={},
            explanation=""
        )

        # Calculate estimates
        plan.estimated_time = self.cost_model.estimate_shuffle_time(task, plan)
        plan.estimated_network_usage = self._estimate_network_usage(task, plan)
        plan.memory_usage = self._estimate_memory_usage(task, plan)

        # Generate explanation
        plan.explanation = self._generate_explanation(task, plan)

        return plan

    def _choose_strategy(self, task: ShuffleTask) -> ShuffleStrategy:
        """Choose shuffle strategy based on task characteristics"""
        # Small data: all-to-all is fine
        if task.data_size_gb < 1:
            return ShuffleStrategy.ALL_TO_ALL

        # Has combiner: use combining strategy
        if task.combiner_function:
            return ShuffleStrategy.COMBINER_BASED

        # Many nodes: use tree aggregation
        if len(self.topology.nodes) > 10:
            return ShuffleStrategy.TREE_AGGREGATE

        # Skewed data: use range partitioning
        if task.key_distribution == 'skewed':
            return ShuffleStrategy.RANGE_PARTITION

        # Default: hash partitioning
        return ShuffleStrategy.HASH_PARTITION

    def _calculate_buffer_sizes(self, task: ShuffleTask) -> Dict[str, int]:
        """Calculate optimal buffer sizes using √n principle"""
        buffer_sizes = {}

        for node_id, node in self.topology.nodes.items():
            # Available memory for shuffle
            available_memory = node.memory_gb * 1e9 * self.memory_limit_fraction

            # Data size per node
            data_per_node = task.data_size_gb * 1e9 / len(self.topology.nodes)

            if data_per_node <= available_memory:
                # Can fit all data
                buffer_size = int(data_per_node)
            else:
                # Use √n buffer
                sqrt_buffer = self.sqrt_calc.calculate_interval(
                    int(data_per_node / task.value_size_avg)
                ) * task.value_size_avg
                buffer_size = min(int(sqrt_buffer), int(available_memory))

            buffer_sizes[node_id] = buffer_size

        return buffer_sizes

    def _calculate_spill_thresholds(self, task: ShuffleTask,
                                  buffer_sizes: Dict[str, int]) -> Dict[str, float]:
        """Calculate memory thresholds for spilling"""
        thresholds = {}

        for node_id, buffer_size in buffer_sizes.items():
            # Spill at 80% of buffer to leave headroom
            thresholds[node_id] = buffer_size * 0.8

        return thresholds

    def _build_aggregation_tree(self) -> Dict[str, List[str]]:
        """Build √n-height aggregation tree"""
        nodes = list(self.topology.nodes.keys())
        n = len(nodes)

        # Calculate branching factor for √n height
        height = int(np.sqrt(n))
        branching_factor = int(np.ceil(n ** (1 / height)))

        tree = {}

        # Build tree level by level
        current_level = nodes[:]

        while len(current_level) > 1:
            next_level = []

            for i in range(0, len(current_level), branching_factor):
                # Group nodes
                group = current_level[i:i + branching_factor]
                if len(group) > 1:
                    parent = group[0]  # First node as parent
                    tree[parent] = group[1:]  # Rest as children
                    next_level.append(parent)
                elif group:
                    next_level.append(group[0])

            current_level = next_level

        return tree

    def _choose_compression(self, task: ShuffleTask) -> CompressionType:
        """Choose compression based on data characteristics and network"""
        # Average network bandwidth
        avg_bandwidth = np.mean([
            n.network_bandwidth_gbps for n in self.topology.nodes.values()
        ])

        # High bandwidth: no compression
        if avg_bandwidth > 10:  # 10+ Gbps
            return CompressionType.NONE

        # Large values: use better compression
        if task.value_size_avg > 1000:
            return CompressionType.ZLIB

        # Medium bandwidth: balanced compression
        if avg_bandwidth > 1:  # 1-10 Gbps
            return CompressionType.SNAPPY

        # Low bandwidth: fast compression
        return CompressionType.LZ4

    def _assign_partitions(self, task: ShuffleTask,
                         strategy: ShuffleStrategy) -> Dict[int, str]:
        """Assign partitions to nodes"""
        nodes = list(self.topology.nodes.keys())
        assignment = {}

        if strategy == ShuffleStrategy.HASH_PARTITION:
            # Round-robin assignment
            for i in range(task.output_partitions):
                assignment[i] = nodes[i % len(nodes)]

        elif strategy == ShuffleStrategy.RANGE_PARTITION:
            # Assign ranges to nodes
            partitions_per_node = task.output_partitions // len(nodes)
            for i, node in enumerate(nodes):
                start = i * partitions_per_node
                end = start + partitions_per_node
                if i == len(nodes) - 1:
                    end = task.output_partitions
                for p in range(start, end):
                    assignment[p] = node

        else:
            # Default: even distribution
            for i in range(task.output_partitions):
                assignment[i] = nodes[i % len(nodes)]

        return assignment

    def _estimate_network_usage(self, task: ShuffleTask, plan: ShufflePlan) -> float:
        """Estimate total network bytes"""
        base_bytes = task.data_size_gb * 1e9

        # Apply compression ratio
        if plan.compression == CompressionType.ZLIB:
            base_bytes *= 0.3  # ~70% compression
        elif plan.compression == CompressionType.SNAPPY:
            base_bytes *= 0.5  # ~50% compression
        elif plan.compression == CompressionType.LZ4:
            base_bytes *= 0.7  # ~30% compression

        # Apply strategy multiplier
        if plan.strategy == ShuffleStrategy.ALL_TO_ALL:
            n = len(self.topology.nodes)
            base_bytes *= (n - 1) / n  # Each node sends to n-1 others
        elif plan.strategy == ShuffleStrategy.TREE_AGGREGATE:
            # Log(n) levels
            base_bytes *= np.log2(len(self.topology.nodes))

        return base_bytes

    def _estimate_memory_usage(self, task: ShuffleTask, plan: ShufflePlan) -> Dict[str, float]:
        """Estimate memory usage per node"""
        memory_usage = {}

        for node_id in self.topology.nodes:
            # Buffer memory
            buffer_mem = plan.buffer_sizes[node_id]

            # Overhead (metadata, indices)
            overhead = buffer_mem * 0.1

            # Compression buffers if used
            compress_mem = 0
            if plan.compression != CompressionType.NONE:
                compress_mem = min(buffer_mem * 0.1, 100 * 1024 * 1024)  # Max 100MB

            memory_usage[node_id] = buffer_mem + overhead + compress_mem

        return memory_usage

    def _generate_explanation(self, task: ShuffleTask, plan: ShufflePlan) -> str:
        """Generate human-readable explanation"""
        explanations = []

        # Strategy explanation
        strategy_reasons = {
            ShuffleStrategy.ALL_TO_ALL: "small data size allows full exchange",
            ShuffleStrategy.TREE_AGGREGATE: f"√n-height tree reduces network hops to {int(np.sqrt(len(self.topology.nodes)))}",
            ShuffleStrategy.HASH_PARTITION: "uniform data distribution suits hash partitioning",
            ShuffleStrategy.RANGE_PARTITION: "skewed data benefits from range partitioning",
            ShuffleStrategy.COMBINER_BASED: "combiner function enables local aggregation"
        }

        explanations.append(
            f"Using {plan.strategy.value} strategy because {strategy_reasons[plan.strategy]}."
        )

        # Buffer sizing
        avg_buffer_mb = np.mean(list(plan.buffer_sizes.values())) / 1e6
        explanations.append(
            f"Allocated {avg_buffer_mb:.0f}MB buffers per node using √n principle "
            f"to balance memory usage and I/O."
        )

        # Compression
        if plan.compression != CompressionType.NONE:
            explanations.append(
                f"Applied {plan.compression.value} compression to reduce network "
                f"traffic by ~{(1 - plan.estimated_network_usage / (task.data_size_gb * 1e9)) * 100:.0f}%."
            )

        # Performance estimate
        explanations.append(
            f"Estimated completion time: {plan.estimated_time:.1f}s with "
            f"{plan.estimated_network_usage / 1e9:.1f}GB network transfer."
        )

        return " ".join(explanations)

    def execute_shuffle(self, task: ShuffleTask, plan: ShufflePlan) -> ShuffleMetrics:
        """Simulate shuffle execution (for testing)"""
        start_time = time.time()

        # Simulate execution
        time.sleep(0.1)  # Simulate some work

        # Calculate metrics
        metrics = ShuffleMetrics(
            total_time=time.time() - start_time,
            network_bytes=int(plan.estimated_network_usage),
            disk_spills=sum(1 for b in plan.buffer_sizes.values()
                          if b < task.data_size_gb * 1e9 / len(self.topology.nodes)),
            memory_peak=max(plan.memory_usage.values()),
            compression_ratio=1.0,
            skew_factor=1.0
        )

        if plan.compression == CompressionType.ZLIB:
            metrics.compression_ratio = 3.3
        elif plan.compression == CompressionType.SNAPPY:
            metrics.compression_ratio = 2.0
        elif plan.compression == CompressionType.LZ4:
            metrics.compression_ratio = 1.4

        return metrics


def create_test_cluster(num_nodes: int = 4) -> List[NodeInfo]:
    """Create a test cluster configuration"""
    nodes = []

    for i in range(num_nodes):
        node = NodeInfo(
            node_id=f"node{i}",
            hostname=f"worker{i}.cluster.local",
            cpu_cores=16,
            memory_gb=64,
            network_bandwidth_gbps=10.0,
            storage_type='ssd',
            rack_id=f"rack{i // 2}"  # 2 nodes per rack
        )
        nodes.append(node)

    return nodes


# Example usage
if __name__ == "__main__":
    print("Distributed Shuffle Optimizer Example")
    print("="*60)

    # Create test cluster
    nodes = create_test_cluster(4)
    optimizer = ShuffleOptimizer(nodes)

    # Example 1: Small uniform shuffle
    print("\nExample 1: Small uniform shuffle")
    task1 = ShuffleTask(
        task_id="shuffle_1",
        input_partitions=100,
        output_partitions=100,
        data_size_gb=0.5,
        key_distribution='uniform',
        value_size_avg=100
    )

    plan1 = optimizer.optimize_shuffle(task1)
    print(f"Strategy: {plan1.strategy.value}")
    print(f"Compression: {plan1.compression.value}")
    print(f"Estimated time: {plan1.estimated_time:.2f}s")
    print(f"Explanation: {plan1.explanation}")

    # Example 2: Large skewed shuffle
    print("\n\nExample 2: Large skewed shuffle")
    task2 = ShuffleTask(
        task_id="shuffle_2",
        input_partitions=1000,
        output_partitions=500,
        data_size_gb=100,
        key_distribution='skewed',
        value_size_avg=1000,
        combiner_function='sum'
    )

    plan2 = optimizer.optimize_shuffle(task2)
    print(f"Strategy: {plan2.strategy.value}")
    print(f"Buffer sizes: {list(plan2.buffer_sizes.values())[0] / 1e9:.1f}GB per node")
    print(f"Network usage: {plan2.estimated_network_usage / 1e9:.1f}GB")
    print(f"Explanation: {plan2.explanation}")

    # Example 3: Many nodes with aggregation
    print("\n\nExample 3: Many nodes with tree aggregation")
    large_cluster = create_test_cluster(16)
    large_optimizer = ShuffleOptimizer(large_cluster)

    task3 = ShuffleTask(
        task_id="shuffle_3",
        input_partitions=10000,
        output_partitions=16,
        data_size_gb=50,
        key_distribution='uniform',
        value_size_avg=200,
        combiner_function='collect'
    )

    plan3 = large_optimizer.optimize_shuffle(task3)
    print(f"Strategy: {plan3.strategy.value}")
    if plan3.aggregation_tree:
        print(f"Tree height: {int(np.sqrt(len(large_cluster)))}")
        print(f"Tree structure sample: {list(plan3.aggregation_tree.items())[:3]}")
    print(f"Explanation: {plan3.explanation}")

    # Simulate execution
    print("\n\nSimulating shuffle execution...")
    metrics = optimizer.execute_shuffle(task1, plan1)
    print(f"Execution time: {metrics.total_time:.3f}s")
    print(f"Network bytes: {metrics.network_bytes / 1e6:.1f}MB")
    print(f"Compression ratio: {metrics.compression_ratio:.1f}x")