sqrtspace-dotnet/src/SqrtSpace.SpaceTime.Pipeline/SpaceTimePipeline.cs
2025-07-20 03:41:39 -04:00

491 lines
16 KiB
C#

using System;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Threading;
using System.Threading.Channels;
using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
using SqrtSpace.SpaceTime.Core;
namespace SqrtSpace.SpaceTime.Pipeline;
/// <summary>
/// Memory-efficient data pipeline with √n buffering
/// </summary>
public class SpaceTimePipeline<TInput, TOutput> : ISpaceTimePipeline<TInput, TOutput>
{
private readonly List<IPipelineStage> _stages;
private readonly ILogger<SpaceTimePipeline<TInput, TOutput>> _logger;
private readonly PipelineConfiguration _configuration;
private readonly CancellationTokenSource _cancellationTokenSource;
private readonly SemaphoreSlim _executionLock;
private PipelineState _state;
public string Name { get; }
public PipelineState State => _state;
public SpaceTimePipeline(
string name,
ILogger<SpaceTimePipeline<TInput, TOutput>> logger,
PipelineConfiguration? configuration = null)
{
Name = name ?? throw new ArgumentNullException(nameof(name));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_configuration = configuration ?? new PipelineConfiguration();
_stages = new List<IPipelineStage>();
_cancellationTokenSource = new CancellationTokenSource();
_executionLock = new SemaphoreSlim(1, 1);
_state = PipelineState.Created;
}
public ISpaceTimePipeline<TInput, TOutput> AddStage<TStageInput, TStageOutput>(
string stageName,
Func<TStageInput, CancellationToken, Task<TStageOutput>> transform,
StageConfiguration? configuration = null)
{
if (_state != PipelineState.Created)
throw new InvalidOperationException("Cannot add stages after pipeline has started");
var stage = new TransformStage<TStageInput, TStageOutput>(
stageName,
transform,
configuration ?? new StageConfiguration(),
_logger);
_stages.Add(stage);
return this;
}
public ISpaceTimePipeline<TInput, TOutput> AddBatchStage<TStageInput, TStageOutput>(
string stageName,
Func<IReadOnlyList<TStageInput>, CancellationToken, Task<IEnumerable<TStageOutput>>> batchTransform,
StageConfiguration? configuration = null)
{
if (_state != PipelineState.Created)
throw new InvalidOperationException("Cannot add stages after pipeline has started");
var stage = new BatchTransformStage<TStageInput, TStageOutput>(
stageName,
batchTransform,
configuration ?? new StageConfiguration(),
_logger);
_stages.Add(stage);
return this;
}
public ISpaceTimePipeline<TInput, TOutput> AddFilterStage<T>(
string stageName,
Func<T, bool> predicate,
StageConfiguration? configuration = null)
{
if (_state != PipelineState.Created)
throw new InvalidOperationException("Cannot add stages after pipeline has started");
var stage = new FilterStage<T>(
stageName,
predicate,
configuration ?? new StageConfiguration(),
_logger);
_stages.Add(stage);
return this;
}
public ISpaceTimePipeline<TInput, TOutput> AddCheckpointStage<T>(
string stageName,
ICheckpointManager checkpointManager,
StageConfiguration? configuration = null)
{
if (_state != PipelineState.Created)
throw new InvalidOperationException("Cannot add stages after pipeline has started");
var stage = new CheckpointStage<T>(
stageName,
checkpointManager,
configuration ?? new StageConfiguration(),
_logger);
_stages.Add(stage);
return this;
}
public async Task<PipelineResult<TOutput>> ExecuteAsync(
TInput input,
CancellationToken cancellationToken = default)
{
return await ExecuteAsync(new[] { input }, cancellationToken);
}
public async Task<PipelineResult<TOutput>> ExecuteAsync(
IEnumerable<TInput> inputs,
CancellationToken cancellationToken = default)
{
await _executionLock.WaitAsync(cancellationToken);
try
{
_state = PipelineState.Running;
var startTime = DateTime.UtcNow;
var result = new PipelineResult<TOutput>();
// Link cancellation tokens
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(
cancellationToken,
_cancellationTokenSource.Token);
// Create execution context
var context = new PipelineExecutionContext
{
PipelineName = Name,
ExecutionId = Guid.NewGuid().ToString(),
StartTime = startTime,
Configuration = _configuration,
CancellationToken = linkedCts.Token
};
try
{
// Build stage channels
var channels = BuildStageChannels();
// Start stage processors
var stageTasks = StartStageProcessors(channels, context);
// Feed inputs
await FeedInputsAsync(inputs, channels.First().Writer, context);
// Wait for completion
await Task.WhenAll(stageTasks);
// Collect outputs
var outputs = new List<TOutput>();
var outputChannel = channels.Last().Reader;
await foreach (var output in outputChannel.ReadAllAsync(linkedCts.Token))
{
outputs.Add((TOutput)(object)output);
}
result.Outputs = outputs;
result.Success = true;
result.ProcessedCount = outputs.Count;
}
catch (Exception ex)
{
_logger.LogError(ex, "Pipeline execution failed");
result.Success = false;
result.Error = ex;
_state = PipelineState.Failed;
}
result.Duration = DateTime.UtcNow - startTime;
_state = result.Success ? PipelineState.Completed : PipelineState.Failed;
return result;
}
finally
{
_executionLock.Release();
}
}
public async IAsyncEnumerable<TOutput> ExecuteStreamingAsync(
IAsyncEnumerable<TInput> inputs,
[EnumeratorCancellation] CancellationToken cancellationToken = default)
{
await _executionLock.WaitAsync(cancellationToken);
try
{
_state = PipelineState.Running;
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(
cancellationToken,
_cancellationTokenSource.Token);
var context = new PipelineExecutionContext
{
PipelineName = Name,
ExecutionId = Guid.NewGuid().ToString(),
StartTime = DateTime.UtcNow,
Configuration = _configuration,
CancellationToken = linkedCts.Token
};
// Build channels
var channels = BuildStageChannels();
// Start processors
var stageTasks = StartStageProcessors(channels, context);
// Start input feeder
var feederTask = Task.Run(async () =>
{
try
{
await foreach (var input in inputs.WithCancellation(linkedCts.Token))
{
await channels.First().Writer.WriteAsync(input, linkedCts.Token);
}
}
finally
{
channels.First().Writer.Complete();
}
}, linkedCts.Token);
// Stream outputs
var outputChannel = channels.Last().Reader;
await foreach (var output in outputChannel.ReadAllAsync(linkedCts.Token))
{
yield return (TOutput)(object)output;
}
await Task.WhenAll(stageTasks.Concat(new[] { feederTask }));
_state = PipelineState.Completed;
}
finally
{
_executionLock.Release();
}
}
public async Task<PipelineStatistics> GetStatisticsAsync()
{
var stats = new PipelineStatistics
{
PipelineName = Name,
State = _state,
StageCount = _stages.Count,
StageStatistics = new List<StageStatistics>()
};
foreach (var stage in _stages)
{
stats.StageStatistics.Add(await stage.GetStatisticsAsync());
}
stats.TotalItemsProcessed = stats.StageStatistics.Sum(s => s.ItemsProcessed);
stats.TotalErrors = stats.StageStatistics.Sum(s => s.Errors);
stats.AverageLatency = stats.StageStatistics.Any()
? TimeSpan.FromMilliseconds(stats.StageStatistics.Average(s => s.AverageLatency.TotalMilliseconds))
: TimeSpan.Zero;
return stats;
}
private List<Channel<object>> BuildStageChannels()
{
var channels = new List<Channel<object>>();
for (int i = 0; i <= _stages.Count; i++)
{
var bufferSize = i < _stages.Count
? _stages[i].Configuration.BufferSize
: _configuration.OutputBufferSize;
// Use √n buffering if not specified
if (bufferSize == 0)
{
bufferSize = SpaceTimeCalculator.CalculateSqrtInterval(_configuration.ExpectedItemCount);
}
var channel = Channel.CreateBounded<object>(new BoundedChannelOptions(bufferSize)
{
FullMode = BoundedChannelFullMode.Wait,
SingleWriter = false,
SingleReader = false
});
channels.Add(channel);
}
return channels;
}
private List<Task> StartStageProcessors(
List<Channel<object>> channels,
PipelineExecutionContext context)
{
var tasks = new List<Task>();
for (int i = 0; i < _stages.Count; i++)
{
var stage = _stages[i];
var inputChannel = channels[i];
var outputChannel = channels[i + 1];
var task = Task.Run(async () =>
{
try
{
await stage.ProcessAsync(
inputChannel.Reader,
outputChannel.Writer,
context);
}
catch (Exception ex)
{
_logger.LogError(ex, "Stage {StageName} failed", stage.Name);
throw;
}
finally
{
outputChannel.Writer.Complete();
}
}, context.CancellationToken);
tasks.Add(task);
}
return tasks;
}
private async Task FeedInputsAsync<T>(
IEnumerable<T> inputs,
ChannelWriter<object> writer,
PipelineExecutionContext context)
{
try
{
foreach (var input in inputs)
{
if (context.CancellationToken.IsCancellationRequested)
break;
await writer.WriteAsync(input!, context.CancellationToken);
}
}
finally
{
writer.Complete();
}
}
public void Dispose()
{
_cancellationTokenSource?.Cancel();
_cancellationTokenSource?.Dispose();
_executionLock?.Dispose();
foreach (var stage in _stages.OfType<IDisposable>())
{
stage.Dispose();
}
}
}
// Interfaces and supporting classes
public interface ISpaceTimePipeline<TInput, TOutput> : IDisposable
{
string Name { get; }
PipelineState State { get; }
ISpaceTimePipeline<TInput, TOutput> AddStage<TStageInput, TStageOutput>(
string stageName,
Func<TStageInput, CancellationToken, Task<TStageOutput>> transform,
StageConfiguration? configuration = null);
ISpaceTimePipeline<TInput, TOutput> AddBatchStage<TStageInput, TStageOutput>(
string stageName,
Func<IReadOnlyList<TStageInput>, CancellationToken, Task<IEnumerable<TStageOutput>>> batchTransform,
StageConfiguration? configuration = null);
ISpaceTimePipeline<TInput, TOutput> AddFilterStage<T>(
string stageName,
Func<T, bool> predicate,
StageConfiguration? configuration = null);
ISpaceTimePipeline<TInput, TOutput> AddCheckpointStage<T>(
string stageName,
ICheckpointManager checkpointManager,
StageConfiguration? configuration = null);
Task<PipelineResult<TOutput>> ExecuteAsync(
TInput input,
CancellationToken cancellationToken = default);
Task<PipelineResult<TOutput>> ExecuteAsync(
IEnumerable<TInput> inputs,
CancellationToken cancellationToken = default);
IAsyncEnumerable<TOutput> ExecuteStreamingAsync(
IAsyncEnumerable<TInput> inputs,
CancellationToken cancellationToken = default);
Task<PipelineStatistics> GetStatisticsAsync();
}
public enum PipelineState
{
Created,
Running,
Completed,
Failed,
Cancelled
}
public class PipelineConfiguration
{
public int ExpectedItemCount { get; set; } = 10000;
public int OutputBufferSize { get; set; } = 0; // 0 = auto (√n)
public TimeSpan DefaultTimeout { get; set; } = TimeSpan.FromMinutes(30);
public bool EnableCheckpointing { get; set; } = true;
public bool EnableMetrics { get; set; } = true;
}
public class StageConfiguration
{
public int BufferSize { get; set; } = 0; // 0 = auto (√n)
public int MaxConcurrency { get; set; } = Environment.ProcessorCount;
public TimeSpan Timeout { get; set; } = TimeSpan.FromMinutes(5);
public bool EnableRetry { get; set; } = true;
public int MaxRetries { get; set; } = 3;
}
public class PipelineResult<T>
{
public bool Success { get; set; }
public List<T> Outputs { get; set; } = new();
public int ProcessedCount { get; set; }
public TimeSpan Duration { get; set; }
public Exception? Error { get; set; }
}
public class PipelineStatistics
{
public string PipelineName { get; set; } = "";
public PipelineState State { get; set; }
public int StageCount { get; set; }
public long TotalItemsProcessed { get; set; }
public long TotalErrors { get; set; }
public TimeSpan AverageLatency { get; set; }
public List<StageStatistics> StageStatistics { get; set; } = new();
}
public class StageStatistics
{
public string StageName { get; set; } = "";
public long ItemsProcessed { get; set; }
public long ItemsFiltered { get; set; }
public long Errors { get; set; }
public TimeSpan AverageLatency { get; set; }
public long MemoryUsage { get; set; }
}
internal interface IPipelineStage
{
string Name { get; }
StageConfiguration Configuration { get; }
Task ProcessAsync(ChannelReader<object> input, ChannelWriter<object> output, PipelineExecutionContext context);
Task<StageStatistics> GetStatisticsAsync();
}
internal class PipelineExecutionContext
{
public string PipelineName { get; set; } = "";
public string ExecutionId { get; set; } = "";
public DateTime StartTime { get; set; }
public PipelineConfiguration Configuration { get; set; } = null!;
public CancellationToken CancellationToken { get; set; }
}