491 lines
16 KiB
C#
491 lines
16 KiB
C#
using System;
|
|
using System.Collections.Generic;
|
|
using System.Linq;
|
|
using System.Runtime.CompilerServices;
|
|
using System.Threading;
|
|
using System.Threading.Channels;
|
|
using System.Threading.Tasks;
|
|
using Microsoft.Extensions.Logging;
|
|
using SqrtSpace.SpaceTime.Core;
|
|
|
|
namespace SqrtSpace.SpaceTime.Pipeline;
|
|
|
|
/// <summary>
|
|
/// Memory-efficient data pipeline with √n buffering
|
|
/// </summary>
|
|
public class SpaceTimePipeline<TInput, TOutput> : ISpaceTimePipeline<TInput, TOutput>
|
|
{
|
|
private readonly List<IPipelineStage> _stages;
|
|
private readonly ILogger<SpaceTimePipeline<TInput, TOutput>> _logger;
|
|
private readonly PipelineConfiguration _configuration;
|
|
private readonly CancellationTokenSource _cancellationTokenSource;
|
|
private readonly SemaphoreSlim _executionLock;
|
|
private PipelineState _state;
|
|
|
|
public string Name { get; }
|
|
public PipelineState State => _state;
|
|
|
|
public SpaceTimePipeline(
|
|
string name,
|
|
ILogger<SpaceTimePipeline<TInput, TOutput>> logger,
|
|
PipelineConfiguration? configuration = null)
|
|
{
|
|
Name = name ?? throw new ArgumentNullException(nameof(name));
|
|
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
|
_configuration = configuration ?? new PipelineConfiguration();
|
|
_stages = new List<IPipelineStage>();
|
|
_cancellationTokenSource = new CancellationTokenSource();
|
|
_executionLock = new SemaphoreSlim(1, 1);
|
|
_state = PipelineState.Created;
|
|
}
|
|
|
|
public ISpaceTimePipeline<TInput, TOutput> AddStage<TStageInput, TStageOutput>(
|
|
string stageName,
|
|
Func<TStageInput, CancellationToken, Task<TStageOutput>> transform,
|
|
StageConfiguration? configuration = null)
|
|
{
|
|
if (_state != PipelineState.Created)
|
|
throw new InvalidOperationException("Cannot add stages after pipeline has started");
|
|
|
|
var stage = new TransformStage<TStageInput, TStageOutput>(
|
|
stageName,
|
|
transform,
|
|
configuration ?? new StageConfiguration(),
|
|
_logger);
|
|
|
|
_stages.Add(stage);
|
|
return this;
|
|
}
|
|
|
|
public ISpaceTimePipeline<TInput, TOutput> AddBatchStage<TStageInput, TStageOutput>(
|
|
string stageName,
|
|
Func<IReadOnlyList<TStageInput>, CancellationToken, Task<IEnumerable<TStageOutput>>> batchTransform,
|
|
StageConfiguration? configuration = null)
|
|
{
|
|
if (_state != PipelineState.Created)
|
|
throw new InvalidOperationException("Cannot add stages after pipeline has started");
|
|
|
|
var stage = new BatchTransformStage<TStageInput, TStageOutput>(
|
|
stageName,
|
|
batchTransform,
|
|
configuration ?? new StageConfiguration(),
|
|
_logger);
|
|
|
|
_stages.Add(stage);
|
|
return this;
|
|
}
|
|
|
|
public ISpaceTimePipeline<TInput, TOutput> AddFilterStage<T>(
|
|
string stageName,
|
|
Func<T, bool> predicate,
|
|
StageConfiguration? configuration = null)
|
|
{
|
|
if (_state != PipelineState.Created)
|
|
throw new InvalidOperationException("Cannot add stages after pipeline has started");
|
|
|
|
var stage = new FilterStage<T>(
|
|
stageName,
|
|
predicate,
|
|
configuration ?? new StageConfiguration(),
|
|
_logger);
|
|
|
|
_stages.Add(stage);
|
|
return this;
|
|
}
|
|
|
|
public ISpaceTimePipeline<TInput, TOutput> AddCheckpointStage<T>(
|
|
string stageName,
|
|
ICheckpointManager checkpointManager,
|
|
StageConfiguration? configuration = null)
|
|
{
|
|
if (_state != PipelineState.Created)
|
|
throw new InvalidOperationException("Cannot add stages after pipeline has started");
|
|
|
|
var stage = new CheckpointStage<T>(
|
|
stageName,
|
|
checkpointManager,
|
|
configuration ?? new StageConfiguration(),
|
|
_logger);
|
|
|
|
_stages.Add(stage);
|
|
return this;
|
|
}
|
|
|
|
public async Task<PipelineResult<TOutput>> ExecuteAsync(
|
|
TInput input,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
return await ExecuteAsync(new[] { input }, cancellationToken);
|
|
}
|
|
|
|
public async Task<PipelineResult<TOutput>> ExecuteAsync(
|
|
IEnumerable<TInput> inputs,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
await _executionLock.WaitAsync(cancellationToken);
|
|
try
|
|
{
|
|
_state = PipelineState.Running;
|
|
var startTime = DateTime.UtcNow;
|
|
var result = new PipelineResult<TOutput>();
|
|
|
|
// Link cancellation tokens
|
|
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(
|
|
cancellationToken,
|
|
_cancellationTokenSource.Token);
|
|
|
|
// Create execution context
|
|
var context = new PipelineExecutionContext
|
|
{
|
|
PipelineName = Name,
|
|
ExecutionId = Guid.NewGuid().ToString(),
|
|
StartTime = startTime,
|
|
Configuration = _configuration,
|
|
CancellationToken = linkedCts.Token
|
|
};
|
|
|
|
try
|
|
{
|
|
// Build stage channels
|
|
var channels = BuildStageChannels();
|
|
|
|
// Start stage processors
|
|
var stageTasks = StartStageProcessors(channels, context);
|
|
|
|
// Feed inputs
|
|
await FeedInputsAsync(inputs, channels.First().Writer, context);
|
|
|
|
// Wait for completion
|
|
await Task.WhenAll(stageTasks);
|
|
|
|
// Collect outputs
|
|
var outputs = new List<TOutput>();
|
|
var outputChannel = channels.Last().Reader;
|
|
|
|
await foreach (var output in outputChannel.ReadAllAsync(linkedCts.Token))
|
|
{
|
|
outputs.Add((TOutput)(object)output);
|
|
}
|
|
|
|
result.Outputs = outputs;
|
|
result.Success = true;
|
|
result.ProcessedCount = outputs.Count;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Pipeline execution failed");
|
|
result.Success = false;
|
|
result.Error = ex;
|
|
_state = PipelineState.Failed;
|
|
}
|
|
|
|
result.Duration = DateTime.UtcNow - startTime;
|
|
_state = result.Success ? PipelineState.Completed : PipelineState.Failed;
|
|
|
|
return result;
|
|
}
|
|
finally
|
|
{
|
|
_executionLock.Release();
|
|
}
|
|
}
|
|
|
|
public async IAsyncEnumerable<TOutput> ExecuteStreamingAsync(
|
|
IAsyncEnumerable<TInput> inputs,
|
|
[EnumeratorCancellation] CancellationToken cancellationToken = default)
|
|
{
|
|
await _executionLock.WaitAsync(cancellationToken);
|
|
try
|
|
{
|
|
_state = PipelineState.Running;
|
|
|
|
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(
|
|
cancellationToken,
|
|
_cancellationTokenSource.Token);
|
|
|
|
var context = new PipelineExecutionContext
|
|
{
|
|
PipelineName = Name,
|
|
ExecutionId = Guid.NewGuid().ToString(),
|
|
StartTime = DateTime.UtcNow,
|
|
Configuration = _configuration,
|
|
CancellationToken = linkedCts.Token
|
|
};
|
|
|
|
// Build channels
|
|
var channels = BuildStageChannels();
|
|
|
|
// Start processors
|
|
var stageTasks = StartStageProcessors(channels, context);
|
|
|
|
// Start input feeder
|
|
var feederTask = Task.Run(async () =>
|
|
{
|
|
try
|
|
{
|
|
await foreach (var input in inputs.WithCancellation(linkedCts.Token))
|
|
{
|
|
await channels.First().Writer.WriteAsync(input, linkedCts.Token);
|
|
}
|
|
}
|
|
finally
|
|
{
|
|
channels.First().Writer.Complete();
|
|
}
|
|
}, linkedCts.Token);
|
|
|
|
// Stream outputs
|
|
var outputChannel = channels.Last().Reader;
|
|
await foreach (var output in outputChannel.ReadAllAsync(linkedCts.Token))
|
|
{
|
|
yield return (TOutput)(object)output;
|
|
}
|
|
|
|
await Task.WhenAll(stageTasks.Concat(new[] { feederTask }));
|
|
_state = PipelineState.Completed;
|
|
}
|
|
finally
|
|
{
|
|
_executionLock.Release();
|
|
}
|
|
}
|
|
|
|
public async Task<PipelineStatistics> GetStatisticsAsync()
|
|
{
|
|
var stats = new PipelineStatistics
|
|
{
|
|
PipelineName = Name,
|
|
State = _state,
|
|
StageCount = _stages.Count,
|
|
StageStatistics = new List<StageStatistics>()
|
|
};
|
|
|
|
foreach (var stage in _stages)
|
|
{
|
|
stats.StageStatistics.Add(await stage.GetStatisticsAsync());
|
|
}
|
|
|
|
stats.TotalItemsProcessed = stats.StageStatistics.Sum(s => s.ItemsProcessed);
|
|
stats.TotalErrors = stats.StageStatistics.Sum(s => s.Errors);
|
|
stats.AverageLatency = stats.StageStatistics.Any()
|
|
? TimeSpan.FromMilliseconds(stats.StageStatistics.Average(s => s.AverageLatency.TotalMilliseconds))
|
|
: TimeSpan.Zero;
|
|
|
|
return stats;
|
|
}
|
|
|
|
private List<Channel<object>> BuildStageChannels()
|
|
{
|
|
var channels = new List<Channel<object>>();
|
|
|
|
for (int i = 0; i <= _stages.Count; i++)
|
|
{
|
|
var bufferSize = i < _stages.Count
|
|
? _stages[i].Configuration.BufferSize
|
|
: _configuration.OutputBufferSize;
|
|
|
|
// Use √n buffering if not specified
|
|
if (bufferSize == 0)
|
|
{
|
|
bufferSize = SpaceTimeCalculator.CalculateSqrtInterval(_configuration.ExpectedItemCount);
|
|
}
|
|
|
|
var channel = Channel.CreateBounded<object>(new BoundedChannelOptions(bufferSize)
|
|
{
|
|
FullMode = BoundedChannelFullMode.Wait,
|
|
SingleWriter = false,
|
|
SingleReader = false
|
|
});
|
|
|
|
channels.Add(channel);
|
|
}
|
|
|
|
return channels;
|
|
}
|
|
|
|
private List<Task> StartStageProcessors(
|
|
List<Channel<object>> channels,
|
|
PipelineExecutionContext context)
|
|
{
|
|
var tasks = new List<Task>();
|
|
|
|
for (int i = 0; i < _stages.Count; i++)
|
|
{
|
|
var stage = _stages[i];
|
|
var inputChannel = channels[i];
|
|
var outputChannel = channels[i + 1];
|
|
|
|
var task = Task.Run(async () =>
|
|
{
|
|
try
|
|
{
|
|
await stage.ProcessAsync(
|
|
inputChannel.Reader,
|
|
outputChannel.Writer,
|
|
context);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Stage {StageName} failed", stage.Name);
|
|
throw;
|
|
}
|
|
finally
|
|
{
|
|
outputChannel.Writer.Complete();
|
|
}
|
|
}, context.CancellationToken);
|
|
|
|
tasks.Add(task);
|
|
}
|
|
|
|
return tasks;
|
|
}
|
|
|
|
private async Task FeedInputsAsync<T>(
|
|
IEnumerable<T> inputs,
|
|
ChannelWriter<object> writer,
|
|
PipelineExecutionContext context)
|
|
{
|
|
try
|
|
{
|
|
foreach (var input in inputs)
|
|
{
|
|
if (context.CancellationToken.IsCancellationRequested)
|
|
break;
|
|
|
|
await writer.WriteAsync(input!, context.CancellationToken);
|
|
}
|
|
}
|
|
finally
|
|
{
|
|
writer.Complete();
|
|
}
|
|
}
|
|
|
|
public void Dispose()
|
|
{
|
|
_cancellationTokenSource?.Cancel();
|
|
_cancellationTokenSource?.Dispose();
|
|
_executionLock?.Dispose();
|
|
|
|
foreach (var stage in _stages.OfType<IDisposable>())
|
|
{
|
|
stage.Dispose();
|
|
}
|
|
}
|
|
}
|
|
|
|
// Interfaces and supporting classes
|
|
public interface ISpaceTimePipeline<TInput, TOutput> : IDisposable
|
|
{
|
|
string Name { get; }
|
|
PipelineState State { get; }
|
|
|
|
ISpaceTimePipeline<TInput, TOutput> AddStage<TStageInput, TStageOutput>(
|
|
string stageName,
|
|
Func<TStageInput, CancellationToken, Task<TStageOutput>> transform,
|
|
StageConfiguration? configuration = null);
|
|
|
|
ISpaceTimePipeline<TInput, TOutput> AddBatchStage<TStageInput, TStageOutput>(
|
|
string stageName,
|
|
Func<IReadOnlyList<TStageInput>, CancellationToken, Task<IEnumerable<TStageOutput>>> batchTransform,
|
|
StageConfiguration? configuration = null);
|
|
|
|
ISpaceTimePipeline<TInput, TOutput> AddFilterStage<T>(
|
|
string stageName,
|
|
Func<T, bool> predicate,
|
|
StageConfiguration? configuration = null);
|
|
|
|
ISpaceTimePipeline<TInput, TOutput> AddCheckpointStage<T>(
|
|
string stageName,
|
|
ICheckpointManager checkpointManager,
|
|
StageConfiguration? configuration = null);
|
|
|
|
Task<PipelineResult<TOutput>> ExecuteAsync(
|
|
TInput input,
|
|
CancellationToken cancellationToken = default);
|
|
|
|
Task<PipelineResult<TOutput>> ExecuteAsync(
|
|
IEnumerable<TInput> inputs,
|
|
CancellationToken cancellationToken = default);
|
|
|
|
IAsyncEnumerable<TOutput> ExecuteStreamingAsync(
|
|
IAsyncEnumerable<TInput> inputs,
|
|
CancellationToken cancellationToken = default);
|
|
|
|
Task<PipelineStatistics> GetStatisticsAsync();
|
|
}
|
|
|
|
public enum PipelineState
|
|
{
|
|
Created,
|
|
Running,
|
|
Completed,
|
|
Failed,
|
|
Cancelled
|
|
}
|
|
|
|
public class PipelineConfiguration
|
|
{
|
|
public int ExpectedItemCount { get; set; } = 10000;
|
|
public int OutputBufferSize { get; set; } = 0; // 0 = auto (√n)
|
|
public TimeSpan DefaultTimeout { get; set; } = TimeSpan.FromMinutes(30);
|
|
public bool EnableCheckpointing { get; set; } = true;
|
|
public bool EnableMetrics { get; set; } = true;
|
|
}
|
|
|
|
public class StageConfiguration
|
|
{
|
|
public int BufferSize { get; set; } = 0; // 0 = auto (√n)
|
|
public int MaxConcurrency { get; set; } = Environment.ProcessorCount;
|
|
public TimeSpan Timeout { get; set; } = TimeSpan.FromMinutes(5);
|
|
public bool EnableRetry { get; set; } = true;
|
|
public int MaxRetries { get; set; } = 3;
|
|
}
|
|
|
|
public class PipelineResult<T>
|
|
{
|
|
public bool Success { get; set; }
|
|
public List<T> Outputs { get; set; } = new();
|
|
public int ProcessedCount { get; set; }
|
|
public TimeSpan Duration { get; set; }
|
|
public Exception? Error { get; set; }
|
|
}
|
|
|
|
public class PipelineStatistics
|
|
{
|
|
public string PipelineName { get; set; } = "";
|
|
public PipelineState State { get; set; }
|
|
public int StageCount { get; set; }
|
|
public long TotalItemsProcessed { get; set; }
|
|
public long TotalErrors { get; set; }
|
|
public TimeSpan AverageLatency { get; set; }
|
|
public List<StageStatistics> StageStatistics { get; set; } = new();
|
|
}
|
|
|
|
public class StageStatistics
|
|
{
|
|
public string StageName { get; set; } = "";
|
|
public long ItemsProcessed { get; set; }
|
|
public long ItemsFiltered { get; set; }
|
|
public long Errors { get; set; }
|
|
public TimeSpan AverageLatency { get; set; }
|
|
public long MemoryUsage { get; set; }
|
|
}
|
|
|
|
internal interface IPipelineStage
|
|
{
|
|
string Name { get; }
|
|
StageConfiguration Configuration { get; }
|
|
Task ProcessAsync(ChannelReader<object> input, ChannelWriter<object> output, PipelineExecutionContext context);
|
|
Task<StageStatistics> GetStatisticsAsync();
|
|
}
|
|
|
|
internal class PipelineExecutionContext
|
|
{
|
|
public string PipelineName { get; set; } = "";
|
|
public string ExecutionId { get; set; } = "";
|
|
public DateTime StartTime { get; set; }
|
|
public PipelineConfiguration Configuration { get; set; } = null!;
|
|
public CancellationToken CancellationToken { get; set; }
|
|
} |