sqrtspace-dotnet/src/SqrtSpace.SpaceTime.Linq/ExternalOrderedEnumerable.cs
2025-07-20 03:41:39 -04:00

196 lines
6.5 KiB
C#

using System.Collections;
using SqrtSpace.SpaceTime.Core;
namespace SqrtSpace.SpaceTime.Linq;
/// <summary>
/// External merge sort implementation for large datasets
/// </summary>
internal sealed class ExternalOrderedEnumerable<TSource, TKey> : IOrderedEnumerable<TSource> where TKey : notnull
{
private readonly IEnumerable<TSource> _source;
private readonly Func<TSource, TKey> _keySelector;
private readonly IComparer<TKey> _comparer;
private readonly int _bufferSize;
public ExternalOrderedEnumerable(
IEnumerable<TSource> source,
Func<TSource, TKey> keySelector,
IComparer<TKey>? comparer,
int? bufferSize)
{
_source = source;
_keySelector = keySelector;
_comparer = comparer ?? Comparer<TKey>.Default;
var count = source.TryGetNonEnumeratedCount(out var c) ? c : 100_000;
_bufferSize = bufferSize ?? SpaceTimeCalculator.CalculateSqrtInterval(count);
}
public IOrderedEnumerable<TSource> CreateOrderedEnumerable<TNewKey>(
Func<TSource, TNewKey> keySelector,
IComparer<TNewKey>? comparer,
bool descending)
{
// Create secondary sort key
return new ThenByOrderedEnumerable<TSource, TKey, TNewKey>(
this, keySelector, comparer, descending);
}
public IEnumerator<TSource> GetEnumerator()
{
// External merge sort implementation
using var storage = new ExternalStorage<TSource>();
var chunks = new List<string>();
var chunk = new List<TSource>(_bufferSize);
// Phase 1: Sort chunks and spill to disk
foreach (var item in _source)
{
chunk.Add(item);
if (chunk.Count >= _bufferSize)
{
var sortedChunk = chunk.OrderBy(_keySelector, _comparer).ToList();
var spillFile = storage.SpillToDiskAsync(sortedChunk).GetAwaiter().GetResult();
chunks.Add(spillFile);
chunk.Clear();
}
}
// Sort and spill remaining items
if (chunk.Count > 0)
{
var sortedChunk = chunk.OrderBy(_keySelector, _comparer).ToList();
var spillFile = storage.SpillToDiskAsync(sortedChunk).GetAwaiter().GetResult();
chunks.Add(spillFile);
}
// Phase 2: Merge sorted chunks
if (chunks.Count == 0)
yield break;
if (chunks.Count == 1)
{
// Single chunk, just read it back
foreach (var item in storage.ReadFromDiskAsync(chunks[0]).ToBlockingEnumerable())
{
yield return item;
}
}
else
{
// Multi-way merge
var iterators = new List<IEnumerator<TSource>>();
var heap = new SortedDictionary<(TKey key, int index), (TSource item, int streamIndex)>(
new MergeComparer<TKey>(_comparer));
try
{
// Initialize iterators
for (int i = 0; i < chunks.Count; i++)
{
var iterator = storage.ReadFromDiskAsync(chunks[i]).ToBlockingEnumerable().GetEnumerator();
iterators.Add(iterator);
if (iterator.MoveNext())
{
var item = iterator.Current;
var key = _keySelector(item);
heap.Add((key, i), (item, i));
}
}
// Merge
while (heap.Count > 0)
{
var min = heap.First();
yield return min.Value.item;
heap.Remove(min.Key);
var streamIndex = min.Value.streamIndex;
if (iterators[streamIndex].MoveNext())
{
var item = iterators[streamIndex].Current;
var key = _keySelector(item);
heap.Add((key, streamIndex), (item, streamIndex));
}
}
}
finally
{
foreach (var iterator in iterators)
{
iterator.Dispose();
}
}
}
}
IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
private sealed class MergeComparer<T> : IComparer<(T key, int index)>
{
private readonly IComparer<T> _keyComparer;
public MergeComparer(IComparer<T> keyComparer)
{
_keyComparer = keyComparer;
}
public int Compare((T key, int index) x, (T key, int index) y)
{
var keyComparison = _keyComparer.Compare(x.key, y.key);
return keyComparison != 0 ? keyComparison : x.index.CompareTo(y.index);
}
}
}
/// <summary>
/// Secondary ordering for ThenBy operations
/// </summary>
internal sealed class ThenByOrderedEnumerable<TSource, TPrimaryKey, TSecondaryKey> : IOrderedEnumerable<TSource>
{
private readonly IOrderedEnumerable<TSource> _primary;
private readonly Func<TSource, TSecondaryKey> _keySelector;
private readonly IComparer<TSecondaryKey> _comparer;
private readonly bool _descending;
public ThenByOrderedEnumerable(
IOrderedEnumerable<TSource> primary,
Func<TSource, TSecondaryKey> keySelector,
IComparer<TSecondaryKey>? comparer,
bool descending)
{
_primary = primary;
_keySelector = keySelector;
_comparer = comparer ?? Comparer<TSecondaryKey>.Default;
_descending = descending;
}
public IOrderedEnumerable<TSource> CreateOrderedEnumerable<TNewKey>(
Func<TSource, TNewKey> keySelector,
IComparer<TNewKey>? comparer,
bool descending)
{
return new ThenByOrderedEnumerable<TSource, TSecondaryKey, TNewKey>(
this, keySelector, comparer, descending);
}
public IEnumerator<TSource> GetEnumerator()
{
// For simplicity, materialize and use standard LINQ
// A production implementation would merge this into the external sort
var items = _primary.ToList();
var ordered = _descending
? items.OrderByDescending(_keySelector, _comparer)
: items.OrderBy(_keySelector, _comparer);
foreach (var item in ordered)
{
yield return item;
}
}
IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
}