146 lines
5.9 KiB
Python
146 lines
5.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Visualize Ollama experiment results"""
|
|
|
|
import json
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
|
|
def create_visualizations():
|
|
# Load results
|
|
with open("ollama_experiment_results.json", "r") as f:
|
|
results = json.load(f)
|
|
|
|
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
|
|
fig.suptitle(f"LLM Space-Time Tradeoffs with {results['model']}", fontsize=16)
|
|
|
|
# 1. Context Chunking Performance
|
|
ax1 = axes[0, 0]
|
|
context = results["experiments"]["context_chunking"]
|
|
methods = ["Full Context\n(O(n) memory)", "Chunked √n\n(O(√n) memory)"]
|
|
times = [context["full_context"]["time"], context["chunked_context"]["time"]]
|
|
memory = [context["full_context"]["memory_delta"], context["chunked_context"]["memory_delta"]]
|
|
|
|
x = np.arange(len(methods))
|
|
width = 0.35
|
|
|
|
ax1_mem = ax1.twinx()
|
|
bars1 = ax1.bar(x - width/2, times, width, label='Time (s)', color='skyblue')
|
|
bars2 = ax1_mem.bar(x + width/2, memory, width, label='Memory (MB)', color='lightcoral')
|
|
|
|
ax1.set_ylabel('Time (seconds)', color='skyblue')
|
|
ax1_mem.set_ylabel('Memory Delta (MB)', color='lightcoral')
|
|
ax1.set_title('Context Processing: Time vs Memory')
|
|
ax1.set_xticks(x)
|
|
ax1.set_xticklabels(methods)
|
|
|
|
# Add value labels
|
|
for bar in bars1:
|
|
height = bar.get_height()
|
|
ax1.text(bar.get_x() + bar.get_width()/2., height,
|
|
f'{height:.1f}s', ha='center', va='bottom')
|
|
for bar in bars2:
|
|
height = bar.get_height()
|
|
ax1_mem.text(bar.get_x() + bar.get_width()/2., height,
|
|
f'{height:.2f}MB', ha='center', va='bottom')
|
|
|
|
# 2. Streaming Performance
|
|
ax2 = axes[0, 1]
|
|
streaming = results["experiments"]["streaming"]
|
|
methods = ["Full Generation", "Streaming"]
|
|
times = [streaming["full_generation"]["time"], streaming["streaming_generation"]["time"]]
|
|
tokens = [streaming["full_generation"]["estimated_tokens"],
|
|
streaming["streaming_generation"]["estimated_tokens"]]
|
|
|
|
ax2.bar(methods, times, color=['#ff9999', '#66b3ff'])
|
|
ax2.set_ylabel('Time (seconds)')
|
|
ax2.set_title('Streaming vs Full Generation')
|
|
|
|
for i, (t, tok) in enumerate(zip(times, tokens)):
|
|
ax2.text(i, t, f'{t:.2f}s\n({tok} tokens)', ha='center', va='bottom')
|
|
|
|
# 3. Checkpointing Overhead
|
|
ax3 = axes[1, 0]
|
|
checkpoint = results["experiments"]["checkpointing"]
|
|
methods = ["No Checkpoint", f"Checkpoint every {checkpoint['with_checkpoint']['checkpoint_interval']}"]
|
|
times = [checkpoint["no_checkpoint"]["time"], checkpoint["with_checkpoint"]["time"]]
|
|
|
|
bars = ax3.bar(methods, times, color=['#90ee90', '#ffd700'])
|
|
ax3.set_ylabel('Time (seconds)')
|
|
ax3.set_title('Checkpointing Time Overhead')
|
|
|
|
# Calculate overhead
|
|
overhead = (times[1] / times[0] - 1) * 100
|
|
ax3.text(0.5, max(times) * 0.9, f'Overhead: {overhead:.1f}%',
|
|
ha='center', transform=ax3.transAxes, fontsize=12,
|
|
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
|
|
|
|
for bar, t in zip(bars, times):
|
|
ax3.text(bar.get_x() + bar.get_width()/2., bar.get_height(),
|
|
f'{t:.1f}s', ha='center', va='bottom')
|
|
|
|
# 4. Summary Statistics
|
|
ax4 = axes[1, 1]
|
|
ax4.axis('off')
|
|
|
|
summary_text = f"""
|
|
Key Findings:
|
|
|
|
1. Context Chunking (√n chunks):
|
|
• Slowdown: {context['chunked_context']['time']/context['full_context']['time']:.1f}x
|
|
• Chunks processed: {context['chunked_context']['num_chunks']}
|
|
• Chunk size: {context['chunked_context']['chunk_size']} chars
|
|
|
|
2. Streaming vs Full:
|
|
• Time difference: {abs(streaming['streaming_generation']['time'] - streaming['full_generation']['time']):.2f}s
|
|
• Tokens generated: ~{streaming['full_generation']['estimated_tokens']}
|
|
|
|
3. Checkpointing:
|
|
• Time overhead: {overhead:.1f}%
|
|
• Checkpoints created: {checkpoint['with_checkpoint']['num_checkpoints']}
|
|
• Interval: Every {checkpoint['with_checkpoint']['checkpoint_interval']} prompts
|
|
|
|
Conclusion: Real LLM inference shows significant
|
|
time overhead (18x) for √n memory reduction,
|
|
validating theoretical space-time tradeoffs.
|
|
"""
|
|
|
|
ax4.text(0.1, 0.9, summary_text, transform=ax4.transAxes,
|
|
fontsize=11, verticalalignment='top', family='monospace',
|
|
bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.3))
|
|
|
|
# Adjust layout to prevent overlapping
|
|
plt.subplots_adjust(hspace=0.3, wspace=0.3)
|
|
plt.savefig('ollama_spacetime_results.png', dpi=150, bbox_inches='tight')
|
|
plt.close() # Close the figure to free memory
|
|
print("Visualization saved to: ollama_spacetime_results.png")
|
|
|
|
# Create a second figure for detailed chunk analysis
|
|
fig2, ax = plt.subplots(1, 1, figsize=(10, 6))
|
|
|
|
# Show the √n relationship
|
|
n_values = np.logspace(2, 6, 50) # 100 to 1M
|
|
sqrt_n = np.sqrt(n_values)
|
|
|
|
ax.loglog(n_values, n_values, 'b-', label='O(n) - Full context', linewidth=2)
|
|
ax.loglog(n_values, sqrt_n, 'r--', label='O(√n) - Chunked', linewidth=2)
|
|
|
|
# Add our experimental point
|
|
text_size = 14750 # Total context length from experiment
|
|
chunk_count = results["experiments"]["context_chunking"]["chunked_context"]["num_chunks"]
|
|
chunk_size = results["experiments"]["context_chunking"]["chunked_context"]["chunk_size"]
|
|
ax.scatter([text_size], [chunk_count], color='green', s=100, zorder=5,
|
|
label=f'Our experiment: {chunk_count} chunks of {chunk_size} chars')
|
|
|
|
ax.set_xlabel('Context Size (characters)')
|
|
ax.set_ylabel('Memory/Processing Units')
|
|
ax.set_title('Space Complexity: Full vs Chunked Processing')
|
|
ax.legend()
|
|
ax.grid(True, alpha=0.3)
|
|
|
|
plt.tight_layout()
|
|
plt.savefig('ollama_sqrt_n_relationship.png', dpi=150, bbox_inches='tight')
|
|
plt.close() # Close the figure
|
|
print("√n relationship saved to: ollama_sqrt_n_relationship.png")
|
|
|
|
if __name__ == "__main__":
|
|
create_visualizations() |