Files

666 lines
23 KiB
C#

using MarketAlly.GitCommitEditor.Models;
using MarketAlly.GitCommitEditor.Models.HistoryHealth;
using MarketAlly.GitCommitEditor.Resources;
using MarketAlly.LibGit2Sharp;
namespace MarketAlly.GitCommitEditor.Services;
/// <summary>
/// Analyzes git repository history health.
/// </summary>
public sealed class HistoryHealthAnalyzer : IHistoryHealthAnalyzer
{
private readonly ICommitAnalyzer _commitAnalyzer;
private readonly HealthScoringWeights _weights;
// Patterns that indicate merge fix commits
private static readonly string[] MergeFixPatterns =
[
"fix merge",
"merge fix",
"resolve conflict",
"fix conflict",
"meerge", // Common typo
"megre", // Common typo
"fixed merge"
];
public HistoryHealthAnalyzer(ICommitAnalyzer commitAnalyzer, HealthScoringWeights? weights = null)
{
_commitAnalyzer = commitAnalyzer;
_weights = weights ?? HealthScoringWeights.Default;
}
public Task<HistoryHealthAnalysis> AnalyzeAsync(
string repoPath,
HistoryAnalysisOptions? options = null,
IProgress<AnalysisProgress>? progress = null,
CancellationToken ct = default)
{
var repo = new ManagedRepo
{
Path = repoPath,
Name = Path.GetFileName(repoPath)
};
return AnalyzeAsync(repo, options, progress, ct);
}
public async Task<HistoryHealthAnalysis> AnalyzeAsync(
ManagedRepo managedRepo,
HistoryAnalysisOptions? options = null,
IProgress<AnalysisProgress>? progress = null,
CancellationToken ct = default)
{
options ??= new HistoryAnalysisOptions();
using var repo = new Repository(managedRepo.Path);
// Get commits to analyze
progress?.Report(new AnalysisProgress { CurrentStage = Str.Health_LoadingCommits, PercentComplete = 0 });
var commits = GetCommitsToAnalyze(repo, options).ToList();
var totalCommits = commits.Count;
progress?.Report(new AnalysisProgress
{
CurrentStage = Str.Health_LoadingCommits,
PercentComplete = 10,
TotalCommits = totalCommits
});
// Analyze duplicates
progress?.Report(new AnalysisProgress
{
CurrentStage = Str.Health_DetectingDuplicates,
PercentComplete = 20,
TotalCommits = totalCommits
});
var duplicateMetrics = options.IncludeDuplicateDetection
? await Task.Run(() => AnalyzeDuplicates(commits), ct)
: CreateEmptyDuplicateMetrics(totalCommits);
ct.ThrowIfCancellationRequested();
// Analyze merges
progress?.Report(new AnalysisProgress
{
CurrentStage = Str.Health_AnalyzingMerges,
PercentComplete = 40,
TotalCommits = totalCommits
});
var mergeMetrics = await Task.Run(() => AnalyzeMerges(commits), ct);
ct.ThrowIfCancellationRequested();
// Analyze branches
progress?.Report(new AnalysisProgress
{
CurrentStage = Str.Health_AnalyzingBranches,
PercentComplete = 50,
TotalCommits = totalCommits
});
var branchMetrics = options.IncludeBranchAnalysis
? await Task.Run(() => AnalyzeBranches(repo, commits), ct)
: CreateEmptyBranchMetrics();
ct.ThrowIfCancellationRequested();
// Analyze message quality
progress?.Report(new AnalysisProgress
{
CurrentStage = Str.Health_AnalyzingMessages,
PercentComplete = 60,
TotalCommits = totalCommits
});
var messageDistribution = options.IncludeMessageDistribution
? await Task.Run(() => AnalyzeMessageQuality(commits, progress, totalCommits), ct)
: CreateEmptyMessageDistribution(totalCommits);
ct.ThrowIfCancellationRequested();
// Analyze authorship
progress?.Report(new AnalysisProgress
{
CurrentStage = Str.Health_AnalyzingAuthorship,
PercentComplete = 90,
TotalCommits = totalCommits
});
var authorshipMetrics = await Task.Run(() => AnalyzeAuthorship(commits), ct);
progress?.Report(new AnalysisProgress
{
CurrentStage = Str.Health_Complete,
PercentComplete = 100,
TotalCommits = totalCommits,
CommitsProcessed = totalCommits
});
return new HistoryHealthAnalysis
{
RepoPath = managedRepo.Path,
RepoName = managedRepo.Name,
CurrentBranch = repo.Head.FriendlyName,
CommitsAnalyzed = totalCommits,
OldestCommitDate = commits.LastOrDefault()?.Author.When,
NewestCommitDate = commits.FirstOrDefault()?.Author.When,
Duplicates = duplicateMetrics,
MergeMetrics = mergeMetrics,
BranchMetrics = branchMetrics,
MessageDistribution = messageDistribution,
AuthorshipMetrics = authorshipMetrics
};
}
private IEnumerable<Commit> GetCommitsToAnalyze(Repository repo, HistoryAnalysisOptions options)
{
var filter = new CommitFilter
{
SortBy = CommitSortStrategies.Topological | CommitSortStrategies.Time
};
IEnumerable<Commit> commits = repo.Commits.QueryBy(filter);
if (options.AnalyzeSince.HasValue)
{
commits = commits.Where(c => c.Author.When >= options.AnalyzeSince.Value);
}
return commits.Take(options.EffectiveMaxCommits);
}
private DuplicateCommitMetrics AnalyzeDuplicates(List<Commit> commits)
{
var duplicateGroups = new List<DuplicateCommitGroup>();
var messageGroups = new Dictionary<string, List<Commit>>();
var treeGroups = new Dictionary<string, List<Commit>>();
foreach (var commit in commits)
{
// Skip merge commits for duplicate detection
if (commit.Parents.Count() > 1) continue;
var message = NormalizeMessage(commit.MessageShort);
var treeSha = commit.Tree.Sha;
// Group by normalized message
if (!messageGroups.ContainsKey(message))
messageGroups[message] = [];
messageGroups[message].Add(commit);
// Group by tree SHA
if (!treeGroups.ContainsKey(treeSha))
treeGroups[treeSha] = [];
treeGroups[treeSha].Add(commit);
}
// Find exact tree duplicates
foreach (var group in treeGroups.Where(g => g.Value.Count > 1))
{
duplicateGroups.Add(new DuplicateCommitGroup
{
CanonicalMessage = group.Value.First().MessageShort,
CommitHashes = group.Value.Select(c => c.Sha).ToList(),
Type = DuplicateType.ExactTree
});
}
// Find exact message duplicates (not already in tree duplicates)
var treeDuplicateHashes = new HashSet<string>(
duplicateGroups.SelectMany(g => g.CommitHashes));
foreach (var group in messageGroups.Where(g => g.Value.Count > 1))
{
var nonTreeDuplicates = group.Value
.Where(c => !treeDuplicateHashes.Contains(c.Sha))
.ToList();
if (nonTreeDuplicates.Count > 1)
{
duplicateGroups.Add(new DuplicateCommitGroup
{
CanonicalMessage = group.Key,
CommitHashes = nonTreeDuplicates.Select(c => c.Sha).ToList(),
Type = DuplicateType.ExactMessage
});
}
}
var exactDuplicates = duplicateGroups
.Where(g => g.Type == DuplicateType.ExactTree)
.Sum(g => g.InstanceCount - 1);
return new DuplicateCommitMetrics
{
TotalCommitsAnalyzed = commits.Count,
TotalDuplicateGroups = duplicateGroups.Count,
TotalDuplicateInstances = duplicateGroups.Sum(g => g.InstanceCount - 1),
ExactDuplicates = exactDuplicates,
CherryPicks = 0, // Would need patch-id comparison
FuzzyMatches = duplicateGroups.Count(g => g.Type == DuplicateType.FuzzyMessage),
DuplicateGroups = duplicateGroups
};
}
private MergeCommitMetrics AnalyzeMerges(List<Commit> commits)
{
var mergeCommits = commits.Where(c => c.Parents.Count() > 1).ToList();
var nonMergeCommits = commits.Where(c => c.Parents.Count() <= 1).ToList();
var mergeFixCommits = new List<string>();
var messyPatterns = new List<string>();
foreach (var commit in commits)
{
var msgLower = commit.MessageShort.ToLowerInvariant();
foreach (var pattern in MergeFixPatterns)
{
if (msgLower.Contains(pattern))
{
if (commit.Parents.Count() <= 1)
{
mergeFixCommits.Add(commit.Sha);
}
if (!messyPatterns.Contains(pattern))
{
messyPatterns.Add(pattern);
}
break;
}
}
}
// Count null merges (empty merge commits)
var nullMerges = mergeCommits.Count(m =>
{
var parents = m.Parents.ToList();
return parents.Count == 2 && m.Tree.Sha == parents[0].Tree.Sha;
});
return new MergeCommitMetrics
{
TotalCommits = commits.Count,
TotalMerges = mergeCommits.Count,
TrivialMerges = mergeCommits.Count - nullMerges,
ConflictMerges = 0, // Hard to detect reliably
MergeFixCommits = mergeFixCommits.Count,
NullMerges = nullMerges,
AverageMergeComplexity = 0, // Would need tree diff
MessyMergePatterns = messyPatterns,
MergeFixCommitHashes = mergeFixCommits
};
}
private BranchComplexityMetrics AnalyzeBranches(Repository repo, List<Commit> commits)
{
var branches = repo.Branches.Where(b => !b.IsRemote).ToList();
var remoteBranches = repo.Branches.Where(b => b.IsRemote).ToList();
var now = DateTimeOffset.UtcNow;
var staleDays = 30;
var staleBranches = new List<string>();
var activeBranches = 0;
foreach (var branch in branches)
{
if (branch.Tip == null) continue;
var age = now - branch.Tip.Author.When;
if (age.TotalDays > staleDays)
{
staleBranches.Add(branch.FriendlyName);
}
else
{
activeBranches++;
}
}
// Count merge commits to determine cross-merges
var mergeCommits = commits.Where(c => c.Parents.Count() > 1).ToList();
var mainBranchNames = new[] { "main", "master", "develop", "dev" };
// Simplified cross-merge detection
var crossMerges = 0;
foreach (var merge in mergeCommits)
{
var msg = merge.MessageShort.ToLowerInvariant();
// If merge message doesn't mention main branches, it's likely a cross-merge
if (!mainBranchNames.Any(b => msg.Contains(b)))
{
crossMerges++;
}
}
// Determine topology
var mergeRatio = commits.Count > 0 ? (double)mergeCommits.Count / commits.Count : 0;
var crossMergeRatio = mergeCommits.Count > 0 ? (double)crossMerges / mergeCommits.Count : 0;
var topology = DetermineTopology(mergeRatio, crossMergeRatio, staleBranches.Count);
return new BranchComplexityMetrics
{
TotalBranches = branches.Count, // Only count local branches
ActiveBranches = activeBranches,
StaleBranches = staleBranches.Count,
CrossMerges = crossMerges,
AverageBranchAge = 0, // Would need more calculation
AverageBranchLength = 0,
LongLivedBranches = 0,
Topology = topology,
StaleBranchNames = staleBranches
};
}
private BranchTopologyType DetermineTopology(double mergeRatio, double crossMergeRatio, int staleBranches)
{
if (mergeRatio < 0.1)
return BranchTopologyType.Linear;
if (crossMergeRatio > 0.5 || (mergeRatio > 0.5 && staleBranches > 5))
return BranchTopologyType.Spaghetti;
if (crossMergeRatio > 0.2 || mergeRatio > 0.4)
return BranchTopologyType.Tangled;
if (mergeRatio > 0.2)
return BranchTopologyType.GitFlow;
return BranchTopologyType.Balanced;
}
private MessageQualityDistribution AnalyzeMessageQuality(
List<Commit> commits,
IProgress<AnalysisProgress>? progress,
int totalCommits)
{
var scores = new List<(string Hash, int Score, DateTimeOffset Date)>();
var poorCommits = new List<string>();
var processed = 0;
foreach (var commit in commits)
{
// Skip merge commits for message quality
if (commit.Parents.Count() > 1) continue;
var analysis = _commitAnalyzer.Analyze(commit.Message);
scores.Add((commit.Sha, analysis.OverallScore, commit.Author.When));
if (analysis.OverallScore < 50)
{
poorCommits.Add(commit.Sha);
}
processed++;
if (processed % 100 == 0)
{
progress?.Report(new AnalysisProgress
{
CurrentStage = Str.Health_AnalyzingMessages,
PercentComplete = 60 + (int)(30.0 * processed / totalCommits),
CommitsProcessed = processed,
TotalCommits = totalCommits
});
}
}
if (scores.Count == 0)
{
return CreateEmptyMessageDistribution(totalCommits);
}
var sortedScores = scores.Select(s => s.Score).OrderBy(s => s).ToList();
var avg = sortedScores.Average();
var median = sortedScores[sortedScores.Count / 2];
// Calculate standard deviation
var sumSquaredDiff = sortedScores.Sum(s => Math.Pow(s - avg, 2));
var stdDev = Math.Sqrt(sumSquaredDiff / sortedScores.Count);
// Determine trend (compare first half vs second half)
var midpoint = scores.Count / 2;
var olderAvg = scores.Skip(midpoint).Average(s => s.Score);
var newerAvg = scores.Take(midpoint).Average(s => s.Score);
var trend = newerAvg > olderAvg + 5 ? TrendDirection.Improving
: newerAvg < olderAvg - 5 ? TrendDirection.Declining
: TrendDirection.Stable;
return new MessageQualityDistribution
{
TotalCommits = scores.Count,
Excellent = scores.Count(s => s.Score >= 90),
Good = scores.Count(s => s.Score >= 70 && s.Score < 90),
Fair = scores.Count(s => s.Score >= 50 && s.Score < 70),
Poor = scores.Count(s => s.Score < 50),
AverageScore = avg,
MedianScore = median,
StandardDeviation = stdDev,
Trend = trend,
Clusters = [], // Could add cluster detection
PoorCommitHashes = poorCommits
};
}
private AuthorshipMetrics AnalyzeAuthorship(List<Commit> commits)
{
// Track running totals for calculating averages
var authorData = new Dictionary<string, (string Name, string Email, int CommitCount, double TotalQuality, int MergeCount)>();
var emailToCanonical = new Dictionary<string, string>(); // Maps raw email to canonical key
var nameToCanonical = new Dictionary<string, string>(); // Maps normalized name to canonical key
var missingEmail = 0;
var invalidEmail = 0;
var botCommits = 0;
var botPatterns = new[] { "[bot]", "dependabot", "renovate", "github-actions", "noreply" };
foreach (var commit in commits)
{
var email = commit.Author.Email ?? "";
var name = commit.Author.Name ?? "Unknown";
var normalizedEmail = NormalizeEmail(email);
var normalizedName = NormalizeName(name);
if (string.IsNullOrWhiteSpace(email))
{
missingEmail++;
}
else if (!email.Contains('@'))
{
invalidEmail++;
}
var isMerge = commit.Parents.Count() > 1;
var isBot = botPatterns.Any(p =>
name.Contains(p, StringComparison.OrdinalIgnoreCase) ||
email.Contains(p, StringComparison.OrdinalIgnoreCase));
if (isBot) botCommits++;
// Analyze message quality for this commit
var quality = _commitAnalyzer.Analyze(commit.Message);
var qualityScore = quality.OverallScore;
// Find or create canonical key for this author
var canonicalKey = FindOrCreateCanonicalKey(
normalizedEmail, normalizedName, name, email,
emailToCanonical, nameToCanonical, authorData);
if (!authorData.TryGetValue(canonicalKey, out var data))
{
data = (name, email, 0, 0, 0);
}
// Update running totals
authorData[canonicalKey] = (
data.Name,
data.Email,
data.CommitCount + 1,
data.TotalQuality + qualityScore,
data.MergeCount + (isMerge ? 1 : 0)
);
}
// Convert to AuthorStats with calculated averages
var authorStats = authorData.ToDictionary(
kvp => kvp.Key,
kvp => new AuthorStats
{
Name = kvp.Value.Name,
Email = kvp.Value.Email,
CommitCount = kvp.Value.CommitCount,
AverageMessageQuality = kvp.Value.CommitCount > 0
? kvp.Value.TotalQuality / kvp.Value.CommitCount
: 0,
MergeCommitCount = kvp.Value.MergeCount
});
return new AuthorshipMetrics
{
TotalAuthors = authorStats.Count,
TotalCommits = commits.Count,
MissingEmailCount = missingEmail,
InvalidEmailCount = invalidEmail,
BotCommits = botCommits,
AuthorBreakdown = authorStats
};
}
private static string FindOrCreateCanonicalKey<T>(
string normalizedEmail,
string normalizedName,
string rawName,
string rawEmail,
Dictionary<string, string> emailToCanonical,
Dictionary<string, string> nameToCanonical,
Dictionary<string, T> authorData)
{
// First, check if we've seen this exact email before
if (!string.IsNullOrEmpty(normalizedEmail) && emailToCanonical.TryGetValue(normalizedEmail, out var existingKey))
{
return existingKey;
}
// Next, check if we've seen this name before (for matching when emails differ)
if (!string.IsNullOrEmpty(normalizedName) && nameToCanonical.TryGetValue(normalizedName, out existingKey))
{
// Also map this email to the same canonical key
if (!string.IsNullOrEmpty(normalizedEmail))
{
emailToCanonical[normalizedEmail] = existingKey;
}
return existingKey;
}
// Create new canonical key - prefer email if valid, otherwise use name
var canonicalKey = !string.IsNullOrEmpty(normalizedEmail) ? normalizedEmail : normalizedName;
// Register mappings
if (!string.IsNullOrEmpty(normalizedEmail))
{
emailToCanonical[normalizedEmail] = canonicalKey;
}
if (!string.IsNullOrEmpty(normalizedName))
{
nameToCanonical[normalizedName] = canonicalKey;
}
return canonicalKey;
}
private static string NormalizeEmail(string email)
{
if (string.IsNullOrWhiteSpace(email)) return "";
email = email.ToLowerInvariant().Trim();
// Remove + aliases (e.g., john+test@gmail.com -> john@gmail.com)
var atIndex = email.IndexOf('@');
if (atIndex > 0)
{
var plusIndex = email.IndexOf('+');
if (plusIndex > 0 && plusIndex < atIndex)
{
email = email[..plusIndex] + email[atIndex..];
}
}
// Normalize common noreply patterns
if (email.Contains("noreply") || email.Contains("no-reply"))
{
// Extract username from GitHub noreply format: 12345678+username@users.noreply.github.com
var match = System.Text.RegularExpressions.Regex.Match(email, @"\d+\+([^@]+)@users\.noreply\.github\.com");
if (match.Success)
{
return match.Groups[1].Value + "@github";
}
}
return email;
}
private static string NormalizeName(string name)
{
if (string.IsNullOrWhiteSpace(name)) return "";
// Lowercase and trim
name = name.ToLowerInvariant().Trim();
// Remove common suffixes/prefixes
name = name.Replace("[bot]", "").Trim();
// Normalize whitespace
name = System.Text.RegularExpressions.Regex.Replace(name, @"\s+", " ");
return name;
}
private static string NormalizeMessage(string message)
{
return message
.ToLowerInvariant()
.Trim()
.Replace("\r", "")
.Replace("\n", " ");
}
private static DuplicateCommitMetrics CreateEmptyDuplicateMetrics(int totalCommits) => new()
{
TotalCommitsAnalyzed = totalCommits,
TotalDuplicateGroups = 0,
TotalDuplicateInstances = 0,
ExactDuplicates = 0,
CherryPicks = 0,
FuzzyMatches = 0,
DuplicateGroups = []
};
private static BranchComplexityMetrics CreateEmptyBranchMetrics() => new()
{
TotalBranches = 0,
ActiveBranches = 0,
StaleBranches = 0,
CrossMerges = 0,
AverageBranchAge = 0,
AverageBranchLength = 0,
LongLivedBranches = 0,
Topology = BranchTopologyType.Linear,
StaleBranchNames = []
};
private static MessageQualityDistribution CreateEmptyMessageDistribution(int totalCommits) => new()
{
TotalCommits = totalCommits,
Excellent = 0,
Good = 0,
Fair = 0,
Poor = 0,
AverageScore = 0,
MedianScore = 0,
StandardDeviation = 0,
Trend = TrendDirection.Stable,
Clusters = [],
PoorCommitHashes = []
};
}