diff --git a/SharepointToolbox/Services/DuplicatesService.cs b/SharepointToolbox/Services/DuplicatesService.cs new file mode 100644 index 0000000..63d6744 --- /dev/null +++ b/SharepointToolbox/Services/DuplicatesService.cs @@ -0,0 +1,250 @@ +using Microsoft.SharePoint.Client; +using Microsoft.SharePoint.Client.Search.Query; +using SharepointToolbox.Core.Helpers; +using SharepointToolbox.Core.Models; + +namespace SharepointToolbox.Services; + +/// +/// Duplicate file and folder detection. +/// Files: Search API (same KQL engine as SearchService) + client-side composite key grouping. +/// Folders: CSOM CAML FSObjType=1 via SharePointPaginationHelper + composite key grouping. +/// Port of PS Find-DuplicateFiles / Find-DuplicateFolders (PS lines 4942-5036). +/// +public class DuplicatesService : IDuplicatesService +{ + private const int BatchSize = 500; + private const int MaxStartRow = 50_000; + + public async Task> ScanDuplicatesAsync( + ClientContext ctx, + DuplicateScanOptions options, + IProgress progress, + CancellationToken ct) + { + ct.ThrowIfCancellationRequested(); + + List allItems; + + if (options.Mode == "Folders") + allItems = await CollectFolderItemsAsync(ctx, options, progress, ct); + else + allItems = await CollectFileItemsAsync(ctx, options, progress, ct); + + progress.Report(OperationProgress.Indeterminate($"Grouping {allItems.Count:N0} items by duplicate key\u2026")); + + var groups = allItems + .GroupBy(item => MakeKey(item, options)) + .Where(g => g.Count() >= 2) + .Select(g => new DuplicateGroup + { + GroupKey = g.Key, + Name = g.First().Name, + Items = g.ToList() + }) + .OrderByDescending(g => g.Items.Count) + .ThenBy(g => g.Name) + .ToList(); + + return groups; + } + + // ── File collection via Search API ──────────────────────────────────────── + + private static async Task> CollectFileItemsAsync( + ClientContext ctx, + DuplicateScanOptions options, + IProgress progress, + CancellationToken ct) + { + // KQL: all documents, optionally scoped to a library + var kqlParts = new List { "ContentType:Document" }; + if (!string.IsNullOrEmpty(options.Library)) + kqlParts.Add($"Path:\"{ctx.Url.TrimEnd('/')}/{options.Library.TrimStart('/')}*\""); + string kql = string.Join(" AND ", kqlParts); + + var allItems = new List(); + int startRow = 0; + + do + { + ct.ThrowIfCancellationRequested(); + + var kq = new KeywordQuery(ctx) + { + QueryText = kql, + StartRow = startRow, + RowLimit = BatchSize, + TrimDuplicates = false + }; + foreach (var prop in new[] { "Title", "Path", "FileExtension", "Created", + "LastModifiedTime", "Size", "ParentLink" }) + kq.SelectProperties.Add(prop); + + var executor = new SearchExecutor(ctx); + ClientResult clientResult = executor.ExecuteQuery(kq); + await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct); + + var table = clientResult.Value + .FirstOrDefault(t => t.TableType == KnownTableTypes.RelevantResults); + if (table == null || table.RowCount == 0) break; + + foreach (System.Collections.Hashtable row in table.ResultRows) + { + var dict = row.Cast() + .ToDictionary(e => e.Key.ToString()!, e => e.Value ?? (object)string.Empty); + + string path = GetStr(dict, "Path"); + if (path.Contains("/_vti_history/", StringComparison.OrdinalIgnoreCase)) + continue; + + string name = System.IO.Path.GetFileName(path); + if (string.IsNullOrEmpty(name)) + name = GetStr(dict, "Title"); + + string raw = GetStr(dict, "Size"); + string digits = System.Text.RegularExpressions.Regex.Replace(raw, "[^0-9]", ""); + long size = long.TryParse(digits, out var sv) ? sv : 0L; + + DateTime? created = ParseDate(GetStr(dict, "Created")); + DateTime? modified = ParseDate(GetStr(dict, "LastModifiedTime")); + + // Derive library from path segments + string library = ExtractLibraryFromPath(path, ctx.Url); + + allItems.Add(new DuplicateItem + { + Name = name, + Path = path, + Library = library, + SizeBytes = size, + Created = created, + Modified = modified + }); + } + + progress.Report(new OperationProgress(allItems.Count, MaxStartRow, + $"Collected {allItems.Count:N0} files\u2026")); + + startRow += BatchSize; + } + while (startRow <= MaxStartRow); + + return allItems; + } + + // ── Folder collection via CAML ──────────────────────────────────────────── + + private static async Task> CollectFolderItemsAsync( + ClientContext ctx, + DuplicateScanOptions options, + IProgress progress, + CancellationToken ct) + { + // Load all document libraries on the site + ctx.Load(ctx.Web, + w => w.Lists.Include( + l => l.Title, l => l.Hidden, l => l.BaseType)); + await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct); + + var libs = ctx.Web.Lists + .Where(l => !l.Hidden && l.BaseType == BaseType.DocumentLibrary) + .ToList(); + + // Filter to specific library if requested + if (!string.IsNullOrEmpty(options.Library)) + { + libs = libs + .Where(l => l.Title.Equals(options.Library, StringComparison.OrdinalIgnoreCase)) + .ToList(); + } + + var camlQuery = new CamlQuery + { + ViewXml = """ + + + + + + 1 + + + + 2000 + + """ + }; + + var allItems = new List(); + + foreach (var lib in libs) + { + ct.ThrowIfCancellationRequested(); + progress.Report(OperationProgress.Indeterminate($"Scanning folders in {lib.Title}\u2026")); + + await foreach (var item in SharePointPaginationHelper.GetAllItemsAsync(ctx, lib, camlQuery, ct)) + { + ct.ThrowIfCancellationRequested(); + + var fv = item.FieldValues; + string name = fv["FileLeafRef"]?.ToString() ?? string.Empty; + string fileRef = fv["FileRef"]?.ToString() ?? string.Empty; + int subCount = Convert.ToInt32(fv["FolderChildCount"] ?? 0); + int childCount = Convert.ToInt32(fv["ItemChildCount"] ?? 0); + int fileCount = Math.Max(0, childCount - subCount); + DateTime? created = fv["Created"] is DateTime cr ? cr : (DateTime?)null; + DateTime? modified = fv["Modified"] is DateTime md ? md : (DateTime?)null; + + allItems.Add(new DuplicateItem + { + Name = name, + Path = fileRef, + Library = lib.Title, + FolderCount = subCount, + FileCount = fileCount, + Created = created, + Modified = modified + }); + } + } + + return allItems; + } + + // ── Composite key builder (matches test scaffold in DuplicatesServiceTests) ── + + internal static string MakeKey(DuplicateItem item, DuplicateScanOptions opts) + { + var parts = new List { item.Name.ToLowerInvariant() }; + if (opts.MatchSize && item.SizeBytes.HasValue) parts.Add(item.SizeBytes.Value.ToString()); + if (opts.MatchCreated && item.Created.HasValue) parts.Add(item.Created.Value.Date.ToString("yyyy-MM-dd")); + if (opts.MatchModified && item.Modified.HasValue) parts.Add(item.Modified.Value.Date.ToString("yyyy-MM-dd")); + if (opts.MatchSubfolderCount && item.FolderCount.HasValue) parts.Add(item.FolderCount.Value.ToString()); + if (opts.MatchFileCount && item.FileCount.HasValue) parts.Add(item.FileCount.Value.ToString()); + return string.Join("|", parts); + } + + // ── Private utilities ───────────────────────────────────────────────────── + + private static string GetStr(IDictionary r, string key) => + r.TryGetValue(key, out var v) ? v?.ToString() ?? string.Empty : string.Empty; + + private static DateTime? ParseDate(string s) => + DateTime.TryParse(s, out var dt) ? dt : (DateTime?)null; + + private static string ExtractLibraryFromPath(string path, string siteUrl) + { + // Extract first path segment after the site URL as library name + // e.g. https://tenant.sharepoint.com/sites/MySite/Shared Documents/file.docx -> "Shared Documents" + if (string.IsNullOrEmpty(path) || string.IsNullOrEmpty(siteUrl)) + return string.Empty; + + string relative = path.StartsWith(siteUrl.TrimEnd('/'), StringComparison.OrdinalIgnoreCase) + ? path.Substring(siteUrl.TrimEnd('/').Length).TrimStart('/') + : path; + + int slash = relative.IndexOf('/'); + return slash > 0 ? relative.Substring(0, slash) : relative; + } +}