using System.Diagnostics; using Microsoft.SharePoint.Client; using Microsoft.SharePoint.Client.Search.Query; using SharepointToolbox.Core.Helpers; using SharepointToolbox.Core.Models; using SharepointToolbox.Services.Export; namespace SharepointToolbox.Services; /// /// Duplicate file and folder detection. /// Files: Search API (same KQL engine as SearchService) + client-side composite key grouping. /// Folders: CSOM CAML FSObjType=1 via SharePointPaginationHelper + composite key grouping. /// Port of PS Find-DuplicateFiles / Find-DuplicateFolders (PS lines 4942-5036). /// public class DuplicatesService : IDuplicatesService { // SharePoint Search REST API caps RowLimit at 500 per request; larger values are silently clamped. private const int BatchSize = 500; // SharePoint Search hard ceiling — StartRow > 50,000 returns an error regardless of pagination state. // See https://learn.microsoft.com/sharepoint/dev/general-development/customizing-search-results-in-sharepoint private const int MaxStartRow = 50_000; /// /// Scans a site for duplicate files or folders and groups matches by the /// composite key configured in (name plus any /// of size / created / modified / subfolder-count / file-count). /// File mode uses the SharePoint Search API — it is fast but capped at /// 50,000 rows (see ). Folder mode uses paginated /// CSOM CAML over every document library on the site. Groups with fewer /// than two items are dropped before return. /// /// Authenticated for the target site. /// Scope (Files/Folders), optional library filter, and match-key toggles. /// Receives row-count progress during collection. /// Cancellation token — honoured between paged requests. /// Duplicate groups ordered by descending size, then name. public async Task> ScanDuplicatesAsync( ClientContext ctx, DuplicateScanOptions options, IProgress progress, CancellationToken ct) { ct.ThrowIfCancellationRequested(); List allItems; if (options.Mode == "Folders") allItems = await CollectFolderItemsAsync(ctx, options, progress, ct); else allItems = await CollectFileItemsAsync(ctx, options, progress, ct); progress.Report(OperationProgress.Indeterminate($"Grouping {allItems.Count:N0} items by duplicate key\u2026")); var groups = allItems .GroupBy(item => MakeKey(item, options)) .Where(g => g.Count() >= 2) .Select(g => { var items = g.ToList(); var name = items[0].Name; var libraries = items .Select(i => i.Library) .Where(l => !string.IsNullOrEmpty(l)) .Distinct(StringComparer.OrdinalIgnoreCase) .OrderBy(l => l, StringComparer.OrdinalIgnoreCase) .ToList(); return new DuplicateGroup { GroupKey = g.Key, Name = libraries.Count > 0 ? $"{name} ({string.Join(", ", libraries)})" : name, Items = items }; }) .OrderByDescending(g => g.Items.Count) .ThenBy(g => g.Name) .ToList(); return groups; } // ── File collection via Search API ──────────────────────────────────────── private static async Task> CollectFileItemsAsync( ClientContext ctx, DuplicateScanOptions options, IProgress progress, CancellationToken ct) { var (siteUrl, siteTitle) = await LoadSiteIdentityAsync(ctx, progress, ct); // KQL: all documents, optionally scoped to a library var kqlParts = new List { "ContentType:Document" }; if (!string.IsNullOrEmpty(options.Library)) kqlParts.Add($"Path:\"{ctx.Url.TrimEnd('/')}/{options.Library.TrimStart('/')}*\""); string kql = string.Join(" AND ", kqlParts); var allItems = new List(); int startRow = 0; do { ct.ThrowIfCancellationRequested(); var kq = new KeywordQuery(ctx) { QueryText = kql, StartRow = startRow, RowLimit = BatchSize, TrimDuplicates = false }; foreach (var prop in new[] { "Title", "Path", "FileExtension", "Created", "LastModifiedTime", "Size", "ParentLink" }) kq.SelectProperties.Add(prop); var executor = new SearchExecutor(ctx); ClientResult clientResult = executor.ExecuteQuery(kq); await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct); var table = clientResult.Value .FirstOrDefault(t => t.TableType == KnownTableTypes.RelevantResults); if (table == null || table.RowCount == 0) break; foreach (var rawRow in table.ResultRows) { // CSOM has returned ResultRows as either Hashtable or // Dictionary across versions — accept both. IDictionary dict; if (rawRow is IDictionary generic) { dict = generic; } else if (rawRow is System.Collections.IDictionary legacy) { dict = new Dictionary(); foreach (System.Collections.DictionaryEntry e in legacy) dict[e.Key.ToString()!] = e.Value ?? string.Empty; } else { continue; } string path = GetStr(dict, "Path"); if (path.Contains("/_vti_history/", StringComparison.OrdinalIgnoreCase)) continue; string name = System.IO.Path.GetFileName(path); if (string.IsNullOrEmpty(name)) name = GetStr(dict, "Title"); string raw = GetStr(dict, "Size"); string digits = System.Text.RegularExpressions.Regex.Replace(raw, "[^0-9]", ""); long size = long.TryParse(digits, out var sv) ? sv : 0L; DateTime? created = ParseDate(GetStr(dict, "Created")); DateTime? modified = ParseDate(GetStr(dict, "LastModifiedTime")); // Derive library from path segments string library = ExtractLibraryFromPath(path, ctx.Url); allItems.Add(new DuplicateItem { Name = name, Path = path, Library = library, SizeBytes = size, Created = created, Modified = modified, SiteUrl = siteUrl, SiteTitle = siteTitle }); } progress.Report(new OperationProgress(allItems.Count, MaxStartRow, $"Collected {allItems.Count:N0} files\u2026")); startRow += BatchSize; } while (startRow <= MaxStartRow); return allItems; } // ── Folder collection via CAML ──────────────────────────────────────────── private static async Task> CollectFolderItemsAsync( ClientContext ctx, DuplicateScanOptions options, IProgress progress, CancellationToken ct) { // Load all document libraries on the site ctx.Load(ctx.Web, w => w.Title, w => w.Lists.Include( l => l.Title, l => l.Hidden, l => l.BaseType)); await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct); var siteUrl = ctx.Url; var siteTitle = string.IsNullOrWhiteSpace(ctx.Web.Title) ? ReportSplitHelper.DeriveSiteLabel(siteUrl) : ctx.Web.Title; var libs = ctx.Web.Lists .Where(l => !l.Hidden && l.BaseType == BaseType.DocumentLibrary) .ToList(); // Filter to specific library if requested if (!string.IsNullOrEmpty(options.Library)) { libs = libs .Where(l => l.Title.Equals(options.Library, StringComparison.OrdinalIgnoreCase)) .ToList(); } // No WHERE clause — a WHERE on non-indexed fields (FSObjType) throws the // list-view threshold on libraries > 5,000 items even with pagination. // Filter for folders client-side via FileSystemObjectType below. var camlQuery = new CamlQuery { ViewXml = """ 5000 """ }; var allItems = new List(); foreach (var lib in libs) { ct.ThrowIfCancellationRequested(); progress.Report(OperationProgress.Indeterminate($"Scanning folders in {lib.Title}\u2026")); await foreach (var item in SharePointPaginationHelper.GetAllItemsAsync(ctx, lib, camlQuery, ct)) { ct.ThrowIfCancellationRequested(); if (item.FileSystemObjectType != FileSystemObjectType.Folder) continue; var fv = item.FieldValues; string name = fv["FileLeafRef"]?.ToString() ?? string.Empty; string fileRef = fv["FileRef"]?.ToString() ?? string.Empty; int subCount = Convert.ToInt32(fv["FolderChildCount"] ?? 0); int childCount = Convert.ToInt32(fv["ItemChildCount"] ?? 0); int fileCount = Math.Max(0, childCount - subCount); DateTime? created = fv["Created"] is DateTime cr ? cr : (DateTime?)null; DateTime? modified = fv["Modified"] is DateTime md ? md : (DateTime?)null; allItems.Add(new DuplicateItem { Name = name, Path = fileRef, Library = lib.Title, FolderCount = subCount, FileCount = fileCount, Created = created, Modified = modified, SiteUrl = siteUrl, SiteTitle = siteTitle }); } } return allItems; } // ── Composite key builder (matches test scaffold in DuplicatesServiceTests) ── internal static string MakeKey(DuplicateItem item, DuplicateScanOptions opts) { var parts = new List { item.Name.ToLowerInvariant() }; if (opts.MatchSize && item.SizeBytes.HasValue) parts.Add(item.SizeBytes.Value.ToString()); if (opts.MatchCreated && item.Created.HasValue) parts.Add(item.Created.Value.Date.ToString("yyyy-MM-dd")); if (opts.MatchModified && item.Modified.HasValue) parts.Add(item.Modified.Value.Date.ToString("yyyy-MM-dd")); if (opts.MatchSubfolderCount && item.FolderCount.HasValue) parts.Add(item.FolderCount.Value.ToString()); if (opts.MatchFileCount && item.FileCount.HasValue) parts.Add(item.FileCount.Value.ToString()); return string.Join("|", parts); } // ── Private utilities ───────────────────────────────────────────────────── private static string GetStr(IDictionary r, string key) => r.TryGetValue(key, out var v) ? v?.ToString() ?? string.Empty : string.Empty; private static DateTime? ParseDate(string s) => DateTime.TryParse(s, out var dt) ? dt : (DateTime?)null; private static async Task<(string Url, string Title)> LoadSiteIdentityAsync( ClientContext ctx, IProgress progress, CancellationToken ct) { try { ctx.Load(ctx.Web, w => w.Title); await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct); } catch (OperationCanceledException) { throw; } catch (Exception ex) { // Best-effort — fall back to URL-derived label Debug.WriteLine($"[DuplicatesService] LoadSiteIdentityAsync: failed to load Web.Title: {ex.GetType().Name}: {ex.Message}"); } var url = ctx.Url ?? string.Empty; string title; try { title = ctx.Web.Title; } catch (Exception ex) { Debug.WriteLine($"[DuplicatesService] LoadSiteIdentityAsync: Web.Title getter threw: {ex.GetType().Name}: {ex.Message}"); title = string.Empty; } if (string.IsNullOrWhiteSpace(title)) title = ReportSplitHelper.DeriveSiteLabel(url); return (url, title); } private static string ExtractLibraryFromPath(string path, string siteUrl) { // Extract first path segment after the site URL as library name // e.g. https://tenant.sharepoint.com/sites/MySite/Shared Documents/file.docx -> "Shared Documents" if (string.IsNullOrEmpty(path) || string.IsNullOrEmpty(siteUrl)) return string.Empty; string relative = path.StartsWith(siteUrl.TrimEnd('/'), StringComparison.OrdinalIgnoreCase) ? path.Substring(siteUrl.TrimEnd('/').Length).TrimStart('/') : path; int slash = relative.IndexOf('/'); return slash > 0 ? relative.Substring(0, slash) : relative; } }