--- phase: 03 plan: 04 title: SearchService and DuplicatesService — KQL Pagination and Duplicate Grouping status: pending wave: 2 depends_on: - 03-01 files_modified: - SharepointToolbox/Services/SearchService.cs - SharepointToolbox/Services/DuplicatesService.cs autonomous: true requirements: - SRCH-01 - SRCH-02 - DUPL-01 - DUPL-02 must_haves: truths: - "SearchService implements ISearchService and builds KQL from all SearchOptions fields (extension, dates, creator, editor, library)" - "SearchService paginates StartRow += 500 and stops when StartRow > 50,000 (platform cap) or MaxResults reached" - "SearchService filters out _vti_history/ paths from results" - "SearchService applies client-side Regex filter when SearchOptions.Regex is non-empty" - "DuplicatesService implements IDuplicatesService for both Mode=Files (Search API) and Mode=Folders (CAML FSObjType=1)" - "DuplicatesService groups items by MakeKey composite key and returns only groups with count >= 2" - "All CSOM round-trips use ExecuteQueryRetryHelper.ExecuteQueryRetryAsync" - "Folder enumeration uses SharePointPaginationHelper.GetAllItemsAsync with FSObjType=1 CAML" artifacts: - path: "SharepointToolbox/Services/SearchService.cs" provides: "KQL search engine with pagination (SRCH-01/02)" exports: ["SearchService"] - path: "SharepointToolbox/Services/DuplicatesService.cs" provides: "Duplicate detection for files and folders (DUPL-01/02)" exports: ["DuplicatesService"] key_links: - from: "SearchService.cs" to: "KeywordQuery + SearchExecutor" via: "Microsoft.SharePoint.Client.Search.Query" pattern: "KeywordQuery" - from: "DuplicatesService.cs" to: "SharePointPaginationHelper.GetAllItemsAsync" via: "folder enumeration" pattern: "SharePointPaginationHelper\\.GetAllItemsAsync" - from: "DuplicatesService.cs" to: "MakeKey" via: "composite key grouping" pattern: "MakeKey" --- # Plan 03-04: SearchService and DuplicatesService — KQL Pagination and Duplicate Grouping ## Goal Implement `SearchService` (KQL-based file search with 500-row pagination and 50,000 hard cap) and `DuplicatesService` (file duplicates via Search API + folder duplicates via CAML `FSObjType=1`). Both services are wave 2 — they depend only on the models and interfaces from Plan 03-01, not on StorageService. ## Context `Microsoft.SharePoint.Client.Search.dll` is available as a transitive dependency of PnP.Framework 1.18.0. The namespace is `Microsoft.SharePoint.Client.Search.Query`. The search pattern requires calling `executor.ExecuteQuery(kq)` to register the query, then `ExecuteQueryRetryHelper.ExecuteQueryRetryAsync` to execute it — calling `ctx.ExecuteQuery()` directly afterward is incorrect and must be avoided. `DuplicatesService` for folders uses `SharePointPaginationHelper.GetAllItemsAsync` with `FSObjType=1` CAML. The CAML field name is `FSObjType` (not `FileSystemObjectType`) — using the wrong name returns zero results silently. The `MakeKey` composite key logic tested in Plan 03-01 `DuplicatesServiceTests` must match exactly what `DuplicatesService` implements. ## Tasks ### Task 1: Implement SearchService **File:** `SharepointToolbox/Services/SearchService.cs` **Action:** Create **Why:** SRCH-01 (multi-criteria search) and SRCH-02 (configurable max results up to 50,000). ```csharp using Microsoft.SharePoint.Client; using Microsoft.SharePoint.Client.Search.Query; using SharepointToolbox.Core.Helpers; using SharepointToolbox.Core.Models; using System.Text.RegularExpressions; namespace SharepointToolbox.Services; /// /// File search using SharePoint KQL Search API. /// Port of PS Search-SPOFiles pattern (PS lines 4747-4987). /// Pagination: 500 rows per batch, hard cap StartRow=50,000 (SharePoint Search boundary). /// public class SearchService : ISearchService { private const int BatchSize = 500; private const int MaxStartRow = 50_000; public async Task> SearchFilesAsync( ClientContext ctx, SearchOptions options, IProgress progress, CancellationToken ct) { ct.ThrowIfCancellationRequested(); string kql = BuildKql(options); ValidateKqlLength(kql); Regex? regexFilter = null; if (!string.IsNullOrWhiteSpace(options.Regex)) { regexFilter = new Regex(options.Regex, RegexOptions.IgnoreCase | RegexOptions.Compiled, TimeSpan.FromSeconds(2)); } var allResults = new List(); int startRow = 0; int maxResults = Math.Min(options.MaxResults, MaxStartRow); do { ct.ThrowIfCancellationRequested(); var kq = new KeywordQuery(ctx) { QueryText = kql, StartRow = startRow, RowLimit = BatchSize, TrimDuplicates = false }; kq.SelectProperties.AddRange(new[] { "Title", "Path", "Author", "LastModifiedTime", "FileExtension", "Created", "ModifiedBy", "Size" }); var executor = new SearchExecutor(ctx); ClientResult clientResult = executor.ExecuteQuery(kq); await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct); var table = clientResult.Value .FirstOrDefault(t => t.TableType == KnownTableTypes.RelevantResults); if (table == null || table.RowCount == 0) break; foreach (System.Collections.Hashtable row in table.ResultRows) { var dict = row.Cast() .ToDictionary(e => e.Key.ToString()!, e => e.Value ?? (object)string.Empty); // Skip SharePoint version history paths string path = Str(dict, "Path"); if (path.Contains("/_vti_history/", StringComparison.OrdinalIgnoreCase)) continue; var result = ParseRow(dict); // Client-side Regex filter on file name if (regexFilter != null) { string fileName = System.IO.Path.GetFileName(result.Path); if (!regexFilter.IsMatch(fileName) && !regexFilter.IsMatch(result.Title)) continue; } allResults.Add(result); if (allResults.Count >= maxResults) goto done; } progress.Report(new OperationProgress(allResults.Count, maxResults, $"Retrieved {allResults.Count:N0} results…")); startRow += BatchSize; } while (startRow <= MaxStartRow && allResults.Count < maxResults); done: return allResults; } // ── Extension point: bypassing the 50,000-item cap ─────────────────────── // // The StartRow approach has a hard ceiling at 50,000 (SharePoint Search boundary). // To go beyond it, replace the StartRow loop with a DocId cursor: // // 1. Add "DocId" to SelectProperties. // 2. Add query.SortList.Add("DocId", SortDirection.Ascending). // 3. First page KQL: unchanged. // Subsequent pages: append "AND DocId>{lastDocId}" to the KQL (StartRow stays 0). // 4. Track lastDocId = Convert.ToInt64(lastRow["DocId"]) after each batch. // 5. Stop when batch.RowCount < BatchSize. // // Caveats: // - DocId is per-site-collection; for multi-site searches, maintain a separate // cursor per ClientContext (site URL). // - The search index can shift between batches (new items indexed mid-scan); // the DocId cursor is safer than StartRow but cannot guarantee zero drift. // - DocId is not returned by default — it must be in SelectProperties. // // This is deliberately not implemented here because SRCH-02 caps results at 50,000, // which the StartRow approach already covers exactly (100 pages × 500 rows). // Implement the DocId cursor if the cap needs to be lifted in a future version. // ── KQL builder ─────────────────────────────────────────────────────────── internal static string BuildKql(SearchOptions opts) { var parts = new List { "ContentType:Document" }; if (opts.Extensions.Length > 0) { var extParts = opts.Extensions .Select(e => $"FileExtension:{e.TrimStart('.').ToLowerInvariant()}"); parts.Add($"({string.Join(" OR ", extParts)})"); } if (opts.CreatedAfter.HasValue) parts.Add($"Created>={opts.CreatedAfter.Value:yyyy-MM-dd}"); if (opts.CreatedBefore.HasValue) parts.Add($"Created<={opts.CreatedBefore.Value:yyyy-MM-dd}"); if (opts.ModifiedAfter.HasValue) parts.Add($"Write>={opts.ModifiedAfter.Value:yyyy-MM-dd}"); if (opts.ModifiedBefore.HasValue) parts.Add($"Write<={opts.ModifiedBefore.Value:yyyy-MM-dd}"); if (!string.IsNullOrEmpty(opts.CreatedBy)) parts.Add($"Author:\"{opts.CreatedBy}\""); if (!string.IsNullOrEmpty(opts.ModifiedBy)) parts.Add($"ModifiedBy:\"{opts.ModifiedBy}\""); if (!string.IsNullOrEmpty(opts.Library) && !string.IsNullOrEmpty(opts.SiteUrl)) parts.Add($"Path:\"{opts.SiteUrl.TrimEnd('/')}/{opts.Library.TrimStart('/')}*\""); return string.Join(" AND ", parts); } private static void ValidateKqlLength(string kql) { // SharePoint Search KQL text hard cap is 4096 characters if (kql.Length > 4096) throw new InvalidOperationException( $"KQL query exceeds 4096-character SharePoint Search limit ({kql.Length} chars). " + "Reduce the number of extension filters."); } // ── Row parser ──────────────────────────────────────────────────────────── private static SearchResult ParseRow(IDictionary row) { static string Str(IDictionary r, string key) => r.TryGetValue(key, out var v) ? v?.ToString() ?? string.Empty : string.Empty; static DateTime? Date(IDictionary r, string key) { var s = Str(r, key); return DateTime.TryParse(s, out var dt) ? dt : (DateTime?)null; } static long ParseSize(IDictionary r, string key) { var raw = Str(r, key); var digits = Regex.Replace(raw, "[^0-9]", ""); return long.TryParse(digits, out var v) ? v : 0L; } return new SearchResult { Title = Str(row, "Title"), Path = Str(row, "Path"), FileExtension = Str(row, "FileExtension"), Created = Date(row, "Created"), LastModified = Date(row, "LastModifiedTime"), Author = Str(row, "Author"), ModifiedBy = Str(row, "ModifiedBy"), SizeBytes = ParseSize(row, "Size") }; } private static string Str(IDictionary r, string key) => r.TryGetValue(key, out var v) ? v?.ToString() ?? string.Empty : string.Empty; } ``` **Verification:** ```bash dotnet build C:/Users/dev/Documents/projets/Sharepoint/SharepointToolbox.slnx dotnet test C:/Users/dev/Documents/projets/Sharepoint/SharepointToolbox.Tests/SharepointToolbox.Tests.csproj --filter "FullyQualifiedName~SearchServiceTests" -x ``` Expected: 0 build errors; CSOM tests skip, no compile errors ### Task 2: Implement DuplicatesService **File:** `SharepointToolbox/Services/DuplicatesService.cs` **Action:** Create **Why:** DUPL-01 (file duplicates via Search API) and DUPL-02 (folder duplicates via CAML pagination). ```csharp using Microsoft.SharePoint.Client; using Microsoft.SharePoint.Client.Search.Query; using SharepointToolbox.Core.Helpers; using SharepointToolbox.Core.Models; namespace SharepointToolbox.Services; /// /// Duplicate file and folder detection. /// Files: Search API (same KQL engine as SearchService) + client-side composite key grouping. /// Folders: CSOM CAML FSObjType=1 via SharePointPaginationHelper + composite key grouping. /// Port of PS Find-DuplicateFiles / Find-DuplicateFolders (PS lines 4942-5036). /// public class DuplicatesService : IDuplicatesService { private const int BatchSize = 500; private const int MaxStartRow = 50_000; public async Task> ScanDuplicatesAsync( ClientContext ctx, DuplicateScanOptions options, IProgress progress, CancellationToken ct) { ct.ThrowIfCancellationRequested(); List allItems; if (options.Mode == "Folders") allItems = await CollectFolderItemsAsync(ctx, options, progress, ct); else allItems = await CollectFileItemsAsync(ctx, options, progress, ct); progress.Report(OperationProgress.Indeterminate($"Grouping {allItems.Count:N0} items by duplicate key…")); var groups = allItems .GroupBy(item => MakeKey(item, options)) .Where(g => g.Count() >= 2) .Select(g => new DuplicateGroup { GroupKey = g.Key, Name = g.First().Name, Items = g.ToList() }) .OrderByDescending(g => g.Items.Count) .ThenBy(g => g.Name) .ToList(); return groups; } // ── File collection via Search API ──────────────────────────────────────── private static async Task> CollectFileItemsAsync( ClientContext ctx, DuplicateScanOptions options, IProgress progress, CancellationToken ct) { // KQL: all documents, optionally scoped to a library var kqlParts = new List { "ContentType:Document" }; if (!string.IsNullOrEmpty(options.Library)) kqlParts.Add($"Path:\"{ctx.Url.TrimEnd('/')}/{options.Library.TrimStart('/')}*\""); string kql = string.Join(" AND ", kqlParts); var allItems = new List(); int startRow = 0; do { ct.ThrowIfCancellationRequested(); var kq = new KeywordQuery(ctx) { QueryText = kql, StartRow = startRow, RowLimit = BatchSize, TrimDuplicates = false }; kq.SelectProperties.AddRange(new[] { "Title", "Path", "FileExtension", "Created", "LastModifiedTime", "Size", "ParentLink" }); var executor = new SearchExecutor(ctx); ClientResult clientResult = executor.ExecuteQuery(kq); await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct); var table = clientResult.Value .FirstOrDefault(t => t.TableType == KnownTableTypes.RelevantResults); if (table == null || table.RowCount == 0) break; foreach (System.Collections.Hashtable row in table.ResultRows) { var dict = row.Cast() .ToDictionary(e => e.Key.ToString()!, e => e.Value ?? (object)string.Empty); string path = GetStr(dict, "Path"); if (path.Contains("/_vti_history/", StringComparison.OrdinalIgnoreCase)) continue; string name = System.IO.Path.GetFileName(path); if (string.IsNullOrEmpty(name)) name = GetStr(dict, "Title"); string raw = GetStr(dict, "Size"); string digits = System.Text.RegularExpressions.Regex.Replace(raw, "[^0-9]", ""); long size = long.TryParse(digits, out var sv) ? sv : 0L; DateTime? created = ParseDate(GetStr(dict, "Created")); DateTime? modified = ParseDate(GetStr(dict, "LastModifiedTime")); // Derive library from ParentLink or path segments string parentLink = GetStr(dict, "ParentLink"); string library = ExtractLibraryFromPath(path, ctx.Url); allItems.Add(new DuplicateItem { Name = name, Path = path, Library = library, SizeBytes = size, Created = created, Modified = modified }); } progress.Report(new OperationProgress(allItems.Count, MaxStartRow, $"Collected {allItems.Count:N0} files…")); startRow += BatchSize; } while (startRow <= MaxStartRow); return allItems; } // ── Folder collection via CAML ──────────────────────────────────────────── private static async Task> CollectFolderItemsAsync( ClientContext ctx, DuplicateScanOptions options, IProgress progress, CancellationToken ct) { // Load all document libraries on the site ctx.Load(ctx.Web, w => w.Lists.Include( l => l.Title, l => l.Hidden, l => l.BaseType)); await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct); var libs = ctx.Web.Lists .Where(l => !l.Hidden && l.BaseType == BaseType.DocumentLibrary) .ToList(); // Filter to specific library if requested if (!string.IsNullOrEmpty(options.Library)) { libs = libs .Where(l => l.Title.Equals(options.Library, StringComparison.OrdinalIgnoreCase)) .ToList(); } var camlQuery = new CamlQuery { ViewXml = """ 1 2000 """ }; var allItems = new List(); foreach (var lib in libs) { ct.ThrowIfCancellationRequested(); progress.Report(OperationProgress.Indeterminate($"Scanning folders in {lib.Title}…")); await foreach (var item in SharePointPaginationHelper.GetAllItemsAsync(ctx, lib, camlQuery, ct)) { ct.ThrowIfCancellationRequested(); var fv = item.FieldValues; string name = fv["FileLeafRef"]?.ToString() ?? string.Empty; string fileRef = fv["FileRef"]?.ToString() ?? string.Empty; int subCount = Convert.ToInt32(fv["FolderChildCount"] ?? 0); int childCount = Convert.ToInt32(fv["ItemChildCount"] ?? 0); int fileCount = Math.Max(0, childCount - subCount); DateTime? created = fv["Created"] is DateTime cr ? cr : (DateTime?)null; DateTime? modified = fv["Modified"] is DateTime md ? md : (DateTime?)null; allItems.Add(new DuplicateItem { Name = name, Path = fileRef, Library = lib.Title, FolderCount = subCount, FileCount = fileCount, Created = created, Modified = modified }); } } return allItems; } // ── Composite key builder (matches test scaffold in DuplicatesServiceTests) ── internal static string MakeKey(DuplicateItem item, DuplicateScanOptions opts) { var parts = new List { item.Name.ToLowerInvariant() }; if (opts.MatchSize && item.SizeBytes.HasValue) parts.Add(item.SizeBytes.Value.ToString()); if (opts.MatchCreated && item.Created.HasValue) parts.Add(item.Created.Value.Date.ToString("yyyy-MM-dd")); if (opts.MatchModified && item.Modified.HasValue) parts.Add(item.Modified.Value.Date.ToString("yyyy-MM-dd")); if (opts.MatchSubfolderCount && item.FolderCount.HasValue) parts.Add(item.FolderCount.Value.ToString()); if (opts.MatchFileCount && item.FileCount.HasValue) parts.Add(item.FileCount.Value.ToString()); return string.Join("|", parts); } // ── Private utilities ───────────────────────────────────────────────────── private static string GetStr(IDictionary r, string key) => r.TryGetValue(key, out var v) ? v?.ToString() ?? string.Empty : string.Empty; private static DateTime? ParseDate(string s) => DateTime.TryParse(s, out var dt) ? dt : (DateTime?)null; private static string ExtractLibraryFromPath(string path, string siteUrl) { // Extract first path segment after the site URL as library name // e.g. https://tenant.sharepoint.com/sites/MySite/Shared Documents/file.docx -> "Shared Documents" if (string.IsNullOrEmpty(path) || string.IsNullOrEmpty(siteUrl)) return string.Empty; string relative = path.StartsWith(siteUrl.TrimEnd('/'), StringComparison.OrdinalIgnoreCase) ? path.Substring(siteUrl.TrimEnd('/').Length).TrimStart('/') : path; int slash = relative.IndexOf('/'); return slash > 0 ? relative.Substring(0, slash) : relative; } } ``` **Verification:** ```bash dotnet test C:/Users/dev/Documents/projets/Sharepoint/SharepointToolbox.Tests/SharepointToolbox.Tests.csproj --filter "FullyQualifiedName~DuplicatesServiceTests" -x ``` Expected: 5 pure-logic tests pass (MakeKey), 2 CSOM stubs skip ## Verification ```bash dotnet build C:/Users/dev/Documents/projets/Sharepoint/SharepointToolbox.slnx dotnet test C:/Users/dev/Documents/projets/Sharepoint/SharepointToolbox.Tests/SharepointToolbox.Tests.csproj --filter "FullyQualifiedName~SearchServiceTests|FullyQualifiedName~DuplicatesServiceTests" -x ``` Expected: 0 build errors; 5 MakeKey tests pass; CSOM stub tests skip; no compile errors ## Commit Message feat(03-04): implement SearchService KQL pagination and DuplicatesService composite key grouping ## Output After completion, create `.planning/phases/03-storage/03-04-SUMMARY.md`