From 9e3d5016e655daf6687295e9b0d1bd3fbe0bacee Mon Sep 17 00:00:00 2001 From: Dev Date: Thu, 2 Apr 2026 15:30:44 +0200 Subject: [PATCH] feat(03-04): implement SearchService KQL pagination with 500-row batches and 50,000 hard cap - KQL builder for extension, date, creator, editor, library filters - Pagination via StartRow += 500, stops at MaxStartRow or MaxResults - Filters _vti_history/ version history paths from results - Client-side Regex filter on file name and title - ValidateKqlLength enforces 4096-char SharePoint limit - SelectProperties added one-by-one (StringCollection has no AddRange) --- SharepointToolbox/Services/SearchService.cs | 197 ++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 SharepointToolbox/Services/SearchService.cs diff --git a/SharepointToolbox/Services/SearchService.cs b/SharepointToolbox/Services/SearchService.cs new file mode 100644 index 0000000..2b2d21d --- /dev/null +++ b/SharepointToolbox/Services/SearchService.cs @@ -0,0 +1,197 @@ +using Microsoft.SharePoint.Client; +using Microsoft.SharePoint.Client.Search.Query; +using SharepointToolbox.Core.Helpers; +using SharepointToolbox.Core.Models; +using System.Text.RegularExpressions; + +namespace SharepointToolbox.Services; + +/// +/// File search using SharePoint KQL Search API. +/// Port of PS Search-SPOFiles pattern (PS lines 4747-4987). +/// Pagination: 500 rows per batch, hard cap StartRow=50,000 (SharePoint Search boundary). +/// +public class SearchService : ISearchService +{ + private const int BatchSize = 500; + private const int MaxStartRow = 50_000; + + public async Task> SearchFilesAsync( + ClientContext ctx, + SearchOptions options, + IProgress progress, + CancellationToken ct) + { + ct.ThrowIfCancellationRequested(); + + string kql = BuildKql(options); + ValidateKqlLength(kql); + + Regex? regexFilter = null; + if (!string.IsNullOrWhiteSpace(options.Regex)) + { + regexFilter = new Regex(options.Regex, + RegexOptions.IgnoreCase | RegexOptions.Compiled, + TimeSpan.FromSeconds(2)); + } + + var allResults = new List(); + int startRow = 0; + int maxResults = Math.Min(options.MaxResults, MaxStartRow); + + do + { + ct.ThrowIfCancellationRequested(); + + var kq = new KeywordQuery(ctx) + { + QueryText = kql, + StartRow = startRow, + RowLimit = BatchSize, + TrimDuplicates = false + }; + foreach (var prop in new[] { "Title", "Path", "Author", "LastModifiedTime", + "FileExtension", "Created", "ModifiedBy", "Size" }) + kq.SelectProperties.Add(prop); + + var executor = new SearchExecutor(ctx); + ClientResult clientResult = executor.ExecuteQuery(kq); + await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct); + + var table = clientResult.Value + .FirstOrDefault(t => t.TableType == KnownTableTypes.RelevantResults); + if (table == null || table.RowCount == 0) break; + + foreach (System.Collections.Hashtable row in table.ResultRows) + { + var dict = row.Cast() + .ToDictionary(e => e.Key.ToString()!, e => e.Value ?? (object)string.Empty); + + // Skip SharePoint version history paths + string path = Str(dict, "Path"); + if (path.Contains("/_vti_history/", StringComparison.OrdinalIgnoreCase)) + continue; + + var result = ParseRow(dict); + + // Client-side Regex filter on file name + if (regexFilter != null) + { + string fileName = System.IO.Path.GetFileName(result.Path); + if (!regexFilter.IsMatch(fileName) && !regexFilter.IsMatch(result.Title)) + continue; + } + + allResults.Add(result); + if (allResults.Count >= maxResults) goto done; + } + + progress.Report(new OperationProgress(allResults.Count, maxResults, + $"Retrieved {allResults.Count:N0} results\u2026")); + + startRow += BatchSize; + } + while (startRow <= MaxStartRow && allResults.Count < maxResults); + + done: + return allResults; + } + + // ── Extension point: bypassing the 50,000-item cap ─────────────────────── + // + // The StartRow approach has a hard ceiling at 50,000 (SharePoint Search boundary). + // To go beyond it, replace the StartRow loop with a DocId cursor: + // + // 1. Add "DocId" to SelectProperties. + // 2. Add query.SortList.Add("DocId", SortDirection.Ascending). + // 3. First page KQL: unchanged. + // Subsequent pages: append "AND DocId>{lastDocId}" to the KQL (StartRow stays 0). + // 4. Track lastDocId = Convert.ToInt64(lastRow["DocId"]) after each batch. + // 5. Stop when batch.RowCount < BatchSize. + // + // Caveats: + // - DocId is per-site-collection; for multi-site searches, maintain a separate + // cursor per ClientContext (site URL). + // - The search index can shift between batches (new items indexed mid-scan); + // the DocId cursor is safer than StartRow but cannot guarantee zero drift. + // - DocId is not returned by default — it must be in SelectProperties. + // + // This is deliberately not implemented here because SRCH-02 caps results at 50,000, + // which the StartRow approach already covers exactly (100 pages × 500 rows). + // Implement the DocId cursor if the cap needs to be lifted in a future version. + + // ── KQL builder ─────────────────────────────────────────────────────────── + + internal static string BuildKql(SearchOptions opts) + { + var parts = new List { "ContentType:Document" }; + + if (opts.Extensions.Length > 0) + { + var extParts = opts.Extensions + .Select(e => $"FileExtension:{e.TrimStart('.').ToLowerInvariant()}"); + parts.Add($"({string.Join(" OR ", extParts)})"); + } + if (opts.CreatedAfter.HasValue) + parts.Add($"Created>={opts.CreatedAfter.Value:yyyy-MM-dd}"); + if (opts.CreatedBefore.HasValue) + parts.Add($"Created<={opts.CreatedBefore.Value:yyyy-MM-dd}"); + if (opts.ModifiedAfter.HasValue) + parts.Add($"Write>={opts.ModifiedAfter.Value:yyyy-MM-dd}"); + if (opts.ModifiedBefore.HasValue) + parts.Add($"Write<={opts.ModifiedBefore.Value:yyyy-MM-dd}"); + if (!string.IsNullOrEmpty(opts.CreatedBy)) + parts.Add($"Author:\"{opts.CreatedBy}\""); + if (!string.IsNullOrEmpty(opts.ModifiedBy)) + parts.Add($"ModifiedBy:\"{opts.ModifiedBy}\""); + if (!string.IsNullOrEmpty(opts.Library) && !string.IsNullOrEmpty(opts.SiteUrl)) + parts.Add($"Path:\"{opts.SiteUrl.TrimEnd('/')}/{opts.Library.TrimStart('/')}*\""); + + return string.Join(" AND ", parts); + } + + private static void ValidateKqlLength(string kql) + { + // SharePoint Search KQL text hard cap is 4096 characters + if (kql.Length > 4096) + throw new InvalidOperationException( + $"KQL query exceeds 4096-character SharePoint Search limit ({kql.Length} chars). " + + "Reduce the number of extension filters."); + } + + // ── Row parser ──────────────────────────────────────────────────────────── + + private static SearchResult ParseRow(IDictionary row) + { + static string Str(IDictionary r, string key) => + r.TryGetValue(key, out var v) ? v?.ToString() ?? string.Empty : string.Empty; + + static DateTime? Date(IDictionary r, string key) + { + var s = Str(r, key); + return DateTime.TryParse(s, out var dt) ? dt : (DateTime?)null; + } + + static long ParseSize(IDictionary r, string key) + { + var raw = Str(r, key); + var digits = Regex.Replace(raw, "[^0-9]", ""); + return long.TryParse(digits, out var v) ? v : 0L; + } + + return new SearchResult + { + Title = Str(row, "Title"), + Path = Str(row, "Path"), + FileExtension = Str(row, "FileExtension"), + Created = Date(row, "Created"), + LastModified = Date(row, "LastModifiedTime"), + Author = Str(row, "Author"), + ModifiedBy = Str(row, "ModifiedBy"), + SizeBytes = ParseSize(row, "Size") + }; + } + + private static string Str(IDictionary r, string key) => + r.TryGetValue(key, out var v) ? v?.ToString() ?? string.Empty : string.Empty; +}