using Microsoft.SharePoint.Client; using Microsoft.SharePoint.Client.Search.Query; using SharepointToolbox.Core.Helpers; using SharepointToolbox.Core.Models; using System.Text.RegularExpressions; namespace SharepointToolbox.Services; /// /// File search using SharePoint KQL Search API. /// Port of PS Search-SPOFiles pattern (PS lines 4747-4987). /// Pagination: 500 rows per batch, hard cap StartRow=50,000 (SharePoint Search boundary). /// public class SearchService : ISearchService { private const int BatchSize = 500; private const int MaxStartRow = 50_000; public async Task> SearchFilesAsync( ClientContext ctx, SearchOptions options, IProgress progress, CancellationToken ct) { ct.ThrowIfCancellationRequested(); string kql = BuildKql(options); ValidateKqlLength(kql); Regex? regexFilter = null; if (!string.IsNullOrWhiteSpace(options.Regex)) { regexFilter = new Regex(options.Regex, RegexOptions.IgnoreCase | RegexOptions.Compiled, TimeSpan.FromSeconds(2)); } var allResults = new List(); int startRow = 0; int maxResults = Math.Min(options.MaxResults, MaxStartRow); do { ct.ThrowIfCancellationRequested(); var kq = new KeywordQuery(ctx) { QueryText = kql, StartRow = startRow, RowLimit = BatchSize, TrimDuplicates = false }; foreach (var prop in new[] { "Title", "Path", "Author", "LastModifiedTime", "FileExtension", "Created", "ModifiedBy", "Size" }) kq.SelectProperties.Add(prop); var executor = new SearchExecutor(ctx); ClientResult clientResult = executor.ExecuteQuery(kq); await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct); var table = clientResult.Value .FirstOrDefault(t => t.TableType == KnownTableTypes.RelevantResults); if (table == null || table.RowCount == 0) break; foreach (System.Collections.Hashtable row in table.ResultRows) { var dict = row.Cast() .ToDictionary(e => e.Key.ToString()!, e => e.Value ?? (object)string.Empty); // Skip SharePoint version history paths string path = Str(dict, "Path"); if (path.Contains("/_vti_history/", StringComparison.OrdinalIgnoreCase)) continue; var result = ParseRow(dict); // Client-side Regex filter on file name if (regexFilter != null) { string fileName = System.IO.Path.GetFileName(result.Path); if (!regexFilter.IsMatch(fileName) && !regexFilter.IsMatch(result.Title)) continue; } allResults.Add(result); if (allResults.Count >= maxResults) goto done; } progress.Report(new OperationProgress(allResults.Count, maxResults, $"Retrieved {allResults.Count:N0} results\u2026")); startRow += BatchSize; } while (startRow <= MaxStartRow && allResults.Count < maxResults); done: return allResults; } // ── Extension point: bypassing the 50,000-item cap ─────────────────────── // // The StartRow approach has a hard ceiling at 50,000 (SharePoint Search boundary). // To go beyond it, replace the StartRow loop with a DocId cursor: // // 1. Add "DocId" to SelectProperties. // 2. Add query.SortList.Add("DocId", SortDirection.Ascending). // 3. First page KQL: unchanged. // Subsequent pages: append "AND DocId>{lastDocId}" to the KQL (StartRow stays 0). // 4. Track lastDocId = Convert.ToInt64(lastRow["DocId"]) after each batch. // 5. Stop when batch.RowCount < BatchSize. // // Caveats: // - DocId is per-site-collection; for multi-site searches, maintain a separate // cursor per ClientContext (site URL). // - The search index can shift between batches (new items indexed mid-scan); // the DocId cursor is safer than StartRow but cannot guarantee zero drift. // - DocId is not returned by default — it must be in SelectProperties. // // This is deliberately not implemented here because SRCH-02 caps results at 50,000, // which the StartRow approach already covers exactly (100 pages × 500 rows). // Implement the DocId cursor if the cap needs to be lifted in a future version. // ── KQL builder ─────────────────────────────────────────────────────────── internal static string BuildKql(SearchOptions opts) { var parts = new List { "ContentType:Document" }; if (opts.Extensions.Length > 0) { var extParts = opts.Extensions .Select(e => $"FileExtension:{e.TrimStart('.').ToLowerInvariant()}"); parts.Add($"({string.Join(" OR ", extParts)})"); } if (opts.CreatedAfter.HasValue) parts.Add($"Created>={opts.CreatedAfter.Value:yyyy-MM-dd}"); if (opts.CreatedBefore.HasValue) parts.Add($"Created<={opts.CreatedBefore.Value:yyyy-MM-dd}"); if (opts.ModifiedAfter.HasValue) parts.Add($"Write>={opts.ModifiedAfter.Value:yyyy-MM-dd}"); if (opts.ModifiedBefore.HasValue) parts.Add($"Write<={opts.ModifiedBefore.Value:yyyy-MM-dd}"); if (!string.IsNullOrEmpty(opts.CreatedBy)) parts.Add($"Author:\"{opts.CreatedBy}\""); if (!string.IsNullOrEmpty(opts.ModifiedBy)) parts.Add($"ModifiedBy:\"{opts.ModifiedBy}\""); if (!string.IsNullOrEmpty(opts.Library) && !string.IsNullOrEmpty(opts.SiteUrl)) parts.Add($"Path:\"{opts.SiteUrl.TrimEnd('/')}/{opts.Library.TrimStart('/')}*\""); return string.Join(" AND ", parts); } private static void ValidateKqlLength(string kql) { // SharePoint Search KQL text hard cap is 4096 characters if (kql.Length > 4096) throw new InvalidOperationException( $"KQL query exceeds 4096-character SharePoint Search limit ({kql.Length} chars). " + "Reduce the number of extension filters."); } // ── Row parser ──────────────────────────────────────────────────────────── private static SearchResult ParseRow(IDictionary row) { static string Str(IDictionary r, string key) => r.TryGetValue(key, out var v) ? v?.ToString() ?? string.Empty : string.Empty; static DateTime? Date(IDictionary r, string key) { var s = Str(r, key); return DateTime.TryParse(s, out var dt) ? dt : (DateTime?)null; } static long ParseSize(IDictionary r, string key) { var raw = Str(r, key); var digits = Regex.Replace(raw, "[^0-9]", ""); return long.TryParse(digits, out var v) ? v : 0L; } return new SearchResult { Title = Str(row, "Title"), Path = Str(row, "Path"), FileExtension = Str(row, "FileExtension"), Created = Date(row, "Created"), LastModified = Date(row, "LastModifiedTime"), Author = Str(row, "Author"), ModifiedBy = Str(row, "ModifiedBy"), SizeBytes = ParseSize(row, "Size") }; } private static string Str(IDictionary r, string key) => r.TryGetValue(key, out var v) ? v?.ToString() ?? string.Empty : string.Empty; }