feat(03-04): implement SearchService KQL pagination with 500-row batches and 50,000 hard cap
- KQL builder for extension, date, creator, editor, library filters - Pagination via StartRow += 500, stops at MaxStartRow or MaxResults - Filters _vti_history/ version history paths from results - Client-side Regex filter on file name and title - ValidateKqlLength enforces 4096-char SharePoint limit - SelectProperties added one-by-one (StringCollection has no AddRange)
This commit is contained in:
197
SharepointToolbox/Services/SearchService.cs
Normal file
197
SharepointToolbox/Services/SearchService.cs
Normal file
@@ -0,0 +1,197 @@
|
|||||||
|
using Microsoft.SharePoint.Client;
|
||||||
|
using Microsoft.SharePoint.Client.Search.Query;
|
||||||
|
using SharepointToolbox.Core.Helpers;
|
||||||
|
using SharepointToolbox.Core.Models;
|
||||||
|
using System.Text.RegularExpressions;
|
||||||
|
|
||||||
|
namespace SharepointToolbox.Services;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// File search using SharePoint KQL Search API.
|
||||||
|
/// Port of PS Search-SPOFiles pattern (PS lines 4747-4987).
|
||||||
|
/// Pagination: 500 rows per batch, hard cap StartRow=50,000 (SharePoint Search boundary).
|
||||||
|
/// </summary>
|
||||||
|
public class SearchService : ISearchService
|
||||||
|
{
|
||||||
|
private const int BatchSize = 500;
|
||||||
|
private const int MaxStartRow = 50_000;
|
||||||
|
|
||||||
|
public async Task<IReadOnlyList<SearchResult>> SearchFilesAsync(
|
||||||
|
ClientContext ctx,
|
||||||
|
SearchOptions options,
|
||||||
|
IProgress<OperationProgress> progress,
|
||||||
|
CancellationToken ct)
|
||||||
|
{
|
||||||
|
ct.ThrowIfCancellationRequested();
|
||||||
|
|
||||||
|
string kql = BuildKql(options);
|
||||||
|
ValidateKqlLength(kql);
|
||||||
|
|
||||||
|
Regex? regexFilter = null;
|
||||||
|
if (!string.IsNullOrWhiteSpace(options.Regex))
|
||||||
|
{
|
||||||
|
regexFilter = new Regex(options.Regex,
|
||||||
|
RegexOptions.IgnoreCase | RegexOptions.Compiled,
|
||||||
|
TimeSpan.FromSeconds(2));
|
||||||
|
}
|
||||||
|
|
||||||
|
var allResults = new List<SearchResult>();
|
||||||
|
int startRow = 0;
|
||||||
|
int maxResults = Math.Min(options.MaxResults, MaxStartRow);
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
ct.ThrowIfCancellationRequested();
|
||||||
|
|
||||||
|
var kq = new KeywordQuery(ctx)
|
||||||
|
{
|
||||||
|
QueryText = kql,
|
||||||
|
StartRow = startRow,
|
||||||
|
RowLimit = BatchSize,
|
||||||
|
TrimDuplicates = false
|
||||||
|
};
|
||||||
|
foreach (var prop in new[] { "Title", "Path", "Author", "LastModifiedTime",
|
||||||
|
"FileExtension", "Created", "ModifiedBy", "Size" })
|
||||||
|
kq.SelectProperties.Add(prop);
|
||||||
|
|
||||||
|
var executor = new SearchExecutor(ctx);
|
||||||
|
ClientResult<ResultTableCollection> clientResult = executor.ExecuteQuery(kq);
|
||||||
|
await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct);
|
||||||
|
|
||||||
|
var table = clientResult.Value
|
||||||
|
.FirstOrDefault(t => t.TableType == KnownTableTypes.RelevantResults);
|
||||||
|
if (table == null || table.RowCount == 0) break;
|
||||||
|
|
||||||
|
foreach (System.Collections.Hashtable row in table.ResultRows)
|
||||||
|
{
|
||||||
|
var dict = row.Cast<System.Collections.DictionaryEntry>()
|
||||||
|
.ToDictionary(e => e.Key.ToString()!, e => e.Value ?? (object)string.Empty);
|
||||||
|
|
||||||
|
// Skip SharePoint version history paths
|
||||||
|
string path = Str(dict, "Path");
|
||||||
|
if (path.Contains("/_vti_history/", StringComparison.OrdinalIgnoreCase))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
var result = ParseRow(dict);
|
||||||
|
|
||||||
|
// Client-side Regex filter on file name
|
||||||
|
if (regexFilter != null)
|
||||||
|
{
|
||||||
|
string fileName = System.IO.Path.GetFileName(result.Path);
|
||||||
|
if (!regexFilter.IsMatch(fileName) && !regexFilter.IsMatch(result.Title))
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
allResults.Add(result);
|
||||||
|
if (allResults.Count >= maxResults) goto done;
|
||||||
|
}
|
||||||
|
|
||||||
|
progress.Report(new OperationProgress(allResults.Count, maxResults,
|
||||||
|
$"Retrieved {allResults.Count:N0} results\u2026"));
|
||||||
|
|
||||||
|
startRow += BatchSize;
|
||||||
|
}
|
||||||
|
while (startRow <= MaxStartRow && allResults.Count < maxResults);
|
||||||
|
|
||||||
|
done:
|
||||||
|
return allResults;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Extension point: bypassing the 50,000-item cap ───────────────────────
|
||||||
|
//
|
||||||
|
// The StartRow approach has a hard ceiling at 50,000 (SharePoint Search boundary).
|
||||||
|
// To go beyond it, replace the StartRow loop with a DocId cursor:
|
||||||
|
//
|
||||||
|
// 1. Add "DocId" to SelectProperties.
|
||||||
|
// 2. Add query.SortList.Add("DocId", SortDirection.Ascending).
|
||||||
|
// 3. First page KQL: unchanged.
|
||||||
|
// Subsequent pages: append "AND DocId>{lastDocId}" to the KQL (StartRow stays 0).
|
||||||
|
// 4. Track lastDocId = Convert.ToInt64(lastRow["DocId"]) after each batch.
|
||||||
|
// 5. Stop when batch.RowCount < BatchSize.
|
||||||
|
//
|
||||||
|
// Caveats:
|
||||||
|
// - DocId is per-site-collection; for multi-site searches, maintain a separate
|
||||||
|
// cursor per ClientContext (site URL).
|
||||||
|
// - The search index can shift between batches (new items indexed mid-scan);
|
||||||
|
// the DocId cursor is safer than StartRow but cannot guarantee zero drift.
|
||||||
|
// - DocId is not returned by default — it must be in SelectProperties.
|
||||||
|
//
|
||||||
|
// This is deliberately not implemented here because SRCH-02 caps results at 50,000,
|
||||||
|
// which the StartRow approach already covers exactly (100 pages × 500 rows).
|
||||||
|
// Implement the DocId cursor if the cap needs to be lifted in a future version.
|
||||||
|
|
||||||
|
// ── KQL builder ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
internal static string BuildKql(SearchOptions opts)
|
||||||
|
{
|
||||||
|
var parts = new List<string> { "ContentType:Document" };
|
||||||
|
|
||||||
|
if (opts.Extensions.Length > 0)
|
||||||
|
{
|
||||||
|
var extParts = opts.Extensions
|
||||||
|
.Select(e => $"FileExtension:{e.TrimStart('.').ToLowerInvariant()}");
|
||||||
|
parts.Add($"({string.Join(" OR ", extParts)})");
|
||||||
|
}
|
||||||
|
if (opts.CreatedAfter.HasValue)
|
||||||
|
parts.Add($"Created>={opts.CreatedAfter.Value:yyyy-MM-dd}");
|
||||||
|
if (opts.CreatedBefore.HasValue)
|
||||||
|
parts.Add($"Created<={opts.CreatedBefore.Value:yyyy-MM-dd}");
|
||||||
|
if (opts.ModifiedAfter.HasValue)
|
||||||
|
parts.Add($"Write>={opts.ModifiedAfter.Value:yyyy-MM-dd}");
|
||||||
|
if (opts.ModifiedBefore.HasValue)
|
||||||
|
parts.Add($"Write<={opts.ModifiedBefore.Value:yyyy-MM-dd}");
|
||||||
|
if (!string.IsNullOrEmpty(opts.CreatedBy))
|
||||||
|
parts.Add($"Author:\"{opts.CreatedBy}\"");
|
||||||
|
if (!string.IsNullOrEmpty(opts.ModifiedBy))
|
||||||
|
parts.Add($"ModifiedBy:\"{opts.ModifiedBy}\"");
|
||||||
|
if (!string.IsNullOrEmpty(opts.Library) && !string.IsNullOrEmpty(opts.SiteUrl))
|
||||||
|
parts.Add($"Path:\"{opts.SiteUrl.TrimEnd('/')}/{opts.Library.TrimStart('/')}*\"");
|
||||||
|
|
||||||
|
return string.Join(" AND ", parts);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void ValidateKqlLength(string kql)
|
||||||
|
{
|
||||||
|
// SharePoint Search KQL text hard cap is 4096 characters
|
||||||
|
if (kql.Length > 4096)
|
||||||
|
throw new InvalidOperationException(
|
||||||
|
$"KQL query exceeds 4096-character SharePoint Search limit ({kql.Length} chars). " +
|
||||||
|
"Reduce the number of extension filters.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Row parser ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
private static SearchResult ParseRow(IDictionary<string, object> row)
|
||||||
|
{
|
||||||
|
static string Str(IDictionary<string, object> r, string key) =>
|
||||||
|
r.TryGetValue(key, out var v) ? v?.ToString() ?? string.Empty : string.Empty;
|
||||||
|
|
||||||
|
static DateTime? Date(IDictionary<string, object> r, string key)
|
||||||
|
{
|
||||||
|
var s = Str(r, key);
|
||||||
|
return DateTime.TryParse(s, out var dt) ? dt : (DateTime?)null;
|
||||||
|
}
|
||||||
|
|
||||||
|
static long ParseSize(IDictionary<string, object> r, string key)
|
||||||
|
{
|
||||||
|
var raw = Str(r, key);
|
||||||
|
var digits = Regex.Replace(raw, "[^0-9]", "");
|
||||||
|
return long.TryParse(digits, out var v) ? v : 0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new SearchResult
|
||||||
|
{
|
||||||
|
Title = Str(row, "Title"),
|
||||||
|
Path = Str(row, "Path"),
|
||||||
|
FileExtension = Str(row, "FileExtension"),
|
||||||
|
Created = Date(row, "Created"),
|
||||||
|
LastModified = Date(row, "LastModifiedTime"),
|
||||||
|
Author = Str(row, "Author"),
|
||||||
|
ModifiedBy = Str(row, "ModifiedBy"),
|
||||||
|
SizeBytes = ParseSize(row, "Size")
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string Str(IDictionary<string, object> r, string key) =>
|
||||||
|
r.TryGetValue(key, out var v) ? v?.ToString() ?? string.Empty : string.Empty;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user