Files
Sharepoint-Toolbox/SharepointToolbox/Services/SearchService.cs
Dev 9e3d5016e6 feat(03-04): implement SearchService KQL pagination with 500-row batches and 50,000 hard cap
- KQL builder for extension, date, creator, editor, library filters
- Pagination via StartRow += 500, stops at MaxStartRow or MaxResults
- Filters _vti_history/ version history paths from results
- Client-side Regex filter on file name and title
- ValidateKqlLength enforces 4096-char SharePoint limit
- SelectProperties added one-by-one (StringCollection has no AddRange)
2026-04-02 15:30:44 +02:00

198 lines
8.1 KiB
C#
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using Microsoft.SharePoint.Client;
using Microsoft.SharePoint.Client.Search.Query;
using SharepointToolbox.Core.Helpers;
using SharepointToolbox.Core.Models;
using System.Text.RegularExpressions;
namespace SharepointToolbox.Services;
/// <summary>
/// File search using SharePoint KQL Search API.
/// Port of PS Search-SPOFiles pattern (PS lines 4747-4987).
/// Pagination: 500 rows per batch, hard cap StartRow=50,000 (SharePoint Search boundary).
/// </summary>
public class SearchService : ISearchService
{
private const int BatchSize = 500;
private const int MaxStartRow = 50_000;
public async Task<IReadOnlyList<SearchResult>> SearchFilesAsync(
ClientContext ctx,
SearchOptions options,
IProgress<OperationProgress> progress,
CancellationToken ct)
{
ct.ThrowIfCancellationRequested();
string kql = BuildKql(options);
ValidateKqlLength(kql);
Regex? regexFilter = null;
if (!string.IsNullOrWhiteSpace(options.Regex))
{
regexFilter = new Regex(options.Regex,
RegexOptions.IgnoreCase | RegexOptions.Compiled,
TimeSpan.FromSeconds(2));
}
var allResults = new List<SearchResult>();
int startRow = 0;
int maxResults = Math.Min(options.MaxResults, MaxStartRow);
do
{
ct.ThrowIfCancellationRequested();
var kq = new KeywordQuery(ctx)
{
QueryText = kql,
StartRow = startRow,
RowLimit = BatchSize,
TrimDuplicates = false
};
foreach (var prop in new[] { "Title", "Path", "Author", "LastModifiedTime",
"FileExtension", "Created", "ModifiedBy", "Size" })
kq.SelectProperties.Add(prop);
var executor = new SearchExecutor(ctx);
ClientResult<ResultTableCollection> clientResult = executor.ExecuteQuery(kq);
await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct);
var table = clientResult.Value
.FirstOrDefault(t => t.TableType == KnownTableTypes.RelevantResults);
if (table == null || table.RowCount == 0) break;
foreach (System.Collections.Hashtable row in table.ResultRows)
{
var dict = row.Cast<System.Collections.DictionaryEntry>()
.ToDictionary(e => e.Key.ToString()!, e => e.Value ?? (object)string.Empty);
// Skip SharePoint version history paths
string path = Str(dict, "Path");
if (path.Contains("/_vti_history/", StringComparison.OrdinalIgnoreCase))
continue;
var result = ParseRow(dict);
// Client-side Regex filter on file name
if (regexFilter != null)
{
string fileName = System.IO.Path.GetFileName(result.Path);
if (!regexFilter.IsMatch(fileName) && !regexFilter.IsMatch(result.Title))
continue;
}
allResults.Add(result);
if (allResults.Count >= maxResults) goto done;
}
progress.Report(new OperationProgress(allResults.Count, maxResults,
$"Retrieved {allResults.Count:N0} results\u2026"));
startRow += BatchSize;
}
while (startRow <= MaxStartRow && allResults.Count < maxResults);
done:
return allResults;
}
// ── Extension point: bypassing the 50,000-item cap ───────────────────────
//
// The StartRow approach has a hard ceiling at 50,000 (SharePoint Search boundary).
// To go beyond it, replace the StartRow loop with a DocId cursor:
//
// 1. Add "DocId" to SelectProperties.
// 2. Add query.SortList.Add("DocId", SortDirection.Ascending).
// 3. First page KQL: unchanged.
// Subsequent pages: append "AND DocId>{lastDocId}" to the KQL (StartRow stays 0).
// 4. Track lastDocId = Convert.ToInt64(lastRow["DocId"]) after each batch.
// 5. Stop when batch.RowCount < BatchSize.
//
// Caveats:
// - DocId is per-site-collection; for multi-site searches, maintain a separate
// cursor per ClientContext (site URL).
// - The search index can shift between batches (new items indexed mid-scan);
// the DocId cursor is safer than StartRow but cannot guarantee zero drift.
// - DocId is not returned by default — it must be in SelectProperties.
//
// This is deliberately not implemented here because SRCH-02 caps results at 50,000,
// which the StartRow approach already covers exactly (100 pages × 500 rows).
// Implement the DocId cursor if the cap needs to be lifted in a future version.
// ── KQL builder ───────────────────────────────────────────────────────────
internal static string BuildKql(SearchOptions opts)
{
var parts = new List<string> { "ContentType:Document" };
if (opts.Extensions.Length > 0)
{
var extParts = opts.Extensions
.Select(e => $"FileExtension:{e.TrimStart('.').ToLowerInvariant()}");
parts.Add($"({string.Join(" OR ", extParts)})");
}
if (opts.CreatedAfter.HasValue)
parts.Add($"Created>={opts.CreatedAfter.Value:yyyy-MM-dd}");
if (opts.CreatedBefore.HasValue)
parts.Add($"Created<={opts.CreatedBefore.Value:yyyy-MM-dd}");
if (opts.ModifiedAfter.HasValue)
parts.Add($"Write>={opts.ModifiedAfter.Value:yyyy-MM-dd}");
if (opts.ModifiedBefore.HasValue)
parts.Add($"Write<={opts.ModifiedBefore.Value:yyyy-MM-dd}");
if (!string.IsNullOrEmpty(opts.CreatedBy))
parts.Add($"Author:\"{opts.CreatedBy}\"");
if (!string.IsNullOrEmpty(opts.ModifiedBy))
parts.Add($"ModifiedBy:\"{opts.ModifiedBy}\"");
if (!string.IsNullOrEmpty(opts.Library) && !string.IsNullOrEmpty(opts.SiteUrl))
parts.Add($"Path:\"{opts.SiteUrl.TrimEnd('/')}/{opts.Library.TrimStart('/')}*\"");
return string.Join(" AND ", parts);
}
private static void ValidateKqlLength(string kql)
{
// SharePoint Search KQL text hard cap is 4096 characters
if (kql.Length > 4096)
throw new InvalidOperationException(
$"KQL query exceeds 4096-character SharePoint Search limit ({kql.Length} chars). " +
"Reduce the number of extension filters.");
}
// ── Row parser ────────────────────────────────────────────────────────────
private static SearchResult ParseRow(IDictionary<string, object> row)
{
static string Str(IDictionary<string, object> r, string key) =>
r.TryGetValue(key, out var v) ? v?.ToString() ?? string.Empty : string.Empty;
static DateTime? Date(IDictionary<string, object> r, string key)
{
var s = Str(r, key);
return DateTime.TryParse(s, out var dt) ? dt : (DateTime?)null;
}
static long ParseSize(IDictionary<string, object> r, string key)
{
var raw = Str(r, key);
var digits = Regex.Replace(raw, "[^0-9]", "");
return long.TryParse(digits, out var v) ? v : 0L;
}
return new SearchResult
{
Title = Str(row, "Title"),
Path = Str(row, "Path"),
FileExtension = Str(row, "FileExtension"),
Created = Date(row, "Created"),
LastModified = Date(row, "LastModifiedTime"),
Author = Str(row, "Author"),
ModifiedBy = Str(row, "ModifiedBy"),
SizeBytes = ParseSize(row, "Size")
};
}
private static string Str(IDictionary<string, object> r, string key) =>
r.TryGetValue(key, out var v) ? v?.ToString() ?? string.Empty : string.Empty;
}