140 lines
8.6 KiB
C#
140 lines
8.6 KiB
C#
using System.Diagnostics;
|
|
using Microsoft.SharePoint.Client;
|
|
using Microsoft.SharePoint.Client.Search.Query;
|
|
using SharepointToolbox.Web.Core.Helpers;
|
|
using SharepointToolbox.Web.Core.Models;
|
|
using SharepointToolbox.Web.Services.Export;
|
|
|
|
namespace SharepointToolbox.Web.Services;
|
|
|
|
public class DuplicatesService : IDuplicatesService
|
|
{
|
|
private const int BatchSize = 500;
|
|
private const int MaxStartRow = 50_000;
|
|
|
|
public async Task<IReadOnlyList<DuplicateGroup>> ScanDuplicatesAsync(
|
|
ClientContext ctx, DuplicateScanOptions options,
|
|
IProgress<OperationProgress> progress, CancellationToken ct)
|
|
{
|
|
ct.ThrowIfCancellationRequested();
|
|
List<DuplicateItem> allItems = options.Mode == "Folders"
|
|
? await CollectFolderItemsAsync(ctx, options, progress, ct)
|
|
: await CollectFileItemsAsync(ctx, options, progress, ct);
|
|
|
|
progress.Report(OperationProgress.Indeterminate($"Grouping {allItems.Count:N0} items…"));
|
|
var groups = allItems
|
|
.GroupBy(item => MakeKey(item, options))
|
|
.Where(g => g.Count() >= 2)
|
|
.Select(g =>
|
|
{
|
|
var items = g.ToList();
|
|
var libraries = items.Select(i => i.Library).Where(l => !string.IsNullOrEmpty(l)).Distinct(StringComparer.OrdinalIgnoreCase).OrderBy(l => l).ToList();
|
|
return new DuplicateGroup { GroupKey = g.Key, Name = libraries.Count > 0 ? $"{items[0].Name} ({string.Join(", ", libraries)})" : items[0].Name, Items = items };
|
|
})
|
|
.OrderByDescending(g => g.Items.Count).ThenBy(g => g.Name).ToList();
|
|
return groups;
|
|
}
|
|
|
|
private static async Task<List<DuplicateItem>> CollectFileItemsAsync(ClientContext ctx, DuplicateScanOptions options, IProgress<OperationProgress> progress, CancellationToken ct)
|
|
{
|
|
var (siteUrl, siteTitle) = await LoadSiteIdentityAsync(ctx, progress, ct);
|
|
var kqlParts = new List<string> { "ContentType:Document" };
|
|
if (!string.IsNullOrEmpty(options.Library)) kqlParts.Add($"Path:\"{ctx.Url.TrimEnd('/')}/{options.Library.TrimStart('/')}*\"");
|
|
string kql = string.Join(" AND ", kqlParts);
|
|
|
|
var allItems = new List<DuplicateItem>();
|
|
int startRow = 0;
|
|
do
|
|
{
|
|
ct.ThrowIfCancellationRequested();
|
|
var kq = new KeywordQuery(ctx) { QueryText = kql, StartRow = startRow, RowLimit = BatchSize, TrimDuplicates = false };
|
|
foreach (var prop in new[] { "Title", "Path", "FileExtension", "Created", "LastModifiedTime", "Size", "ParentLink" }) kq.SelectProperties.Add(prop);
|
|
var executor = new SearchExecutor(ctx);
|
|
var clientResult = executor.ExecuteQuery(kq);
|
|
await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct);
|
|
var table = clientResult.Value.FirstOrDefault(t => t.TableType == KnownTableTypes.RelevantResults);
|
|
if (table == null || table.RowCount == 0) break;
|
|
|
|
foreach (var rawRow in table.ResultRows)
|
|
{
|
|
IDictionary<string, object> dict;
|
|
if (rawRow is IDictionary<string, object> generic) dict = generic;
|
|
else if (rawRow is System.Collections.IDictionary legacy) { dict = new Dictionary<string, object>(); foreach (System.Collections.DictionaryEntry e in legacy) dict[e.Key.ToString()!] = e.Value ?? string.Empty; }
|
|
else continue;
|
|
|
|
string path = GetStr(dict, "Path");
|
|
if (path.Contains("/_vti_history/", StringComparison.OrdinalIgnoreCase)) continue;
|
|
string name = System.IO.Path.GetFileName(path);
|
|
if (string.IsNullOrEmpty(name)) name = GetStr(dict, "Title");
|
|
string raw = GetStr(dict, "Size");
|
|
string digits = System.Text.RegularExpressions.Regex.Replace(raw, "[^0-9]", "");
|
|
long size = long.TryParse(digits, out var sv) ? sv : 0L;
|
|
allItems.Add(new DuplicateItem { Name = name, Path = path, Library = ExtractLibraryFromPath(path, ctx.Url), SizeBytes = size, Created = ParseDate(GetStr(dict, "Created")), Modified = ParseDate(GetStr(dict, "LastModifiedTime")), SiteUrl = siteUrl, SiteTitle = siteTitle });
|
|
}
|
|
progress.Report(new OperationProgress(allItems.Count, MaxStartRow, $"Collected {allItems.Count:N0} files…"));
|
|
startRow += BatchSize;
|
|
}
|
|
while (startRow <= MaxStartRow);
|
|
return allItems;
|
|
}
|
|
|
|
private static async Task<List<DuplicateItem>> CollectFolderItemsAsync(ClientContext ctx, DuplicateScanOptions options, IProgress<OperationProgress> progress, CancellationToken ct)
|
|
{
|
|
ctx.Load(ctx.Web, w => w.Title, w => w.Lists.Include(l => l.Title, l => l.Hidden, l => l.BaseType));
|
|
await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct);
|
|
string siteUrl = ctx.Url;
|
|
string siteTitle = string.IsNullOrWhiteSpace(ctx.Web.Title) ? ReportSplitHelper.DeriveSiteLabel(siteUrl) : ctx.Web.Title;
|
|
|
|
var libs = ctx.Web.Lists.Where(l => !l.Hidden && l.BaseType == BaseType.DocumentLibrary).ToList();
|
|
if (!string.IsNullOrEmpty(options.Library)) libs = libs.Where(l => l.Title.Equals(options.Library, StringComparison.OrdinalIgnoreCase)).ToList();
|
|
|
|
var camlQuery = new CamlQuery { ViewXml = "<View Scope='RecursiveAll'><Query></Query><RowLimit Paged='TRUE'>5000</RowLimit></View>" };
|
|
var allItems = new List<DuplicateItem>();
|
|
foreach (var lib in libs)
|
|
{
|
|
ct.ThrowIfCancellationRequested();
|
|
progress.Report(OperationProgress.Indeterminate($"Scanning folders in {lib.Title}…"));
|
|
await foreach (var item in SharePointPaginationHelper.GetAllItemsAsync(ctx, lib, camlQuery, ct))
|
|
{
|
|
if (item.FileSystemObjectType != FileSystemObjectType.Folder) continue;
|
|
var fv = item.FieldValues;
|
|
string name = fv["FileLeafRef"]?.ToString() ?? string.Empty;
|
|
string fileRef = fv["FileRef"]?.ToString() ?? string.Empty;
|
|
int subCount = Convert.ToInt32(fv["FolderChildCount"] ?? 0);
|
|
int childCount = Convert.ToInt32(fv["ItemChildCount"] ?? 0);
|
|
allItems.Add(new DuplicateItem { Name = name, Path = fileRef, Library = lib.Title, FolderCount = subCount, FileCount = Math.Max(0, childCount - subCount), Created = fv["Created"] is DateTime cr ? cr : (DateTime?)null, Modified = fv["Modified"] is DateTime md ? md : (DateTime?)null, SiteUrl = siteUrl, SiteTitle = siteTitle });
|
|
}
|
|
}
|
|
return allItems;
|
|
}
|
|
|
|
internal static string MakeKey(DuplicateItem item, DuplicateScanOptions opts)
|
|
{
|
|
var parts = new List<string> { item.Name.ToLowerInvariant() };
|
|
if (opts.MatchSize && item.SizeBytes.HasValue) parts.Add(item.SizeBytes.Value.ToString());
|
|
if (opts.MatchCreated && item.Created.HasValue) parts.Add(item.Created.Value.Date.ToString("yyyy-MM-dd"));
|
|
if (opts.MatchModified && item.Modified.HasValue) parts.Add(item.Modified.Value.Date.ToString("yyyy-MM-dd"));
|
|
if (opts.MatchSubfolderCount && item.FolderCount.HasValue) parts.Add(item.FolderCount.Value.ToString());
|
|
if (opts.MatchFileCount && item.FileCount.HasValue) parts.Add(item.FileCount.Value.ToString());
|
|
return string.Join("|", parts);
|
|
}
|
|
|
|
private static string GetStr(IDictionary<string, object> r, string key) => r.TryGetValue(key, out var v) ? v?.ToString() ?? string.Empty : string.Empty;
|
|
private static DateTime? ParseDate(string s) => DateTime.TryParse(s, out var dt) ? dt : (DateTime?)null;
|
|
private static string ExtractLibraryFromPath(string path, string siteUrl)
|
|
{
|
|
if (string.IsNullOrEmpty(path) || string.IsNullOrEmpty(siteUrl)) return string.Empty;
|
|
string relative = path.StartsWith(siteUrl.TrimEnd('/'), StringComparison.OrdinalIgnoreCase) ? path[(siteUrl.TrimEnd('/').Length)..].TrimStart('/') : path;
|
|
int slash = relative.IndexOf('/');
|
|
return slash > 0 ? relative[..slash] : relative;
|
|
}
|
|
private static async Task<(string Url, string Title)> LoadSiteIdentityAsync(ClientContext ctx, IProgress<OperationProgress> progress, CancellationToken ct)
|
|
{
|
|
try { ctx.Load(ctx.Web, w => w.Title); await ExecuteQueryRetryHelper.ExecuteQueryRetryAsync(ctx, progress, ct); } catch (OperationCanceledException) { throw; } catch (Exception ex) { Debug.WriteLine($"LoadSiteIdentityAsync: {ex.Message}"); }
|
|
var url = ctx.Url ?? string.Empty;
|
|
string title; try { title = ctx.Web.Title; } catch { title = string.Empty; }
|
|
if (string.IsNullOrWhiteSpace(title)) title = ReportSplitHelper.DeriveSiteLabel(url);
|
|
return (url, title);
|
|
}
|
|
}
|