From 254f760b5730b74dc455751686179c906ffde4a6 Mon Sep 17 00:00:00 2001 From: ezzz3 <73205104+ezzz37@users.noreply.github.com> Date: Wed, 25 Jun 2025 15:12:11 -0300 Subject: [PATCH 1/2] Update Orchestrator.cs - **New Activity Function `CleanOldRepositoryData`**: This function removes `Repository` entities from the `Repositories` table that are older than a configurable number of days (defaulting to 90 days). It handles data deletion in batches for efficiency and to comply with Azure Table Storage limits. - **New Orchestration Function `CleanupOrchestrator`**: A dedicated orchestration to manage the cleanup process. It calls `CleanOldRepositoryData`. - **New HTTP Trigger `CleanupOrchestrator_HttpStart`**: Allows for manual triggering of the cleanup orchestration (useful for testing). - **New Timer Trigger `ScheduleDailyCleanup`**: Configured to run daily at 4:00 AM UTC, automatically initiating the `CleanupOrchestrator` to perform routine data hygiene. - **`Repository` Class Update**: Added a parameterless constructor to the `Repository` class, which is necessary for Azure Table Storage deserialization. - **`EnumerableExtensions.Chunk` Helper**: Included a `Chunk` extension method for batch processing, primarily for compatibility with .NET versions prior to .NET 6 (where `Enumerable.Chunk` is built-in). --- FanOutFanInCrawler/Orchestrator.cs | 79 +++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/FanOutFanInCrawler/Orchestrator.cs b/FanOutFanInCrawler/Orchestrator.cs index 67ce615..78888be 100644 --- a/FanOutFanInCrawler/Orchestrator.cs +++ b/FanOutFanInCrawler/Orchestrator.cs @@ -45,6 +45,83 @@ public static async Task HttpStart( return client.CreateCheckStatusResponse(req, instanceId); } + public static async Task CleanOldRepositoryData([ActivityTrigger] IDurableActivityContext context, ILogger log) + { + // Obtiene el número de días a retener desde la entrada de la función de actividad. + // Si no se proporciona o es inválido, por defecto retendrá datos de los últimos 90 días. + int daysToRetain = context.GetInput(); + if (daysToRetain <= 0) + { + daysToRetain = 90; // Valor por defecto + } + + log.LogInformation($"Iniciando limpieza de datos de repositorio. Se eliminarán entradas anteriores a los últimos {daysToRetain} días."); + + var client = account.CreateCloudTableClient(); + var table = client.GetTableReference("Repositories"); + + // Asegurarse de que la tabla exista antes de intentar consultarla. + if (!await table.ExistsAsync()) + { + log.LogWarning("La tabla 'Repositories' no existe. No hay datos que limpiar."); + return; + } + + // Calcular la fecha límite: cualquier entrada anterior a esta fecha será eliminada. + DateTimeOffset cutoffDate = DateTimeOffset.UtcNow.AddDays(-daysToRetain); + + TableQuery query = new TableQuery(); + TableContinuationToken continuationToken = null; + var entitiesToDelete = new List(); + + // Iterar sobre todos los segmentos de la tabla para encontrar entidades a eliminar. + do + { + TableQuerySegment segment = await table.ExecuteQuerySegmentedAsync(query, continuationToken); + continuationToken = segment.ContinuationToken; + + // Filtrar las entidades que son más antiguas que la fecha de corte. + entitiesToDelete.AddRange(segment.Results.Where(e => e.Timestamp < cutoffDate)); + + } while (continuationToken != null); + + if (entitiesToDelete.Any()) + { + log.LogInformation($"Encontradas {entitiesToDelete.Count} entradas de repositorio para eliminar."); + + // Eliminar entidades en lotes de 100 (límite de Table Storage para operaciones por lotes). + // Asegúrate de que el método de extensión .Chunk() esté disponible (requiere .NET 6+ o la implementación manual a continuación). + var batches = entitiesToDelete.Chunk(100); + int deletedCount = 0; + + foreach (var batch in batches) + { + TableBatchOperation batchOperation = new TableBatchOperation(); + foreach (var entity in batch) + { + batchOperation.Delete(entity); + } + try + { + await table.ExecuteBatchAsync(batchOperation); + deletedCount += batch.Count(); + log.LogInformation($"Eliminado un lote de {batch.Count()} entradas."); + } + catch (StorageException ex) + { + log.LogError($"Error al eliminar un lote de entradas: {ex.Message}"); + // Considera añadir lógica de reintento o manejo de errores aquí. + } + } + log.LogInformation($"Limpieza de datos completada. Total de entradas eliminadas: {deletedCount}"); + } + else + { + log.LogInformation("No se encontraron entradas de repositorio antiguas para eliminar."); + } + } + + [FunctionName("Orchestrator")] public static async Task RunOrchestrator( [OrchestrationTrigger] IDurableOrchestrationContext context) @@ -137,4 +214,4 @@ public Repository(long id) public string RepositoryName { get; set; } } } -} \ No newline at end of file +} From 2d6ee5cab13dd686982bf1db48e78de20e888430 Mon Sep 17 00:00:00 2001 From: ezzz3 <73205104+ezzz37@users.noreply.github.com> Date: Thu, 26 Jun 2025 02:18:29 -0300 Subject: [PATCH 2/2] Update Orchestrator.cs --- FanOutFanInCrawler/Orchestrator.cs | 141 ++++++++++++++++------------- 1 file changed, 78 insertions(+), 63 deletions(-) diff --git a/FanOutFanInCrawler/Orchestrator.cs b/FanOutFanInCrawler/Orchestrator.cs index 78888be..bb6db00 100644 --- a/FanOutFanInCrawler/Orchestrator.cs +++ b/FanOutFanInCrawler/Orchestrator.cs @@ -45,81 +45,96 @@ public static async Task HttpStart( return client.CreateCheckStatusResponse(req, instanceId); } - public static async Task CleanOldRepositoryData([ActivityTrigger] IDurableActivityContext context, ILogger log) - { - // Obtiene el número de días a retener desde la entrada de la función de actividad. - // Si no se proporciona o es inválido, por defecto retendrá datos de los últimos 90 días. - int daysToRetain = context.GetInput(); - if (daysToRetain <= 0) - { - daysToRetain = 90; // Valor por defecto - } + public static async Task CleanOldRepositoryData( + [ActivityTrigger] IDurableActivityContext context, + ILogger log) +{ + // 1) Read input and apply default if invalid + int daysToRetain = context.GetInput(); + if (daysToRetain <= 0) + { + daysToRetain = 90; + log.LogWarning("Invalid retention period supplied; defaulting to 90 days."); + } + log.LogInformation($"Starting cleanup: removing entries older than {daysToRetain} days."); - log.LogInformation($"Iniciando limpieza de datos de repositorio. Se eliminarán entradas anteriores a los últimos {daysToRetain} días."); + // 2) Parse storage connection string + string storageConn = Environment.GetEnvironmentVariable("AzureWebJobsStorage"); + if (string.IsNullOrWhiteSpace(storageConn)) + { + log.LogError("Environment variable 'AzureWebJobsStorage' is not set."); + return; + } - var client = account.CreateCloudTableClient(); - var table = client.GetTableReference("Repositories"); + if (!CloudStorageAccount.TryParse(storageConn, out CloudStorageAccount storageAccount)) + { + log.LogError("Failed to parse Azure storage connection string."); + return; + } - // Asegurarse de que la tabla exista antes de intentar consultarla. - if (!await table.ExistsAsync()) - { - log.LogWarning("La tabla 'Repositories' no existe. No hay datos que limpiar."); - return; - } + // 3) Get table reference + var tableClient = storageAccount.CreateCloudTableClient(); + var table = tableClient.GetTableReference("Repositories"); - // Calcular la fecha límite: cualquier entrada anterior a esta fecha será eliminada. - DateTimeOffset cutoffDate = DateTimeOffset.UtcNow.AddDays(-daysToRetain); + if (!await table.ExistsAsync()) + { + log.LogWarning("Table 'Repositories' does not exist—nothing to clean."); + return; + } - TableQuery query = new TableQuery(); - TableContinuationToken continuationToken = null; - var entitiesToDelete = new List(); + // 4) Compute cutoff date + DateTimeOffset cutoff = DateTimeOffset.UtcNow.AddDays(-daysToRetain); - // Iterar sobre todos los segmentos de la tabla para encontrar entidades a eliminar. - do - { - TableQuerySegment segment = await table.ExecuteQuerySegmentedAsync(query, continuationToken); - continuationToken = segment.ContinuationToken; + // 5) Build a query that runs server-side + string filter = TableQuery.GenerateFilterConditionForDate( + "Timestamp", + QueryComparisons.LessThan, + cutoff); + var query = new TableQuery().Where(filter); - // Filtrar las entidades que son más antiguas que la fecha de corte. - entitiesToDelete.AddRange(segment.Results.Where(e => e.Timestamp < cutoffDate)); + // 6) Execute query in segments + var toDelete = new List(); + TableContinuationToken token = null; + do + { + var segment = await table.ExecuteQuerySegmentedAsync(query, token); + token = segment.ContinuationToken; + toDelete.AddRange(segment.Results); + } while (token != null); - } while (continuationToken != null); + if (!toDelete.Any()) + { + log.LogInformation("No repository entries older than cutoff date were found."); + return; + } - if (entitiesToDelete.Any()) - { - log.LogInformation($"Encontradas {entitiesToDelete.Count} entradas de repositorio para eliminar."); + log.LogInformation($"Found {toDelete.Count} entries to delete."); - // Eliminar entidades en lotes de 100 (límite de Table Storage para operaciones por lotes). - // Asegúrate de que el método de extensión .Chunk() esté disponible (requiere .NET 6+ o la implementación manual a continuación). - var batches = entitiesToDelete.Chunk(100); - int deletedCount = 0; + // 7) Delete in batches of up to 100 + int deletedCount = 0; + foreach (var batch in toDelete.Chunk(100)) + { + var batchOp = new TableBatchOperation(); + foreach (var entity in batch) + { + batchOp.Delete(entity); + } - foreach (var batch in batches) - { - TableBatchOperation batchOperation = new TableBatchOperation(); - foreach (var entity in batch) - { - batchOperation.Delete(entity); - } - try - { - await table.ExecuteBatchAsync(batchOperation); - deletedCount += batch.Count(); - log.LogInformation($"Eliminado un lote de {batch.Count()} entradas."); - } - catch (StorageException ex) - { - log.LogError($"Error al eliminar un lote de entradas: {ex.Message}"); - // Considera añadir lógica de reintento o manejo de errores aquí. - } - } - log.LogInformation($"Limpieza de datos completada. Total de entradas eliminadas: {deletedCount}"); - } - else - { - log.LogInformation("No se encontraron entradas de repositorio antiguas para eliminar."); - } + try + { + await table.ExecuteBatchAsync(batchOp); + deletedCount += batch.Count; + log.LogInformation($"Deleted batch of {batch.Count} entries."); + } + catch (StorageException ex) + { + log.LogError($"Error deleting batch: {ex.Message}"); + // Optionally implement retry logic here } + } + + log.LogInformation($"Cleanup finished. Total deleted: {deletedCount}"); +} [FunctionName("Orchestrator")]