revert back to existing implementation: ack and commitSync

Shekharrajak · Shekharrajak · commit c3811238794e · 2025-11-15T22:50:53.000+05:30
diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/KafkaShareGroupSourceReader.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/KafkaShareGroupSourceReader.java
@@ -352,46 +352,41 @@ protected ShareGroupSubscriptionState toSplitType(
 
     @Override
     public List<ShareGroupSubscriptionState> snapshotState(long checkpointId) {
-        // Update current checkpoint ID for record association
         currentCheckpointId.set(checkpointId);
 
-        // Ensure share consumer is set in transaction manager
         ShareConsumer<byte[], byte[]> consumer = getShareConsumer();
         if (consumer != null && transactionManager != null) {
             transactionManager.setShareConsumer(consumer);
         }
 
-        // Get records for this checkpoint (checkpoint subsuming)
         Set<RecordMetadata> recordsToAck = acknowledgmentBuffer.getRecordsUpTo(checkpointId);
 
-        // Phase 1 of 2PC: Prepare acknowledgments
+        // Phase 1: Mark records ready (DO NOT send to broker yet)
         if (!recordsToAck.isEmpty()) {
             try {
-                transactionManager.prepareAcknowledgments(checkpointId, recordsToAck);
+                transactionManager.markReadyForAcknowledgment(checkpointId, recordsToAck);
                 LOG.info(
-                        "Share group '{}': CHECKPOINT {} PREPARED - {} records marked for acknowledgment",
+                        "Share group '{}': CHECKPOINT {} READY - {} records marked (not sent to broker)",
                         shareGroupId,
                         checkpointId,
                         recordsToAck.size());
             } catch (Exception e) {
                 LOG.error(
-                        "Share group '{}': CHECKPOINT {} PREPARE FAILED - transaction will be aborted",
+                        "Share group '{}': CHECKPOINT {} MARK FAILED",
                         shareGroupId,
                         checkpointId,
                         e);
-                throw new RuntimeException("Failed to prepare checkpoint " + checkpointId, e);
+                throw new RuntimeException("Failed to mark checkpoint " + checkpointId, e);
             }
         } else {
             LOG.debug(
-                    "Share group '{}': CHECKPOINT {} SNAPSHOT - No records to prepare",
+                    "Share group '{}': CHECKPOINT {} SNAPSHOT - No records to mark",
                     shareGroupId,
                     checkpointId);
         }
 
-        // Get the current subscription state from parent
         List<ShareGroupSubscriptionState> states = super.snapshotState(checkpointId);
 
-        // Log checkpoint snapshot statistics
         AcknowledgmentBuffer.BufferStatistics stats = acknowledgmentBuffer.getStatistics();
         LOG.info(
                 "Share group '{}': CHECKPOINT {} SNAPSHOT - {} records buffered across {} checkpoints (memory: {} bytes)",
@@ -401,16 +396,14 @@ public List<ShareGroupSubscriptionState> snapshotState(long checkpointId) {
                 stats.getCheckpointCount(),
                 stats.getMemoryUsageBytes());
 
-        // Return minimal subscription state - no offset tracking needed
         return states;
     }
 
     /**
      * Callback when a checkpoint completes successfully.
      *
-     * Phase 2 of 2PC: Commit transaction.
-     * The broker applies acknowledgments atomically when checkpoint completes.
-     * This ensures no data loss - records remain locked until checkpoint succeeds.
+     * Phase 2 of 2PC: NOW send acknowledgments to broker.
+     * Records stay locked until this method succeeds - ensuring no data loss.
      *
      * @param checkpointId the ID of the checkpoint that completed
      * @throws Exception if commit fails
@@ -419,7 +412,6 @@ public List<ShareGroupSubscriptionState> snapshotState(long checkpointId) {
     public void notifyCheckpointComplete(long checkpointId) throws Exception {
         final long startTime = System.currentTimeMillis();
 
-        // Get all records up to this checkpoint for statistics
         Set<RecordMetadata> processedRecords = acknowledgmentBuffer.getRecordsUpTo(checkpointId);
 
         if (processedRecords.isEmpty()) {
@@ -432,17 +424,15 @@ public void notifyCheckpointComplete(long checkpointId) throws Exception {
         }
 
         LOG.info(
-                "Share group '{}': CHECKPOINT {} COMPLETE - Committing transaction for {} records",
+                "Share group '{}': CHECKPOINT {} COMPLETE - NOW sending {} acknowledgments to broker",
                 shareGroupId,
                 checkpointId,
                 processedRecords.size());
 
         try {
-            // Phase 2 of 2PC: Commit transaction
-            // Broker applies prepared acknowledgments atomically
+            // Phase 2: Send acknowledgments to broker (ONLY when checkpoint completes)
             transactionManager.commitTransaction(checkpointId);
 
-            // Update metrics
             final long duration = System.currentTimeMillis() - startTime;
             if (shareGroupMetrics != null) {
                 shareGroupMetrics.recordSuccessfulCommit();
@@ -452,11 +442,10 @@ public void notifyCheckpointComplete(long checkpointId) throws Exception {
                 }
             }
 
-            // Clean up buffer - remove processed record metadata
             int removedCount = acknowledgmentBuffer.removeUpTo(checkpointId);
 
             LOG.info(
-                    "Share group '{}': CHECKPOINT {} SUCCESS - Committed {} records, cleaned up {} metadata entries in {}ms",
+                    "Share group '{}': CHECKPOINT {} SUCCESS - Committed {} records to broker, cleaned up {} metadata entries in {}ms",
                     shareGroupId,
                     checkpointId,
                     processedRecords.size(),
@@ -465,7 +454,7 @@ public void notifyCheckpointComplete(long checkpointId) throws Exception {
 
         } catch (Exception e) {
             LOG.error(
-                    "Share group '{}': CHECKPOINT {} COMMIT FAILED",
+                    "Share group '{}': CHECKPOINT {} COMMIT FAILED - Records remain locked at broker",
                     shareGroupId,
                     checkpointId,
                     e);
@@ -475,7 +464,6 @@ public void notifyCheckpointComplete(long checkpointId) throws Exception {
             throw e;
         }
 
-        // Call parent implementation
         super.notifyCheckpointComplete(checkpointId);
     }
 
diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/transaction/FlinkTransactionManager.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/transaction/FlinkTransactionManager.java
@@ -32,16 +32,19 @@
 import java.util.concurrent.ConcurrentHashMap;
 
 /**
- * Manages transactional acknowledgments for Flink share group source.
+ * Coordinates acknowledgments with Flink checkpoint lifecycle for at-least-once semantics.
  *
- * Implements two-phase commit (2PC) to ensure no data loss:
- * - Phase 1 (Prepare): Send acks to broker on snapshotState
- * - Phase 2 (Commit): Broker applies acks on notifyCheckpointComplete
+ * Two-phase commit coordinated with Flink checkpoints:
+ * - Phase 1 (snapshotState): Buffer acks locally, records stay locked at broker
+ * - Phase 2 (notifyCheckpointComplete): Send acks via commitSync(), uses Kafka's built-in 2PC
  *
- * Recovery logic:
- * - On restore, query broker for transaction state
- * - If PREPARED → commit (checkpoint was written)
- * - If ACTIVE → abort (checkpoint incomplete)
+ * At-least-once guarantee:
+ * - Records stay IN_FLIGHT (locked) at broker until checkpoint completes
+ * - If checkpoint fails: locks timeout → records automatically redelivered
+ * - If checkpoint succeeds: commitSync() atomically acknowledges records
+ *
+ * Note: Kafka's built-in commitSync() handles PREPARED→COMMITTED atomically (milliseconds).
+ * This manager coordinates the TIMING of commitSync() with Flink's checkpoint lifecycle.
  */
 @Internal
 public class FlinkTransactionManager {
@@ -50,11 +53,13 @@ public class FlinkTransactionManager {
     private final String shareGroupId;
     private ShareConsumer<?, ?> shareConsumer;
     private final Map<Long, TransactionState> checkpointTransactions;
+    private final Map<Long, Set<RecordMetadata>> readyForAcknowledgment;
 
     public FlinkTransactionManager(String shareGroupId, ShareConsumer<?, ?> shareConsumer) {
         this.shareGroupId = shareGroupId;
         this.shareConsumer = shareConsumer;
         this.checkpointTransactions = new ConcurrentHashMap<>();
+        this.readyForAcknowledgment = new ConcurrentHashMap<>();
     }
 
     /**
@@ -65,29 +70,57 @@ public void setShareConsumer(ShareConsumer<?, ?> shareConsumer) {
     }
 
     /**
-     * Prepare acknowledgments (Phase 1 of 2PC).
-     * Called during snapshotState before checkpoint barrier.
+     * Mark acknowledgments ready (Phase 1).
+     * Stores records locally - does NOT send to broker yet.
+     * Records remain locked (IN_FLIGHT) at broker until commitTransaction().
      */
-    public void prepareAcknowledgments(long checkpointId, Set<RecordMetadata> records) throws Exception {
+    public void markReadyForAcknowledgment(long checkpointId, Set<RecordMetadata> records) {
         if (records.isEmpty()) {
-            LOG.debug("Share group '{}': No records to prepare for checkpoint {}",
+            LOG.debug("Share group '{}': No records to mark for checkpoint {}",
+                shareGroupId, checkpointId);
+            return;
+        }
+
+        LOG.info("Share group '{}': Marking {} records ready for checkpoint {} (NOT sending to broker yet)",
+            shareGroupId, records.size(), checkpointId);
+
+        readyForAcknowledgment.put(checkpointId, records);
+        checkpointTransactions.put(checkpointId, TransactionState.READY);
+    }
+
+    /**
+     * Commit transaction (Phase 2).
+     * Sends acks to broker using Kafka's built-in atomic commitSync().
+     * Kafka internally: acknowledge() marks PREPARED, commitSync() applies atomically.
+     */
+    public void commitTransaction(long checkpointId) throws Exception {
+        Set<RecordMetadata> records = readyForAcknowledgment.remove(checkpointId);
+
+        if (records == null || records.isEmpty()) {
+            LOG.debug("Share group '{}': No records to commit for checkpoint {}",
                 shareGroupId, checkpointId);
-            return
-;
+            checkpointTransactions.remove(checkpointId);
+            return;
+        }
+
+        TransactionState state = checkpointTransactions.get(checkpointId);
+        if (state != TransactionState.READY) {
+            LOG.warn("Share group '{}': Cannot commit checkpoint {} in state {}",
+                shareGroupId, checkpointId, state);
+            return;
         }
 
-        LOG.info("Share group '{}': Preparing {} records for checkpoint {}",
+        LOG.info("Share group '{}': Committing {} records for checkpoint {}",
             shareGroupId, records.size(), checkpointId);
 
         try {
-            // Group by partition for efficient acknowledgment
+            // Send acknowledgments using Kafka's built-in atomic commit
             Map<TopicPartition, java.util.List<RecordMetadata>> byPartition = new ConcurrentHashMap<>();
             for (RecordMetadata meta : records) {
                 TopicPartition tp = new TopicPartition(meta.getTopic(), meta.getPartition());
                 byPartition.computeIfAbsent(tp, k -> new java.util.ArrayList<>()).add(meta);
             }
 
-            // Acknowledge records (marks them as prepared in broker)
             for (Map.Entry<TopicPartition, java.util.List<RecordMetadata>> entry : byPartition.entrySet()) {
                 for (RecordMetadata meta : entry.getValue()) {
                     shareConsumer.acknowledge(
@@ -97,59 +130,30 @@ public void prepareAcknowledgments(long checkpointId, Set<RecordMetadata> record
                 }
             }
 
-            // Sync to ensure broker received acknowledgments
+            // commitSync() atomically applies all acknowledgments at broker
             shareConsumer.commitSync(Duration.ofSeconds(30));
 
-            // Track transaction state
-            checkpointTransactions.put(checkpointId, TransactionState.PREPARED);
+            checkpointTransactions.put(checkpointId, TransactionState.COMMITTED);
+            cleanupOldTransactions(checkpointId);
 
-            LOG.info("Share group '{}': Prepared checkpoint {} successfully",
+            LOG.info("Share group '{}': Successfully committed checkpoint {}",
                 shareGroupId, checkpointId);
 
         } catch (Exception e) {
-            LOG.error("Share group '{}': Failed to prepare checkpoint {}",
+            LOG.error("Share group '{}': Failed to commit checkpoint {}",
                 shareGroupId, checkpointId, e);
             checkpointTransactions.put(checkpointId, TransactionState.FAILED);
             throw e;
         }
     }
 
     /**
-     * Commit transaction (Phase 2 of 2PC).
-     * Called on notifyCheckpointComplete - broker applies acknowledgments atomically.
-     */
-    public void commitTransaction(long checkpointId) {
-        TransactionState state = checkpointTransactions.get(checkpointId);
-        if (state == null) {
-            LOG.debug("Share group '{}': No transaction for checkpoint {}",
-                shareGroupId, checkpointId);
-            return;
-        }
-
-        if (state != TransactionState.PREPARED) {
-            LOG.warn("Share group '{}': Cannot commit checkpoint {} in state {}",
-                shareGroupId, checkpointId, state);
-            return;
-        }
-
-        LOG.info("Share group '{}': Committing checkpoint {}", shareGroupId, checkpointId);
-
-        // Broker automatically applies prepared acknowledgments on checkpoint complete
-        // No additional action needed - this is handled by Kafka coordinator
-
-        checkpointTransactions.put(checkpointId, TransactionState.COMMITTED);
-        cleanupOldTransactions(checkpointId);
-    }
-
-    /**
-     * Abort transaction.
-     * Called on notifyCheckpointAborted - releases record locks.
+     * Abort transaction - releases record locks for redelivery.
      */
     public void abortTransaction(long checkpointId, Set<RecordMetadata> records) {
         LOG.info("Share group '{}': Aborting checkpoint {}", shareGroupId, checkpointId);
 
         try {
-            // Release records back for redelivery
             for (RecordMetadata meta : records) {
                 shareConsumer.acknowledge(
                     meta.getConsumerRecord(),
@@ -158,32 +162,23 @@ public void abortTransaction(long checkpointId, Set<RecordMetadata> records) {
             }
 
             shareConsumer.commitSync(Duration.ofSeconds(10));
-
             checkpointTransactions.put(checkpointId, TransactionState.ABORTED);
 
         } catch (Exception e) {
             LOG.error("Share group '{}': Failed to abort checkpoint {}",
                 shareGroupId, checkpointId, e);
-            // Records will timeout and be redelivered automatically
         }
 
         cleanupOldTransactions(checkpointId);
     }
 
     /**
-     * Handle recovery after task restart.
-     * Queries broker for transaction state and makes recovery decision.
+     * Recovery is handled automatically by Kafka's lock timeout mechanism.
+     * If task fails, locks expire and records are redelivered - no explicit action needed.
      */
     public void recoverFromCheckpoint(long restoredCheckpointId) {
-        LOG.info("Share group '{}': Recovering from checkpoint {}",
+        LOG.info("Share group '{}': Recovering from checkpoint {} - relying on Kafka lock timeout for redelivery",
             shareGroupId, restoredCheckpointId);
-
-        // Query broker for transaction state
-        // In actual implementation, this would use admin client to query broker
-        // For now, conservative approach: assume need to restart
-
-        LOG.info("Share group '{}': Recovery complete - ready for new checkpoints",
-            shareGroupId);
     }
 
     private void cleanupOldTransactions(long completedCheckpointId) {
@@ -193,7 +188,7 @@ private void cleanupOldTransactions(long completedCheckpointId) {
     }
 
     private enum TransactionState {
-        PREPARED,
+        READY,
         COMMITTED,
         ABORTED,
         FAILED