Add timeout to work pool enqueuing

acogoluegnes · acogoluegnes · commit 4e223b205cf2 · 2018-02-12T11:51:17.000+01:00
Making the work pool fail after it didn't manage to enqueue work for a given time makes the client more reactive to broker overload. Note this usually happens to clients that do not set QoS properly. Neverlethess, making the client as early as possible can avoid hard-to-debug connection failure. This complements the triggering of connection recovery on failed write operations. Work pool enqueueing timeout is usefull for NIO, where the same thread is used for both reading and writing (if the thread is stuck waiting on work pool enqueueing, no write operation can occur, and the TCP connection failure is never detected). [#154263515] Fixes #341
diff --git a/src/main/java/com/rabbitmq/client/ConnectionFactory.java b/src/main/java/com/rabbitmq/client/ConnectionFactory.java
@@ -81,6 +81,9 @@ public class ConnectionFactory implements Cloneable {
     /** The default network recovery interval: 5000 millis */
     public static final long   DEFAULT_NETWORK_RECOVERY_INTERVAL = 5000;
 
+    /** The default timeout for work pool enqueueing: no timeout */
+    public static final int    DEFAULT_WORK_POOL_TIMEOUT = -1;
+
     private static final String PREFERRED_TLS_PROTOCOL = "TLSv1.2";
 
     private static final String FALLBACK_TLS_PROTOCOL = "TLSv1";
@@ -138,6 +141,12 @@ public class ConnectionFactory implements Cloneable {
      */
     private boolean channelShouldCheckRpcResponseType = false;
 
+    /**
+     * Timeout in ms for work pool enqueuing.
+     * @since 4.5.0
+     */
+    private int workPoolTimeout = DEFAULT_WORK_POOL_TIMEOUT;
+
     /** @return the default host to use for connections */
     public String getHost() {
         return host;
@@ -974,6 +983,7 @@ public ConnectionParams params(ExecutorService consumerWorkServiceExecutor) {
         result.setHeartbeatExecutor(heartbeatExecutor);
         result.setChannelRpcTimeout(channelRpcTimeout);
         result.setChannelShouldCheckRpcResponseType(channelShouldCheckRpcResponseType);
+        result.setWorkPoolTimeout(workPoolTimeout);
         return result;
     }
 
@@ -1270,4 +1280,25 @@ public void setChannelShouldCheckRpcResponseType(boolean channelShouldCheckRpcRe
     public boolean isChannelShouldCheckRpcResponseType() {
         return channelShouldCheckRpcResponseType;
     }
+
+    /**
+     * Timeout (in ms) for work pool enqueueing.
+     * The {@link WorkPool} dispatches several types of responses
+     * from the broker (e.g. deliveries). A high-traffic
+     * client with slow consumers can exhaust the work pool and
+     * compromise the whole connection (by e.g. letting the broker
+     * saturate the receive TCP buffers). Setting a timeout
+     * would make the connection fail early and avoid hard-to-diagnose
+     * TCP connection failure. Note this shouldn't happen
+     * with clients that set appropriate QoS values.
+     * Default is no timeout.
+     * @param workPoolTimeout timeout in ms
+     */
+    public void setWorkPoolTimeout(int workPoolTimeout) {
+        this.workPoolTimeout = workPoolTimeout;
+    }
+
+    public int getWorkPoolTimeout() {
+        return workPoolTimeout;
+    }
 }
diff --git a/src/main/java/com/rabbitmq/client/impl/AMQConnection.java b/src/main/java/com/rabbitmq/client/impl/AMQConnection.java
@@ -64,6 +64,8 @@ public class AMQConnection extends ShutdownNotifierComponent implements Connecti
 
     private final ErrorOnWriteListener errorOnWriteListener;
 
+    private final int workPoolTimeout;
+
     private final AtomicBoolean finalShutdownStarted = new AtomicBoolean(false);
 
     /**
@@ -255,12 +257,12 @@ public AMQConnection(ConnectionParams params, FrameHandler frameHandler, Metrics
             new ErrorOnWriteListener() {
                 @Override
                 public void handle(Connection connection, IOException exception) { }
-            };
-
+        };
+        this.workPoolTimeout = params.getWorkPoolTimeout();
     }
 
     private void initializeConsumerWorkService() {
-        this._workService  = new ConsumerWorkService(consumerWorkServiceExecutor, threadFactory, shutdownTimeout);
+        this._workService  = new ConsumerWorkService(consumerWorkServiceExecutor, threadFactory, workPoolTimeout, shutdownTimeout);
     }
 
     private void initializeHeartbeatSender() {
@@ -619,6 +621,9 @@ public boolean handleReadFrame(Frame frame) {
             try {
                 readFrame(frame);
                 return true;
+            } catch (WorkPoolFullException e) {
+                // work pool is full, we propagate this one.
+                throw e;
             } catch (Throwable ex) {
                 try {
                     handleFailure(ex);
diff --git a/src/main/java/com/rabbitmq/client/impl/ChannelN.java b/src/main/java/com/rabbitmq/client/impl/ChannelN.java
@@ -387,6 +387,9 @@ private void releaseChannel() {
                 if (callback != null) {
                     try {
                         this.dispatcher.handleCancel(callback, consumerTag);
+                    } catch (WorkPoolFullException e) {
+                        // couldn't enqueue in work pool, propagating
+                        throw e;
                     } catch (Throwable ex) {
                         getConnection().getExceptionHandler().handleConsumerException(this,
                                                                                       ex,
@@ -445,10 +448,13 @@ protected void processDelivery(Command command, Basic.Deliver method) {
             // in case a manual ack in the callback, the stats will be able to record the ack
             metricsCollector.consumedMessage(this, m.getDeliveryTag(), m.getConsumerTag());
             this.dispatcher.handleDelivery(callback,
-                                           m.getConsumerTag(),
-                                           envelope,
-                                           (BasicProperties) command.getContentHeader(),
-                                           command.getContentBody());
+                m.getConsumerTag(),
+                envelope,
+                (BasicProperties) command.getContentHeader(),
+                command.getContentBody());
+        } catch (WorkPoolFullException e) {
+            // couldn't enqueue in work pool, propagating
+            throw e;
         } catch (Throwable ex) {
             getConnection().getExceptionHandler().handleConsumerException(this,
                 ex,
diff --git a/src/main/java/com/rabbitmq/client/impl/ConnectionParams.java b/src/main/java/com/rabbitmq/client/impl/ConnectionParams.java
@@ -45,6 +45,7 @@ public class ConnectionParams {
     private int channelRpcTimeout;
     private boolean channelShouldCheckRpcResponseType;
     private ErrorOnWriteListener errorOnWriteListener;
+    private int workPoolTimeout = -1;
 
     private ExceptionHandler exceptionHandler;
     private ThreadFactory threadFactory;
@@ -222,4 +223,12 @@ public void setErrorOnWriteListener(ErrorOnWriteListener errorOnWriteListener) {
     public ErrorOnWriteListener getErrorOnWriteListener() {
         return errorOnWriteListener;
     }
+
+    public void setWorkPoolTimeout(int workPoolTimeout) {
+        this.workPoolTimeout = workPoolTimeout;
+    }
+
+    public int getWorkPoolTimeout() {
+        return workPoolTimeout;
+    }
 }
diff --git a/src/main/java/com/rabbitmq/client/impl/ConsumerWorkService.java b/src/main/java/com/rabbitmq/client/impl/ConsumerWorkService.java
@@ -31,14 +31,18 @@ final public class ConsumerWorkService {
     private final WorkPool<Channel, Runnable> workPool;
     private final int shutdownTimeout;
 
-    public ConsumerWorkService(ExecutorService executor, ThreadFactory threadFactory, int shutdownTimeout) {
+    public ConsumerWorkService(ExecutorService executor, ThreadFactory threadFactory, int queueingTimeout, int shutdownTimeout) {
         this.privateExecutor = (executor == null);
         this.executor = (executor == null) ? Executors.newFixedThreadPool(DEFAULT_NUM_THREADS, threadFactory)
-                                           : executor;
-        this.workPool = new WorkPool<Channel, Runnable>();
+            : executor;
+        this.workPool = new WorkPool<Channel, Runnable>(queueingTimeout);
         this.shutdownTimeout = shutdownTimeout;
     }
 
+    public ConsumerWorkService(ExecutorService executor, ThreadFactory threadFactory, int shutdownTimeout) {
+        this(executor, threadFactory, -1, shutdownTimeout);
+    }
+
     public int getShutdownTimeout() {
         return shutdownTimeout;
     }
diff --git a/src/main/java/com/rabbitmq/client/impl/WorkPool.java b/src/main/java/com/rabbitmq/client/impl/WorkPool.java
@@ -21,6 +21,9 @@
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.Callable;
+import java.util.concurrent.TimeUnit;
 
 /**
  * <p>This is a generic implementation of the channels specification
@@ -61,6 +64,37 @@ public class WorkPool<K, W> {
     private final Map<K, VariableLinkedBlockingQueue<W>> pool = new HashMap<K, VariableLinkedBlockingQueue<W>>();
     /** Those keys which want limits to be removed. We do not limit queue size if this is non-empty. */
     private final Set<K> unlimited = new HashSet<K>();
+    private final EnqueueingCallback<W> enqueueingCallback;
+
+    public WorkPool(final int queueingTimeout) {
+        if (queueingTimeout > 0) {
+            this.enqueueingCallback = new EnqueueingCallback<W>() {
+                @Override
+                public void enqueue(BlockingQueue<W> queue, W item) {
+                    try {
+                        boolean offered = queue.offer(item, queueingTimeout, TimeUnit.MILLISECONDS);
+                        if (!offered) {
+                            throw new WorkPoolFullException("Could not enqueue in work pool after " + queueingTimeout + " ms.");
+                        }
+                    } catch (InterruptedException e) {
+                        Thread.currentThread();
+                    }
+                }
+            };
+        } else {
+            this.enqueueingCallback = new EnqueueingCallback<W>() {
+
+                @Override
+                public void enqueue(BlockingQueue<W> queue, W item) {
+                    try {
+                        queue.put(item);
+                    } catch (InterruptedException e) {
+                        Thread.currentThread().interrupt();
+                    }
+                }
+            };
+        }
+    }
 
     /**
      * Add client <code><b>key</b></code> to pool of item queues, with an empty queue.
@@ -178,11 +212,7 @@ public boolean addWorkItem(K key, W item) {
         }
         // The put operation may block. We need to make sure we are not holding the lock while that happens.
         if (queue != null) {
-            try {
-                queue.put(item);
-            } catch (InterruptedException e) {
-                Thread.currentThread().interrupt();
-            }
+            enqueueingCallback.enqueue(queue, item);
 
             synchronized (this) {
                 if (isDormant(key)) {
@@ -243,4 +273,10 @@ private K readyToInProgress() {
         }
         return key;
     }
+
+    private interface EnqueueingCallback<W> {
+
+        void enqueue(BlockingQueue<W> queue, W item);
+
+    }
 }
diff --git a/src/main/java/com/rabbitmq/client/impl/WorkPoolFullException.java b/src/main/java/com/rabbitmq/client/impl/WorkPoolFullException.java
@@ -1,8 +1,26 @@
+// Copyright (c) 2018-Present Pivotal Software, Inc.  All rights reserved.
+//
+// This software, the RabbitMQ Java client library, is triple-licensed under the
+// Mozilla Public License 1.1 ("MPL"), the GNU General Public License version 2
+// ("GPL") and the Apache License version 2 ("ASL"). For the MPL, please see
+// LICENSE-MPL-RabbitMQ. For the GPL, please see LICENSE-GPL2.  For the ASL,
+// please see LICENSE-APACHE2.
+//
+// This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND,
+// either express or implied. See the LICENSE file for specific language governing
+// rights and limitations of this software.
+//
+// If you have any questions regarding licensing, please contact us at
+// info@rabbitmq.com.
+
 package com.rabbitmq.client.impl;
 
 /**
- *
+ * Exception thrown when {@link WorkPool} enqueueing times out.
  */
-public class WorkPoolFullException {
+public class WorkPoolFullException extends RuntimeException {
 
+    public WorkPoolFullException(String msg) {
+        super(msg);
+    }
 }
diff --git a/src/test/java/com/rabbitmq/client/impl/WorkPoolTests.java b/src/test/java/com/rabbitmq/client/impl/WorkPoolTests.java
@@ -30,7 +30,7 @@
  */
 public class WorkPoolTests {
 
-    private final WorkPool<String, Object> pool = new WorkPool<String, Object>();
+    private final WorkPool<String, Object> pool = new WorkPool<String, Object>(-1);
 
     /**
      * Test unknown key tolerated silently
diff --git a/src/test/java/com/rabbitmq/client/test/NoAutoRecoveryWhenTcpWindowIsFullTest.java b/src/test/java/com/rabbitmq/client/test/NoAutoRecoveryWhenTcpWindowIsFullTest.java
@@ -24,6 +24,7 @@
 import com.rabbitmq.client.Envelope;
 import com.rabbitmq.client.Recoverable;
 import com.rabbitmq.client.RecoveryListener;
+import com.rabbitmq.client.impl.nio.NioParams;
 import com.rabbitmq.client.impl.recovery.AutorecoveringChannel;
 import com.rabbitmq.client.impl.recovery.AutorecoveringConnection;
 import org.junit.After;
@@ -79,6 +80,7 @@ public void setUp() throws Exception {
         final ConnectionFactory factory = TestUtils.connectionFactory();
         factory.setSocketConfigurator(new DefaultSocketConfigurator() {
 
+            /* default value on a Linux platform */
             int DEFAULT_RECEIVE_BUFFER_SIZE = 43690;
 
             @Override
@@ -94,10 +96,15 @@ public void configure(Socket socket) throws IOException {
         factory.setRequestedHeartbeat(5);
         factory.setSharedExecutor(executorService);
         // we need the shutdown executor: channel shutting down depends on the work pool,
-        // which is full. Channel shutting down will time out with the shutdown executor
+        // which is full. Channel shutting down will time out with the shutdown executor.
         factory.setShutdownExecutor(executorService);
         factory.setNetworkRecoveryInterval(2000);
 
+        if (TestUtils.USE_NIO) {
+            factory.setWorkPoolTimeout(10 * 1000);
+            factory.setNioParams(new NioParams().setWriteQueueCapacity(10 * 1000 * 1000).setNbIoThreads(4));
+        }
+
         producingConnection = (AutorecoveringConnection) factory.newConnection("Producer Connection");
         producingChannel = (AutorecoveringChannel) producingConnection.createChannel();
         consumingConnection = (AutorecoveringConnection) factory.newConnection("Consuming Connection");
@@ -116,9 +123,6 @@ public void tearDown() throws IOException {
 
     @Test
     public void failureAndRecovery() throws IOException, InterruptedException {
-        if (TestUtils.USE_NIO) {
-            return;
-        }
         final String queue = UUID.randomUUID().toString();
 
         final CountDownLatch recoveryLatch = new CountDownLatch(1);