Skip to content

Commit 80dc43a

Browse files
authored
[Inference API] Remove worst-case additional 50ms latency for non-rate limited requests (#136167)
1 parent 20e30c9 commit 80dc43a

File tree

8 files changed

+444
-145
lines changed

8 files changed

+444
-145
lines changed

docs/changelog/136167.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 136167
2+
summary: "[Inference API] Remove worst-case additional 50ms latency for non-rate limited\
3+
\ requests"
4+
area: Machine Learning
5+
type: bug
6+
issues: []

server/src/main/java/org/elasticsearch/inference/InputType.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ public static boolean isInternalTypeOrUnspecified(InputType inputType) {
6161
return inputType == InputType.INTERNAL_INGEST || inputType == InputType.INTERNAL_SEARCH || inputType == InputType.UNSPECIFIED;
6262
}
6363

64+
public static boolean isIngest(InputType inputType) {
65+
return inputType == InputType.INGEST || inputType == InputType.INTERNAL_INGEST;
66+
}
67+
6468
public static boolean isSpecified(InputType inputType) {
6569
return inputType != null && inputType != InputType.UNSPECIFIED;
6670
}

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/external/http/sender/RequestExecutorService.java

Lines changed: 268 additions & 66 deletions
Large diffs are not rendered by default.

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/settings/RateLimitSettings.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ public RateLimitSettings(long requestsPerTimeUnit, TimeUnit timeUnit) {
125125
}
126126

127127
// This should only be used for testing.
128-
RateLimitSettings(long requestsPerTimeUnit, TimeUnit timeUnit, boolean enabled) {
128+
public RateLimitSettings(long requestsPerTimeUnit, TimeUnit timeUnit, boolean enabled) {
129129
if (requestsPerTimeUnit <= 0) {
130130
throw new IllegalArgumentException("requests per minute must be positive");
131131
}

x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/InputTypeTests.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ public static InputType randomWithInternalAndUnspecified() {
6464
return randomFrom(InputType.INTERNAL_SEARCH, InputType.INTERNAL_INGEST, InputType.UNSPECIFIED);
6565
}
6666

67+
public static InputType randomIngest() {
68+
return randomFrom(InputType.INGEST, InputType.INTERNAL_INGEST);
69+
}
70+
6771
public void testFromRestString_ValidInputType() {
6872
for (String internal : List.of("search", "ingest", "classification", "clustering", "unspecified")) {
6973
assertEquals(InputType.fromRestString(internal), InputType.fromString(internal));
@@ -211,4 +215,8 @@ public void testValidateInputTypeTranslationValues_ThrowsAnException_WhenValueIs
211215
)
212216
);
213217
}
218+
219+
public void testIsIngest() {
220+
assertTrue(InputType.isIngest(randomFrom(InputType.INGEST, InputType.INTERNAL_INGEST)));
221+
}
214222
}

x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/external/http/sender/HttpRequestSenderTests.java

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,12 @@ public void testHttpRequestSender_Throws_WhenCallingSendBeforeStart() throws Exc
354354
PlainActionFuture<InferenceServiceResults> listener = new PlainActionFuture<>();
355355
var thrownException = expectThrows(
356356
AssertionError.class,
357-
() -> sender.send(RequestManagerTests.createMock(), new EmbeddingsInput(List.of(), null), null, listener)
357+
() -> sender.send(
358+
RequestManagerTests.createMockWithRateLimitingEnabled(),
359+
new EmbeddingsInput(List.of(), null),
360+
null,
361+
listener
362+
)
358363
);
359364
assertThat(thrownException.getMessage(), is("call start() before sending a request"));
360365
}
@@ -375,7 +380,12 @@ public void testHttpRequestSender_Throws_WhenATimeoutOccurs() throws Exception {
375380
sender.startSynchronously();
376381

377382
PlainActionFuture<InferenceServiceResults> listener = new PlainActionFuture<>();
378-
sender.send(RequestManagerTests.createMock(), new EmbeddingsInput(List.of(), null), TimeValue.timeValueNanos(1), listener);
383+
sender.send(
384+
RequestManagerTests.createMockWithRateLimitingEnabled(),
385+
new EmbeddingsInput(List.of(), null),
386+
TimeValue.timeValueNanos(1),
387+
listener
388+
);
379389

380390
var thrownException = expectThrows(ElasticsearchStatusException.class, () -> listener.actionGet(TIMEOUT));
381391

@@ -397,7 +407,12 @@ public void testHttpRequestSenderWithTimeout_Throws_WhenATimeoutOccurs() throws
397407
sender.startSynchronously();
398408

399409
PlainActionFuture<InferenceServiceResults> listener = new PlainActionFuture<>();
400-
sender.send(RequestManagerTests.createMock(), new EmbeddingsInput(List.of(), null), TimeValue.timeValueNanos(1), listener);
410+
sender.send(
411+
RequestManagerTests.createMockWithRateLimitingEnabled(),
412+
new EmbeddingsInput(List.of(), null),
413+
TimeValue.timeValueNanos(1),
414+
listener
415+
);
401416

402417
var thrownException = expectThrows(ElasticsearchStatusException.class, () -> listener.actionGet(TIMEOUT));
403418

x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/external/http/sender/RequestExecutorServiceTests.java

Lines changed: 121 additions & 67 deletions
Large diffs are not rendered by default.

x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/external/http/sender/RequestManagerTests.java

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,26 +15,36 @@
1515
import org.elasticsearch.xpack.inference.external.request.RequestTests;
1616
import org.elasticsearch.xpack.inference.services.settings.RateLimitSettings;
1717

18+
import java.util.concurrent.TimeUnit;
19+
1820
import static org.mockito.ArgumentMatchers.any;
1921
import static org.mockito.Mockito.doAnswer;
2022
import static org.mockito.Mockito.mock;
2123
import static org.mockito.Mockito.when;
2224

2325
public class RequestManagerTests {
24-
public static RequestManager createMock() {
25-
return createMock(mock(RequestSender.class));
26+
public static RequestManager createMockWithRateLimitingDisabled(RequestSender requestSender, String inferenceEntityId) {
27+
return createMock(requestSender, inferenceEntityId, new RateLimitSettings(1, TimeUnit.MINUTES, false));
28+
}
29+
30+
public static RequestManager createMockWithRateLimitingDisabled(String inferenceEntityId) {
31+
return createMock(mock(RequestSender.class), inferenceEntityId, new RateLimitSettings(1, TimeUnit.MINUTES, false));
32+
}
33+
34+
public static RequestManager createMockWithRateLimitingEnabled() {
35+
return createMockWithRateLimitingEnabled(mock(RequestSender.class));
2636
}
2737

28-
public static RequestManager createMock(String inferenceEntityId) {
29-
return createMock(mock(RequestSender.class), inferenceEntityId);
38+
public static RequestManager createMockWithRateLimitingEnabled(String inferenceEntityId) {
39+
return createMockWithRateLimitingEnabled(mock(RequestSender.class), inferenceEntityId);
3040
}
3141

32-
public static RequestManager createMock(RequestSender requestSender) {
33-
return createMock(requestSender, "id", new RateLimitSettings(1));
42+
public static RequestManager createMockWithRateLimitingEnabled(RequestSender requestSender) {
43+
return createMock(requestSender, "id", new RateLimitSettings(1, TimeUnit.MINUTES, true));
3444
}
3545

36-
public static RequestManager createMock(RequestSender requestSender, String inferenceEntityId) {
37-
return createMock(requestSender, inferenceEntityId, new RateLimitSettings(1));
46+
public static RequestManager createMockWithRateLimitingEnabled(RequestSender requestSender, String inferenceEntityId) {
47+
return createMock(requestSender, inferenceEntityId, new RateLimitSettings(1, TimeUnit.MINUTES, true));
3848
}
3949

4050
public static RequestManager createMock(RequestSender requestSender, String inferenceEntityId, RateLimitSettings settings) {

0 commit comments

Comments
 (0)