Skip to content

Commit 5d1c49b

Browse files
authored
Bail out from barriers when barriers hit 410 Lease Not Found. (#47232)
* Enable 410-1022 on Head requests to bail out. * Bail fast on barrier on reads. * Enhance tests to ensure primary is contacted and barrier request count doesn't exceed a certain threshold. * Enhance tests to ensure barrier post the QuorumSelected phase is invoked. * Code comments * Adding a way to run tests against a multi-region Strong account. * Adding a way to run tests against a multi-region Strong account. * Validate sub-status code too. * Code cleanup. * Add CHANGELOG.md entry. * Modify barrier hit criteria. * Modify barrier hit criteria. * Add tests for barrier bail out in Bounded Staleness consistency. * Add tests for barrier bail out in Bounded Staleness consistency. * Refactoring * Addressing code comments. * Verify write barrier criteria. * Verify barrier bail out criteria. * Verify barrier bail out criteria. * Managing merge. * Fix compilation errors. * Fix tests. * Fix tests. * Addressing comments. * Addressing comments. * Addressing review comments. * Refactoring. * Addressing review comments. * Addressing review comments. * Addressing review comments.
1 parent 793d43a commit 5d1c49b

File tree

13 files changed

+1067
-58
lines changed

13 files changed

+1067
-58
lines changed

sdk/cosmos/azure-cosmos-tests/pom.xml

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -906,10 +906,10 @@ Licensed under the MIT License.
906906
</build>
907907
</profile>
908908
<profile>
909-
<!-- integration tests, requires Cosmos DB endpoint -->
910-
<id>fault-injection-barrier</id>
909+
<!-- tests which target a multi-region strong Cosmos DB account -->
910+
<id>multi-region-strong</id>
911911
<properties>
912-
<test.groups>fault-injection-barrier</test.groups>
912+
<test.groups>multi-region-strong</test.groups>
913913
</properties>
914914
<build>
915915
<plugins>
@@ -919,9 +919,14 @@ Licensed under the MIT License.
919919
<version>3.5.3</version> <!-- {x-version-update;org.apache.maven.plugins:maven-failsafe-plugin;external_dependency} -->
920920
<configuration>
921921
<suiteXmlFiles>
922-
<suiteXmlFile>src/test/resources/fault-injection-barrier-testng.xml</suiteXmlFile>
922+
<suiteXmlFile>src/test/resources/multi-region-strong.xml</suiteXmlFile>
923923
</suiteXmlFiles>
924-
924+
<systemPropertyVariables>
925+
<COSMOS.CLIENT_LEAK_DETECTION_ENABLED>true</COSMOS.CLIENT_LEAK_DETECTION_ENABLED>
926+
<io.netty.leakDetection.samplingInterval>1</io.netty.leakDetection.samplingInterval>
927+
<io.netty.leakDetection.targetRecords>256</io.netty.leakDetection.targetRecords>
928+
<io.netty.leakDetection.level>paranoid</io.netty.leakDetection.level>
929+
</systemPropertyVariables>
925930
</configuration>
926931
</plugin>
927932
</plugins>

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/BailOutFromBarrierE2ETests.java

Lines changed: 543 additions & 0 deletions
Large diffs are not rendered by default.

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/faultinjection/FaultInjectionServerErrorRuleOnDirectTests.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ public FaultInjectionServerErrorRuleOnDirectTests(CosmosClientBuilder clientBuil
8484
this.subscriberValidationTimeout = TIMEOUT;
8585
}
8686

87-
@BeforeClass(groups = {"multi-region", "long", "fast", "fi-multi-master", "fault-injection-barrier"}, timeOut = TIMEOUT)
87+
@BeforeClass(groups = {"multi-region", "long", "fast", "fi-multi-master", "multi-region-strong"}, timeOut = TIMEOUT)
8888
public void beforeClass() {
8989
clientWithoutPreferredRegions = getClientBuilder()
9090
.preferredRegions(new ArrayList<>())
@@ -1057,7 +1057,7 @@ public void faultInjectionServerErrorRuleTests_HitLimit() throws JsonProcessingE
10571057
}
10581058
}
10591059

1060-
@AfterClass(groups = {"multi-region", "long", "fast", "fi-multi-master", "fault-injection-barrier"}, timeOut = SHUTDOWN_TIMEOUT, alwaysRun = true)
1060+
@AfterClass(groups = {"multi-region", "long", "fast", "fi-multi-master", "multi-region-strong"}, timeOut = SHUTDOWN_TIMEOUT, alwaysRun = true)
10611061
public void afterClass() {
10621062
safeClose(clientWithoutPreferredRegions);
10631063
}
@@ -1475,7 +1475,7 @@ public void faultInjectionInjectTcpResponseDelay() throws JsonProcessingExceptio
14751475
}
14761476
}
14771477

1478-
@Test(groups = {"fault-injection-barrier"}, dataProvider = "barrierRequestServerErrorResponseProvider", timeOut = 2 * TIMEOUT)
1478+
@Test(groups = {"multi-region-strong"}, dataProvider = "barrierRequestServerErrorResponseProvider", timeOut = 2 * TIMEOUT)
14791479
public void faultInjection_serverError_barrierRequest(
14801480
OperationType operationType,
14811481
FaultInjectionServerErrorType serverErrorType,

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/directconnectivity/ReflectionUtils.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,10 @@ public static StoreReader getStoreReader(ConsistencyReader consistencyReader) {
309309
return get(StoreReader.class, consistencyReader, "storeReader");
310310
}
311311

312+
public static StoreReader getStoreReader(ConsistencyWriter consistencyWriter) {
313+
return get(StoreReader.class, consistencyWriter, "storeReader");
314+
}
315+
312316
public static void setStoreReader(ConsistencyReader consistencyReader, StoreReader storeReader) {
313317
set(consistencyReader, storeReader, "storeReader");
314318
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
package com.azure.cosmos.implementation.directconnectivity;
5+
6+
import com.azure.cosmos.ConsistencyLevel;
7+
import com.azure.cosmos.implementation.OperationType;
8+
import com.azure.cosmos.implementation.RxDocumentServiceRequest;
9+
import com.azure.cosmos.implementation.Utils;
10+
11+
import java.util.concurrent.atomic.AtomicInteger;
12+
import java.util.function.BiFunction;
13+
14+
public class StoreResponseInterceptorUtils {
15+
16+
public static BiFunction<RxDocumentServiceRequest, StoreResponse, StoreResponse> forceBarrierFollowedByBarrierFailure(
17+
ConsistencyLevel operationConsistencyLevel,
18+
String regionName,
19+
int maxAllowedFailureCount,
20+
AtomicInteger failureCount,
21+
int statusCode,
22+
int subStatusCode) {
23+
24+
return (request, storeResponse) -> {
25+
26+
if (OperationType.Create.equals(request.getOperationType()) && regionName.equals(request.requestContext.regionalRoutingContextToRoute.getGatewayRegionalEndpoint().toString())) {
27+
28+
long localLsn = Long.parseLong(storeResponse.getHeaderValue(WFConstants.BackendHeaders.LOCAL_LSN));
29+
long manipulatedGCLSN = localLsn - 1;
30+
31+
storeResponse.setHeaderValue(WFConstants.BackendHeaders.GLOBAL_COMMITTED_LSN, String.valueOf(manipulatedGCLSN));
32+
33+
return storeResponse;
34+
}
35+
36+
if (OperationType.Read.equals(request.getOperationType()) && regionName.equals(request.requestContext.regionalRoutingContextToRoute.getGatewayRegionalEndpoint().toString())) {
37+
38+
if (ConsistencyLevel.STRONG.equals(operationConsistencyLevel)) {
39+
40+
long globalLsn = Long.parseLong(storeResponse.getHeaderValue(WFConstants.BackendHeaders.LSN));
41+
long itemLsn = Long.parseLong(storeResponse.getHeaderValue(WFConstants.BackendHeaders.ITEM_LSN));
42+
long manipulatedGlobalCommittedLSN = Math.min(globalLsn, itemLsn) - 1;
43+
44+
storeResponse.setHeaderValue(WFConstants.BackendHeaders.GLOBAL_COMMITTED_LSN, String.valueOf(manipulatedGlobalCommittedLSN));
45+
46+
return storeResponse;
47+
} else if (ConsistencyLevel.BOUNDED_STALENESS.equals(operationConsistencyLevel)) {
48+
49+
long manipulatedItemLSN = -1;
50+
long manipulatedGlobalLSN = 0;
51+
52+
storeResponse.setHeaderValue(WFConstants.BackendHeaders.LSN, String.valueOf(manipulatedGlobalLSN));
53+
storeResponse.setHeaderValue(WFConstants.BackendHeaders.LOCAL_LSN, String.valueOf(manipulatedGlobalLSN));
54+
storeResponse.setHeaderValue(WFConstants.BackendHeaders.ITEM_LSN, String.valueOf(manipulatedItemLSN));
55+
storeResponse.setHeaderValue(WFConstants.BackendHeaders.ITEM_LOCAL_LSN, String.valueOf(manipulatedItemLSN));
56+
57+
return storeResponse;
58+
}
59+
60+
return storeResponse;
61+
}
62+
63+
if (OperationType.Head.equals(request.getOperationType()) && regionName.equals(request.requestContext.regionalRoutingContextToRoute.getGatewayRegionalEndpoint().toString())) {
64+
if (failureCount.incrementAndGet() <= maxAllowedFailureCount) {
65+
throw Utils.createCosmosException(statusCode, subStatusCode, new Exception("An intercepted exception occurred. Check status and substatus code for details."), null);
66+
}
67+
}
68+
69+
return storeResponse;
70+
};
71+
}
72+
73+
public static BiFunction<RxDocumentServiceRequest, StoreResponse, StoreResponse> forceSuccessfulBarriersOnReadUntilQuorumSelectionThenForceBarrierFailures(
74+
ConsistencyLevel operationConsistencyLevel,
75+
String regionName,
76+
int allowedSuccessfulHeadRequestsWithoutBarrierBeingMet,
77+
AtomicInteger successfulHeadRequestCount,
78+
int maxAllowedFailureCount,
79+
AtomicInteger failureCount,
80+
int statusCode,
81+
int subStatusCode) {
82+
83+
return (request, storeResponse) -> {
84+
85+
if (OperationType.Read.equals(request.getOperationType()) && regionName.equals(request.requestContext.regionalRoutingContextToRoute.getGatewayRegionalEndpoint().toString())) {
86+
87+
if (ConsistencyLevel.STRONG.equals(operationConsistencyLevel)) {
88+
89+
long globalLsn = Long.parseLong(storeResponse.getHeaderValue(WFConstants.BackendHeaders.LSN));
90+
long itemLsn = Long.parseLong(storeResponse.getHeaderValue(WFConstants.BackendHeaders.ITEM_LSN));
91+
long manipulatedGlobalCommittedLSN = Math.min(globalLsn, itemLsn) - 1;
92+
93+
storeResponse.setHeaderValue(WFConstants.BackendHeaders.GLOBAL_COMMITTED_LSN, String.valueOf(manipulatedGlobalCommittedLSN));
94+
95+
return storeResponse;
96+
} else if (ConsistencyLevel.BOUNDED_STALENESS.equals(operationConsistencyLevel)) {
97+
98+
long manipulatedItemLSN = -1;
99+
long manipulatedGlobalLSN = 0;
100+
101+
storeResponse.setHeaderValue(WFConstants.BackendHeaders.LSN, String.valueOf(manipulatedGlobalLSN));
102+
storeResponse.setHeaderValue(WFConstants.BackendHeaders.LOCAL_LSN, String.valueOf(manipulatedGlobalLSN));
103+
storeResponse.setHeaderValue(WFConstants.BackendHeaders.ITEM_LSN, String.valueOf(manipulatedItemLSN));
104+
storeResponse.setHeaderValue(WFConstants.BackendHeaders.ITEM_LOCAL_LSN, String.valueOf(manipulatedItemLSN));
105+
106+
return storeResponse;
107+
}
108+
109+
return storeResponse;
110+
}
111+
112+
if (OperationType.Head.equals(request.getOperationType()) && regionName.equals(request.requestContext.regionalRoutingContextToRoute.getGatewayRegionalEndpoint().toString())) {
113+
114+
if (successfulHeadRequestCount.incrementAndGet() <= allowedSuccessfulHeadRequestsWithoutBarrierBeingMet) {
115+
116+
if (ConsistencyLevel.STRONG.equals(operationConsistencyLevel)) {
117+
118+
long localLsn = Long.parseLong(storeResponse.getHeaderValue(WFConstants.BackendHeaders.LOCAL_LSN));
119+
long itemLsn = Long.parseLong(storeResponse.getHeaderValue(WFConstants.BackendHeaders.ITEM_LSN));
120+
long manipulatedGCLSN = Math.min(localLsn, itemLsn) - 1;
121+
122+
storeResponse.setHeaderValue(WFConstants.BackendHeaders.GLOBAL_COMMITTED_LSN, String.valueOf(manipulatedGCLSN));
123+
124+
return storeResponse;
125+
} else if (ConsistencyLevel.BOUNDED_STALENESS.equals(operationConsistencyLevel)) {
126+
127+
long manipulatedItemLSN = -1;
128+
long manipulatedGlobalLSN = -1;
129+
130+
storeResponse.setHeaderValue(WFConstants.BackendHeaders.LSN, String.valueOf(manipulatedGlobalLSN));
131+
storeResponse.setHeaderValue(WFConstants.BackendHeaders.LOCAL_LSN, String.valueOf(manipulatedGlobalLSN));
132+
storeResponse.setHeaderValue(WFConstants.BackendHeaders.ITEM_LSN, String.valueOf(manipulatedItemLSN));
133+
storeResponse.setHeaderValue(WFConstants.BackendHeaders.ITEM_LOCAL_LSN, String.valueOf(manipulatedItemLSN));
134+
135+
return storeResponse;
136+
}
137+
138+
return storeResponse;
139+
}
140+
141+
if (failureCount.incrementAndGet() <= maxAllowedFailureCount) {
142+
throw Utils.createCosmosException(statusCode, subStatusCode, new Exception("An intercepted exception occurred. Check status and substatus code for details."), null);
143+
}
144+
}
145+
146+
return storeResponse;
147+
};
148+
}
149+
}

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/rx/TestSuiteBase.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ public CosmosAsyncDatabase getDatabase(String id) {
205205

206206
@BeforeSuite(groups = {"thinclient", "fast", "long", "direct", "multi-region", "multi-master", "flaky-multi-master", "emulator",
207207
"emulator-vnext", "split", "query", "cfp-split", "circuit-breaker-misc-gateway", "circuit-breaker-misc-direct",
208-
"circuit-breaker-read-all-read-many", "fi-multi-master", "long-emulator", "fi-thinclient-multi-region", "fi-thinclient-multi-master", "fault-injection-barrier"}, timeOut = SUITE_SETUP_TIMEOUT)
208+
"circuit-breaker-read-all-read-many", "fi-multi-master", "long-emulator", "fi-thinclient-multi-region", "fi-thinclient-multi-master", "multi-region-strong"}, timeOut = SUITE_SETUP_TIMEOUT)
209209
public void beforeSuite() {
210210

211211
logger.info("beforeSuite Started");
@@ -223,7 +223,7 @@ public void beforeSuite() {
223223

224224
@AfterSuite(groups = {"thinclient", "fast", "long", "direct", "multi-region", "multi-master", "flaky-multi-master",
225225
"emulator", "split", "query", "cfp-split", "circuit-breaker-misc-gateway", "circuit-breaker-misc-direct",
226-
"circuit-breaker-read-all-read-many", "fi-multi-master", "long-emulator", "fi-thinclient-multi-region", "fi-thinclient-multi-master", "fault-injection-barrier"}, timeOut = SUITE_SHUTDOWN_TIMEOUT)
226+
"circuit-breaker-read-all-read-many", "fi-multi-master", "long-emulator", "fi-thinclient-multi-region", "fi-thinclient-multi-master", "multi-region-strong"}, timeOut = SUITE_SHUTDOWN_TIMEOUT)
227227
public void afterSuite() {
228228

229229
logger.info("afterSuite Started");
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,14 @@
2121
~ SOFTWARE.
2222
-->
2323
<!DOCTYPE suite SYSTEM "https://testng.org/testng-1.0.dtd">
24-
<suite name="fault-injection-barrier">
24+
<suite name="multi-region-strong">
2525
<listeners>
2626
<listener class-name="com.azure.cosmos.CosmosNettyLeakDetectorFactory"/>
2727
</listeners>
28-
<test name="fault-injection-barrier" group-by-instances="true">
28+
<test name="multi-region-strong" group-by-instances="true">
2929
<groups>
3030
<run>
31-
<include name="fault-injection-barrier"/>
31+
<include name="multi-region-strong"/>
3232
</run>
3333
</groups>
3434
<packages>

sdk/cosmos/azure-cosmos/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#### Other Changes
1313
* Enabled hostname validation for RNTBD connections to backend - [PR 47111](https://github.com/Azure/azure-sdk-for-java/pull/47111)
1414
* Changed to use incremental change feed to get partition key ranges. - [46810](https://github.com/Azure/azure-sdk-for-java/pull/46810)
15+
* Optimized 410 `Lease Not Found` handling for Strong Consistency account by avoiding unnecessary retries in the barrier attainment flow. - [PR 47232](https://github.com/Azure/azure-sdk-for-java/pull/47232)
1516

1617
### 4.75.0 (2025-10-21)
1718
> [!IMPORTANT]

0 commit comments

Comments
 (0)