Skip to content

Commit 12dc3d0

Browse files
ankediasankarpnrussgold
authored
Owls 91212 - Merge fixes for the introspector retry behavior after the job times out and to capture WDT logs. (#2613)
* Merge fixes for the introspector retry behavior after the job times out and to capture WDT logs. Co-authored-by: sankar <sankarpn@gmail.com> Co-authored-by: Russell Gold <russ@russgold.net>
1 parent 264af44 commit 12dc3d0

File tree

18 files changed

+466
-149
lines changed

18 files changed

+466
-149
lines changed

integration-tests/src/test/java/oracle/weblogic/kubernetes/ItKubernetesEvents.java

Lines changed: 58 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import oracle.weblogic.kubernetes.annotations.IntegrationTest;
3939
import oracle.weblogic.kubernetes.annotations.Namespaces;
4040
import oracle.weblogic.kubernetes.logging.LoggingFacade;
41+
import org.awaitility.core.ConditionFactory;
4142
import org.junit.jupiter.api.AfterAll;
4243
import org.junit.jupiter.api.BeforeAll;
4344
import org.junit.jupiter.api.DisplayName;
@@ -48,6 +49,8 @@
4849
import org.junit.jupiter.params.ParameterizedTest;
4950
import org.junit.jupiter.params.provider.ValueSource;
5051

52+
import static java.util.concurrent.TimeUnit.MINUTES;
53+
import static java.util.concurrent.TimeUnit.SECONDS;
5154
import static oracle.weblogic.kubernetes.TestConstants.ADMIN_PASSWORD_DEFAULT;
5255
import static oracle.weblogic.kubernetes.TestConstants.ADMIN_USERNAME_DEFAULT;
5356
import static oracle.weblogic.kubernetes.TestConstants.BASE_IMAGES_REPO_SECRET;
@@ -109,6 +112,7 @@
109112
import static oracle.weblogic.kubernetes.utils.SecretUtils.createSecretWithUsernamePassword;
110113
import static oracle.weblogic.kubernetes.utils.ThreadSafeLogger.getLogger;
111114
import static oracle.weblogic.kubernetes.utils.WLSTUtils.executeWLSTScript;
115+
import static org.awaitility.Awaitility.with;
112116
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
113117
import static org.junit.jupiter.api.Assertions.assertEquals;
114118
import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -155,6 +159,10 @@ class ItKubernetesEvents {
155159

156160
private static LoggingFacade logger = null;
157161

162+
public static ConditionFactory withLongRetryPolicy = with().pollDelay(2, SECONDS)
163+
.and().with().pollInterval(10, SECONDS)
164+
.atMost(10, MINUTES).await();
165+
158166
/**
159167
* Assigns unique namespaces for operator and domains.
160168
* Pull WebLogic image if running tests in Kind cluster.
@@ -188,7 +196,7 @@ public static void initAll(@Namespaces(6) List<String> namespaces) {
188196
externalRestHttpsPort = getServiceNodePort(opNamespace, "external-weblogic-operator-svc");
189197

190198
// This test uses the operator restAPI to scale the domain. To do this in OKD cluster,
191-
// we need to expose the external service as route and set tls termination to passthrough
199+
// we need to expose the external service as route and set tls termination to passthrough
192200
logger.info("Create a route for the operator external service - only for OKD");
193201
String opExternalSvc = createRouteForOKD("external-weblogic-operator-svc", opNamespace);
194202
// Patch the route just created to set tls termination to passthrough
@@ -292,54 +300,82 @@ void testDomainK8sEventsNonExistingCluster() {
292300
}
293301

294302
/**
295-
* Test the following domain events are logged when domain resource goes through various life cycle stages.
296-
* Patch the domain resource to remove the webLogicCredentialsSecret and verify DomainChanged is
297-
* logged when operator processes the domain resource changes.
298-
* Verifies DomainProcessingRetrying is logged when operator retries the failed domain resource
299-
* changes since webLogicCredentialsSecret is still missing.
303+
* Test the following domain events are logged when domain resource goes through introspector failure.
304+
* Patch the domain resource to shutdown servers.
305+
* Patch the domain resource to point to a bad DOMAIN_HOME and update serverStartPolicy to IF_NEEDED.
306+
* Verifies DomainProcessingFailed event is logged.
300307
* Verifies DomainProcessingAborted is logged when operator exceeds the maximum retries and gives
301308
* up processing the domain resource.
309+
* Cleanup by patching the domain resource to a valid location and introspectVersion to bring up all servers again.
302310
*/
303311
@Order(4)
304312
@Test
305313
@DisplayName("Test domain events for failed/retried domain life cycle changes")
306314
void testDomainK8SEventsFailed() {
307315
V1Patch patch;
308316
String patchStr;
317+
Domain domain = assertDoesNotThrow(() -> getDomainCustomResource(domainUid, domainNamespace1));
318+
String originalDomainHome = domain.getSpec().getDomainHome();
309319

310320
OffsetDateTime timestamp = now();
311321
try {
312-
logger.info("remove the webLogicCredentialsSecret to verify the following events"
313-
+ " DomainChanged, DomainProcessingRetrying and DomainProcessingAborted are logged");
314-
patchStr = "[{\"op\": \"remove\", \"path\": \"/spec/webLogicCredentialsSecret\"}]";
315-
logger.info("PatchStr for webLogicCredentialsSecret: {0}", patchStr);
322+
logger.info("Shutting down all servers in domain with serverStartPolicy : NEVER");
323+
patchStr = "[{\"op\": \"replace\", \"path\": \"/spec/serverStartPolicy\", \"value\": \"NEVER\"}]";
324+
patch = new V1Patch(patchStr);
325+
assertTrue(patchDomainCustomResource(domainUid, domainNamespace1, patch, V1Patch.PATCH_FORMAT_JSON_PATCH),
326+
"patchDomainCustomResource failed");
327+
328+
logger.info("Checking if the admin server {0} is shutdown in namespace {1}",
329+
adminServerPodName, domainNamespace1);
330+
checkPodDoesNotExist(adminServerPodName, domainUid, domainNamespace1);
331+
332+
for (int i = 1; i <= replicaCount; i++) {
333+
logger.info("Checking if the managed server {0} is shutdown in namespace {1}",
334+
managedServerPodNamePrefix + i, domainNamespace1);
335+
checkPodDoesNotExist(managedServerPodNamePrefix + i, domainUid, domainNamespace1);
336+
}
337+
338+
logger.info("Replace the domainHome to a nonexisting location to verify the following events"
339+
+ " DomainChanged, DomainProcessingRetrying and DomainProcessingAborted are logged");
340+
patchStr = "[{\"op\": \"replace\", "
341+
+ "\"path\": \"/spec/domainHome\", \"value\": \"" + originalDomainHome + "bad\"},"
342+
+ "{\"op\": \"replace\", \"path\": \"/spec/serverStartPolicy\", \"value\": \"IF_NEEDED\"}]";
343+
logger.info("PatchStr for domainHome: {0}", patchStr);
316344

317345
patch = new V1Patch(patchStr);
318346
assertTrue(patchDomainCustomResource(domainUid, domainNamespace1, patch, V1Patch.PATCH_FORMAT_JSON_PATCH),
319-
"patchDomainCustomResource failed");
347+
"patchDomainCustomResource failed");
320348

321349
logger.info("verify domain changed event is logged");
322350
checkEvent(opNamespace, domainNamespace1, domainUid, DOMAIN_CHANGED, "Normal", timestamp);
323-
324-
// logger.info("verify domain processing retrying event");
325-
// checkEvent(opNamespace, domainNamespace1, domainUid, DOMAIN_PROCESSING_RETRYING, "Normal", timestamp);
326-
327351
logger.info("verify domain processing aborted event");
328352
checkEvent(opNamespace, domainNamespace1, domainUid, DOMAIN_PROCESSING_ABORTED, "Warning", timestamp);
329353
} finally {
354+
logger.info("Restoring the domain with valid location and bringing up all servers");
330355
timestamp = now();
331-
// add back the webLogicCredentialsSecret
332-
patchStr = "[{\"op\": \"add\", \"path\": \"/spec/webLogicCredentialsSecret\", "
333-
+ "\"value\" : {\"name\":\"" + wlSecretName + "\" , \"namespace\":\"" + domainNamespace1 + "\"}"
334-
+ "}]";
335-
logger.info("PatchStr for webLogicCredentialsSecret: {0}", patchStr);
356+
String introspectVersion = assertDoesNotThrow(() -> getNextIntrospectVersion(domainUid, domainNamespace1));
357+
// add back the original domain home
358+
patchStr = "["
359+
+ "{\"op\": \"replace\", \"path\": \"/spec/domainHome\", \"value\": \"" + originalDomainHome + "\"},"
360+
+ "{\"op\": \"add\", \"path\": \"/spec/introspectVersion\", \"value\": \"" + introspectVersion + "\"}"
361+
+ "]";
362+
logger.info("PatchStr for domainHome: {0}", patchStr);
336363

337364
patch = new V1Patch(patchStr);
338365
assertTrue(patchDomainCustomResource(domainUid, domainNamespace1, patch, V1Patch.PATCH_FORMAT_JSON_PATCH),
339-
"patchDomainCustomResource failed");
366+
"patchDomainCustomResource failed");
340367

341368
logger.info("verify domain changed event is logged");
342369
checkEvent(opNamespace, domainNamespace1, domainUid, DOMAIN_CHANGED, "Normal", timestamp);
370+
logger.info("verifying the admin server is created and started");
371+
checkPodReadyAndServiceExists(adminServerPodName, domainUid, domainNamespace1);
372+
373+
// verify managed server services created
374+
for (int i = 1; i <= replicaCount; i++) {
375+
logger.info("Checking managed server service/pod {0} is created in namespace {1}",
376+
managedServerPodNamePrefix + i, domainNamespace1);
377+
checkPodReadyAndServiceExists(managedServerPodNamePrefix + i, domainUid, domainNamespace1);
378+
}
343379
}
344380
}
345381

@@ -889,7 +925,7 @@ public static void tearDown() {
889925
private static void checkEvent(
890926
String opNamespace, String domainNamespace, String domainUid,
891927
String reason, String type, OffsetDateTime timestamp) {
892-
testUntil(
928+
testUntil(withLongRetryPolicy,
893929
checkDomainEvent(opNamespace, domainNamespace, domainUid, reason, type, timestamp),
894930
logger,
895931
"domain event {0} to be logged in namespace {1}",

operator/src/main/java/oracle/kubernetes/operator/DomainProcessorImpl.java

Lines changed: 25 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -841,7 +841,7 @@ private boolean shouldContinue() {
841841
return true;
842842
} else if (shouldReportAbortedEvent()) {
843843
return true;
844-
} else if (hasExceededRetryCount() && !isImgRestartIntrospectVerChanged(liveInfo, cachedInfo)) {
844+
} else if (hasExceededRetryCount(liveInfo) && !isImgRestartIntrospectVerChanged(liveInfo, cachedInfo)) {
845845
LOGGER.severe(ProcessingConstants.EXCEEDED_INTROSPECTOR_MAX_RETRY_COUNT_ERROR_MSG);
846846
return false;
847847
} else if (isFatalIntrospectorError()) {
@@ -852,10 +852,7 @@ private boolean shouldContinue() {
852852
return false; // we have already cached this
853853
} else if (shouldRecheck(cachedInfo)) {
854854

855-
if (hasExceededRetryCount()) {
856-
resetIntrospectorJobFailureCount();
857-
}
858-
if (getCurrentIntrospectFailureRetryCount() > 0) {
855+
if (getCurrentIntrospectFailureRetryCount(liveInfo) > 0) {
859856
logRetryCount(cachedInfo);
860857
}
861858
LOGGER.fine("Continue the make-right domain presence, explicitRecheck -> " + explicitRecheck);
@@ -869,29 +866,9 @@ private boolean shouldReportAbortedEvent() {
869866
return Optional.ofNullable(eventData).map(EventData::getItem).orElse(null) == DOMAIN_PROCESSING_ABORTED;
870867
}
871868

872-
private void resetIntrospectorJobFailureCount() {
873-
Optional.ofNullable(liveInfo)
874-
.map(DomainPresenceInfo::getDomain)
875-
.map(Domain::getStatus)
876-
.map(DomainStatus::resetIntrospectJobFailureCount);
877-
}
878-
879-
private boolean hasExceededRetryCount() {
880-
return getCurrentIntrospectFailureRetryCount()
881-
>= DomainPresence.getDomainPresenceFailureRetryMaxCount();
882-
}
883-
884-
private Integer getCurrentIntrospectFailureRetryCount() {
885-
return Optional.ofNullable(liveInfo)
886-
.map(DomainPresenceInfo::getDomain)
887-
.map(Domain::getStatus)
888-
.map(DomainStatus::getIntrospectJobFailureCount)
889-
.orElse(0);
890-
}
891-
892869
private void logRetryCount(DomainPresenceInfo cachedInfo) {
893870
LOGGER.info(MessageKeys.INTROSPECT_JOB_FAILED_RETRY_COUNT, cachedInfo.getDomain().getDomainUid(),
894-
getCurrentIntrospectFailureRetryCount(),
871+
getCurrentIntrospectFailureRetryCount(liveInfo),
895872
DomainPresence.getDomainPresenceFailureRetryMaxCount());
896873
}
897874

@@ -1020,6 +997,19 @@ private static String getIntrospectVersion(DomainPresenceInfo info) {
1020997
.orElse(null);
1021998
}
1022999

1000+
private Integer getCurrentIntrospectFailureRetryCount(DomainPresenceInfo info) {
1001+
return Optional.ofNullable(info)
1002+
.map(DomainPresenceInfo::getDomain)
1003+
.map(Domain::getStatus)
1004+
.map(DomainStatus::getIntrospectJobFailureCount)
1005+
.orElse(0);
1006+
}
1007+
1008+
private boolean hasExceededRetryCount(DomainPresenceInfo info) {
1009+
return getCurrentIntrospectFailureRetryCount(info)
1010+
>= DomainPresence.getDomainPresenceFailureRetryMaxCount();
1011+
}
1012+
10231013
private static boolean isCachedInfoNewer(DomainPresenceInfo liveInfo, DomainPresenceInfo cachedInfo) {
10241014
return liveInfo.getDomain() != null
10251015
&& KubernetesUtils.isFirstNewer(cachedInfo.getDomain().getMetadata(), liveInfo.getDomain().getMetadata());
@@ -1048,7 +1038,8 @@ public void onThrowable(Packet packet, Throwable throwable) {
10481038
gate.startFiberIfLastFiberMatches(
10491039
domainUid,
10501040
Fiber.getCurrentIfSet(),
1051-
DomainStatusUpdater.createFailureRelatedSteps(throwable),
1041+
Step.chain(DomainStatusUpdater.createFailureCountStep(null),
1042+
DomainStatusUpdater.createFailureRelatedSteps(throwable)),
10521043
plan.packet,
10531044
new CompletionCallback() {
10541045
@Override
@@ -1071,7 +1062,7 @@ public void onThrowable(Packet packet, Throwable throwable) {
10711062
LoggingContext.setThreadContext().namespace(ns).domainUid(domainUid)) {
10721063
existing.setPopulated(false);
10731064
// proceed only if we have not already retried max number of times
1074-
int retryCount = existing.incrementAndGetFailureCount();
1065+
int retryCount = getCurrentIntrospectFailureRetryCount(existing);
10751066
LOGGER.fine(
10761067
"Failure count for DomainPresenceInfo: "
10771068
+ existing
@@ -1128,6 +1119,11 @@ Step createDomainUpPlan(DomainPresenceInfo info) {
11281119
bringAdminServerUp(info, delegate.getPodAwaiterStepFactory(info.getNamespace())),
11291120
managedServerStrategy);
11301121

1122+
if (hasExceededRetryCount(info) && isImgRestartIntrospectVerChanged(info,
1123+
getExistingDomainPresenceInfo(info.getNamespace(), info.getDomainUid()))) {
1124+
domainUpStrategy = Step.chain(DomainStatusUpdater.createResetFailureCountStep(), domainUpStrategy);
1125+
}
1126+
11311127
return Step.chain(
11321128
createDomainUpInitialStep(info),
11331129
ConfigMapHelper.readExistingIntrospectorConfigMap(info.getNamespace(), info.getDomainUid()),
@@ -1174,8 +1170,7 @@ private static class TailStep extends Step {
11741170

11751171
@Override
11761172
public NextAction apply(Packet packet) {
1177-
packet.getSpi(DomainPresenceInfo.class).complete();
1178-
return doNext(packet);
1173+
return doNext(DomainStatusUpdater.createResetFailureCountStep(), packet);
11791174
}
11801175
}
11811176

operator/src/main/java/oracle/kubernetes/operator/DomainStatusUpdater.java

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ static class FailureCountStep extends DomainStatusUpdaterStep {
165165

166166
private final V1Job domainIntrospectorJob;
167167

168-
public FailureCountStep(@Nonnull V1Job domainIntrospectorJob) {
168+
public FailureCountStep(V1Job domainIntrospectorJob) {
169169
super(null);
170170
this.domainIntrospectorJob = domainIntrospectorJob;
171171
}
@@ -177,10 +177,22 @@ void modifyStatus(DomainStatus domainStatus) {
177177

178178
@Nullable
179179
private String getJobUid() {
180-
return Optional.of(domainIntrospectorJob).map(V1Job::getMetadata).map(V1ObjectMeta::getUid).orElse(null);
180+
return Optional.ofNullable(domainIntrospectorJob).map(V1Job::getMetadata).map(V1ObjectMeta::getUid).orElse(null);
181181
}
182182
}
183183

184+
static class ResetFailureCountStep extends DomainStatusUpdaterStep {
185+
186+
@Override
187+
void modifyStatus(DomainStatus domainStatus) {
188+
domainStatus.resetIntrospectJobFailureCount();
189+
}
190+
}
191+
192+
public static Step createResetFailureCountStep() {
193+
return new ResetFailureCountStep();
194+
}
195+
184196
private static String getEventMessage(@Nonnull DomainFailureReason reason, String message) {
185197
return !StringUtils.isBlank(message) ? message : reason.toString();
186198
}
@@ -736,7 +748,6 @@ private boolean isDeleting(String serverName) {
736748
return Optional.ofNullable(getInfo().getServerPod(serverName)).map(PodHelper::isDeleting).orElse(false);
737749
}
738750

739-
740751
private String getDesiredState(String serverName, String clusterName, boolean isAdminServer) {
741752
return isAdminServer | expectedRunningServers.contains(serverName)
742753
? getDomain().getServer(serverName, clusterName).getDesiredState()

operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,11 @@ public static boolean isComplete(V1Job job) {
137137
return false;
138138
}
139139

140-
static boolean isFailed(V1Job job) {
140+
/**
141+
* Returns true if the specified job has a failed status or condition.
142+
* @param job job to be tested
143+
*/
144+
public static boolean isFailed(V1Job job) {
141145
if (job == null) {
142146
return false;
143147
}
@@ -173,8 +177,12 @@ private static String getStatus(V1JobCondition jobCondition) {
173177
return Optional.ofNullable(jobCondition).map(V1JobCondition::getStatus).orElse("");
174178
}
175179

176-
177-
static String getFailedReason(V1Job job) {
180+
/**
181+
* Get the reason for job failure.
182+
* @param job job
183+
* @return Job failure reason.
184+
*/
185+
public static String getFailedReason(V1Job job) {
178186
V1JobStatus status = job.getStatus();
179187
if (status != null && status.getConditions() != null) {
180188
for (V1JobCondition cond : status.getConditions()) {
@@ -298,7 +306,7 @@ void updatePacket(Packet packet, V1Job job) {
298306
// be available for reading
299307
@Override
300308
boolean shouldTerminateFiber(V1Job job) {
301-
return isFailed(job) && ("DeadlineExceeded".equals(getFailedReason(job)));
309+
return isJobTimedOut(job);
302310
}
303311

304312
// create an exception to terminate the fiber
@@ -328,6 +336,10 @@ public NextAction onSuccess(Packet packet, CallResponse<V1Job> callResponse) {
328336
}
329337
}
330338

339+
public static boolean isJobTimedOut(V1Job job) {
340+
return isFailed(job) && ("DeadlineExceeded".equals(getFailedReason(job)));
341+
}
342+
331343
static class DeadlineExceededException extends Exception {
332344
final V1Job job;
333345

operator/src/main/java/oracle/kubernetes/operator/helpers/DomainPresenceInfo.java

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
import java.util.concurrent.ConcurrentHashMap;
1717
import java.util.concurrent.ConcurrentMap;
1818
import java.util.concurrent.atomic.AtomicBoolean;
19-
import java.util.concurrent.atomic.AtomicInteger;
2019
import java.util.concurrent.atomic.AtomicReference;
2120
import java.util.concurrent.locks.ReadWriteLock;
2221
import java.util.concurrent.locks.ReentrantReadWriteLock;
@@ -62,7 +61,6 @@ public class DomainPresenceInfo implements PacketComponent {
6261
private final AtomicReference<Domain> domain;
6362
private final AtomicBoolean isDeleting = new AtomicBoolean(false);
6463
private final AtomicBoolean isPopulated = new AtomicBoolean(false);
65-
private final AtomicInteger retryCount = new AtomicInteger(0);
6664
private final AtomicReference<Collection<ServerStartupInfo>> serverStartupInfo;
6765
private final AtomicReference<Collection<ServerShutdownInfo>> serverShutdownInfo;
6866

@@ -570,23 +568,6 @@ public void setPopulated(boolean populated) {
570568
isPopulated.set(populated);
571569
}
572570

573-
private void resetFailureCount() {
574-
retryCount.set(0);
575-
}
576-
577-
public int incrementAndGetFailureCount() {
578-
return retryCount.incrementAndGet();
579-
}
580-
581-
int getRetryCount() {
582-
return retryCount.get();
583-
}
584-
585-
/** Sets the last completion time to now. */
586-
public void complete() {
587-
resetFailureCount();
588-
}
589-
590571
/**
591572
* Gets the domain. Except the instance to change frequently based on status updates.
592573
*

0 commit comments

Comments
 (0)