Skip to content

Commit 30a4b3b

Browse files
Set SSL Cert ConfigMap in KOTS and Improve Detection of Unhealthy Pods in Tests (#2239)
* re-add privateCAs value so that KOTS can use it to configure the SDK * don't use old value and instead just set SSL_CERT_CONFIGMAP env var * add additional handling to pod validation to not fail on job pods in an error state * fix job completion validation * add retry and timeout for pod validation * allow pending pods in validation for TestMultiNodeHAInstallation test * fix unbound variable * bypass pending pods for multinode airgap ha * fix extraEnv and check for unready pods * fix dry run test
1 parent 8515c2d commit 30a4b3b

File tree

10 files changed

+131
-10
lines changed

10 files changed

+131
-10
lines changed

e2e/install_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1325,6 +1325,9 @@ func TestMultiNodeHAInstallation(t *testing.T) {
13251325

13261326
checkPostUpgradeStateWithOptions(t, tc, postUpgradeStateOptions{
13271327
node: 2,
1328+
withEnv: map[string]string{
1329+
"ALLOW_PENDING_PODS": "true",
1330+
},
13281331
})
13291332

13301333
t.Logf("%s: test complete", time.Now().Format(time.RFC3339))
@@ -1450,6 +1453,9 @@ func TestMultiNodeAirgapHAInstallation(t *testing.T) {
14501453

14511454
checkPostUpgradeStateWithOptions(t, tc, postUpgradeStateOptions{
14521455
node: 2,
1456+
withEnv: map[string]string{
1457+
"ALLOW_PENDING_PODS": "true",
1458+
},
14531459
})
14541460

14551461
t.Logf("%s: test complete", time.Now().Format(time.RFC3339))

e2e/scripts/check-airgap-installation-state.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ main() {
4343
exit 1
4444
fi
4545

46-
validate_no_pods_in_crashloop
46+
validate_all_pods_healthy
4747
}
4848

4949
main "$@"

e2e/scripts/check-airgap-post-ha-state.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ main() {
8181
# scale the second deployment back down so that they aren't restored in the DR test
8282
kubectl scale -n "$APP_NAMESPACE" deployment/second --replicas=0
8383

84-
validate_no_pods_in_crashloop
84+
validate_all_pods_healthy
8585
}
8686

8787
main "$@"

e2e/scripts/check-installation-state.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ main() {
4949
validate_data_dirs
5050
fi
5151

52-
validate_no_pods_in_crashloop
52+
validate_all_pods_healthy
5353
}
5454

5555
main "$@"

e2e/scripts/check-post-ha-state.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ main() {
6161
exit 1
6262
fi
6363

64-
validate_no_pods_in_crashloop
64+
validate_all_pods_healthy
6565
}
6666

6767
main "$@"

e2e/scripts/check-postupgrade-state.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ main() {
111111

112112
validate_data_dirs
113113

114-
validate_no_pods_in_crashloop
114+
validate_all_pods_healthy
115115
}
116116

117117
main "$@"

e2e/scripts/common.sh

Lines changed: 109 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -455,12 +455,116 @@ validate_data_dirs() {
455455
fi
456456
}
457457

458-
validate_no_pods_in_crashloop() {
459-
if kubectl get pods -A | grep CrashLoopBackOff -q ; then
460-
echo "found pods in CrashLoopBackOff state"
461-
kubectl get pods -A | grep CrashLoopBackOff
462-
exit 1
458+
validate_non_job_pods_healthy() {
459+
local unhealthy_pods
460+
local unready_pods
461+
462+
# Check for environment variable override (used by specific tests)
463+
if [ "${ALLOW_PENDING_PODS:-}" = "true" ]; then
464+
# Allow Running, Completed, Succeeded, Pending
465+
unhealthy_pods=$(kubectl get pods -A --no-headers -o custom-columns="NAMESPACE:.metadata.namespace,NAME:.metadata.name,STATUS:.status.phase,OWNER:.metadata.ownerReferences[0].kind" | \
466+
awk '$4 != "Job" && ($3 != "Running" && $3 != "Completed" && $3 != "Succeeded" && $3 != "Pending") { print $1 "/" $2 " (" $3 ")" }')
467+
echo "All non-Job pods are healthy (allowing Pending pods)"
468+
else
469+
# Default: only allow Running, Completed, Succeeded
470+
unhealthy_pods=$(kubectl get pods -A --no-headers -o custom-columns="NAMESPACE:.metadata.namespace,NAME:.metadata.name,STATUS:.status.phase,OWNER:.metadata.ownerReferences[0].kind" | \
471+
awk '$4 != "Job" && ($3 != "Running" && $3 != "Completed" && $3 != "Succeeded") { print $1 "/" $2 " (" $3 ")" }')
472+
echo "All non-Job pods are healthy"
473+
fi
474+
475+
# Check container readiness for Running pods (skip Completed/Succeeded pods as they don't need to be ready)
476+
unready_pods=$(kubectl get pods -A --no-headers -o custom-columns="NAMESPACE:.metadata.namespace,NAME:.metadata.name,STATUS:.status.phase,READY:.status.containerStatuses[*].ready,OWNER:.metadata.ownerReferences[0].kind" | \
477+
awk '$5 != "Job" && $3 == "Running" && ($4 == "" || $4 !~ /^(true[[:space:]]*)*$/) { print $1 "/" $2 " (not ready)" }')
478+
479+
local has_issues=0
480+
481+
if [ -n "$unhealthy_pods" ]; then
482+
echo "found non-Job pods in unhealthy state:"
483+
echo "$unhealthy_pods"
484+
has_issues=1
485+
fi
486+
487+
if [ -n "$unready_pods" ]; then
488+
echo "found non-Job pods that are Running but not ready:"
489+
echo "$unready_pods"
490+
has_issues=1
491+
fi
492+
493+
if [ $has_issues -eq 1 ]; then
494+
return 1
495+
fi
496+
497+
return 0
498+
}
499+
500+
validate_jobs_completed() {
501+
local incomplete_jobs
502+
# Check that all Jobs have succeeded (status.succeeded should equal spec.completions)
503+
# Flag any job that hasn't fully succeeded
504+
incomplete_jobs=$(kubectl get jobs -A --no-headers -o custom-columns="NAMESPACE:.metadata.namespace,NAME:.metadata.name,COMPLETIONS:.spec.completions,SUCCESSFUL:.status.succeeded" | \
505+
awk '$4 != $3 { print $1 "/" $2 " (succeeded: " $4 "/" $3 ")" }')
506+
507+
if [ -n "$incomplete_jobs" ]; then
508+
echo "found Jobs that have not completed successfully:"
509+
echo "$incomplete_jobs"
510+
echo ""
511+
echo "Job details:"
512+
kubectl get jobs -A
513+
return 1
514+
fi
515+
echo "All Jobs have completed successfully"
516+
return 0
517+
}
518+
519+
validate_all_pods_healthy() {
520+
local timeout=300 # 5 minutes
521+
local start_time
522+
local current_time
523+
local elapsed_time
524+
start_time=$(date +%s)
525+
526+
# Show what mode we're in
527+
if [ "${ALLOW_PENDING_PODS:-}" = "true" ]; then
528+
echo "Validating pod and job health (allowing Pending pods)..."
529+
else
530+
echo "Validating pod and job health (default: Running, Completed, Succeeded)..."
463531
fi
532+
533+
while true; do
534+
current_time=$(date +%s)
535+
elapsed_time=$((current_time - start_time))
536+
537+
if [ "$elapsed_time" -ge "$timeout" ]; then
538+
echo "Timed out waiting for pods and jobs to be healthy after 5 minutes"
539+
540+
# Show detailed failure info
541+
validate_non_job_pods_healthy || true
542+
echo ""
543+
validate_jobs_completed || true
544+
545+
return 1
546+
fi
547+
548+
# Check if both validations pass
549+
local pods_healthy=0
550+
local jobs_healthy=0
551+
552+
if validate_non_job_pods_healthy >/dev/null 2>&1; then
553+
pods_healthy=1
554+
fi
555+
556+
if validate_jobs_completed >/dev/null 2>&1; then
557+
jobs_healthy=1
558+
fi
559+
560+
if [ $pods_healthy -eq 1 ] && [ $jobs_healthy -eq 1 ]; then
561+
echo "All pods and jobs are healthy"
562+
return 0
563+
fi
564+
565+
echo "Waiting for pods and jobs to be healthy... (${elapsed_time}s elapsed)"
566+
sleep 10
567+
done
464568
}
465569

466570
validate_worker_profile() {

pkg/addons/adminconsole/static/values.tpl.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,6 @@ passwordSecretRef:
1919
name: kotsadm-password
2020
service:
2121
enabled: false
22+
extraEnv:
23+
- name: SSL_CERT_CONFIGMAP
24+
value: "kotsadm-private-cas"

pkg/addons/adminconsole/values.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,10 @@ func (a *AdminConsole) GenerateHelmValues(ctx context.Context, kcli client.Clien
5757
"name": "ENABLE_IMPROVED_DR",
5858
"value": "true",
5959
},
60+
{
61+
"name": "SSL_CERT_CONFIGMAP",
62+
"value": "kotsadm-private-cas",
63+
},
6064
}
6165

6266
if a.Proxy != nil {

tests/dryrun/install_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -722,6 +722,10 @@ func TestHTTPProxyWithCABundleConfiguration(t *testing.T) {
722722
"name": "ENABLE_IMPROVED_DR",
723723
"value": "true",
724724
},
725+
{
726+
"name": "SSL_CERT_CONFIGMAP",
727+
"value": "kotsadm-private-cas",
728+
},
725729
{
726730
"name": "HTTP_PROXY",
727731
"value": "http://localhost:3128",

0 commit comments

Comments
 (0)