From 0ee72d3799e3cf9752520599a61ffd545c2e5991 Mon Sep 17 00:00:00 2001 From: ada mancini Date: Mon, 6 Jan 2025 15:54:39 -0500 Subject: [PATCH 01/10] limit velero backups & restores to most recent object --- pkg/analyze/velero.go | 44 +++++++++++++++++-- .../troubleshoot/v1beta2/analyzer_shared.go | 4 +- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/pkg/analyze/velero.go b/pkg/analyze/velero.go index 6be5327b4..583afbd77 100644 --- a/pkg/analyze/velero.go +++ b/pkg/analyze/velero.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" "path/filepath" + "sort" "strings" appsV1 "k8s.io/api/apps/v1" @@ -65,6 +66,17 @@ func (a *AnalyzeVelero) veleroStatus(analyzer *troubleshootv1beta2.VeleroAnalyze oldVeleroRepoType = true } + // default to only the most recent Backup and Restore object if not specified in the analyzer spec + backupCount := analyzer.BackupsCount + if backupCount <= 0 { + backupCount = 1 + } + + restoreCount := analyzer.RestoresCount + if restoreCount <= 0 { + restoreCount = 1 + } + if oldVeleroRepoType == true { // old velero (v1.9.x) has a BackupRepositoryTypeRestic // get resticrepositories.velero.io @@ -276,12 +288,12 @@ func (a *AnalyzeVelero) veleroStatus(analyzer *troubleshootv1beta2.VeleroAnalyze results = append(results, analyzeLogs(nodeAgentlogs, "node-agent*")...) results = append(results, analyzeLogs(veleroLogs, "velero*")...) - results = append(results, analyzeBackups(backups)...) + results = append(results, analyzeBackups(backups, backupCount)...) results = append(results, analyzeBackupStorageLocations(backupStorageLocations)...) results = append(results, analyzeDeleteBackupRequests(deleteBackupRequests)...) results = append(results, analyzePodVolumeBackups(podVolumeBackups)...) results = append(results, analyzePodVolumeRestores(podVolumeRestores)...) - results = append(results, analyzeRestores(restores)...) + results = append(results, analyzeRestores(restores, restoreCount)...) results = append(results, analyzeSchedules(schedules)...) results = append(results, analyzeVolumeSnapshotLocations(volumeSnapshotLocations)...) @@ -357,9 +369,19 @@ func analyzeResticRepositories(resticRepositories []*restic_types.ResticReposito return results } -func analyzeBackups(backups []*velerov1.Backup) []*AnalyzeResult { +func analyzeBackups(backups []*velerov1.Backup, count int) []*AnalyzeResult { results := []*AnalyzeResult{} + // Sort backups by StartTimestamp in descending order + sort.SliceStable(backups, func(i, j int) bool { + return backups[i].Status.StartTimestamp.After(backups[j].Status.StartTimestamp.Time) + }) + + // Limit to the most recent backupCount items + if len(backups) > count { + backups = backups[:count] + } + failedPhases := map[velerov1.BackupPhase]bool{ velerov1.BackupPhaseFailed: true, velerov1.BackupPhasePartiallyFailed: true, @@ -501,10 +523,20 @@ func analyzePodVolumeRestores(podVolumeRestores []*velerov1.PodVolumeRestore) [] return results } -func analyzeRestores(restores []*velerov1.Restore) []*AnalyzeResult { +func analyzeRestores(restores []*velerov1.Restore, count int) []*AnalyzeResult { results := []*AnalyzeResult{} failures := 0 + // Sort restores by StartTimestamp in descending order + sort.SliceStable(restores, func(i, j int) bool { + return restores[i].Status.StartTimestamp.After(restores[j].Status.StartTimestamp.Time) + }) + + // Limit to the most recent restoreCount items + if len(restores) > count { + restores = restores[:count] + } + if len(restores) > 0 { failedPhases := map[velerov1.RestorePhase]bool{ @@ -513,6 +545,10 @@ func analyzeRestores(restores []*velerov1.Restore) []*AnalyzeResult { velerov1.RestorePhaseFailedValidation: true, } + failureReasons := []string{ + "found a restore with status \"InProgress\" during the server starting, mark it as \"Failed\"", + } + for _, restore := range restores { if failedPhases[restore.Status.Phase] { result := &AnalyzeResult{ diff --git a/pkg/apis/troubleshoot/v1beta2/analyzer_shared.go b/pkg/apis/troubleshoot/v1beta2/analyzer_shared.go index 55d1c5ac8..7efea27cb 100644 --- a/pkg/apis/troubleshoot/v1beta2/analyzer_shared.go +++ b/pkg/apis/troubleshoot/v1beta2/analyzer_shared.go @@ -200,7 +200,9 @@ type CephStatusAnalyze struct { } type VeleroAnalyze struct { - AnalyzeMeta `json:",inline" yaml:",inline"` + AnalyzeMeta `json:",inline" yaml:",inline"` + BackupsCount int `json:"backupCount,omitempty" yaml:"backupCount,omitempty"` + RestoresCount int `json:"restoreCount,omitempty" yaml:"restoreCount,omitempty"` } type LonghornAnalyze struct { From c979ac2598d24773a4595d6f6ff7d9e5b9eac149 Mon Sep 17 00:00:00 2001 From: ada mancini Date: Mon, 6 Jan 2025 15:55:47 -0500 Subject: [PATCH 02/10] make schemas --- config/crds/troubleshoot.sh_analyzers.yaml | 4 ++++ config/crds/troubleshoot.sh_preflights.yaml | 4 ++++ config/crds/troubleshoot.sh_supportbundles.yaml | 4 ++++ schemas/analyzer-troubleshoot-v1beta2.json | 6 ++++++ schemas/preflight-troubleshoot-v1beta2.json | 6 ++++++ schemas/supportbundle-troubleshoot-v1beta2.json | 6 ++++++ 6 files changed, 30 insertions(+) diff --git a/config/crds/troubleshoot.sh_analyzers.yaml b/config/crds/troubleshoot.sh_analyzers.yaml index 527509e6c..e66ab7369 100644 --- a/config/crds/troubleshoot.sh_analyzers.yaml +++ b/config/crds/troubleshoot.sh_analyzers.yaml @@ -1809,10 +1809,14 @@ spec: additionalProperties: type: string type: object + backupCount: + type: integer checkName: type: string exclude: type: BoolString + restoreCount: + type: integer strict: type: BoolString type: object diff --git a/config/crds/troubleshoot.sh_preflights.yaml b/config/crds/troubleshoot.sh_preflights.yaml index ceb3429f6..f2d573f7f 100644 --- a/config/crds/troubleshoot.sh_preflights.yaml +++ b/config/crds/troubleshoot.sh_preflights.yaml @@ -1809,10 +1809,14 @@ spec: additionalProperties: type: string type: object + backupCount: + type: integer checkName: type: string exclude: type: BoolString + restoreCount: + type: integer strict: type: BoolString type: object diff --git a/config/crds/troubleshoot.sh_supportbundles.yaml b/config/crds/troubleshoot.sh_supportbundles.yaml index f6d3d6fcf..7e5e1aaa3 100644 --- a/config/crds/troubleshoot.sh_supportbundles.yaml +++ b/config/crds/troubleshoot.sh_supportbundles.yaml @@ -1840,10 +1840,14 @@ spec: additionalProperties: type: string type: object + backupCount: + type: integer checkName: type: string exclude: type: BoolString + restoreCount: + type: integer strict: type: BoolString type: object diff --git a/schemas/analyzer-troubleshoot-v1beta2.json b/schemas/analyzer-troubleshoot-v1beta2.json index fb1acc0ca..bb426c19d 100644 --- a/schemas/analyzer-troubleshoot-v1beta2.json +++ b/schemas/analyzer-troubleshoot-v1beta2.json @@ -2724,12 +2724,18 @@ "type": "string" } }, + "backupCount": { + "type": "integer" + }, "checkName": { "type": "string" }, "exclude": { "oneOf": [{"type": "string"},{"type": "boolean"}] }, + "restoreCount": { + "type": "integer" + }, "strict": { "oneOf": [{"type": "string"},{"type": "boolean"}] } diff --git a/schemas/preflight-troubleshoot-v1beta2.json b/schemas/preflight-troubleshoot-v1beta2.json index 7e2e40a28..20c0407cf 100644 --- a/schemas/preflight-troubleshoot-v1beta2.json +++ b/schemas/preflight-troubleshoot-v1beta2.json @@ -2724,12 +2724,18 @@ "type": "string" } }, + "backupCount": { + "type": "integer" + }, "checkName": { "type": "string" }, "exclude": { "oneOf": [{"type": "string"},{"type": "boolean"}] }, + "restoreCount": { + "type": "integer" + }, "strict": { "oneOf": [{"type": "string"},{"type": "boolean"}] } diff --git a/schemas/supportbundle-troubleshoot-v1beta2.json b/schemas/supportbundle-troubleshoot-v1beta2.json index cf79b9df6..b77e93146 100644 --- a/schemas/supportbundle-troubleshoot-v1beta2.json +++ b/schemas/supportbundle-troubleshoot-v1beta2.json @@ -2770,12 +2770,18 @@ "type": "string" } }, + "backupCount": { + "type": "integer" + }, "checkName": { "type": "string" }, "exclude": { "oneOf": [{"type": "string"},{"type": "boolean"}] }, + "restoreCount": { + "type": "integer" + }, "strict": { "oneOf": [{"type": "string"},{"type": "boolean"}] } From a379eb520e957acbc26e95f595906dd4a8a6c94e Mon Sep 17 00:00:00 2001 From: ada mancini Date: Mon, 6 Jan 2025 16:53:32 -0500 Subject: [PATCH 03/10] fix unused var --- pkg/analyze/velero.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/analyze/velero.go b/pkg/analyze/velero.go index 583afbd77..4b521f6e9 100644 --- a/pkg/analyze/velero.go +++ b/pkg/analyze/velero.go @@ -545,9 +545,9 @@ func analyzeRestores(restores []*velerov1.Restore, count int) []*AnalyzeResult { velerov1.RestorePhaseFailedValidation: true, } - failureReasons := []string{ - "found a restore with status \"InProgress\" during the server starting, mark it as \"Failed\"", - } + // failureReasons := []string{ + // "found a restore with status \"InProgress\" during the server starting, mark it as \"Failed\"", + // } for _, restore := range restores { if failedPhases[restore.Status.Phase] { From 70699ec9bb8a6fa54368b8b7b0ca85c0fe4e625a Mon Sep 17 00:00:00 2001 From: ada mancini Date: Tue, 7 Jan 2025 12:16:35 -0500 Subject: [PATCH 04/10] update tests for backups/restoresCount --- pkg/analyze/velero_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/analyze/velero_test.go b/pkg/analyze/velero_test.go index 995e6a3ec..79b6bd7d2 100644 --- a/pkg/analyze/velero_test.go +++ b/pkg/analyze/velero_test.go @@ -279,7 +279,7 @@ func TestAnalyzeVelero_Backups(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := analyzeBackups(tt.args.backups); !reflect.DeepEqual(got, tt.want) { + if got := analyzeBackups(tt.args.backups, 1); !reflect.DeepEqual(got, tt.want) { t.Errorf("analyzeBackups() = %v, want %v", got, tt.want) } }) @@ -629,7 +629,7 @@ func TestAnalyzeVelero_Restores(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := analyzeRestores(tt.args.restores); !reflect.DeepEqual(got, tt.want) { + if got := analyzeRestores(tt.args.restores, 1); !reflect.DeepEqual(got, tt.want) { t.Errorf("analyzeRestores() = %v, want %v", got, tt.want) } }) From e434b57e3918d9a559484310ca9847b4c039d0b3 Mon Sep 17 00:00:00 2001 From: ada mancini Date: Fri, 17 Jan 2025 15:39:25 -0500 Subject: [PATCH 05/10] failureReason - velero oom test --- pkg/analyze/velero_test.go | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pkg/analyze/velero_test.go b/pkg/analyze/velero_test.go index 79b6bd7d2..5a3d94e08 100644 --- a/pkg/analyze/velero_test.go +++ b/pkg/analyze/velero_test.go @@ -626,6 +626,33 @@ func TestAnalyzeVelero_Restores(t *testing.T) { }, }, }, + { + name: "restore failureReason - velero pod restarted", + args: args{ + restores: []*velerov1.Restore{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "instance-abcdef.kotsadm", + }, + Spec: velerov1.RestoreSpec{ + BackupName: "instance-abcdef", + }, + Status: velerov1.RestoreStatus{ + Phase: velerov1.RestorePhaseCompleted, + FailureReason: "found a restore with status \"InProgress\" during the server starting, mark it as \"Failed\"", + }, + }, + }, + }, + want: []*AnalyzeResult{ + { + Title: "Restore instance-abcdef.kotsadm", + Message: "Restore instance-abcdef.kotsadm reported a FailureReason: found a restore with status \"InProgress\" during the server starting, mark it as \"Failed\". Resolution: The Velero pod exited or restarted while a restore was already in progress, most likely due to running out of memory. Check the resource allocation of the velero pod and increase it or remove the memory limit.", + IsPass: false, + IsFail: true, + }, + }, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { From ef8d26796458e21d75e31019df0e3536f1baf8f5 Mon Sep 17 00:00:00 2001 From: ada mancini Date: Fri, 17 Jan 2025 15:40:58 -0500 Subject: [PATCH 06/10] style fixes --- pkg/analyze/velero.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/analyze/velero.go b/pkg/analyze/velero.go index 4b521f6e9..50d63832f 100644 --- a/pkg/analyze/velero.go +++ b/pkg/analyze/velero.go @@ -77,7 +77,7 @@ func (a *AnalyzeVelero) veleroStatus(analyzer *troubleshootv1beta2.VeleroAnalyze restoreCount = 1 } - if oldVeleroRepoType == true { + if oldVeleroRepoType { // old velero (v1.9.x) has a BackupRepositoryTypeRestic // get resticrepositories.velero.io resticRepositoriesDir := GetVeleroResticRepositoriesDirectory() @@ -284,7 +284,7 @@ func (a *AnalyzeVelero) veleroStatus(analyzer *troubleshootv1beta2.VeleroAnalyze } veleroLogsGlob := filepath.Join(logsDir, "velero*", "*.log") - veleroLogs, err := findFiles(veleroLogsGlob, excludeFiles) + veleroLogs, _ := findFiles(veleroLogsGlob, excludeFiles) results = append(results, analyzeLogs(nodeAgentlogs, "node-agent*")...) results = append(results, analyzeLogs(veleroLogs, "velero*")...) @@ -666,14 +666,14 @@ func aggregateResults(results []*AnalyzeResult) []*AnalyzeResult { out = append(out, result) } if len(results) > 0 { - if resultFailed == false { + if !resultFailed { out = append(out, &AnalyzeResult{ Title: "Velero Status", IsPass: true, Message: "Velero setup is healthy", }) } - if resultFailed == true { + if resultFailed { out = append(out, &AnalyzeResult{ Title: "Velero Status", IsWarn: true, From 7f993dbb9ac4c4a888c78982117d5cd8d50336e0 Mon Sep 17 00:00:00 2001 From: ada mancini Date: Fri, 17 Jan 2025 15:42:35 -0500 Subject: [PATCH 07/10] surface known failureReasons --- pkg/analyze/velero.go | 61 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 9 deletions(-) diff --git a/pkg/analyze/velero.go b/pkg/analyze/velero.go index 50d63832f..f1e57ea0a 100644 --- a/pkg/analyze/velero.go +++ b/pkg/analyze/velero.go @@ -388,16 +388,30 @@ func analyzeBackups(backups []*velerov1.Backup, count int) []*AnalyzeResult { velerov1.BackupPhaseFailedValidation: true, } - for _, backup := range backups { + // knownFailureReasons is a map of known failure messages to their resolutions + knownFailureReasons := map[string]string{ + "some known error message": "Resolution for the known error.", + } + for _, backup := range backups { if failedPhases[backup.Status.Phase] { result := &AnalyzeResult{ Title: fmt.Sprintf("Backup %s", backup.Name), } + + // Check if the backup has a failure reason and it's in the map + if backup.Status.FailureReason != "" { + if resolution, found := knownFailureReasons[backup.Status.FailureReason]; found { + result.Message = fmt.Sprintf("Backup %s phase is %s. Reason: %s. Resolution: %s", backup.Name, backup.Status.Phase, backup.Status.FailureReason, resolution) + } else { + result.Message = fmt.Sprintf("Backup %s phase is %s. Reason: %s", backup.Name, backup.Status.Phase, backup.Status.FailureReason) + } + } else { + result.Message = fmt.Sprintf("Backup %s phase is %s", backup.Name, backup.Status.Phase) + } + result.IsFail = true - result.Message = fmt.Sprintf("Backup %s phase is %s", backup.Name, backup.Status.Phase) results = append(results, result) - } } if len(backups) > 0 { @@ -471,14 +485,31 @@ func analyzeDeleteBackupRequests(deleteBackupRequests []*velerov1.DeleteBackupRe func analyzePodVolumeBackups(podVolumeBackups []*velerov1.PodVolumeBackup) []*AnalyzeResult { results := []*AnalyzeResult{} failures := 0 + + // knownFailureMessages is a map of known failure messages to their resolutions + knownFailureMessages := map[string]string{ + "example known error message": "Resolution for the known pod volume backup error.", + } + if len(podVolumeBackups) > 0 { for _, podVolumeBackup := range podVolumeBackups { if podVolumeBackup.Status.Phase == velerov1.PodVolumeBackupPhaseFailed { result := &AnalyzeResult{ Title: fmt.Sprintf("Pod Volume Backup %s", podVolumeBackup.Name), } + + // Check if the pod volume backup has a status message and it's in the map + if podVolumeBackup.Status.Message != "" { + if resolution, found := knownFailureMessages[podVolumeBackup.Status.Message]; found { + result.Message = fmt.Sprintf("Pod Volume Backup %s phase is %s. Message: %s. Resolution: %s", podVolumeBackup.Name, podVolumeBackup.Status.Phase, podVolumeBackup.Status.Message, resolution) + } else { + result.Message = fmt.Sprintf("Pod Volume Backup %s phase is %s. Message: %s", podVolumeBackup.Name, podVolumeBackup.Status.Phase, podVolumeBackup.Status.Message) + } + } else { + result.Message = fmt.Sprintf("Pod Volume Backup %s phase is %s", podVolumeBackup.Name, podVolumeBackup.Status.Phase) + } + result.IsFail = true - result.Message = fmt.Sprintf("Pod Volume Backup %s phase is %s", podVolumeBackup.Name, podVolumeBackup.Status.Phase) results = append(results, result) failures++ } @@ -545,17 +576,29 @@ func analyzeRestores(restores []*velerov1.Restore, count int) []*AnalyzeResult { velerov1.RestorePhaseFailedValidation: true, } - // failureReasons := []string{ - // "found a restore with status \"InProgress\" during the server starting, mark it as \"Failed\"", - // } + // knownFailureReasons is a map of strings to strings that are used to detect specific failure messages and return a resolution + knownFailureReasons := map[string]string{ + "found a restore with status \"InProgress\" during the server starting, mark it as \"Failed\"": "The Velero pod exited or restarted while a restore was already in progress, most likely due to running out of memory. Check the resource allocation of the velero pod and increase it or remove the memory limit.", + } for _, restore := range restores { - if failedPhases[restore.Status.Phase] { + if failedPhases[restore.Status.Phase] || restore.Status.FailureReason != "" { result := &AnalyzeResult{ Title: fmt.Sprintf("Restore %s", restore.Name), } + + // Check if the restore has a failure reason and it's in the map + if restore.Status.FailureReason != "" { + if resolution, found := knownFailureReasons[restore.Status.FailureReason]; found { + result.Message = fmt.Sprintf("Restore %s reported a FailureReason: %s. Resolution: %s", restore.Name, restore.Status.FailureReason, resolution) + } else { + result.Message = fmt.Sprintf("Restore %s phase is %s. Reason: %s", restore.Name, restore.Status.Phase, restore.Status.FailureReason) + } + } else { + result.Message = fmt.Sprintf("Restore %s phase is %s", restore.Name, restore.Status.Phase) + } + result.IsFail = true - result.Message = fmt.Sprintf("Restore %s phase is %s", restore.Name, restore.Status.Phase) results = append(results, result) failures++ } From 12eb3008284d076fb0e4359fe20ca4e3339f864f Mon Sep 17 00:00:00 2001 From: ada mancini Date: Fri, 17 Jan 2025 15:42:53 -0500 Subject: [PATCH 08/10] upgrade warnings to failures --- pkg/analyze/velero.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/analyze/velero.go b/pkg/analyze/velero.go index f1e57ea0a..505a92a83 100644 --- a/pkg/analyze/velero.go +++ b/pkg/analyze/velero.go @@ -684,7 +684,7 @@ func analyzeLogs(logs map[string][]byte, kind string) []*AnalyzeResult { } if strings.Contains(logContent, "error") || strings.Contains(logContent, "panic") || strings.Contains(logContent, "fatal") { - result.IsWarn = true + result.IsFail = true result.Message = fmt.Sprintf("Found error|panic|fatal in %s pod log file(s)", kind) results = append(results, result) } From 4276072cdf16e3d01edb8a8926a20b860cce2cb5 Mon Sep 17 00:00:00 2001 From: ada mancini Date: Fri, 17 Jan 2025 15:43:09 -0500 Subject: [PATCH 09/10] upgrade warnings to failures --- pkg/analyze/velero.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/analyze/velero.go b/pkg/analyze/velero.go index 505a92a83..954905b17 100644 --- a/pkg/analyze/velero.go +++ b/pkg/analyze/velero.go @@ -677,7 +677,7 @@ func analyzeLogs(logs map[string][]byte, kind string) []*AnalyzeResult { Title: fmt.Sprintf("Velero logs for pod [%s]", key), } if strings.Contains(logContent, "permission denied") { - result.IsWarn = true + result.IsFail = true result.Message = fmt.Sprintf("Found 'permission denied' in %s pod log file(s)", kind) results = append(results, result) continue From 5a7c58bb1f06c02d27b500b39147564cad073a84 Mon Sep 17 00:00:00 2001 From: ada mancini Date: Tue, 21 Jan 2025 15:22:14 -0500 Subject: [PATCH 10/10] upgrade warnings to errors --- pkg/analyze/velero_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/analyze/velero_test.go b/pkg/analyze/velero_test.go index 5a3d94e08..d899584c1 100644 --- a/pkg/analyze/velero_test.go +++ b/pkg/analyze/velero_test.go @@ -921,7 +921,7 @@ func TestAnalyzeVelero_Logs(t *testing.T) { { Title: "Velero logs for pod [node-agent-m6n9j]", Message: "Found error|panic|fatal in node-agent* pod log file(s)", - IsWarn: true, + IsFail: true, }, { Title: "Velero Logs analysis for kind [node-agent*]",