Skip to content

Commit 7f993db

Browse files
committed
surface known failureReasons
1 parent ef8d267 commit 7f993db

File tree

1 file changed

+52
-9
lines changed

1 file changed

+52
-9
lines changed

pkg/analyze/velero.go

Lines changed: 52 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -388,16 +388,30 @@ func analyzeBackups(backups []*velerov1.Backup, count int) []*AnalyzeResult {
388388
velerov1.BackupPhaseFailedValidation: true,
389389
}
390390

391-
for _, backup := range backups {
391+
// knownFailureReasons is a map of known failure messages to their resolutions
392+
knownFailureReasons := map[string]string{
393+
"some known error message": "Resolution for the known error.",
394+
}
392395

396+
for _, backup := range backups {
393397
if failedPhases[backup.Status.Phase] {
394398
result := &AnalyzeResult{
395399
Title: fmt.Sprintf("Backup %s", backup.Name),
396400
}
401+
402+
// Check if the backup has a failure reason and it's in the map
403+
if backup.Status.FailureReason != "" {
404+
if resolution, found := knownFailureReasons[backup.Status.FailureReason]; found {
405+
result.Message = fmt.Sprintf("Backup %s phase is %s. Reason: %s. Resolution: %s", backup.Name, backup.Status.Phase, backup.Status.FailureReason, resolution)
406+
} else {
407+
result.Message = fmt.Sprintf("Backup %s phase is %s. Reason: %s", backup.Name, backup.Status.Phase, backup.Status.FailureReason)
408+
}
409+
} else {
410+
result.Message = fmt.Sprintf("Backup %s phase is %s", backup.Name, backup.Status.Phase)
411+
}
412+
397413
result.IsFail = true
398-
result.Message = fmt.Sprintf("Backup %s phase is %s", backup.Name, backup.Status.Phase)
399414
results = append(results, result)
400-
401415
}
402416
}
403417
if len(backups) > 0 {
@@ -471,14 +485,31 @@ func analyzeDeleteBackupRequests(deleteBackupRequests []*velerov1.DeleteBackupRe
471485
func analyzePodVolumeBackups(podVolumeBackups []*velerov1.PodVolumeBackup) []*AnalyzeResult {
472486
results := []*AnalyzeResult{}
473487
failures := 0
488+
489+
// knownFailureMessages is a map of known failure messages to their resolutions
490+
knownFailureMessages := map[string]string{
491+
"example known error message": "Resolution for the known pod volume backup error.",
492+
}
493+
474494
if len(podVolumeBackups) > 0 {
475495
for _, podVolumeBackup := range podVolumeBackups {
476496
if podVolumeBackup.Status.Phase == velerov1.PodVolumeBackupPhaseFailed {
477497
result := &AnalyzeResult{
478498
Title: fmt.Sprintf("Pod Volume Backup %s", podVolumeBackup.Name),
479499
}
500+
501+
// Check if the pod volume backup has a status message and it's in the map
502+
if podVolumeBackup.Status.Message != "" {
503+
if resolution, found := knownFailureMessages[podVolumeBackup.Status.Message]; found {
504+
result.Message = fmt.Sprintf("Pod Volume Backup %s phase is %s. Message: %s. Resolution: %s", podVolumeBackup.Name, podVolumeBackup.Status.Phase, podVolumeBackup.Status.Message, resolution)
505+
} else {
506+
result.Message = fmt.Sprintf("Pod Volume Backup %s phase is %s. Message: %s", podVolumeBackup.Name, podVolumeBackup.Status.Phase, podVolumeBackup.Status.Message)
507+
}
508+
} else {
509+
result.Message = fmt.Sprintf("Pod Volume Backup %s phase is %s", podVolumeBackup.Name, podVolumeBackup.Status.Phase)
510+
}
511+
480512
result.IsFail = true
481-
result.Message = fmt.Sprintf("Pod Volume Backup %s phase is %s", podVolumeBackup.Name, podVolumeBackup.Status.Phase)
482513
results = append(results, result)
483514
failures++
484515
}
@@ -545,17 +576,29 @@ func analyzeRestores(restores []*velerov1.Restore, count int) []*AnalyzeResult {
545576
velerov1.RestorePhaseFailedValidation: true,
546577
}
547578

548-
// failureReasons := []string{
549-
// "found a restore with status \"InProgress\" during the server starting, mark it as \"Failed\"",
550-
// }
579+
// knownFailureReasons is a map of strings to strings that are used to detect specific failure messages and return a resolution
580+
knownFailureReasons := map[string]string{
581+
"found a restore with status \"InProgress\" during the server starting, mark it as \"Failed\"": "The Velero pod exited or restarted while a restore was already in progress, most likely due to running out of memory. Check the resource allocation of the velero pod and increase it or remove the memory limit.",
582+
}
551583

552584
for _, restore := range restores {
553-
if failedPhases[restore.Status.Phase] {
585+
if failedPhases[restore.Status.Phase] || restore.Status.FailureReason != "" {
554586
result := &AnalyzeResult{
555587
Title: fmt.Sprintf("Restore %s", restore.Name),
556588
}
589+
590+
// Check if the restore has a failure reason and it's in the map
591+
if restore.Status.FailureReason != "" {
592+
if resolution, found := knownFailureReasons[restore.Status.FailureReason]; found {
593+
result.Message = fmt.Sprintf("Restore %s reported a FailureReason: %s. Resolution: %s", restore.Name, restore.Status.FailureReason, resolution)
594+
} else {
595+
result.Message = fmt.Sprintf("Restore %s phase is %s. Reason: %s", restore.Name, restore.Status.Phase, restore.Status.FailureReason)
596+
}
597+
} else {
598+
result.Message = fmt.Sprintf("Restore %s phase is %s", restore.Name, restore.Status.Phase)
599+
}
600+
557601
result.IsFail = true
558-
result.Message = fmt.Sprintf("Restore %s phase is %s", restore.Name, restore.Status.Phase)
559602
results = append(results, result)
560603
failures++
561604
}

0 commit comments

Comments
 (0)