Skip to content

Commit cb461ca

Browse files
authored
[ROB-1014] Prometheus auto discovery (#375)
1 parent 1f7689d commit cb461ca

File tree

6 files changed

+622
-384
lines changed

6 files changed

+622
-384
lines changed

helm/holmes/values.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ toolsets:
4040
enabled: true
4141
internet:
4242
enabled: true
43+
prometheus/metrics:
44+
enabled: true
4345

4446

4547
resources:

holmes/plugins/toolsets/prometheus/prometheus.py

+27-17
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from requests import RequestException
2121

2222
from urllib.parse import urljoin
23-
23+
from holmes.plugins.toolsets.service_discovery import PrometheusDiscovery
2424
from holmes.plugins.toolsets.utils import (
2525
STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION,
2626
standard_start_datetime_tool_param_description,
@@ -36,7 +36,7 @@
3636

3737
class PrometheusConfig(BaseModel):
3838
# URL is optional because it can be set with an env var
39-
prometheus_url: Union[str, None]
39+
prometheus_url: Optional[str]
4040
healthcheck: str = "-/healthy"
4141
# Setting to None will remove the time window from the request for labels
4242
metrics_labels_time_window_hrs: Union[int, None] = 48
@@ -734,25 +734,35 @@ def _reload_llm_instructions(self):
734734
self._load_llm_instructions(jinja_template=f"file://{template_file_path}")
735735

736736
def prerequisites_callable(self, config: dict[str, Any]) -> Tuple[bool, str]:
737-
if not config and not os.environ.get("PROMETHEUS_URL", None):
738-
return (
739-
False,
740-
"Prometheus is misconfigured. prometheus_url is required but missing",
741-
)
742-
elif not config and os.environ.get("PROMETHEUS_URL", None):
743-
self.config = PrometheusConfig(
744-
prometheus_url=os.environ.get("PROMETHEUS_URL"),
745-
headers=add_prometheus_auth(
746-
os.environ.get("PROMETHEUS_AUTH_HEADER", None)
747-
),
748-
)
749-
self._reload_llm_instructions()
750-
return True, ""
751-
else:
737+
if config:
752738
self.config = PrometheusConfig(**config)
753739
self._reload_llm_instructions()
754740
return self._is_healthy()
755741

742+
prometheus_url = os.environ.get("PROMETHEUS_URL")
743+
if not prometheus_url:
744+
prometheus_url = self.auto_detect_prometheus_url()
745+
if not prometheus_url:
746+
return (
747+
False,
748+
"Unable to auto-detect prometheus. Define prometheus_url in the configuration for tool prometheus/metrics",
749+
)
750+
751+
self.config = PrometheusConfig(
752+
prometheus_url=prometheus_url,
753+
headers=add_prometheus_auth(os.environ.get("PROMETHEUS_AUTH_HEADER")),
754+
)
755+
logging.warning(f"Prometheus auto discovered at url {prometheus_url}")
756+
self._reload_llm_instructions()
757+
return True, ""
758+
759+
def auto_detect_prometheus_url(self) -> Optional[str]:
760+
url: Optional[str] = PrometheusDiscovery.find_prometheus_url()
761+
if not url:
762+
url = PrometheusDiscovery.find_vm_url()
763+
764+
return url
765+
756766
def _is_healthy(self) -> Tuple[bool, str]:
757767
if (
758768
not hasattr(self, "config")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import logging
2+
from kubernetes import client
3+
from kubernetes.client import V1ServiceList
4+
from kubernetes.client.models.v1_service import V1Service
5+
import os
6+
from typing import List, Optional
7+
from kubernetes import config
8+
9+
CLUSTER_DOMAIN = os.environ.get("CLUSTER_DOMAIN", "cluster.local")
10+
11+
try:
12+
if os.getenv("KUBERNETES_SERVICE_HOST"):
13+
config.load_incluster_config()
14+
else:
15+
config.load_kube_config()
16+
except config.config_exception.ConfigException as e:
17+
logging.warning(f"Running without kube-config! e={e}")
18+
19+
20+
def find_service_url(label_selector):
21+
"""
22+
Get the url of an in-cluster service with a specific label
23+
"""
24+
# we do it this way because there is a weird issue with hikaru's ServiceList.listServiceForAllNamespaces()
25+
try:
26+
v1 = client.CoreV1Api()
27+
svc_list: V1ServiceList = v1.list_service_for_all_namespaces(
28+
label_selector=label_selector
29+
)
30+
if not svc_list.items:
31+
return None
32+
svc: V1Service = svc_list.items[0]
33+
name = svc.metadata.name
34+
namespace = svc.metadata.namespace
35+
port = svc.spec.ports[0].port
36+
url = f"http://{name}.{namespace}.svc.{CLUSTER_DOMAIN}:{port}"
37+
logging.info(
38+
f"discovered service with label-selector: `{label_selector}` at url: `{url}`"
39+
)
40+
return url
41+
except Exception:
42+
logging.warning("Error finding url")
43+
return None
44+
45+
46+
class ServiceDiscovery:
47+
@classmethod
48+
def find_url(cls, selectors: List[str], error_msg: str) -> Optional[str]:
49+
"""
50+
Try to autodiscover the url of an in-cluster service
51+
"""
52+
53+
for label_selector in selectors:
54+
service_url = find_service_url(label_selector)
55+
if service_url:
56+
return service_url
57+
58+
logging.debug(error_msg)
59+
return None
60+
61+
62+
class PrometheusDiscovery(ServiceDiscovery):
63+
@classmethod
64+
def find_prometheus_url(cls) -> Optional[str]:
65+
return super().find_url(
66+
selectors=[
67+
"app=kube-prometheus-stack-prometheus",
68+
"app=prometheus,component=server,release!=kubecost",
69+
"app=prometheus-server",
70+
"app=prometheus-operator-prometheus",
71+
"app=rancher-monitoring-prometheus",
72+
"app=prometheus-prometheus",
73+
"app.kubernetes.io/component=query,app.kubernetes.io/name=thanos",
74+
"app.kubernetes.io/name=thanos-query",
75+
"app=thanos-query",
76+
"app=thanos-querier",
77+
],
78+
error_msg="Prometheus url could not be found. Add 'prometheus_url' under your prometheus tools config",
79+
)
80+
81+
@classmethod
82+
def find_vm_url(cls) -> Optional[str]:
83+
return super().find_url(
84+
selectors=[
85+
"app.kubernetes.io/name=vmsingle",
86+
"app.kubernetes.io/name=victoria-metrics-single",
87+
"app.kubernetes.io/name=vmselect",
88+
"app=vmselect",
89+
],
90+
error_msg="Victoria Metrics url could not be found. Add 'prometheus_url' under your prometheus tools config",
91+
)

0 commit comments

Comments
 (0)