Skip to content

Commit 21211b7

Browse files
done
1 parent 3c555f8 commit 21211b7

File tree

7 files changed

+118
-28
lines changed

7 files changed

+118
-28
lines changed

poetry.lock

+53-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ vertexai= "1.71.1"
3737
vllm = { version = "^0.6.3", optional = true }
3838
aiodocker = { version = "^0.24.0", optional = true }
3939
ray = { version = "^2.41.0", optional = true }
40+
posthog = "^3.11.0"
4041

4142
[tool.poetry.extras]
4243
vllm = ["vllm"]

src/bespokelabs/curator/constants.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
"""Defaults used across curator."""
22

33
BATCH_REQUEST_ID_TAG = "custom_id"
4+
_CURATOR_DEFAULT_CACHE_DIR = "~/.cache/curator"

src/bespokelabs/curator/llm/llm.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from pydantic import BaseModel
1212
from xxhash import xxh64
1313

14+
from bespokelabs.curator.constants import _CURATOR_DEFAULT_CACHE_DIR
1415
from bespokelabs.curator.db import MetadataDB
1516
from bespokelabs.curator.llm.prompt_formatter import PromptFormatter
1617
from bespokelabs.curator.request_processor._factory import _RequestProcessorFactory
@@ -19,7 +20,6 @@
1920
if TYPE_CHECKING:
2021
from datasets import Dataset
2122

22-
_CURATOR_DEFAULT_CACHE_DIR = "~/.cache/curator"
2323
T = TypeVar("T")
2424
_DictOrBaseModel = Dict[str, Any] | BaseModel
2525
logger = logging.getLogger(__name__)

src/bespokelabs/curator/status_tracker/batch_status_tracker.py

+9
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from rich.progress import BarColumn, Progress, TextColumn, TimeElapsedColumn
1111
from rich.table import Table
1212

13+
from bespokelabs.curator.telemetry.client import TelemetryEvent, telemetry_client
1314
from bespokelabs.curator.types.generic_batch import GenericBatch, GenericBatchStatus
1415
from bespokelabs.curator.types.generic_response import TokenUsage
1516

@@ -92,6 +93,14 @@ def stop_tracker(self):
9293
self._progress.stop()
9394
self.display_final_stats()
9495

96+
# update anonymized telemetry
97+
telemetry_client.capture(
98+
TelemetryEvent(
99+
event_type="BatchRequest",
100+
metadata=self.model_dump(),
101+
)
102+
)
103+
95104
def update_display(self):
96105
"""Update statistics with token usage and cost information."""
97106
# Format display texts

src/bespokelabs/curator/status_tracker/online_status_tracker.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import logging
22
import time
33
import typing as t
4-
from dataclasses import dataclass, field
4+
from dataclasses import asdict, dataclass, field
55
from enum import Enum
66
from typing import Optional
77

@@ -13,18 +13,23 @@
1313
from rich.progress import BarColumn, Progress, TextColumn, TimeElapsedColumn, TimeRemainingColumn
1414
from rich.table import Table
1515

16+
from bespokelabs.curator.telemetry.client import TelemetryEvent, telemetry_client
1617
from bespokelabs.curator.types.generic_response import TokenUsage
1718

1819
logger = logging.getLogger(__name__)
1920

2021

21-
class TokenLimitStrategy(Enum):
22+
class TokenLimitStrategy(str, Enum):
2223
"""Token limit Strategy enum."""
2324

2425
combined = "combined"
2526
seperate = "seperate"
2627
default = "combined"
2728

29+
def __str__(self):
30+
"""String representation of the token limit strategy."""
31+
return self.value
32+
2833

2934
class _TokenCount(BaseModel):
3035
input: int | float | None = 0
@@ -279,6 +284,13 @@ def stop_tracker(self):
279284

280285
self._console.print(table)
281286

287+
telemetry_client.capture(
288+
TelemetryEvent(
289+
event_type="OnlineRequest",
290+
metadata=asdict(self),
291+
)
292+
)
293+
282294
def __str__(self):
283295
"""String representation of the status tracker."""
284296
return (
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,58 @@
11
import logging
22
import os
3+
import uuid
34
from dataclasses import dataclass
45
from typing import Optional
56

67
import posthog
7-
from bespokelabs.curator.telemetry.events import TelemetryEvent
8+
9+
from bespokelabs.curator.constants import _CURATOR_DEFAULT_CACHE_DIR
810

911
logger = logging.getLogger(__name__)
1012

13+
14+
def _random_distinct_id():
15+
# check if cache
16+
curator_cache_dir = os.environ.get(
17+
"CURATOR_CACHE_DIR",
18+
os.path.expanduser(_CURATOR_DEFAULT_CACHE_DIR),
19+
)
20+
os.makedirs(curator_cache_dir, exist_ok=True)
21+
distinct_id_file = os.path.join(curator_cache_dir, ".distinct_id.txt")
22+
23+
if os.path.exists(distinct_id_file):
24+
with open(distinct_id_file) as f:
25+
random_distinct_id = uuid.UUID(f.read().strip())
26+
else:
27+
random_distinct_id = uuid.uuid4()
28+
with open(distinct_id_file, "w") as f:
29+
f.write(str(random_distinct_id))
30+
31+
return random_distinct_id
32+
33+
1134
@dataclass
1235
class TelemetryEvent:
1336
"""Base class for all telemetry events."""
37+
1438
event_type: str
15-
event_id: str
1639
metadata: dict
40+
distinct_id: str = _random_distinct_id()
1741

1842

1943
@dataclass
2044
class PosthogConfig:
2145
"""Configuration settings for PostHog client."""
46+
2247
api_key: str
2348
enabled: bool = True
2449
debug: bool = False
25-
disable_geoip: bool = False
2650
host: Optional[str] = None
2751

2852

2953
class PosthogClient:
30-
"""
31-
Client for sending telemetry events to PostHog analytics.
32-
54+
"""Client for sending telemetry events to PostHog analytics.
55+
3356
This uses a write-only project API key that can only create new events.
3457
It cannot read events or access other PostHog data, making it safe for public apps.
3558
"""
@@ -40,42 +63,34 @@ def __init__(self, config: PosthogConfig):
4063
self._initialize_client()
4164

4265
def _initialize_client(self) -> None:
43-
"""Set up the PostHog client with configuration settings"""
44-
66+
"""Set up the PostHog client with configuration settings."""
4567
posthog.project_api_key = self.config.api_key
4668
posthog.debug = self.config.debug
47-
posthog.disable_geoip = self.config.disable_geoip
48-
69+
4970
if self.config.host:
5071
posthog.host = self.config.host
5172

5273
def capture(self, event: TelemetryEvent) -> None:
53-
"""
54-
Capture and send a telemetry event to PostHog
55-
74+
"""Capture and send a telemetry event to PostHog.
75+
5676
Args:
5777
event: The telemetry event to capture
5878
"""
5979
if not self.config.enabled:
6080
return
61-
81+
6282
try:
63-
posthog.capture(
64-
distinct_id=event.event_id,
65-
event=event.event_type,
66-
properties=event.properties
67-
)
68-
except Exception as e:
69-
logger.error(f"Failed to capture telemetry event: {e}")
83+
posthog.capture(distinct_id=event.distinct_id, event=event.event_type, properties=event.metadata)
84+
except Exception:
85+
pass
7086

7187

7288
# Initialize the telemetry client with environment-based configuration
7389
config = PosthogConfig(
74-
api_key=os.getenv("POSTHOG_API_KEY"),
90+
api_key="phc_HGGTf1LmtsUnBaVBufgIwRsAwdkvH3cSsDKgW5RnJz8",
7591
enabled=os.getenv("TELEMETRY_ENABLED", "true").lower() in ("true", "1", "t"),
7692
debug=os.getenv("DEBUG_MODE", "false").lower() in ("true", "1", "t"),
77-
disable_geoip=os.getenv("TELEMETRY_ENABLED", "false").lower() in ("true", "1", "t"),
78-
host=os.getenv("POSTHOG_HOST")
93+
host=os.getenv("POSTHOG_HOST"),
7994
)
8095

8196
telemetry_client = PosthogClient(config=config)

0 commit comments

Comments
 (0)