Skip to content

Commit 24149db

Browse files
authored
feat(fortuna): Better metrics tracking for alerting (#2703)
* better metrics tracking for alerting * better tracking * pr comments
1 parent a58e20a commit 24149db

File tree

5 files changed

+167
-97
lines changed

5 files changed

+167
-97
lines changed

apps/fortuna/src/command/run.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ pub async fn run(opts: &RunOptions) -> Result<()> {
9999
.collect(),
100100
));
101101
for (chain_id, chain_config) in config.chains.clone() {
102+
keeper_metrics.add_chain(chain_id.clone(), config.provider.address);
102103
let keeper_metrics = keeper_metrics.clone();
103104
let keeper_private_key_option = keeper_private_key_option.clone();
104105
let chains = chains.clone();
@@ -168,7 +169,6 @@ async fn setup_chain_and_run_keeper(
168169
rpc_metrics.clone(),
169170
)
170171
.await?;
171-
keeper_metrics.add_chain(chain_id.clone(), state.provider_address);
172172
chains.write().await.insert(
173173
chain_id.clone(),
174174
ApiBlockChainState::Initialized(state.clone()),

apps/fortuna/src/keeper.rs

Lines changed: 49 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -178,45 +178,56 @@ pub async fn run_keeper_threads(
178178
};
179179

180180
loop {
181-
// There isn't a loop for indefinite trials. There is a new thread being spawned every `TRACK_INTERVAL` seconds.
182-
// If rpc start fails all of these threads will just exit, instead of retrying.
183-
// We are tracking rpc failures elsewhere, so it's fine.
184-
spawn(
185-
track_provider(
186-
chain_id.clone(),
187-
contract.clone(),
188-
provider_address,
189-
keeper_metrics.clone(),
190-
)
191-
.in_current_span(),
192-
);
193-
spawn(
194-
track_balance(
195-
chain_id.clone(),
196-
contract.client(),
197-
keeper_address,
198-
keeper_metrics.clone(),
199-
)
200-
.in_current_span(),
201-
);
202-
spawn(
203-
track_accrued_pyth_fees(
204-
chain_id.clone(),
205-
contract.clone(),
206-
keeper_metrics.clone(),
207-
)
208-
.in_current_span(),
209-
);
210-
spawn(
211-
track_block_timestamp_lag(
212-
chain_id.clone(),
213-
contract.client(),
214-
keeper_metrics.clone(),
215-
)
216-
.in_current_span(),
217-
);
218-
219181
time::sleep(TRACK_INTERVAL).await;
182+
183+
// Track provider info and balance sequentially. Note that the tracking is done sequentially with the
184+
// timestamp last. If there is a persistent error in any of these methods, the timestamp will lag behind
185+
// current time and trigger an alert.
186+
if let Err(e) = track_provider(
187+
chain_id.clone(),
188+
contract.clone(),
189+
provider_address,
190+
keeper_metrics.clone(),
191+
)
192+
.await
193+
{
194+
tracing::error!("Error tracking provider: {:?}", e);
195+
continue;
196+
}
197+
198+
if let Err(e) = track_balance(
199+
chain_id.clone(),
200+
contract.client(),
201+
keeper_address,
202+
keeper_metrics.clone(),
203+
)
204+
.await
205+
{
206+
tracing::error!("Error tracking balance: {:?}", e);
207+
continue;
208+
}
209+
210+
if let Err(e) = track_accrued_pyth_fees(
211+
chain_id.clone(),
212+
contract.clone(),
213+
keeper_metrics.clone(),
214+
)
215+
.await
216+
{
217+
tracing::error!("Error tracking accrued pyth fees: {:?}", e);
218+
continue;
219+
}
220+
221+
if let Err(e) = track_block_timestamp_lag(
222+
chain_id.clone(),
223+
contract.client(),
224+
keeper_metrics.clone(),
225+
)
226+
.await
227+
{
228+
tracing::error!("Error tracking block timestamp lag: {:?}", e);
229+
continue;
230+
}
220231
}
221232
}
222233
.in_current_span(),

apps/fortuna/src/keeper/block.rs

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,16 @@ use {
33
api::{self, BlockchainState},
44
chain::{ethereum::InstrumentedSignablePythContract, reader::BlockNumber},
55
eth_utils::utils::EscalationPolicy,
6-
keeper::keeper_metrics::KeeperMetrics,
6+
keeper::keeper_metrics::{ChainIdLabel, KeeperMetrics},
77
keeper::process_event::process_event_with_backoff,
88
},
99
anyhow::Result,
1010
ethers::types::U256,
11-
std::{collections::HashSet, sync::Arc},
11+
std::{
12+
collections::HashSet,
13+
sync::Arc,
14+
time::{SystemTime, UNIX_EPOCH},
15+
},
1216
tokio::{
1317
spawn,
1418
sync::{mpsc, RwLock},
@@ -115,6 +119,10 @@ pub async fn process_single_block_batch(
115119
metrics: Arc<KeeperMetrics>,
116120
fulfilled_requests_cache: Arc<RwLock<HashSet<u64>>>,
117121
) {
122+
let label = ChainIdLabel {
123+
chain_id: chain_state.id.clone(),
124+
};
125+
118126
loop {
119127
let events_res = chain_state
120128
.contract
@@ -125,6 +133,31 @@ pub async fn process_single_block_batch(
125133
)
126134
.await;
127135

136+
// Only update metrics if we successfully retrieved events.
137+
if events_res.is_ok() {
138+
// Track the last time blocks were processed. If anything happens to the processing thread, the
139+
// timestamp will lag, which will trigger an alert.
140+
let server_timestamp = SystemTime::now()
141+
.duration_since(UNIX_EPOCH)
142+
.map(|duration| duration.as_secs() as i64)
143+
.unwrap_or(0);
144+
metrics
145+
.process_event_timestamp
146+
.get_or_create(&label)
147+
.set(server_timestamp);
148+
149+
let current_block = metrics
150+
.process_event_block_number
151+
.get_or_create(&label)
152+
.get();
153+
if block_range.to > current_block as u64 {
154+
metrics
155+
.process_event_block_number
156+
.get_or_create(&label)
157+
.set(block_range.to as i64);
158+
}
159+
}
160+
128161
match events_res {
129162
Ok(events) => {
130163
tracing::info!(num_of_events = &events.len(), "Processing",);

apps/fortuna/src/keeper/keeper_metrics.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,10 @@ pub struct KeeperMetrics {
4444
pub gas_price_estimate: Family<AccountLabel, Gauge<f64, AtomicU64>>,
4545
pub accrued_pyth_fees: Family<ChainIdLabel, Gauge<f64, AtomicU64>>,
4646
pub block_timestamp_lag: Family<ChainIdLabel, Gauge>,
47+
pub latest_block_timestamp: Family<ChainIdLabel, Gauge>,
48+
pub process_event_timestamp: Family<ChainIdLabel, Gauge>,
49+
pub latest_block_number: Family<ChainIdLabel, Gauge>,
50+
pub process_event_block_number: Family<ChainIdLabel, Gauge>,
4751
}
4852

4953
impl Default for KeeperMetrics {
@@ -87,6 +91,10 @@ impl Default for KeeperMetrics {
8791
gas_price_estimate: Family::default(),
8892
accrued_pyth_fees: Family::default(),
8993
block_timestamp_lag: Family::default(),
94+
latest_block_timestamp: Family::default(),
95+
process_event_timestamp: Family::default(),
96+
latest_block_number: Family::default(),
97+
process_event_block_number: Family::default(),
9098
}
9199
}
92100
}
@@ -228,6 +236,30 @@ impl KeeperMetrics {
228236
keeper_metrics.block_timestamp_lag.clone(),
229237
);
230238

239+
writable_registry.register(
240+
"latest_block_timestamp",
241+
"The current block timestamp",
242+
keeper_metrics.latest_block_timestamp.clone(),
243+
);
244+
245+
writable_registry.register(
246+
"process_event_timestamp",
247+
"Timestamp of the last time the keeper updated the events",
248+
keeper_metrics.process_event_timestamp.clone(),
249+
);
250+
251+
writable_registry.register(
252+
"latest_block_number",
253+
"The current block number",
254+
keeper_metrics.latest_block_number.clone(),
255+
);
256+
257+
writable_registry.register(
258+
"process_event_block_number",
259+
"The highest block number for which events have been successfully retrieved and processed",
260+
keeper_metrics.process_event_block_number.clone(),
261+
);
262+
231263
// *Important*: When adding a new metric:
232264
// 1. Register it above using `writable_registry.register(...)`
233265
// 2. Add a get_or_create call in the add_chain function below to initialize it for each chain/provider pair
@@ -241,6 +273,12 @@ impl KeeperMetrics {
241273
};
242274
let _ = self.accrued_pyth_fees.get_or_create(&chain_id_label);
243275
let _ = self.block_timestamp_lag.get_or_create(&chain_id_label);
276+
let _ = self.latest_block_timestamp.get_or_create(&chain_id_label);
277+
let _ = self.process_event_timestamp.get_or_create(&chain_id_label);
278+
let _ = self.latest_block_number.get_or_create(&chain_id_label);
279+
let _ = self
280+
.process_event_block_number
281+
.get_or_create(&chain_id_label);
244282

245283
let account_label = AccountLabel {
246284
chain_id,

0 commit comments

Comments
 (0)