Skip to content

Commit a89e698

Browse files
committed
feat: congestion metrics
1 parent 60d5310 commit a89e698

File tree

11 files changed

+419
-41
lines changed

11 files changed

+419
-41
lines changed

Cargo.lock

Lines changed: 3 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,6 @@ unexpected_cfgs = { level = "warn", check-cfg = ["cfg(iroh_docsrs)", "cfg(iroh_l
4040

4141
[workspace.lints.clippy]
4242
unused-async = "warn"
43+
44+
[patch.crates-io]
45+
iroh-metrics = { git = "https://github.com/n0-computer/iroh-metrics", branch = "arqu/histograms" }

iroh-dns-server/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ hickory-server = { version = "0.25.1", features = ["https-ring"] }
2828
http = "1.0.0"
2929
humantime = "2.2.0"
3030
humantime-serde = "1.1.1"
31-
iroh-metrics = { version = "0.35", features = ["service"] }
31+
iroh-metrics = { git = "https://github.com/n0-computer/iroh-metrics", branch = "arqu/histograms", features = ["service"] }
3232
lru = "0.13"
3333
n0-future = "0.1.2"
3434
n0-snafu = "0.2.2"

iroh-relay/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ http-body-util = "0.1.0"
3232
hyper = { version = "1", features = ["server", "client", "http1"] }
3333
hyper-util = "0.1.1"
3434
iroh-base = { version = "0.92.0", path = "../iroh-base", default-features = false, features = ["key", "relay"] }
35-
iroh-metrics = { version = "0.35", default-features = false }
35+
iroh-metrics = { git = "https://github.com/n0-computer/iroh-metrics", branch = "arqu/histograms", default-features = false }
3636
n0-future = "0.1.2"
3737
num_enum = "0.7"
3838
pin-project = "1"

iroh/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ futures-buffered = "0.2.11"
8585
spki = { version = "0.7.3", features = ["std"] }
8686

8787
# metrics
88-
iroh-metrics = { version = "0.35", default-features = false }
88+
iroh-metrics = { git = "https://github.com/n0-computer/iroh-metrics", branch = "arqu/histograms", default-features = false }
8989

9090
# local-swarm-discovery
9191
swarm-discovery = { version = "0.4", optional = true }

iroh/src/magicsock.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -795,7 +795,8 @@ impl MagicSock {
795795
}
796796
disco::Message::Pong(pong) => {
797797
self.metrics.magicsock.recv_disco_pong.inc();
798-
self.node_map.handle_pong(sender, src, pong);
798+
self.node_map
799+
.handle_pong(sender, src, pong, &self.metrics.magicsock);
799800
}
800801
disco::Message::CallMeMaybe(cm) => {
801802
self.metrics.magicsock.recv_disco_call_me_maybe.inc();
@@ -2063,7 +2064,9 @@ impl Actor {
20632064
async fn handle_actor_message(&mut self, msg: ActorMessage) {
20642065
match msg {
20652066
ActorMessage::EndpointPingExpired(id, txid) => {
2066-
self.msock.node_map.notify_ping_timeout(id, txid);
2067+
self.msock
2068+
.node_map
2069+
.notify_ping_timeout(id, txid, &self.msock.metrics.magicsock);
20672070
}
20682071
ActorMessage::NetworkChange => {
20692072
self.network_monitor.network_change().await.ok();
@@ -2255,7 +2258,9 @@ impl Actor {
22552258
/// This is called when connectivity changes enough that we no longer trust the old routes.
22562259
#[instrument(skip_all)]
22572260
fn reset_endpoint_states(&mut self) {
2258-
self.msock.node_map.reset_node_states()
2261+
self.msock
2262+
.node_map
2263+
.reset_node_states(&self.msock.metrics.magicsock)
22592264
}
22602265
}
22612266

iroh/src/magicsock/metrics.rs

Lines changed: 86 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
use iroh_metrics::{Counter, MetricsGroup};
1+
use iroh_metrics::{Counter, Histogram, MetricsGroup};
22
use serde::{Deserialize, Serialize};
33

44
/// Enum of metrics for the module
55
// TODO(frando): Add description doc strings for each metric.
66
#[allow(missing_docs)]
7-
#[derive(Debug, Default, Serialize, Deserialize, MetricsGroup)]
7+
#[derive(Debug, Serialize, Deserialize, MetricsGroup)]
88
#[non_exhaustive]
99
#[metrics(name = "magicsock")]
1010
pub struct Metrics {
@@ -77,4 +77,88 @@ pub struct Metrics {
7777
pub connection_handshake_success: Counter,
7878
/// Number of connections with a successful handshake that became direct.
7979
pub connection_became_direct: Counter,
80+
81+
/*
82+
* Path Congestion Metrics
83+
*/
84+
/// Number of times a path was marked as outdated due to consecutive ping failures.
85+
pub path_marked_outdated: Counter,
86+
/// Number of ping failures recorded across all paths.
87+
pub path_ping_failures: Counter,
88+
/// Number of consecutive failure resets (path recovered).
89+
pub path_failure_resets: Counter,
90+
/// Histogram of packet loss rates (0.0-1.0) observed on UDP paths.
91+
pub path_packet_loss_rate: Histogram,
92+
/// Histogram of RTT variance (in milliseconds) as a congestion indicator.
93+
pub path_rtt_variance_ms: Histogram,
94+
/// Histogram of path quality scores (0.0-1.0).
95+
pub path_quality_score: Histogram,
96+
}
97+
98+
impl Default for Metrics {
99+
fn default() -> Self {
100+
Self {
101+
update_direct_addrs: Counter::default(),
102+
send_ipv4: Counter::default(),
103+
send_ipv6: Counter::default(),
104+
send_relay: Counter::default(),
105+
send_relay_error: Counter::default(),
106+
send_data: Counter::default(),
107+
send_data_network_down: Counter::default(),
108+
recv_data_relay: Counter::default(),
109+
recv_data_ipv4: Counter::default(),
110+
recv_data_ipv6: Counter::default(),
111+
recv_datagrams: Counter::default(),
112+
recv_gro_datagrams: Counter::default(),
113+
send_disco_udp: Counter::default(),
114+
send_disco_relay: Counter::default(),
115+
sent_disco_udp: Counter::default(),
116+
sent_disco_relay: Counter::default(),
117+
sent_disco_ping: Counter::default(),
118+
sent_disco_pong: Counter::default(),
119+
sent_disco_call_me_maybe: Counter::default(),
120+
recv_disco_bad_key: Counter::default(),
121+
recv_disco_bad_parse: Counter::default(),
122+
recv_disco_udp: Counter::default(),
123+
recv_disco_relay: Counter::default(),
124+
recv_disco_ping: Counter::default(),
125+
recv_disco_pong: Counter::default(),
126+
recv_disco_call_me_maybe: Counter::default(),
127+
recv_disco_call_me_maybe_bad_disco: Counter::default(),
128+
relay_home_change: Counter::default(),
129+
num_direct_conns_added: Counter::default(),
130+
num_direct_conns_removed: Counter::default(),
131+
num_relay_conns_added: Counter::default(),
132+
num_relay_conns_removed: Counter::default(),
133+
actor_tick_main: Counter::default(),
134+
actor_tick_msg: Counter::default(),
135+
actor_tick_re_stun: Counter::default(),
136+
actor_tick_portmap_changed: Counter::default(),
137+
actor_tick_direct_addr_heartbeat: Counter::default(),
138+
actor_link_change: Counter::default(),
139+
actor_tick_other: Counter::default(),
140+
nodes_contacted: Counter::default(),
141+
nodes_contacted_directly: Counter::default(),
142+
connection_handshake_success: Counter::default(),
143+
connection_became_direct: Counter::default(),
144+
path_marked_outdated: Counter::default(),
145+
path_ping_failures: Counter::default(),
146+
path_failure_resets: Counter::default(),
147+
path_packet_loss_rate: packet_loss_buckets(),
148+
path_rtt_variance_ms: rtt_variance_buckets(),
149+
path_quality_score: quality_score_buckets(),
150+
}
151+
}
152+
}
153+
154+
fn packet_loss_buckets() -> Histogram {
155+
Histogram::new(vec![0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0])
156+
}
157+
158+
fn rtt_variance_buckets() -> Histogram {
159+
Histogram::new(vec![0.0, 1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0])
160+
}
161+
162+
fn quality_score_buckets() -> Histogram {
163+
Histogram::new(vec![0.0, 0.3, 0.5, 0.7, 0.85, 0.95, 1.0])
80164
}

iroh/src/magicsock/node_map.rs

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -192,14 +192,19 @@ impl NodeMap {
192192
}
193193
}
194194

195-
pub(super) fn notify_ping_timeout(&self, id: usize, tx_id: stun_rs::TransactionId) {
195+
pub(super) fn notify_ping_timeout(
196+
&self,
197+
id: usize,
198+
tx_id: stun_rs::TransactionId,
199+
metrics: &Metrics,
200+
) {
196201
if let Some(ep) = self
197202
.inner
198203
.lock()
199204
.expect("poisoned")
200205
.get_mut(NodeStateKey::Idx(id))
201206
{
202-
ep.ping_timeout(tx_id, Instant::now());
207+
ep.ping_timeout(tx_id, Instant::now(), metrics);
203208
}
204209
}
205210

@@ -228,11 +233,17 @@ impl NodeMap {
228233
.handle_ping(sender, src, tx_id)
229234
}
230235

231-
pub(super) fn handle_pong(&self, sender: PublicKey, src: &transports::Addr, pong: Pong) {
236+
pub(super) fn handle_pong(
237+
&self,
238+
sender: PublicKey,
239+
src: &transports::Addr,
240+
pong: Pong,
241+
metrics: &Metrics,
242+
) {
232243
self.inner
233244
.lock()
234245
.expect("poisoned")
235-
.handle_pong(sender, src, pong)
246+
.handle_pong(sender, src, pong, metrics)
236247
}
237248

238249
#[must_use = "actions must be handled"]
@@ -268,11 +279,11 @@ impl NodeMap {
268279
Some((public_key, udp_addr, relay_url, ping_actions))
269280
}
270281

271-
pub(super) fn reset_node_states(&self) {
282+
pub(super) fn reset_node_states(&self, metrics: &Metrics) {
272283
let now = Instant::now();
273284
let mut inner = self.inner.lock().expect("poisoned");
274285
for (_, ep) in inner.node_states_mut() {
275-
ep.note_connectivity_change(now);
286+
ep.note_connectivity_change(now, metrics);
276287
}
277288
}
278289

@@ -506,9 +517,15 @@ impl NodeMapInner {
506517
.map(|ep| ep.conn_type())
507518
}
508519

509-
fn handle_pong(&mut self, sender: NodeId, src: &transports::Addr, pong: Pong) {
520+
fn handle_pong(
521+
&mut self,
522+
sender: NodeId,
523+
src: &transports::Addr,
524+
pong: Pong,
525+
metrics: &Metrics,
526+
) {
510527
if let Some(ns) = self.get_mut(NodeStateKey::NodeId(sender)).as_mut() {
511-
let insert = ns.handle_pong(&pong, src.clone().into());
528+
let insert = ns.handle_pong(&pong, src.clone().into(), metrics);
512529
if let Some((src, key)) = insert {
513530
self.set_node_key_for_ip_port(src, &key);
514531
}
@@ -541,7 +558,7 @@ impl NodeMapInner {
541558
Some(ns) => {
542559
debug!(endpoints = ?cm.my_numbers, "received call-me-maybe");
543560

544-
ns.handle_call_me_maybe(cm)
561+
ns.handle_call_me_maybe(cm, metrics)
545562
}
546563
}
547564
}

0 commit comments

Comments
 (0)