From 4aabb7aa4832b2a77f9759b0204d0429cfff61df Mon Sep 17 00:00:00 2001 From: Sudan Landge Date: Wed, 11 Oct 2023 14:03:32 +0000 Subject: [PATCH 01/12] feat(metrics)(net): refactor and add per net device metrics Emit aggregate and per device metrics for network devices: - Move NetDeviceMetrics from logger to Net module. - Use NET_DEV_METRICS_PVT instead of adding an entry of NetDeviceMetrics in Net so that metrics can be flushed even from signal handlers. Note: this is part 1 of 12 commits to add per net device metrics. Signed-off-by: Sudan Landge --- src/vmm/src/devices/virtio/net/metrics.rs | 285 ++++++++++++++++++++++ src/vmm/src/devices/virtio/net/mod.rs | 1 + src/vmm/src/logger/metrics.rs | 127 +++------- 3 files changed, 318 insertions(+), 95 deletions(-) create mode 100644 src/vmm/src/devices/virtio/net/metrics.rs diff --git a/src/vmm/src/devices/virtio/net/metrics.rs b/src/vmm/src/devices/virtio/net/metrics.rs new file mode 100644 index 00000000000..11df1e223fe --- /dev/null +++ b/src/vmm/src/devices/virtio/net/metrics.rs @@ -0,0 +1,285 @@ +// Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Defines the metrics system for Network devices. +//! +//! # Metrics format +//! The metrics are flushed in JSON when requested by vmm::logger::metrics::METRICS.write(). +//! +//! ## JSON example with metrics: +//! ```json +//! { +//! "net": { +//! "activate_fails": "SharedIncMetric", +//! "cfg_fails": "SharedIncMetric", +//! "mac_address_updates": "SharedIncMetric", +//! "no_rx_avail_buffer": "SharedIncMetric", +//! "no_tx_avail_buffer": "SharedIncMetric", +//! ... +//! } +//! "net0": { +//! "activate_fails": "SharedIncMetric", +//! "cfg_fails": "SharedIncMetric", +//! "mac_address_updates": "SharedIncMetric", +//! "no_rx_avail_buffer": "SharedIncMetric", +//! "no_tx_avail_buffer": "SharedIncMetric", +//! ... +//! } +//! "net1": { +//! "activate_fails": "SharedIncMetric", +//! "cfg_fails": "SharedIncMetric", +//! "mac_address_updates": "SharedIncMetric", +//! "no_rx_avail_buffer": "SharedIncMetric", +//! "no_tx_avail_buffer": "SharedIncMetric", +//! ... +//! } +//! ... +//! "netN": { +//! "activate_fails": "SharedIncMetric", +//! "cfg_fails": "SharedIncMetric", +//! "mac_address_updates": "SharedIncMetric", +//! "no_rx_avail_buffer": "SharedIncMetric", +//! "no_tx_avail_buffer": "SharedIncMetric", +//! ... +//! } +//! } +//! ``` +//! Each `net` field in the example above is a serializable `NetDeviceMetrics` structure +//! collecting metrics such as `activate_fails`, `cfg_fails`, etc. for the network device. +//! `net0`, `net1` and `netN` in the above example represent metrics 0th, 1st and 'N'th +//! network device respectively and `net` is the aggregate of all the per device metrics. +//! +//! # Limitations +//! Network device currently do not have `vmm::logger::metrics::StoreMetrics` so aggregate +//! doesn't consider them. +//! +//! # Design +//! The main design goals of this system are: +//! * To improve network device metrics by logging them at per device granularity. +//! * Continue to provide aggregate net metrics to maintain backward compatibility. +//! * Move NetDeviceMetrics out of from logger and decouple it. +//! * Use lockless operations, preferably ones that don't require anything other than simple +//! reads/writes being atomic. +//! * Rely on `serde` to provide the actual serialization for writing the metrics. +//! * Since all metrics start at 0, we implement the `Default` trait via derive for all of them, to +//! avoid having to initialize everything by hand. +//! +//! The system implements 1 types of metrics: +//! * Shared Incremental Metrics (SharedIncMetrics) - dedicated for the metrics which need a counter +//! (i.e the number of times an API request failed). These metrics are reset upon flush. +//! We use NET_DEV_METRICS_PVT instead of adding an entry of NetDeviceMetrics +//! in Net so that metrics are accessible to be flushed even from signal handlers. + +use serde::ser::SerializeMap; +use serde::{Serialize, Serializer}; + +use crate::logger::{IncMetric, SharedIncMetric}; + +#[derive(Debug, Default)] +pub struct NetDeviceMetricsIndex(usize); + +impl NetDeviceMetricsIndex { + pub fn get(&self) -> &NetDeviceMetrics { + // SAFETY: This is called by individual Net devices + // on self (i.e. self.metrics.get()) so no race condition + // and it is safe. + unsafe { &NET_DEV_METRICS_PVT.metrics[self.0] } + } +} + +/// provides instance for net metrics +#[derive(Debug, Default)] +pub struct NetDeviceMetricsPool { + /// used to access per net device metrics + metrics: Vec, +} + +impl NetDeviceMetricsPool { + /// default construction + pub fn get() -> NetDeviceMetricsIndex { + // SAFETY: This is only called during creation of a Net device + // and since each device is created sequentially so there is no + // case of race condition. + unsafe { + NET_DEV_METRICS_PVT.metrics.push(NetDeviceMetrics::new()); + NetDeviceMetricsIndex(NET_DEV_METRICS_PVT.metrics.len() - 1) + } + } +} + +/// Contains Network-related metrics per device. +static mut NET_DEV_METRICS_PVT: NetDeviceMetricsPool = NetDeviceMetricsPool { + metrics: Vec::new(), +}; + +pub fn flush_metrics(serializer: S) -> Result { + // SAFETY: This is called from vmm::logger::metrics::METRICS.write which + // has a lock and takes care of race condition. + unsafe { + // +1 to accomodate aggregate net metrics + let mut seq = serializer.serialize_map(Some(1 + NET_DEV_METRICS_PVT.metrics.len()))?; + + let net_aggregated: NetDeviceMetrics = NET_DEV_METRICS_PVT.metrics.iter().fold( + NetDeviceMetrics::default(), + |mut net_agg, net| { + net_agg.aggregate(net); + net_agg + }, + ); + + seq.serialize_entry("net", &net_aggregated)?; + + for (i, metrics) in NET_DEV_METRICS_PVT.metrics.iter().enumerate() { + let devn = format!("net{}", i); + // let ser_metrics = NetDeviceMetrics::default().aggregate(&metrics); + seq.serialize_entry(&devn, &metrics)?; + } + seq.end() + } +} + +/// Network-related metrics. +// #[derive(Debug, Default)] +#[derive(Debug, Default, Serialize)] +pub struct NetDeviceMetrics { + /// Number of times when activate failed on a network device. + pub activate_fails: SharedIncMetric, + /// Number of times when interacting with the space config of a network device failed. + pub cfg_fails: SharedIncMetric, + /// Number of times the mac address was updated through the config space. + pub mac_address_updates: SharedIncMetric, + /// No available buffer for the net device rx queue. + pub no_rx_avail_buffer: SharedIncMetric, + /// No available buffer for the net device tx queue. + pub no_tx_avail_buffer: SharedIncMetric, + /// Number of times when handling events on a network device failed. + pub event_fails: SharedIncMetric, + /// Number of events associated with the receiving queue. + pub rx_queue_event_count: SharedIncMetric, + /// Number of events associated with the rate limiter installed on the receiving path. + pub rx_event_rate_limiter_count: SharedIncMetric, + /// Number of RX partial writes to guest. + pub rx_partial_writes: SharedIncMetric, + /// Number of RX rate limiter throttling events. + pub rx_rate_limiter_throttled: SharedIncMetric, + /// Number of events received on the associated tap. + pub rx_tap_event_count: SharedIncMetric, + /// Number of bytes received. + pub rx_bytes_count: SharedIncMetric, + /// Number of packets received. + pub rx_packets_count: SharedIncMetric, + /// Number of errors while receiving data. + pub rx_fails: SharedIncMetric, + /// Number of successful read operations while receiving data. + pub rx_count: SharedIncMetric, + /// Number of times reading from TAP failed. + pub tap_read_fails: SharedIncMetric, + /// Number of times writing to TAP failed. + pub tap_write_fails: SharedIncMetric, + /// Number of transmitted bytes. + pub tx_bytes_count: SharedIncMetric, + /// Number of malformed TX frames. + pub tx_malformed_frames: SharedIncMetric, + /// Number of errors while transmitting data. + pub tx_fails: SharedIncMetric, + /// Number of successful write operations while transmitting data. + pub tx_count: SharedIncMetric, + /// Number of transmitted packets. + pub tx_packets_count: SharedIncMetric, + /// Number of TX partial reads from guest. + pub tx_partial_reads: SharedIncMetric, + /// Number of events associated with the transmitting queue. + pub tx_queue_event_count: SharedIncMetric, + /// Number of events associated with the rate limiter installed on the transmitting path. + pub tx_rate_limiter_event_count: SharedIncMetric, + /// Number of RX rate limiter throttling events. + pub tx_rate_limiter_throttled: SharedIncMetric, + /// Number of packets with a spoofed mac, sent by the guest. + pub tx_spoofed_mac_count: SharedIncMetric, +} + +impl NetDeviceMetrics { + /// Const default construction. + pub const fn new() -> Self { + Self { + activate_fails: SharedIncMetric::new(), + cfg_fails: SharedIncMetric::new(), + mac_address_updates: SharedIncMetric::new(), + no_rx_avail_buffer: SharedIncMetric::new(), + no_tx_avail_buffer: SharedIncMetric::new(), + event_fails: SharedIncMetric::new(), + rx_queue_event_count: SharedIncMetric::new(), + rx_event_rate_limiter_count: SharedIncMetric::new(), + rx_partial_writes: SharedIncMetric::new(), + rx_rate_limiter_throttled: SharedIncMetric::new(), + rx_tap_event_count: SharedIncMetric::new(), + rx_bytes_count: SharedIncMetric::new(), + rx_packets_count: SharedIncMetric::new(), + rx_fails: SharedIncMetric::new(), + rx_count: SharedIncMetric::new(), + tap_read_fails: SharedIncMetric::new(), + tap_write_fails: SharedIncMetric::new(), + tx_bytes_count: SharedIncMetric::new(), + tx_malformed_frames: SharedIncMetric::new(), + tx_fails: SharedIncMetric::new(), + tx_count: SharedIncMetric::new(), + tx_packets_count: SharedIncMetric::new(), + tx_partial_reads: SharedIncMetric::new(), + tx_queue_event_count: SharedIncMetric::new(), + tx_rate_limiter_event_count: SharedIncMetric::new(), + tx_rate_limiter_throttled: SharedIncMetric::new(), + tx_spoofed_mac_count: SharedIncMetric::new(), + } + } + + /// Net metrics are SharedIncMetric where the diff of current vs + /// old is serialized i.e. serialize_u64(current-old). + /// So to have the aggregate serialized in same way we need to + /// fetch the diff of current vs old metrics and add it to the + /// aggregate. + pub fn aggregate(&mut self, other: &Self) { + self.activate_fails.add(other.activate_fails.fetch_diff()); + self.cfg_fails.add(other.cfg_fails.fetch_diff()); + self.mac_address_updates + .add(other.mac_address_updates.fetch_diff()); + self.no_rx_avail_buffer + .add(other.no_rx_avail_buffer.fetch_diff()); + self.no_tx_avail_buffer + .add(other.no_tx_avail_buffer.fetch_diff()); + self.event_fails.add(other.event_fails.fetch_diff()); + self.rx_queue_event_count + .add(other.rx_queue_event_count.fetch_diff()); + self.rx_event_rate_limiter_count + .add(other.rx_event_rate_limiter_count.fetch_diff()); + self.rx_partial_writes + .add(other.rx_partial_writes.fetch_diff()); + self.rx_rate_limiter_throttled + .add(other.rx_rate_limiter_throttled.fetch_diff()); + self.rx_tap_event_count + .add(other.rx_tap_event_count.fetch_diff()); + self.rx_bytes_count.add(other.rx_bytes_count.fetch_diff()); + self.rx_packets_count + .add(other.rx_packets_count.fetch_diff()); + self.rx_fails.add(other.rx_fails.fetch_diff()); + self.rx_count.add(other.rx_count.fetch_diff()); + self.tap_read_fails.add(other.tap_read_fails.fetch_diff()); + self.tap_write_fails.add(other.tap_write_fails.fetch_diff()); + self.tx_bytes_count.add(other.tx_bytes_count.fetch_diff()); + self.tx_malformed_frames + .add(other.tx_malformed_frames.fetch_diff()); + self.tx_fails.add(other.tx_fails.fetch_diff()); + self.tx_count.add(other.tx_count.fetch_diff()); + self.tx_packets_count + .add(other.tx_packets_count.fetch_diff()); + self.tx_partial_reads + .add(other.tx_partial_reads.fetch_diff()); + self.tx_queue_event_count + .add(other.tx_queue_event_count.fetch_diff()); + self.tx_rate_limiter_event_count + .add(other.tx_rate_limiter_event_count.fetch_diff()); + self.tx_rate_limiter_throttled + .add(other.tx_rate_limiter_throttled.fetch_diff()); + self.tx_spoofed_mac_count + .add(other.tx_spoofed_mac_count.fetch_diff()); + } +} diff --git a/src/vmm/src/devices/virtio/net/mod.rs b/src/vmm/src/devices/virtio/net/mod.rs index c7ea15f296a..ce96283e1a4 100644 --- a/src/vmm/src/devices/virtio/net/mod.rs +++ b/src/vmm/src/devices/virtio/net/mod.rs @@ -19,6 +19,7 @@ pub const TX_INDEX: usize = 1; pub mod device; mod event_handler; +pub mod metrics; pub mod persist; mod tap; pub mod test_utils; diff --git a/src/vmm/src/logger/metrics.rs b/src/vmm/src/logger/metrics.rs index 17b1a81ae45..56be93cedca 100644 --- a/src/vmm/src/logger/metrics.rs +++ b/src/vmm/src/logger/metrics.rs @@ -72,6 +72,7 @@ use serde::{Serialize, Serializer}; use vm_superio::rtc_pl031::RtcEvents; use super::FcLineWriter; +use crate::devices::virtio::net::metrics as net_metrics; #[cfg(target_arch = "aarch64")] use crate::warn; @@ -201,6 +202,10 @@ pub trait IncMetric { } /// Returns current value of the counter. fn count(&self) -> u64; + + /// Returns diff of current and old value of the counter. + /// Mostly used in process of aggregating per device metrics. + fn fetch_diff(&self) -> u64; } /// Used for defining new types of metrics that do not need a counter and act as a persistent @@ -254,6 +259,9 @@ impl IncMetric for SharedIncMetric { fn count(&self) -> u64 { self.0.load(Ordering::Relaxed) } + fn fetch_diff(&self) -> u64 { + self.0.load(Ordering::Relaxed) - self.1.load(Ordering::Relaxed) + } } impl StoreMetric for SharedStoreMetric { @@ -706,99 +714,6 @@ impl MmdsMetrics { } } -/// Network-related metrics. -#[derive(Debug, Default, Serialize)] -pub struct NetDeviceMetrics { - /// Number of times when activate failed on a network device. - pub activate_fails: SharedIncMetric, - /// Number of times when interacting with the space config of a network device failed. - pub cfg_fails: SharedIncMetric, - //// Number of times the mac address was updated through the config space. - pub mac_address_updates: SharedIncMetric, - /// No available buffer for the net device rx queue. - pub no_rx_avail_buffer: SharedIncMetric, - /// No available buffer for the net device tx queue. - pub no_tx_avail_buffer: SharedIncMetric, - /// Number of times when handling events on a network device failed. - pub event_fails: SharedIncMetric, - /// Number of events associated with the receiving queue. - pub rx_queue_event_count: SharedIncMetric, - /// Number of events associated with the rate limiter installed on the receiving path. - pub rx_event_rate_limiter_count: SharedIncMetric, - /// Number of RX partial writes to guest. - pub rx_partial_writes: SharedIncMetric, - /// Number of RX rate limiter throttling events. - pub rx_rate_limiter_throttled: SharedIncMetric, - /// Number of events received on the associated tap. - pub rx_tap_event_count: SharedIncMetric, - /// Number of bytes received. - pub rx_bytes_count: SharedIncMetric, - /// Number of packets received. - pub rx_packets_count: SharedIncMetric, - /// Number of errors while receiving data. - pub rx_fails: SharedIncMetric, - /// Number of successful read operations while receiving data. - pub rx_count: SharedIncMetric, - /// Number of times reading from TAP failed. - pub tap_read_fails: SharedIncMetric, - /// Number of times writing to TAP failed. - pub tap_write_fails: SharedIncMetric, - /// Number of transmitted bytes. - pub tx_bytes_count: SharedIncMetric, - /// Number of malformed TX frames. - pub tx_malformed_frames: SharedIncMetric, - /// Number of errors while transmitting data. - pub tx_fails: SharedIncMetric, - /// Number of successful write operations while transmitting data. - pub tx_count: SharedIncMetric, - /// Number of transmitted packets. - pub tx_packets_count: SharedIncMetric, - /// Number of TX partial reads from guest. - pub tx_partial_reads: SharedIncMetric, - /// Number of events associated with the transmitting queue. - pub tx_queue_event_count: SharedIncMetric, - /// Number of events associated with the rate limiter installed on the transmitting path. - pub tx_rate_limiter_event_count: SharedIncMetric, - /// Number of RX rate limiter throttling events. - pub tx_rate_limiter_throttled: SharedIncMetric, - /// Number of packets with a spoofed mac, sent by the guest. - pub tx_spoofed_mac_count: SharedIncMetric, -} -impl NetDeviceMetrics { - /// Const default construction. - pub const fn new() -> Self { - Self { - activate_fails: SharedIncMetric::new(), - cfg_fails: SharedIncMetric::new(), - mac_address_updates: SharedIncMetric::new(), - no_rx_avail_buffer: SharedIncMetric::new(), - no_tx_avail_buffer: SharedIncMetric::new(), - event_fails: SharedIncMetric::new(), - rx_queue_event_count: SharedIncMetric::new(), - rx_event_rate_limiter_count: SharedIncMetric::new(), - rx_partial_writes: SharedIncMetric::new(), - rx_rate_limiter_throttled: SharedIncMetric::new(), - rx_tap_event_count: SharedIncMetric::new(), - rx_bytes_count: SharedIncMetric::new(), - rx_packets_count: SharedIncMetric::new(), - rx_fails: SharedIncMetric::new(), - rx_count: SharedIncMetric::new(), - tap_read_fails: SharedIncMetric::new(), - tap_write_fails: SharedIncMetric::new(), - tx_bytes_count: SharedIncMetric::new(), - tx_malformed_frames: SharedIncMetric::new(), - tx_fails: SharedIncMetric::new(), - tx_count: SharedIncMetric::new(), - tx_packets_count: SharedIncMetric::new(), - tx_partial_reads: SharedIncMetric::new(), - tx_queue_event_count: SharedIncMetric::new(), - tx_rate_limiter_event_count: SharedIncMetric::new(), - tx_rate_limiter_throttled: SharedIncMetric::new(), - tx_spoofed_mac_count: SharedIncMetric::new(), - } - } -} - /// Performance metrics related for the moment only to snapshots. // These store the duration of creating/loading a snapshot and of // pausing/resuming the microVM. @@ -1146,6 +1061,27 @@ impl Serialize for SerializeToUtcTimestampMs { } } +#[derive(Default, Debug)] +// By using the below structure in FirecrackerMetrics it is easy +// to serialise Firecracker app_metrics as a signle json object which +// otherwise would have required extra string manipulation to pack +// net as part of the same json object as FirecrackerMetrics. +pub struct NetMetricsSerializer {} +impl NetMetricsSerializer { + pub const fn new() -> Self { + Self {} + } +} + +impl Serialize for NetMetricsSerializer { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + net_metrics::flush_metrics(serializer) + } +} + /// Structure storing all metrics while enforcing serialization support on them. #[derive(Debug, Default, Serialize)] pub struct FirecrackerMetrics { @@ -1168,8 +1104,9 @@ pub struct FirecrackerMetrics { pub logger: LoggerSystemMetrics, /// Metrics specific to MMDS functionality. pub mmds: MmdsMetrics, + #[serde(flatten)] /// A network device's related metrics. - pub net: NetDeviceMetrics, + pub net_ser: NetMetricsSerializer, /// Metrics related to API PATCH requests. pub patch_api_requests: PatchRequestsMetrics, /// Metrics related to API PUT requests. @@ -1206,7 +1143,7 @@ impl FirecrackerMetrics { latencies_us: PerformanceMetrics::new(), logger: LoggerSystemMetrics::new(), mmds: MmdsMetrics::new(), - net: NetDeviceMetrics::new(), + net_ser: NetMetricsSerializer::new(), patch_api_requests: PatchRequestsMetrics::new(), put_api_requests: PutRequestsMetrics::new(), #[cfg(target_arch = "aarch64")] From b370ce3cb0d0d532ec58c1f621d35383545a258c Mon Sep 17 00:00:00 2001 From: Sudan Landge Date: Wed, 11 Oct 2023 14:29:57 +0000 Subject: [PATCH 02/12] feat(metrics)(net): per device net metrics for report_net_event_fail Use individual instance of metrics instead of the METRICS.net. Since we do not use METRIC.net, modify report_net_event_fail to use self.metrics. Note: this is part 2 of 12 commits to add per net device metrics. Signed-off-by: Sudan Landge --- src/vmm/src/devices/mod.rs | 7 +- src/vmm/src/devices/virtio/net/device.rs | 181 ++++++++++-------- .../src/devices/virtio/net/event_handler.rs | 4 +- src/vmm/src/devices/virtio/net/test_utils.rs | 6 +- 4 files changed, 110 insertions(+), 88 deletions(-) diff --git a/src/vmm/src/devices/mod.rs b/src/vmm/src/devices/mod.rs index 454e4330609..c1c0eef86f4 100644 --- a/src/vmm/src/devices/mod.rs +++ b/src/vmm/src/devices/mod.rs @@ -17,14 +17,15 @@ pub mod virtio; pub use bus::{Bus, BusDevice, BusError}; use log::error; +use crate::devices::virtio::metrics::NetDeviceMetricsIndex; use crate::devices::virtio::{QueueError, VsockError}; use crate::logger::{IncMetric, METRICS}; // Function used for reporting error in terms of logging -// but also in terms of METRICS net event fails. -pub(crate) fn report_net_event_fail(err: DeviceError) { +// but also in terms of metrics of net event fails. +pub(crate) fn report_net_event_fail(net_metrics: &NetDeviceMetricsIndex, err: DeviceError) { error!("{:?}", err); - METRICS.net.event_fails.inc(); + net_metrics.get().event_fails.inc(); } pub(crate) fn report_balloon_event_fail(err: virtio::balloon::BalloonError) { diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index d4d3f8448ec..59614595e88 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -37,6 +37,7 @@ use crate::vstate::memory::{ByteValued, Bytes, GuestMemoryMmap}; const FRAME_HEADER_MAX_LEN: usize = PAYLOAD_OFFSET + ETH_IPV4_FRAME_LEN; use crate::devices::virtio::iovec::IoVecBuffer; +use crate::devices::virtio::metrics::{NetDeviceMetricsIndex, NetDeviceMetricsPool}; use crate::devices::virtio::net::tap::Tap; use crate::devices::virtio::net::{ gen, NetError, NetQueue, MAX_BUFFER_SIZE, NET_QUEUE_SIZES, RX_INDEX, TX_INDEX, @@ -136,6 +137,7 @@ pub struct Net { /// The MMDS stack corresponding to this interface. /// Only if MMDS transport has been associated with it. pub mmds_ns: Option, + pub(crate) metrics: NetDeviceMetricsIndex, } impl Net { @@ -190,6 +192,7 @@ impl Net { device_state: DeviceState::Inactive, activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(NetError::EventFd)?, mmds_ns: None, + metrics: NetDeviceMetricsPool::get(), }) } @@ -272,7 +275,7 @@ impl Net { self.irq_trigger .trigger_irq(IrqType::Vring) .map_err(|err| { - METRICS.net.event_fails.inc(); + self.metrics.get().event_fails.inc(); DeviceError::FailedSignalingIrq(err) })?; } @@ -305,7 +308,7 @@ impl Net { // Returns true on successful frame delivery. fn rate_limited_rx_single_frame(&mut self) -> bool { if !Self::rate_limiter_consume_op(&mut self.rx_rate_limiter, self.rx_bytes_read as u64) { - METRICS.net.rx_rate_limiter_throttled.inc(); + self.metrics.get().rx_rate_limiter_throttled.inc(); return false; } @@ -331,6 +334,7 @@ impl Net { mem: &GuestMemoryMmap, data: &[u8], head: DescriptorChain, + metrics: &NetDeviceMetricsIndex, ) -> Result<(), FrontendError> { let mut chunk = data; let mut next_descriptor = Some(head); @@ -343,13 +347,13 @@ impl Net { let len = std::cmp::min(chunk.len(), descriptor.len as usize); match mem.write_slice(&chunk[..len], descriptor.addr) { Ok(()) => { - METRICS.net.rx_count.inc(); + metrics.get().rx_count.inc(); chunk = &chunk[len..]; } Err(err) => { error!("Failed to write slice: {:?}", err); if let GuestMemoryError::PartialBuffer { .. } = err { - METRICS.net.rx_partial_writes.inc(); + metrics.get().rx_partial_writes.inc(); } return Err(FrontendError::GuestMemory(err)); } @@ -357,8 +361,8 @@ impl Net { // If chunk is empty we are done here. if chunk.is_empty() { - METRICS.net.rx_bytes_count.add(data.len() as u64); - METRICS.net.rx_packets_count.inc(); + metrics.get().rx_bytes_count.add(data.len() as u64); + metrics.get().rx_packets_count.inc(); return Ok(()); } @@ -376,7 +380,7 @@ impl Net { let queue = &mut self.queues[RX_INDEX]; let head_descriptor = queue.pop_or_enable_notification(mem).ok_or_else(|| { - METRICS.net.no_rx_avail_buffer.inc(); + self.metrics.get().no_rx_avail_buffer.inc(); FrontendError::EmptyQueue })?; let head_index = head_descriptor.index; @@ -385,10 +389,11 @@ impl Net { mem, &self.rx_frame_buf[..self.rx_bytes_read], head_descriptor, + &self.metrics, ); // Mark the descriptor chain as used. If an error occurred, skip the descriptor chain. let used_len = if result.is_err() { - METRICS.net.rx_fails.inc(); + self.metrics.get().rx_fails.inc(); 0 } else { // Safe to unwrap because a frame must be smaller than 2^16 bytes. @@ -432,18 +437,19 @@ impl Net { frame_iovec: &IoVecBuffer, tap: &mut Tap, guest_mac: Option, + metrics: &NetDeviceMetricsIndex, ) -> Result { // Read the frame headers from the IoVecBuffer. This will return None // if the frame_iovec is empty. let header_len = frame_iovec.read_at(headers, 0).ok_or_else(|| { error!("Received empty TX buffer"); - METRICS.net.tx_malformed_frames.inc(); + metrics.get().tx_malformed_frames.inc(); NetError::VnetHeaderMissing })?; let headers = frame_bytes_from_buf(&headers[..header_len]).map_err(|e| { error!("VNET headers missing in TX frame"); - METRICS.net.tx_malformed_frames.inc(); + metrics.get().tx_malformed_frames.inc(); e })?; @@ -470,20 +476,20 @@ impl Net { if let Some(guest_mac) = guest_mac { let _ = EthernetFrame::from_bytes(headers).map(|eth_frame| { if guest_mac != eth_frame.src_mac() { - METRICS.net.tx_spoofed_mac_count.inc(); + metrics.get().tx_spoofed_mac_count.inc(); } }); } match Self::write_tap(tap, frame_iovec) { Ok(_) => { - METRICS.net.tx_bytes_count.add(frame_iovec.len() as u64); - METRICS.net.tx_packets_count.inc(); - METRICS.net.tx_count.inc(); + metrics.get().tx_bytes_count.add(frame_iovec.len() as u64); + metrics.get().tx_packets_count.inc(); + metrics.get().tx_count.inc(); } Err(err) => { error!("Failed to write to tap: {:?}", err); - METRICS.net.tap_write_fails.inc(); + metrics.get().tap_write_fails.inc(); } }; Ok(false) @@ -512,7 +518,7 @@ impl Net { match self.read_from_mmds_or_tap() { Ok(count) => { self.rx_bytes_read = count; - METRICS.net.rx_count.inc(); + self.metrics.get().rx_count.inc(); if !self.rate_limited_rx_single_frame() { self.rx_deferred_frame = true; break; @@ -525,7 +531,7 @@ impl Net { Some(err) if err == EAGAIN => (), _ => { error!("Failed to read tap: {:?}", err); - METRICS.net.tap_read_fails.inc(); + self.metrics.get().tap_read_fails.inc(); return Err(DeviceError::FailedReadTap); } }; @@ -580,7 +586,7 @@ impl Net { let buffer = match IoVecBuffer::from_descriptor_chain(mem, head) { Ok(buffer) => buffer, Err(_) => { - METRICS.net.tx_fails.inc(); + self.metrics.get().tx_fails.inc(); tx_queue .add_used(mem, head_index, 0) .map_err(DeviceError::QueueError)?; @@ -589,7 +595,7 @@ impl Net { }; if !Self::rate_limiter_consume_op(&mut self.tx_rate_limiter, buffer.len() as u64) { tx_queue.undo_pop(); - METRICS.net.tx_rate_limiter_throttled.inc(); + self.metrics.get().tx_rate_limiter_throttled.inc(); break; } @@ -600,6 +606,7 @@ impl Net { &buffer, &mut self.tap, self.guest_mac, + &self.metrics, ) .unwrap_or(false); if frame_consumed_by_mmds && !self.rx_deferred_frame { @@ -614,7 +621,7 @@ impl Net { } if !used_any { - METRICS.net.no_tx_avail_buffer.inc(); + self.metrics.get().no_tx_avail_buffer.inc(); } self.signal_used_queue(NetQueue::Tx)?; @@ -654,37 +661,38 @@ impl Net { /// This is called by the event manager responding to the guest adding a new /// buffer in the RX queue. pub fn process_rx_queue_event(&mut self) { - METRICS.net.rx_queue_event_count.inc(); + self.metrics.get().rx_queue_event_count.inc(); if let Err(err) = self.queue_evts[RX_INDEX].read() { // rate limiters present but with _very high_ allowed rate error!("Failed to get rx queue event: {:?}", err); - METRICS.net.event_fails.inc(); + self.metrics.get().event_fails.inc(); } else if self.rx_rate_limiter.is_blocked() { - METRICS.net.rx_rate_limiter_throttled.inc(); + self.metrics.get().rx_rate_limiter_throttled.inc(); } else { // If the limiter is not blocked, resume the receiving of bytes. - self.resume_rx().unwrap_or_else(report_net_event_fail); + self.resume_rx() + .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); } } pub fn process_tap_rx_event(&mut self) { // This is safe since we checked in the event handler that the device is activated. let mem = self.device_state.mem().unwrap(); - METRICS.net.rx_tap_event_count.inc(); + self.metrics.get().rx_tap_event_count.inc(); // While there are no available RX queue buffers and there's a deferred_frame // don't process any more incoming. Otherwise start processing a frame. In the // process the deferred_frame flag will be set in order to avoid freezing the // RX queue. if self.queues[RX_INDEX].is_empty(mem) && self.rx_deferred_frame { - METRICS.net.no_rx_avail_buffer.inc(); + self.metrics.get().no_rx_avail_buffer.inc(); return; } // While limiter is blocked, don't process any more incoming. if self.rx_rate_limiter.is_blocked() { - METRICS.net.rx_rate_limiter_throttled.inc(); + self.metrics.get().rx_rate_limiter_throttled.inc(); return; } @@ -693,9 +701,10 @@ impl Net { // until we manage to receive this deferred frame. { self.handle_deferred_frame() - .unwrap_or_else(report_net_event_fail); + .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); } else { - self.process_rx().unwrap_or_else(report_net_event_fail); + self.process_rx() + .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); } } @@ -704,48 +713,51 @@ impl Net { /// This is called by the event manager responding to the guest adding a new /// buffer in the TX queue. pub fn process_tx_queue_event(&mut self) { - METRICS.net.tx_queue_event_count.inc(); + self.metrics.get().tx_queue_event_count.inc(); if let Err(err) = self.queue_evts[TX_INDEX].read() { error!("Failed to get tx queue event: {:?}", err); - METRICS.net.event_fails.inc(); + self.metrics.get().event_fails.inc(); } else if !self.tx_rate_limiter.is_blocked() // If the limiter is not blocked, continue transmitting bytes. { - self.process_tx().unwrap_or_else(report_net_event_fail); + self.process_tx() + .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); } else { - METRICS.net.tx_rate_limiter_throttled.inc(); + self.metrics.get().tx_rate_limiter_throttled.inc(); } } pub fn process_rx_rate_limiter_event(&mut self) { - METRICS.net.rx_event_rate_limiter_count.inc(); + self.metrics.get().rx_event_rate_limiter_count.inc(); // Upon rate limiter event, call the rate limiter handler // and restart processing the queue. match self.rx_rate_limiter.event_handler() { Ok(_) => { // There might be enough budget now to receive the frame. - self.resume_rx().unwrap_or_else(report_net_event_fail); + self.resume_rx() + .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); } Err(err) => { error!("Failed to get rx rate-limiter event: {:?}", err); - METRICS.net.event_fails.inc(); + self.metrics.get().event_fails.inc(); } } } pub fn process_tx_rate_limiter_event(&mut self) { - METRICS.net.tx_rate_limiter_event_count.inc(); + self.metrics.get().tx_rate_limiter_event_count.inc(); // Upon rate limiter event, call the rate limiter handler // and restart processing the queue. match self.tx_rate_limiter.event_handler() { Ok(_) => { // There might be enough budget now to send the frame. - self.process_tx().unwrap_or_else(report_net_event_fail); + self.process_tx() + .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); } Err(err) => { error!("Failed to get tx rate-limiter event: {:?}", err); - METRICS.net.event_fails.inc(); + self.metrics.get().event_fails.inc(); } } } @@ -799,7 +811,7 @@ impl VirtioDevice for Net { let config_len = config_space_bytes.len() as u64; if offset >= config_len { error!("Failed to read config space"); - METRICS.net.cfg_fails.inc(); + self.metrics.get().cfg_fails.inc(); return; } if let Some(end) = offset.checked_add(data.len() as u64) { @@ -820,13 +832,13 @@ impl VirtioDevice for Net { .and_then(|(start, end)| config_space_bytes.get_mut(start..end)) else { error!("Failed to write config space"); - METRICS.net.cfg_fails.inc(); + self.metrics.get().cfg_fails.inc(); return; }; dst.copy_from_slice(data); self.guest_mac = Some(self.config_space.guest_mac); - METRICS.net.mac_address_updates.inc(); + self.metrics.get().mac_address_updates.inc(); } fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { @@ -881,7 +893,7 @@ pub mod tests { use crate::dumbo::pdu::arp::{EthIPv4ArpFrame, ETH_IPV4_FRAME_LEN}; use crate::dumbo::pdu::ethernet::ETHERTYPE_ARP; use crate::dumbo::EthernetFrame; - use crate::logger::{IncMetric, METRICS}; + use crate::logger::IncMetric; use crate::rate_limiter::{BucketUpdate, RateLimiter, TokenBucket, TokenType}; use crate::vstate::memory::{Address, GuestMemory}; @@ -1008,7 +1020,7 @@ pub mod tests { // Check that the guest MAC was updated. let expected_guest_mac = MacAddr::from_bytes_unchecked(&new_config); assert_eq!(expected_guest_mac, net.guest_mac.unwrap()); - assert_eq!(METRICS.net.mac_address_updates.count(), 1); + assert_eq!(net.metrics.get().mac_address_updates.count(), 1); // Partial write (this is how the kernel sets a new mac address) - byte by byte. let new_config = [0x11, 0x22, 0x33, 0x44, 0x55, 0x66]; @@ -1041,7 +1053,7 @@ pub mod tests { th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); th.net().queue_evts[RX_INDEX].read().unwrap(); check_metric_after_block!( - METRICS.net.event_fails, + th.net().metrics.get().event_fails, 1, th.simulate_event(NetEvent::RxQueue) ); @@ -1136,7 +1148,7 @@ pub mod tests { // Inject frame to tap and run epoll. let frame = inject_tap_tx_frame(&th.net(), 1000); check_metric_after_block!( - METRICS.net.rx_packets_count, + th.net().metrics.get().rx_packets_count, 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1177,7 +1189,7 @@ pub mod tests { // Inject frame to tap and run epoll. let frame = inject_tap_tx_frame(&th.net(), 1000); check_metric_after_block!( - METRICS.net.rx_packets_count, + th.net().metrics.get().rx_packets_count, 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1217,7 +1229,7 @@ pub mod tests { let frame_1 = inject_tap_tx_frame(&th.net(), 200); let frame_2 = inject_tap_tx_frame(&th.net(), 300); check_metric_after_block!( - METRICS.net.rx_packets_count, + th.net().metrics.get().rx_packets_count, 2, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1248,7 +1260,7 @@ pub mod tests { th.add_desc_chain(NetQueue::Tx, 0, &[(0, 4096, 0)]); th.net().queue_evts[TX_INDEX].read().unwrap(); check_metric_after_block!( - METRICS.net.event_fails, + th.net().metrics.get().event_fails, 1, th.simulate_event(NetEvent::TxQueue) ); @@ -1287,7 +1299,7 @@ pub mod tests { // Send an invalid frame (too small, VNET header missing). th.add_desc_chain(NetQueue::Tx, 0, &[(0, 1, 0)]); check_metric_after_block!( - &METRICS.net.tx_malformed_frames, + &th.net().metrics.get().tx_malformed_frames, 1, th.event_manager.run_with_timeout(100) ); @@ -1309,7 +1321,7 @@ pub mod tests { // Send an invalid frame (too small, VNET header missing). th.add_desc_chain(NetQueue::Tx, 0, &[(0, 0, 0)]); check_metric_after_block!( - &METRICS.net.tx_malformed_frames, + &th.net().metrics.get().tx_malformed_frames, 1, th.event_manager.run_with_timeout(100) ); @@ -1347,7 +1359,7 @@ pub mod tests { // One frame is valid, one will not be handled because it includes write-only memory // so that leaves us with 2 malformed (no vnet header) frames. check_metric_after_block!( - &METRICS.net.tx_malformed_frames, + &th.net().metrics.get().tx_malformed_frames, 2, th.event_manager.run_with_timeout(100) ); @@ -1377,7 +1389,7 @@ pub mod tests { let frame = th.write_tx_frame(&desc_list, 1000); check_metric_after_block!( - METRICS.net.tx_packets_count, + th.net().metrics.get().tx_packets_count, 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1403,7 +1415,7 @@ pub mod tests { let _ = th.write_tx_frame(&desc_list, 1000); check_metric_after_block!( - METRICS.net.tap_write_fails, + th.net().metrics.get().tap_write_fails, 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1430,7 +1442,7 @@ pub mod tests { let frame_2 = th.write_tx_frame(&desc_list, 600); check_metric_after_block!( - METRICS.net.tx_packets_count, + th.net().metrics.get().tx_packets_count, 2, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1509,6 +1521,7 @@ pub mod tests { &buffer, &mut net.tap, Some(src_mac), + &net.metrics, ) .unwrap()) ); @@ -1537,7 +1550,7 @@ pub mod tests { // Check that a legit MAC doesn't affect the spoofed MAC metric. check_metric_after_block!( - &METRICS.net.tx_spoofed_mac_count, + &net.metrics.get().tx_spoofed_mac_count, 0, Net::write_to_mmds_or_tap( net.mmds_ns.as_mut(), @@ -1546,12 +1559,13 @@ pub mod tests { &buffer, &mut net.tap, Some(guest_mac), + &net.metrics, ) ); // Check that a spoofed MAC increases our spoofed MAC metric. check_metric_after_block!( - &METRICS.net.tx_spoofed_mac_count, + &net.metrics.get().tx_spoofed_mac_count, 1, Net::write_to_mmds_or_tap( net.mmds_ns.as_mut(), @@ -1560,6 +1574,7 @@ pub mod tests { &buffer, &mut net.tap, Some(not_guest_mac), + &net.metrics, ) ); } @@ -1572,7 +1587,7 @@ pub mod tests { // RX rate limiter events should error since the limiter is not blocked. // Validate that the event failed and failure was properly accounted for. check_metric_after_block!( - &METRICS.net.event_fails, + &th.net().metrics.get().event_fails, 1, th.simulate_event(NetEvent::RxRateLimiter) ); @@ -1580,7 +1595,7 @@ pub mod tests { // TX rate limiter events should error since the limiter is not blocked. // Validate that the event failed and failure was properly accounted for. check_metric_after_block!( - &METRICS.net.event_fails, + &th.net().metrics.get().event_fails, 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1598,7 +1613,7 @@ pub mod tests { // The RX queue is empty and rx_deffered_frame is set. th.net().rx_deferred_frame = true; check_metric_after_block!( - &METRICS.net.no_rx_avail_buffer, + &th.net().metrics.get().no_rx_avail_buffer, 1, th.simulate_event(NetEvent::Tap) ); @@ -1611,7 +1626,7 @@ pub mod tests { // Fake an avail buffer; this time, tap reading should error out. th.rxq.avail.idx.set(1); check_metric_after_block!( - &METRICS.net.tap_read_fails, + &th.net().metrics.get().tap_read_fails, 1, th.simulate_event(NetEvent::Tap) ); @@ -1623,19 +1638,22 @@ pub mod tests { th.activate_net(); th.net().tap.mocks.set_read_tap(ReadTapMock::TapFrame); - let rx_packets_count = METRICS.net.rx_packets_count.count(); + let rx_packets_count = th.net().metrics.get().rx_packets_count.count(); let _ = inject_tap_tx_frame(&th.net(), 1000); // Trigger a Tap event that. This should fail since there // are not any available descriptors in the queue check_metric_after_block!( - &METRICS.net.no_rx_avail_buffer, + &th.net().metrics.get().no_rx_avail_buffer, 1, th.simulate_event(NetEvent::Tap) ); // The frame we read from the tap should be deferred now and // no frames should have been transmitted assert!(th.net().rx_deferred_frame); - assert_eq!(METRICS.net.rx_packets_count.count(), rx_packets_count); + assert_eq!( + th.net().metrics.get().rx_packets_count.count(), + rx_packets_count + ); // Let's add a second frame, which should really have the same // fate. @@ -1646,19 +1664,22 @@ pub mod tests { // since there's only one Descriptor Chain in the queue. th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); check_metric_after_block!( - &METRICS.net.no_rx_avail_buffer, + &th.net().metrics.get().no_rx_avail_buffer, 1, th.simulate_event(NetEvent::Tap) ); // We should still have a deferred frame assert!(th.net().rx_deferred_frame); // However, we should have delivered the first frame - assert_eq!(METRICS.net.rx_packets_count.count(), rx_packets_count + 1); + assert_eq!( + th.net().metrics.get().rx_packets_count.count(), + rx_packets_count + 1 + ); // Let's add one more descriptor and try to handle the last frame as well. th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); check_metric_after_block!( - &METRICS.net.rx_packets_count, + &th.net().metrics.get().rx_packets_count, 1, th.simulate_event(NetEvent::RxQueue) ); @@ -1675,7 +1696,7 @@ pub mod tests { th.net().rx_rate_limiter = RateLimiter::new(0, 0, 0, 0, 0, 0).unwrap(); // There is no actual event on the rate limiter's timerfd. check_metric_after_block!( - &METRICS.net.event_fails, + &th.net().metrics.get().event_fails, 1, th.simulate_event(NetEvent::RxRateLimiter) ); @@ -1690,7 +1711,7 @@ pub mod tests { th.simulate_event(NetEvent::TxRateLimiter); // There is no actual event on the rate limiter's timerfd. check_metric_after_block!( - &METRICS.net.event_fails, + &th.net().metrics.get().event_fails, 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1720,7 +1741,7 @@ pub mod tests { // assert that limiter is blocked assert!(th.net().tx_rate_limiter.is_blocked()); - assert_eq!(METRICS.net.tx_rate_limiter_throttled.count(), 1); + assert_eq!(th.net().metrics.get().tx_rate_limiter_throttled.count(), 1); // make sure the data is still queued for processing assert_eq!(th.txq.used.idx.get(), 0); } @@ -1731,7 +1752,7 @@ pub mod tests { // trigger the RX queue event handler th.simulate_event(NetEvent::TxQueue); - assert_eq!(METRICS.net.tx_rate_limiter_throttled.count(), 2); + assert_eq!(th.net().metrics.get().tx_rate_limiter_throttled.count(), 2); } // wait for 100ms to give the rate-limiter timer a chance to replenish @@ -1742,7 +1763,7 @@ pub mod tests { { // tx_count increments 1 from write_to_mmds_or_tap() check_metric_after_block!( - &METRICS.net.tx_count, + &th.net().metrics.get().tx_count, 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1759,7 +1780,7 @@ pub mod tests { { // tx_count increments 1 from write_to_mmds_or_tap() check_metric_after_block!( - &METRICS.net.tx_count, + &th.net().metrics.get().tx_count, 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1791,7 +1812,7 @@ pub mod tests { // assert that limiter is blocked assert!(th.net().rx_rate_limiter.is_blocked()); - assert_eq!(METRICS.net.rx_rate_limiter_throttled.count(), 1); + assert_eq!(th.net().metrics.get().rx_rate_limiter_throttled.count(), 1); assert!(th.net().rx_deferred_frame); // assert that no operation actually completed (limiter blocked it) assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); @@ -1804,7 +1825,7 @@ pub mod tests { // trigger the RX queue event handler th.simulate_event(NetEvent::RxQueue); - assert_eq!(METRICS.net.rx_rate_limiter_throttled.count(), 2); + assert_eq!(th.net().metrics.get().rx_rate_limiter_throttled.count(), 2); } // wait for 100ms to give the rate-limiter timer a chance to replenish @@ -1816,7 +1837,7 @@ pub mod tests { let frame = &th.net().tap.mocks.read_tap.mock_frame(); // no longer throttled check_metric_after_block!( - &METRICS.net.rx_rate_limiter_throttled, + &th.net().metrics.get().rx_rate_limiter_throttled, 0, th.simulate_event(NetEvent::RxRateLimiter) ); @@ -1854,7 +1875,7 @@ pub mod tests { // trigger the TX handler th.add_desc_chain(NetQueue::Tx, 0, &[(0, 4096, 0)]); check_metric_after_block!( - METRICS.net.tx_rate_limiter_throttled, + th.net().metrics.get().tx_rate_limiter_throttled, 1, th.simulate_event(NetEvent::TxQueue) ); @@ -1873,7 +1894,7 @@ pub mod tests { { // no longer throttled check_metric_after_block!( - &METRICS.net.tx_rate_limiter_throttled, + &th.net().metrics.get().tx_rate_limiter_throttled, 0, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1902,14 +1923,14 @@ pub mod tests { { // trigger the RX handler check_metric_after_block!( - METRICS.net.rx_rate_limiter_throttled, + th.net().metrics.get().rx_rate_limiter_throttled, 1, th.simulate_event(NetEvent::Tap) ); // assert that limiter is blocked assert!(th.net().rx_rate_limiter.is_blocked()); - assert!(METRICS.net.rx_rate_limiter_throttled.count() >= 1); + assert!(th.net().metrics.get().rx_rate_limiter_throttled.count() >= 1); assert!(th.net().rx_deferred_frame); // assert that no operation actually completed (limiter blocked it) assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); diff --git a/src/vmm/src/devices/virtio/net/event_handler.rs b/src/vmm/src/devices/virtio/net/event_handler.rs index 496a9b79dea..3581dff776e 100644 --- a/src/vmm/src/devices/virtio/net/event_handler.rs +++ b/src/vmm/src/devices/virtio/net/event_handler.rs @@ -8,7 +8,7 @@ use utils::epoll::EventSet; use crate::devices::virtio::net::device::Net; use crate::devices::virtio::{VirtioDevice, RX_INDEX, TX_INDEX}; -use crate::logger::{debug, error, warn, IncMetric, METRICS}; +use crate::logger::{debug, error, warn, IncMetric}; impl Net { fn register_runtime_events(&self, ops: &mut EventOps) { @@ -84,7 +84,7 @@ impl MutEventSubscriber for Net { _ if activate_fd == source => self.process_activate_event(ops), _ => { warn!("Net: Spurious event received: {:?}", source); - METRICS.net.event_fails.inc(); + self.metrics.get().event_fails.inc(); } } } else { diff --git a/src/vmm/src/devices/virtio/net/test_utils.rs b/src/vmm/src/devices/virtio/net/test_utils.rs index 29ccaac9ac4..ab6f364c62a 100644 --- a/src/vmm/src/devices/virtio/net/test_utils.rs +++ b/src/vmm/src/devices/virtio/net/test_utils.rs @@ -363,7 +363,7 @@ pub mod test { IrqType, Net, VirtioDevice, MAX_BUFFER_SIZE, RX_INDEX, TX_INDEX, VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE, }; - use crate::logger::{IncMetric, METRICS}; + use crate::logger::IncMetric; use crate::vstate::memory::{ Address, Bytes, GuestAddress, GuestMemoryExtension, GuestMemoryMmap, }; @@ -497,7 +497,7 @@ pub mod test { // Inject frame to tap and run epoll. let frame = inject_tap_tx_frame(&self.net(), frame_len); check_metric_after_block!( - METRICS.net.rx_packets_count, + self.net().metrics.get().rx_packets_count, 0, self.event_manager.run_with_timeout(100).unwrap() ); @@ -525,7 +525,7 @@ pub mod test { )], ); check_metric_after_block!( - METRICS.net.rx_packets_count, + self.net().metrics.get().rx_packets_count, 1, self.event_manager.run_with_timeout(100).unwrap() ); From 1e45431a5b3383084034e417b2c1884f72206876 Mon Sep 17 00:00:00 2001 From: Sudan Landge Date: Wed, 11 Oct 2023 17:35:15 +0000 Subject: [PATCH 03/12] feat(metrics)(net): Add per device net metrics in CHANGELOG Update CHANGELOGS to notify that Firecracker now emits per net device metrics. Note: this is part 3 of 12 commits to add per net device metrics. Signed-off-by: Sudan Landge --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9eca11f5f79..0571b2b6d96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,9 @@ capability checks that Firecracker performs during boot. If any of these fields are in use, minimal target snapshot version is restricted to 1.5. +- [#4145](https://github.com/firecracker-microvm/firecracker/pull/4145): + Added support for per net device metrics. Now, along with aggregated + network device metrics, metrics per network device with also be emitted. ### Changed From 13839e5b3c03bed49bbf866dd688d879ef262ccb Mon Sep 17 00:00:00 2001 From: Sudan Landge Date: Wed, 11 Oct 2023 21:54:44 +0000 Subject: [PATCH 04/12] feat(metrics)(net): Add unit test for net metrics Create a unit test that allocates 19 NetDeviceMetrics (19 because it is the max number of irqs we have for net devices), update and verify a metric. Note: this is part 4 of 12 commits to add per net device metrics. Signed-off-by: Sudan Landge --- src/vmm/src/devices/virtio/net/metrics.rs | 27 +++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/src/vmm/src/devices/virtio/net/metrics.rs b/src/vmm/src/devices/virtio/net/metrics.rs index 11df1e223fe..d1bc68b6b9d 100644 --- a/src/vmm/src/devices/virtio/net/metrics.rs +++ b/src/vmm/src/devices/virtio/net/metrics.rs @@ -75,7 +75,7 @@ use serde::{Serialize, Serializer}; use crate::logger::{IncMetric, SharedIncMetric}; -#[derive(Debug, Default)] +#[derive(Copy, Clone, Debug, Default)] pub struct NetDeviceMetricsIndex(usize); impl NetDeviceMetricsIndex { @@ -88,7 +88,7 @@ impl NetDeviceMetricsIndex { } /// provides instance for net metrics -#[derive(Debug, Default)] +#[derive(Debug)] pub struct NetDeviceMetricsPool { /// used to access per net device metrics metrics: Vec, @@ -283,3 +283,26 @@ impl NetDeviceMetrics { .add(other.tx_spoofed_mac_count.fetch_diff()); } } + +#[cfg(test)] +pub mod tests { + use super::*; + #[test] + fn test_net_dev_metrics() { + // we can have max 19 net devices + const MAX_NET_DEVICES: usize = 19; + let mut net_dev_metrics: [NetDeviceMetricsIndex; MAX_NET_DEVICES] = + [NetDeviceMetricsIndex::default(); MAX_NET_DEVICES]; + for metric in net_dev_metrics.iter_mut().take(MAX_NET_DEVICES) { + *metric = NetDeviceMetricsPool::get(); + metric.get().activate_fails.inc(); + } + // SAFETY: something + unsafe { + assert!(NET_DEV_METRICS_PVT.metrics.len() >= MAX_NET_DEVICES); + } + for metric in net_dev_metrics.iter_mut().take(MAX_NET_DEVICES) { + assert_eq!(metric.get().activate_fails.count(), 1); + } + } +} From 24ecef2bde3e633de63e50d4f31b8b5926d64685 Mon Sep 17 00:00:00 2001 From: Sudan Landge Date: Thu, 12 Oct 2023 17:09:45 +0000 Subject: [PATCH 05/12] feat(metrics)(net): Use a lock to access net metrics Use RwLock to make accessing Metrics threadsafe. Use better name to allocate NetDeviceMetrics Note: this is part 5 of 12 commits to add per net device metrics. Signed-off-by: Sudan Landge --- src/vmm/src/devices/mod.rs | 2 +- src/vmm/src/devices/virtio/net/device.rs | 302 +++++++++++++----- .../src/devices/virtio/net/event_handler.rs | 2 +- src/vmm/src/devices/virtio/net/metrics.rs | 29 +- src/vmm/src/devices/virtio/net/test_utils.rs | 4 +- 5 files changed, 247 insertions(+), 92 deletions(-) diff --git a/src/vmm/src/devices/mod.rs b/src/vmm/src/devices/mod.rs index c1c0eef86f4..28d55611bc7 100644 --- a/src/vmm/src/devices/mod.rs +++ b/src/vmm/src/devices/mod.rs @@ -25,7 +25,7 @@ use crate::logger::{IncMetric, METRICS}; // but also in terms of metrics of net event fails. pub(crate) fn report_net_event_fail(net_metrics: &NetDeviceMetricsIndex, err: DeviceError) { error!("{:?}", err); - net_metrics.get().event_fails.inc(); + net_metrics.get().write().unwrap().event_fails.inc(); } pub(crate) fn report_balloon_event_fail(err: virtio::balloon::BalloonError) { diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index 59614595e88..73301e29ea9 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -37,7 +37,7 @@ use crate::vstate::memory::{ByteValued, Bytes, GuestMemoryMmap}; const FRAME_HEADER_MAX_LEN: usize = PAYLOAD_OFFSET + ETH_IPV4_FRAME_LEN; use crate::devices::virtio::iovec::IoVecBuffer; -use crate::devices::virtio::metrics::{NetDeviceMetricsIndex, NetDeviceMetricsPool}; +use crate::devices::virtio::metrics::{NetDeviceMetricsAllocator, NetDeviceMetricsIndex}; use crate::devices::virtio::net::tap::Tap; use crate::devices::virtio::net::{ gen, NetError, NetQueue, MAX_BUFFER_SIZE, NET_QUEUE_SIZES, RX_INDEX, TX_INDEX, @@ -192,7 +192,7 @@ impl Net { device_state: DeviceState::Inactive, activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(NetError::EventFd)?, mmds_ns: None, - metrics: NetDeviceMetricsPool::get(), + metrics: NetDeviceMetricsAllocator::alloc(), }) } @@ -275,7 +275,7 @@ impl Net { self.irq_trigger .trigger_irq(IrqType::Vring) .map_err(|err| { - self.metrics.get().event_fails.inc(); + self.metrics.get().write().unwrap().event_fails.inc(); DeviceError::FailedSignalingIrq(err) })?; } @@ -308,7 +308,12 @@ impl Net { // Returns true on successful frame delivery. fn rate_limited_rx_single_frame(&mut self) -> bool { if !Self::rate_limiter_consume_op(&mut self.rx_rate_limiter, self.rx_bytes_read as u64) { - self.metrics.get().rx_rate_limiter_throttled.inc(); + self.metrics + .get() + .write() + .unwrap() + .rx_rate_limiter_throttled + .inc(); return false; } @@ -347,13 +352,13 @@ impl Net { let len = std::cmp::min(chunk.len(), descriptor.len as usize); match mem.write_slice(&chunk[..len], descriptor.addr) { Ok(()) => { - metrics.get().rx_count.inc(); + metrics.get().write().unwrap().rx_count.inc(); chunk = &chunk[len..]; } Err(err) => { error!("Failed to write slice: {:?}", err); if let GuestMemoryError::PartialBuffer { .. } = err { - metrics.get().rx_partial_writes.inc(); + metrics.get().write().unwrap().rx_partial_writes.inc(); } return Err(FrontendError::GuestMemory(err)); } @@ -361,8 +366,13 @@ impl Net { // If chunk is empty we are done here. if chunk.is_empty() { - metrics.get().rx_bytes_count.add(data.len() as u64); - metrics.get().rx_packets_count.inc(); + metrics + .get() + .write() + .unwrap() + .rx_bytes_count + .add(data.len() as u64); + metrics.get().write().unwrap().rx_packets_count.inc(); return Ok(()); } @@ -380,7 +390,7 @@ impl Net { let queue = &mut self.queues[RX_INDEX]; let head_descriptor = queue.pop_or_enable_notification(mem).ok_or_else(|| { - self.metrics.get().no_rx_avail_buffer.inc(); + self.metrics.get().write().unwrap().no_rx_avail_buffer.inc(); FrontendError::EmptyQueue })?; let head_index = head_descriptor.index; @@ -393,7 +403,7 @@ impl Net { ); // Mark the descriptor chain as used. If an error occurred, skip the descriptor chain. let used_len = if result.is_err() { - self.metrics.get().rx_fails.inc(); + self.metrics.get().write().unwrap().rx_fails.inc(); 0 } else { // Safe to unwrap because a frame must be smaller than 2^16 bytes. @@ -443,13 +453,13 @@ impl Net { // if the frame_iovec is empty. let header_len = frame_iovec.read_at(headers, 0).ok_or_else(|| { error!("Received empty TX buffer"); - metrics.get().tx_malformed_frames.inc(); + metrics.get().write().unwrap().tx_malformed_frames.inc(); NetError::VnetHeaderMissing })?; let headers = frame_bytes_from_buf(&headers[..header_len]).map_err(|e| { error!("VNET headers missing in TX frame"); - metrics.get().tx_malformed_frames.inc(); + metrics.get().write().unwrap().tx_malformed_frames.inc(); e })?; @@ -476,20 +486,25 @@ impl Net { if let Some(guest_mac) = guest_mac { let _ = EthernetFrame::from_bytes(headers).map(|eth_frame| { if guest_mac != eth_frame.src_mac() { - metrics.get().tx_spoofed_mac_count.inc(); + metrics.get().write().unwrap().tx_spoofed_mac_count.inc(); } }); } match Self::write_tap(tap, frame_iovec) { Ok(_) => { - metrics.get().tx_bytes_count.add(frame_iovec.len() as u64); - metrics.get().tx_packets_count.inc(); - metrics.get().tx_count.inc(); + metrics + .get() + .write() + .unwrap() + .tx_bytes_count + .add(frame_iovec.len() as u64); + metrics.get().write().unwrap().tx_packets_count.inc(); + metrics.get().write().unwrap().tx_count.inc(); } Err(err) => { error!("Failed to write to tap: {:?}", err); - metrics.get().tap_write_fails.inc(); + metrics.get().write().unwrap().tap_write_fails.inc(); } }; Ok(false) @@ -518,7 +533,7 @@ impl Net { match self.read_from_mmds_or_tap() { Ok(count) => { self.rx_bytes_read = count; - self.metrics.get().rx_count.inc(); + self.metrics.get().write().unwrap().rx_count.inc(); if !self.rate_limited_rx_single_frame() { self.rx_deferred_frame = true; break; @@ -531,7 +546,7 @@ impl Net { Some(err) if err == EAGAIN => (), _ => { error!("Failed to read tap: {:?}", err); - self.metrics.get().tap_read_fails.inc(); + self.metrics.get().write().unwrap().tap_read_fails.inc(); return Err(DeviceError::FailedReadTap); } }; @@ -586,7 +601,7 @@ impl Net { let buffer = match IoVecBuffer::from_descriptor_chain(mem, head) { Ok(buffer) => buffer, Err(_) => { - self.metrics.get().tx_fails.inc(); + self.metrics.get().write().unwrap().tx_fails.inc(); tx_queue .add_used(mem, head_index, 0) .map_err(DeviceError::QueueError)?; @@ -595,7 +610,12 @@ impl Net { }; if !Self::rate_limiter_consume_op(&mut self.tx_rate_limiter, buffer.len() as u64) { tx_queue.undo_pop(); - self.metrics.get().tx_rate_limiter_throttled.inc(); + self.metrics + .get() + .write() + .unwrap() + .tx_rate_limiter_throttled + .inc(); break; } @@ -621,7 +641,7 @@ impl Net { } if !used_any { - self.metrics.get().no_tx_avail_buffer.inc(); + self.metrics.get().write().unwrap().no_tx_avail_buffer.inc(); } self.signal_used_queue(NetQueue::Tx)?; @@ -661,14 +681,24 @@ impl Net { /// This is called by the event manager responding to the guest adding a new /// buffer in the RX queue. pub fn process_rx_queue_event(&mut self) { - self.metrics.get().rx_queue_event_count.inc(); + self.metrics + .get() + .write() + .unwrap() + .rx_queue_event_count + .inc(); if let Err(err) = self.queue_evts[RX_INDEX].read() { // rate limiters present but with _very high_ allowed rate error!("Failed to get rx queue event: {:?}", err); - self.metrics.get().event_fails.inc(); + self.metrics.get().write().unwrap().event_fails.inc(); } else if self.rx_rate_limiter.is_blocked() { - self.metrics.get().rx_rate_limiter_throttled.inc(); + self.metrics + .get() + .write() + .unwrap() + .rx_rate_limiter_throttled + .inc(); } else { // If the limiter is not blocked, resume the receiving of bytes. self.resume_rx() @@ -679,20 +709,25 @@ impl Net { pub fn process_tap_rx_event(&mut self) { // This is safe since we checked in the event handler that the device is activated. let mem = self.device_state.mem().unwrap(); - self.metrics.get().rx_tap_event_count.inc(); + self.metrics.get().write().unwrap().rx_tap_event_count.inc(); // While there are no available RX queue buffers and there's a deferred_frame // don't process any more incoming. Otherwise start processing a frame. In the // process the deferred_frame flag will be set in order to avoid freezing the // RX queue. if self.queues[RX_INDEX].is_empty(mem) && self.rx_deferred_frame { - self.metrics.get().no_rx_avail_buffer.inc(); + self.metrics.get().write().unwrap().no_rx_avail_buffer.inc(); return; } // While limiter is blocked, don't process any more incoming. if self.rx_rate_limiter.is_blocked() { - self.metrics.get().rx_rate_limiter_throttled.inc(); + self.metrics + .get() + .write() + .unwrap() + .rx_rate_limiter_throttled + .inc(); return; } @@ -713,22 +748,37 @@ impl Net { /// This is called by the event manager responding to the guest adding a new /// buffer in the TX queue. pub fn process_tx_queue_event(&mut self) { - self.metrics.get().tx_queue_event_count.inc(); + self.metrics + .get() + .write() + .unwrap() + .tx_queue_event_count + .inc(); if let Err(err) = self.queue_evts[TX_INDEX].read() { error!("Failed to get tx queue event: {:?}", err); - self.metrics.get().event_fails.inc(); + self.metrics.get().write().unwrap().event_fails.inc(); } else if !self.tx_rate_limiter.is_blocked() // If the limiter is not blocked, continue transmitting bytes. { self.process_tx() .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); } else { - self.metrics.get().tx_rate_limiter_throttled.inc(); + self.metrics + .get() + .write() + .unwrap() + .tx_rate_limiter_throttled + .inc(); } } pub fn process_rx_rate_limiter_event(&mut self) { - self.metrics.get().rx_event_rate_limiter_count.inc(); + self.metrics + .get() + .write() + .unwrap() + .rx_event_rate_limiter_count + .inc(); // Upon rate limiter event, call the rate limiter handler // and restart processing the queue. @@ -740,13 +790,18 @@ impl Net { } Err(err) => { error!("Failed to get rx rate-limiter event: {:?}", err); - self.metrics.get().event_fails.inc(); + self.metrics.get().write().unwrap().event_fails.inc(); } } } pub fn process_tx_rate_limiter_event(&mut self) { - self.metrics.get().tx_rate_limiter_event_count.inc(); + self.metrics + .get() + .write() + .unwrap() + .tx_rate_limiter_event_count + .inc(); // Upon rate limiter event, call the rate limiter handler // and restart processing the queue. match self.tx_rate_limiter.event_handler() { @@ -757,7 +812,7 @@ impl Net { } Err(err) => { error!("Failed to get tx rate-limiter event: {:?}", err); - self.metrics.get().event_fails.inc(); + self.metrics.get().write().unwrap().event_fails.inc(); } } } @@ -811,7 +866,7 @@ impl VirtioDevice for Net { let config_len = config_space_bytes.len() as u64; if offset >= config_len { error!("Failed to read config space"); - self.metrics.get().cfg_fails.inc(); + self.metrics.get().write().unwrap().cfg_fails.inc(); return; } if let Some(end) = offset.checked_add(data.len() as u64) { @@ -832,13 +887,18 @@ impl VirtioDevice for Net { .and_then(|(start, end)| config_space_bytes.get_mut(start..end)) else { error!("Failed to write config space"); - self.metrics.get().cfg_fails.inc(); + self.metrics.get().write().unwrap().cfg_fails.inc(); return; }; dst.copy_from_slice(data); self.guest_mac = Some(self.config_space.guest_mac); - self.metrics.get().mac_address_updates.inc(); + self.metrics + .get() + .write() + .unwrap() + .mac_address_updates + .inc(); } fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { @@ -1020,7 +1080,15 @@ pub mod tests { // Check that the guest MAC was updated. let expected_guest_mac = MacAddr::from_bytes_unchecked(&new_config); assert_eq!(expected_guest_mac, net.guest_mac.unwrap()); - assert_eq!(net.metrics.get().mac_address_updates.count(), 1); + assert_eq!( + net.metrics + .get() + .write() + .unwrap() + .mac_address_updates + .count(), + 1 + ); // Partial write (this is how the kernel sets a new mac address) - byte by byte. let new_config = [0x11, 0x22, 0x33, 0x44, 0x55, 0x66]; @@ -1053,7 +1121,7 @@ pub mod tests { th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); th.net().queue_evts[RX_INDEX].read().unwrap(); check_metric_after_block!( - th.net().metrics.get().event_fails, + th.net().metrics.get().write().unwrap().event_fails, 1, th.simulate_event(NetEvent::RxQueue) ); @@ -1148,7 +1216,7 @@ pub mod tests { // Inject frame to tap and run epoll. let frame = inject_tap_tx_frame(&th.net(), 1000); check_metric_after_block!( - th.net().metrics.get().rx_packets_count, + th.net().metrics.get().write().unwrap().rx_packets_count, 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1189,7 +1257,7 @@ pub mod tests { // Inject frame to tap and run epoll. let frame = inject_tap_tx_frame(&th.net(), 1000); check_metric_after_block!( - th.net().metrics.get().rx_packets_count, + th.net().metrics.get().write().unwrap().rx_packets_count, 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1229,7 +1297,7 @@ pub mod tests { let frame_1 = inject_tap_tx_frame(&th.net(), 200); let frame_2 = inject_tap_tx_frame(&th.net(), 300); check_metric_after_block!( - th.net().metrics.get().rx_packets_count, + th.net().metrics.get().write().unwrap().rx_packets_count, 2, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1260,7 +1328,7 @@ pub mod tests { th.add_desc_chain(NetQueue::Tx, 0, &[(0, 4096, 0)]); th.net().queue_evts[TX_INDEX].read().unwrap(); check_metric_after_block!( - th.net().metrics.get().event_fails, + th.net().metrics.get().write().unwrap().event_fails, 1, th.simulate_event(NetEvent::TxQueue) ); @@ -1299,7 +1367,7 @@ pub mod tests { // Send an invalid frame (too small, VNET header missing). th.add_desc_chain(NetQueue::Tx, 0, &[(0, 1, 0)]); check_metric_after_block!( - &th.net().metrics.get().tx_malformed_frames, + &th.net().metrics.get().write().unwrap().tx_malformed_frames, 1, th.event_manager.run_with_timeout(100) ); @@ -1321,7 +1389,7 @@ pub mod tests { // Send an invalid frame (too small, VNET header missing). th.add_desc_chain(NetQueue::Tx, 0, &[(0, 0, 0)]); check_metric_after_block!( - &th.net().metrics.get().tx_malformed_frames, + &th.net().metrics.get().write().unwrap().tx_malformed_frames, 1, th.event_manager.run_with_timeout(100) ); @@ -1359,7 +1427,7 @@ pub mod tests { // One frame is valid, one will not be handled because it includes write-only memory // so that leaves us with 2 malformed (no vnet header) frames. check_metric_after_block!( - &th.net().metrics.get().tx_malformed_frames, + &th.net().metrics.get().write().unwrap().tx_malformed_frames, 2, th.event_manager.run_with_timeout(100) ); @@ -1389,7 +1457,7 @@ pub mod tests { let frame = th.write_tx_frame(&desc_list, 1000); check_metric_after_block!( - th.net().metrics.get().tx_packets_count, + th.net().metrics.get().write().unwrap().tx_packets_count, 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1415,7 +1483,7 @@ pub mod tests { let _ = th.write_tx_frame(&desc_list, 1000); check_metric_after_block!( - th.net().metrics.get().tap_write_fails, + th.net().metrics.get().write().unwrap().tap_write_fails, 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1442,7 +1510,7 @@ pub mod tests { let frame_2 = th.write_tx_frame(&desc_list, 600); check_metric_after_block!( - th.net().metrics.get().tx_packets_count, + th.net().metrics.get().write().unwrap().tx_packets_count, 2, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1550,7 +1618,7 @@ pub mod tests { // Check that a legit MAC doesn't affect the spoofed MAC metric. check_metric_after_block!( - &net.metrics.get().tx_spoofed_mac_count, + &net.metrics.get().write().unwrap().tx_spoofed_mac_count, 0, Net::write_to_mmds_or_tap( net.mmds_ns.as_mut(), @@ -1565,7 +1633,7 @@ pub mod tests { // Check that a spoofed MAC increases our spoofed MAC metric. check_metric_after_block!( - &net.metrics.get().tx_spoofed_mac_count, + &net.metrics.get().write().unwrap().tx_spoofed_mac_count, 1, Net::write_to_mmds_or_tap( net.mmds_ns.as_mut(), @@ -1587,7 +1655,7 @@ pub mod tests { // RX rate limiter events should error since the limiter is not blocked. // Validate that the event failed and failure was properly accounted for. check_metric_after_block!( - &th.net().metrics.get().event_fails, + &th.net().metrics.get().write().unwrap().event_fails, 1, th.simulate_event(NetEvent::RxRateLimiter) ); @@ -1595,7 +1663,7 @@ pub mod tests { // TX rate limiter events should error since the limiter is not blocked. // Validate that the event failed and failure was properly accounted for. check_metric_after_block!( - &th.net().metrics.get().event_fails, + &th.net().metrics.get().write().unwrap().event_fails, 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1613,7 +1681,7 @@ pub mod tests { // The RX queue is empty and rx_deffered_frame is set. th.net().rx_deferred_frame = true; check_metric_after_block!( - &th.net().metrics.get().no_rx_avail_buffer, + &th.net().metrics.get().write().unwrap().no_rx_avail_buffer, 1, th.simulate_event(NetEvent::Tap) ); @@ -1626,7 +1694,7 @@ pub mod tests { // Fake an avail buffer; this time, tap reading should error out. th.rxq.avail.idx.set(1); check_metric_after_block!( - &th.net().metrics.get().tap_read_fails, + &th.net().metrics.get().write().unwrap().tap_read_fails, 1, th.simulate_event(NetEvent::Tap) ); @@ -1638,12 +1706,19 @@ pub mod tests { th.activate_net(); th.net().tap.mocks.set_read_tap(ReadTapMock::TapFrame); - let rx_packets_count = th.net().metrics.get().rx_packets_count.count(); + let rx_packets_count = th + .net() + .metrics + .get() + .write() + .unwrap() + .rx_packets_count + .count(); let _ = inject_tap_tx_frame(&th.net(), 1000); // Trigger a Tap event that. This should fail since there // are not any available descriptors in the queue check_metric_after_block!( - &th.net().metrics.get().no_rx_avail_buffer, + &th.net().metrics.get().write().unwrap().no_rx_avail_buffer, 1, th.simulate_event(NetEvent::Tap) ); @@ -1651,7 +1726,13 @@ pub mod tests { // no frames should have been transmitted assert!(th.net().rx_deferred_frame); assert_eq!( - th.net().metrics.get().rx_packets_count.count(), + th.net() + .metrics + .get() + .write() + .unwrap() + .rx_packets_count + .count(), rx_packets_count ); @@ -1664,7 +1745,7 @@ pub mod tests { // since there's only one Descriptor Chain in the queue. th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); check_metric_after_block!( - &th.net().metrics.get().no_rx_avail_buffer, + &th.net().metrics.get().write().unwrap().no_rx_avail_buffer, 1, th.simulate_event(NetEvent::Tap) ); @@ -1672,14 +1753,20 @@ pub mod tests { assert!(th.net().rx_deferred_frame); // However, we should have delivered the first frame assert_eq!( - th.net().metrics.get().rx_packets_count.count(), + th.net() + .metrics + .get() + .write() + .unwrap() + .rx_packets_count + .count(), rx_packets_count + 1 ); // Let's add one more descriptor and try to handle the last frame as well. th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); check_metric_after_block!( - &th.net().metrics.get().rx_packets_count, + &th.net().metrics.get().write().unwrap().rx_packets_count, 1, th.simulate_event(NetEvent::RxQueue) ); @@ -1696,7 +1783,7 @@ pub mod tests { th.net().rx_rate_limiter = RateLimiter::new(0, 0, 0, 0, 0, 0).unwrap(); // There is no actual event on the rate limiter's timerfd. check_metric_after_block!( - &th.net().metrics.get().event_fails, + &th.net().metrics.get().write().unwrap().event_fails, 1, th.simulate_event(NetEvent::RxRateLimiter) ); @@ -1711,7 +1798,7 @@ pub mod tests { th.simulate_event(NetEvent::TxRateLimiter); // There is no actual event on the rate limiter's timerfd. check_metric_after_block!( - &th.net().metrics.get().event_fails, + &th.net().metrics.get().write().unwrap().event_fails, 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1741,7 +1828,16 @@ pub mod tests { // assert that limiter is blocked assert!(th.net().tx_rate_limiter.is_blocked()); - assert_eq!(th.net().metrics.get().tx_rate_limiter_throttled.count(), 1); + assert_eq!( + th.net() + .metrics + .get() + .write() + .unwrap() + .tx_rate_limiter_throttled + .count(), + 1 + ); // make sure the data is still queued for processing assert_eq!(th.txq.used.idx.get(), 0); } @@ -1752,7 +1848,16 @@ pub mod tests { // trigger the RX queue event handler th.simulate_event(NetEvent::TxQueue); - assert_eq!(th.net().metrics.get().tx_rate_limiter_throttled.count(), 2); + assert_eq!( + th.net() + .metrics + .get() + .write() + .unwrap() + .tx_rate_limiter_throttled + .count(), + 2 + ); } // wait for 100ms to give the rate-limiter timer a chance to replenish @@ -1763,7 +1868,7 @@ pub mod tests { { // tx_count increments 1 from write_to_mmds_or_tap() check_metric_after_block!( - &th.net().metrics.get().tx_count, + &th.net().metrics.get().write().unwrap().tx_count, 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1780,7 +1885,7 @@ pub mod tests { { // tx_count increments 1 from write_to_mmds_or_tap() check_metric_after_block!( - &th.net().metrics.get().tx_count, + &th.net().metrics.get().write().unwrap().tx_count, 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1812,7 +1917,16 @@ pub mod tests { // assert that limiter is blocked assert!(th.net().rx_rate_limiter.is_blocked()); - assert_eq!(th.net().metrics.get().rx_rate_limiter_throttled.count(), 1); + assert_eq!( + th.net() + .metrics + .get() + .write() + .unwrap() + .rx_rate_limiter_throttled + .count(), + 1 + ); assert!(th.net().rx_deferred_frame); // assert that no operation actually completed (limiter blocked it) assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); @@ -1825,7 +1939,16 @@ pub mod tests { // trigger the RX queue event handler th.simulate_event(NetEvent::RxQueue); - assert_eq!(th.net().metrics.get().rx_rate_limiter_throttled.count(), 2); + assert_eq!( + th.net() + .metrics + .get() + .write() + .unwrap() + .rx_rate_limiter_throttled + .count(), + 2 + ); } // wait for 100ms to give the rate-limiter timer a chance to replenish @@ -1837,7 +1960,12 @@ pub mod tests { let frame = &th.net().tap.mocks.read_tap.mock_frame(); // no longer throttled check_metric_after_block!( - &th.net().metrics.get().rx_rate_limiter_throttled, + &th.net() + .metrics + .get() + .write() + .unwrap() + .rx_rate_limiter_throttled, 0, th.simulate_event(NetEvent::RxRateLimiter) ); @@ -1875,7 +2003,12 @@ pub mod tests { // trigger the TX handler th.add_desc_chain(NetQueue::Tx, 0, &[(0, 4096, 0)]); check_metric_after_block!( - th.net().metrics.get().tx_rate_limiter_throttled, + th.net() + .metrics + .get() + .write() + .unwrap() + .tx_rate_limiter_throttled, 1, th.simulate_event(NetEvent::TxQueue) ); @@ -1894,7 +2027,12 @@ pub mod tests { { // no longer throttled check_metric_after_block!( - &th.net().metrics.get().tx_rate_limiter_throttled, + &th.net() + .metrics + .get() + .write() + .unwrap() + .tx_rate_limiter_throttled, 0, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1923,14 +2061,28 @@ pub mod tests { { // trigger the RX handler check_metric_after_block!( - th.net().metrics.get().rx_rate_limiter_throttled, + th.net() + .metrics + .get() + .write() + .unwrap() + .rx_rate_limiter_throttled, 1, th.simulate_event(NetEvent::Tap) ); // assert that limiter is blocked assert!(th.net().rx_rate_limiter.is_blocked()); - assert!(th.net().metrics.get().rx_rate_limiter_throttled.count() >= 1); + assert!( + th.net() + .metrics + .get() + .write() + .unwrap() + .rx_rate_limiter_throttled + .count() + >= 1 + ); assert!(th.net().rx_deferred_frame); // assert that no operation actually completed (limiter blocked it) assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); diff --git a/src/vmm/src/devices/virtio/net/event_handler.rs b/src/vmm/src/devices/virtio/net/event_handler.rs index 3581dff776e..86eda751654 100644 --- a/src/vmm/src/devices/virtio/net/event_handler.rs +++ b/src/vmm/src/devices/virtio/net/event_handler.rs @@ -84,7 +84,7 @@ impl MutEventSubscriber for Net { _ if activate_fd == source => self.process_activate_event(ops), _ => { warn!("Net: Spurious event received: {:?}", source); - self.metrics.get().event_fails.inc(); + self.metrics.get().write().unwrap().event_fails.inc(); } } } else { diff --git a/src/vmm/src/devices/virtio/net/metrics.rs b/src/vmm/src/devices/virtio/net/metrics.rs index d1bc68b6b9d..55985560e0c 100644 --- a/src/vmm/src/devices/virtio/net/metrics.rs +++ b/src/vmm/src/devices/virtio/net/metrics.rs @@ -70,6 +70,8 @@ //! We use NET_DEV_METRICS_PVT instead of adding an entry of NetDeviceMetrics //! in Net so that metrics are accessible to be flushed even from signal handlers. +use std::sync::RwLock; + use serde::ser::SerializeMap; use serde::{Serialize, Serializer}; @@ -79,7 +81,7 @@ use crate::logger::{IncMetric, SharedIncMetric}; pub struct NetDeviceMetricsIndex(usize); impl NetDeviceMetricsIndex { - pub fn get(&self) -> &NetDeviceMetrics { + pub fn get(&self) -> &RwLock { // SAFETY: This is called by individual Net devices // on self (i.e. self.metrics.get()) so no race condition // and it is safe. @@ -89,26 +91,28 @@ impl NetDeviceMetricsIndex { /// provides instance for net metrics #[derive(Debug)] -pub struct NetDeviceMetricsPool { +pub struct NetDeviceMetricsAllocator { /// used to access per net device metrics - metrics: Vec, + metrics: Vec>, } -impl NetDeviceMetricsPool { +impl NetDeviceMetricsAllocator { /// default construction - pub fn get() -> NetDeviceMetricsIndex { + pub fn alloc() -> NetDeviceMetricsIndex { // SAFETY: This is only called during creation of a Net device // and since each device is created sequentially so there is no // case of race condition. unsafe { - NET_DEV_METRICS_PVT.metrics.push(NetDeviceMetrics::new()); + NET_DEV_METRICS_PVT + .metrics + .push(RwLock::new(NetDeviceMetrics::new())); NetDeviceMetricsIndex(NET_DEV_METRICS_PVT.metrics.len() - 1) } } } /// Contains Network-related metrics per device. -static mut NET_DEV_METRICS_PVT: NetDeviceMetricsPool = NetDeviceMetricsPool { +static mut NET_DEV_METRICS_PVT: NetDeviceMetricsAllocator = NetDeviceMetricsAllocator { metrics: Vec::new(), }; @@ -122,7 +126,7 @@ pub fn flush_metrics(serializer: S) -> Result { let net_aggregated: NetDeviceMetrics = NET_DEV_METRICS_PVT.metrics.iter().fold( NetDeviceMetrics::default(), |mut net_agg, net| { - net_agg.aggregate(net); + net_agg.aggregate(&net.read().unwrap()); net_agg }, ); @@ -131,8 +135,7 @@ pub fn flush_metrics(serializer: S) -> Result { for (i, metrics) in NET_DEV_METRICS_PVT.metrics.iter().enumerate() { let devn = format!("net{}", i); - // let ser_metrics = NetDeviceMetrics::default().aggregate(&metrics); - seq.serialize_entry(&devn, &metrics)?; + seq.serialize_entry(&devn, &*metrics.read().unwrap())?; } seq.end() } @@ -294,15 +297,15 @@ pub mod tests { let mut net_dev_metrics: [NetDeviceMetricsIndex; MAX_NET_DEVICES] = [NetDeviceMetricsIndex::default(); MAX_NET_DEVICES]; for metric in net_dev_metrics.iter_mut().take(MAX_NET_DEVICES) { - *metric = NetDeviceMetricsPool::get(); - metric.get().activate_fails.inc(); + *metric = NetDeviceMetricsAllocator::alloc(); + metric.get().write().unwrap().activate_fails.inc(); } // SAFETY: something unsafe { assert!(NET_DEV_METRICS_PVT.metrics.len() >= MAX_NET_DEVICES); } for metric in net_dev_metrics.iter_mut().take(MAX_NET_DEVICES) { - assert_eq!(metric.get().activate_fails.count(), 1); + assert_eq!(metric.get().write().unwrap().activate_fails.count(), 1); } } } diff --git a/src/vmm/src/devices/virtio/net/test_utils.rs b/src/vmm/src/devices/virtio/net/test_utils.rs index ab6f364c62a..04fa887f2f8 100644 --- a/src/vmm/src/devices/virtio/net/test_utils.rs +++ b/src/vmm/src/devices/virtio/net/test_utils.rs @@ -497,7 +497,7 @@ pub mod test { // Inject frame to tap and run epoll. let frame = inject_tap_tx_frame(&self.net(), frame_len); check_metric_after_block!( - self.net().metrics.get().rx_packets_count, + self.net().metrics.get().write().unwrap().rx_packets_count, 0, self.event_manager.run_with_timeout(100).unwrap() ); @@ -525,7 +525,7 @@ pub mod test { )], ); check_metric_after_block!( - self.net().metrics.get().rx_packets_count, + self.net().metrics.get().write().unwrap().rx_packets_count, 1, self.event_manager.run_with_timeout(100).unwrap() ); From cce04ed36acd260bc139cd925db41a4f65b32ef7 Mon Sep 17 00:00:00 2001 From: Sudan Landge Date: Fri, 13 Oct 2023 11:02:38 +0000 Subject: [PATCH 06/12] feat(metrics)(net): Use iface_id to emit per net device metrics With Vec based approach, "net0" is metrics for the 1st net device created. However, if devices are not created in the same order all the time, the first device created could either be eth0 or eth1 and then net0 could sometimes point to eth0 and sometimes to eth1 which doesn't help with analysing the metrics. So, use Map instead of Vec to provide more clarity on which interface the metrics actually belongs to. Also, instead of net0 name the metrics json object as net_$iface_id e.g. net_eth0 or net_eth1 which should help in better analysis. Use "net_$iface_id" for the metrics name instead of "net_$tap_name" to be consistent with the net endpoint "/network-interfaces/{iface_id}" Note: this is part 6 of 12 commits to add per net device metrics. Signed-off-by: Sudan Landge --- CHANGELOG.md | 12 +- src/vmm/src/devices/mod.rs | 13 +- src/vmm/src/devices/virtio/net/device.rs | 384 ++++++------------ .../src/devices/virtio/net/event_handler.rs | 6 +- src/vmm/src/devices/virtio/net/metrics.rs | 206 ++++++---- src/vmm/src/devices/virtio/net/test_utils.rs | 13 +- src/vmm/src/devices/virtio/test_utils.rs | 14 + 7 files changed, 308 insertions(+), 340 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0571b2b6d96..8b38f945779 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,15 @@ ### Added +- [#4145](https://github.com/firecracker-microvm/firecracker/pull/4145): + Added support for per net device metrics. Now, along with aggregated + network device metrics, metrics per network device with also be emitted. + Aggregate metrics will continue to be emitted with label "net" in the + metrics json object while each net device metrics will be emitted with + label "net_{iface_id}". E.g. the associated metrics for the endpoint + "/network-interfaces/eth0" will be available under "net_eth0:" in each + metrics json object. + ### Changed - Simplified and clarified the removal policy of deprecated API elements @@ -34,9 +43,6 @@ capability checks that Firecracker performs during boot. If any of these fields are in use, minimal target snapshot version is restricted to 1.5. -- [#4145](https://github.com/firecracker-microvm/firecracker/pull/4145): - Added support for per net device metrics. Now, along with aggregated - network device metrics, metrics per network device with also be emitted. ### Changed diff --git a/src/vmm/src/devices/mod.rs b/src/vmm/src/devices/mod.rs index 28d55611bc7..60e0ced8726 100644 --- a/src/vmm/src/devices/mod.rs +++ b/src/vmm/src/devices/mod.rs @@ -17,15 +17,20 @@ pub mod virtio; pub use bus::{Bus, BusDevice, BusError}; use log::error; -use crate::devices::virtio::metrics::NetDeviceMetricsIndex; +// get_net_metrics and NetMetricsPerDevice are the functions called by the +// macro NET_METRICS +use crate::devices::virtio::metrics::{get_net_metrics, NetMetricsPerDevice}; use crate::devices::virtio::{QueueError, VsockError}; use crate::logger::{IncMetric, METRICS}; +use crate::NET_METRICS; // Function used for reporting error in terms of logging // but also in terms of metrics of net event fails. -pub(crate) fn report_net_event_fail(net_metrics: &NetDeviceMetricsIndex, err: DeviceError) { - error!("{:?}", err); - net_metrics.get().write().unwrap().event_fails.inc(); +// network metrics is reported per device so we need `net_iface_id` +// to identify which device the metrics and `err` should be reported for. +pub(crate) fn report_net_event_fail(net_iface_id: &String, err: DeviceError) { + error!("{:?}:{:?}", net_iface_id, err); + NET_METRICS!(net_iface_id, event_fails.inc()); } pub(crate) fn report_balloon_event_fail(err: virtio::balloon::BalloonError) { diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index 73301e29ea9..75c5e57782a 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -37,7 +37,9 @@ use crate::vstate::memory::{ByteValued, Bytes, GuestMemoryMmap}; const FRAME_HEADER_MAX_LEN: usize = PAYLOAD_OFFSET + ETH_IPV4_FRAME_LEN; use crate::devices::virtio::iovec::IoVecBuffer; -use crate::devices::virtio::metrics::{NetDeviceMetricsAllocator, NetDeviceMetricsIndex}; +// get_net_metrics and NetMetricsPerDevice are the functions called by the +// macro NET_METRICS +use crate::devices::virtio::metrics::{get_net_metrics, NetMetricsPerDevice}; use crate::devices::virtio::net::tap::Tap; use crate::devices::virtio::net::{ gen, NetError, NetQueue, MAX_BUFFER_SIZE, NET_QUEUE_SIZES, RX_INDEX, TX_INDEX, @@ -46,6 +48,7 @@ use crate::devices::virtio::{ ActivateError, DescriptorChain, DeviceState, IrqTrigger, IrqType, Queue, VirtioDevice, TYPE_NET, }; use crate::devices::{report_net_event_fail, DeviceError}; +use crate::NET_METRICS; #[derive(Debug)] enum FrontendError { @@ -137,7 +140,6 @@ pub struct Net { /// The MMDS stack corresponding to this interface. /// Only if MMDS transport has been associated with it. pub mmds_ns: Option, - pub(crate) metrics: NetDeviceMetricsIndex, } impl Net { @@ -173,6 +175,11 @@ impl Net { queues.push(Queue::new(size)); } + // allocate NetDeviceMetrics to log metrics for this device. + // we don't need a pointer to NetDeviceMetrics since we use + // NET_METRICS! macro with device's id to access the metrics + // and so alloc doesn't need to return anything. + NetMetricsPerDevice::alloc(id.clone()); Ok(Net { id, tap, @@ -192,7 +199,6 @@ impl Net { device_state: DeviceState::Inactive, activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(NetError::EventFd)?, mmds_ns: None, - metrics: NetDeviceMetricsAllocator::alloc(), }) } @@ -275,7 +281,7 @@ impl Net { self.irq_trigger .trigger_irq(IrqType::Vring) .map_err(|err| { - self.metrics.get().write().unwrap().event_fails.inc(); + NET_METRICS!(&self.id, event_fails.inc()); DeviceError::FailedSignalingIrq(err) })?; } @@ -308,12 +314,7 @@ impl Net { // Returns true on successful frame delivery. fn rate_limited_rx_single_frame(&mut self) -> bool { if !Self::rate_limiter_consume_op(&mut self.rx_rate_limiter, self.rx_bytes_read as u64) { - self.metrics - .get() - .write() - .unwrap() - .rx_rate_limiter_throttled - .inc(); + NET_METRICS!(&self.id, rx_rate_limiter_throttled.inc()); return false; } @@ -339,7 +340,7 @@ impl Net { mem: &GuestMemoryMmap, data: &[u8], head: DescriptorChain, - metrics: &NetDeviceMetricsIndex, + net_iface_id: &String, ) -> Result<(), FrontendError> { let mut chunk = data; let mut next_descriptor = Some(head); @@ -352,13 +353,13 @@ impl Net { let len = std::cmp::min(chunk.len(), descriptor.len as usize); match mem.write_slice(&chunk[..len], descriptor.addr) { Ok(()) => { - metrics.get().write().unwrap().rx_count.inc(); + NET_METRICS!(net_iface_id, rx_count.inc()); chunk = &chunk[len..]; } Err(err) => { error!("Failed to write slice: {:?}", err); if let GuestMemoryError::PartialBuffer { .. } = err { - metrics.get().write().unwrap().rx_partial_writes.inc(); + NET_METRICS!(net_iface_id, rx_partial_writes.inc()); } return Err(FrontendError::GuestMemory(err)); } @@ -366,13 +367,9 @@ impl Net { // If chunk is empty we are done here. if chunk.is_empty() { - metrics - .get() - .write() - .unwrap() - .rx_bytes_count - .add(data.len() as u64); - metrics.get().write().unwrap().rx_packets_count.inc(); + let len = data.len() as u64; + NET_METRICS!(net_iface_id, rx_bytes_count.add(len)); + NET_METRICS!(net_iface_id, rx_packets_count.inc()); return Ok(()); } @@ -390,7 +387,7 @@ impl Net { let queue = &mut self.queues[RX_INDEX]; let head_descriptor = queue.pop_or_enable_notification(mem).ok_or_else(|| { - self.metrics.get().write().unwrap().no_rx_avail_buffer.inc(); + NET_METRICS!(&self.id, no_rx_avail_buffer.inc()); FrontendError::EmptyQueue })?; let head_index = head_descriptor.index; @@ -399,11 +396,11 @@ impl Net { mem, &self.rx_frame_buf[..self.rx_bytes_read], head_descriptor, - &self.metrics, + &self.id, ); // Mark the descriptor chain as used. If an error occurred, skip the descriptor chain. let used_len = if result.is_err() { - self.metrics.get().write().unwrap().rx_fails.inc(); + NET_METRICS!(&self.id, rx_fails.inc()); 0 } else { // Safe to unwrap because a frame must be smaller than 2^16 bytes. @@ -447,19 +444,19 @@ impl Net { frame_iovec: &IoVecBuffer, tap: &mut Tap, guest_mac: Option, - metrics: &NetDeviceMetricsIndex, + net_iface_id: &String, ) -> Result { // Read the frame headers from the IoVecBuffer. This will return None // if the frame_iovec is empty. let header_len = frame_iovec.read_at(headers, 0).ok_or_else(|| { error!("Received empty TX buffer"); - metrics.get().write().unwrap().tx_malformed_frames.inc(); + NET_METRICS!(net_iface_id, tx_malformed_frames.inc()); NetError::VnetHeaderMissing })?; let headers = frame_bytes_from_buf(&headers[..header_len]).map_err(|e| { error!("VNET headers missing in TX frame"); - metrics.get().write().unwrap().tx_malformed_frames.inc(); + NET_METRICS!(net_iface_id, tx_malformed_frames.inc()); e })?; @@ -486,25 +483,21 @@ impl Net { if let Some(guest_mac) = guest_mac { let _ = EthernetFrame::from_bytes(headers).map(|eth_frame| { if guest_mac != eth_frame.src_mac() { - metrics.get().write().unwrap().tx_spoofed_mac_count.inc(); + NET_METRICS!(net_iface_id, tx_spoofed_mac_count.inc()); } }); } match Self::write_tap(tap, frame_iovec) { Ok(_) => { - metrics - .get() - .write() - .unwrap() - .tx_bytes_count - .add(frame_iovec.len() as u64); - metrics.get().write().unwrap().tx_packets_count.inc(); - metrics.get().write().unwrap().tx_count.inc(); + let len = frame_iovec.len() as u64; + NET_METRICS!(net_iface_id, tx_bytes_count.add(len)); + NET_METRICS!(net_iface_id, tx_packets_count.inc()); + NET_METRICS!(net_iface_id, tx_count.inc()); } Err(err) => { error!("Failed to write to tap: {:?}", err); - metrics.get().write().unwrap().tap_write_fails.inc(); + NET_METRICS!(net_iface_id, tap_write_fails.inc()); } }; Ok(false) @@ -533,7 +526,7 @@ impl Net { match self.read_from_mmds_or_tap() { Ok(count) => { self.rx_bytes_read = count; - self.metrics.get().write().unwrap().rx_count.inc(); + NET_METRICS!(&self.id, rx_count.inc()); if !self.rate_limited_rx_single_frame() { self.rx_deferred_frame = true; break; @@ -546,7 +539,7 @@ impl Net { Some(err) if err == EAGAIN => (), _ => { error!("Failed to read tap: {:?}", err); - self.metrics.get().write().unwrap().tap_read_fails.inc(); + NET_METRICS!(&self.id, tap_read_fails.inc()); return Err(DeviceError::FailedReadTap); } }; @@ -601,7 +594,7 @@ impl Net { let buffer = match IoVecBuffer::from_descriptor_chain(mem, head) { Ok(buffer) => buffer, Err(_) => { - self.metrics.get().write().unwrap().tx_fails.inc(); + NET_METRICS!(&self.id, tx_fails.inc()); tx_queue .add_used(mem, head_index, 0) .map_err(DeviceError::QueueError)?; @@ -610,12 +603,7 @@ impl Net { }; if !Self::rate_limiter_consume_op(&mut self.tx_rate_limiter, buffer.len() as u64) { tx_queue.undo_pop(); - self.metrics - .get() - .write() - .unwrap() - .tx_rate_limiter_throttled - .inc(); + NET_METRICS!(&self.id, tx_rate_limiter_throttled.inc()); break; } @@ -626,7 +614,7 @@ impl Net { &buffer, &mut self.tap, self.guest_mac, - &self.metrics, + &self.id, ) .unwrap_or(false); if frame_consumed_by_mmds && !self.rx_deferred_frame { @@ -641,7 +629,7 @@ impl Net { } if !used_any { - self.metrics.get().write().unwrap().no_tx_avail_buffer.inc(); + NET_METRICS!(&self.id, no_tx_avail_buffer.inc()); } self.signal_used_queue(NetQueue::Tx)?; @@ -681,53 +669,38 @@ impl Net { /// This is called by the event manager responding to the guest adding a new /// buffer in the RX queue. pub fn process_rx_queue_event(&mut self) { - self.metrics - .get() - .write() - .unwrap() - .rx_queue_event_count - .inc(); + NET_METRICS!(&self.id, rx_queue_event_count.inc()); if let Err(err) = self.queue_evts[RX_INDEX].read() { // rate limiters present but with _very high_ allowed rate error!("Failed to get rx queue event: {:?}", err); - self.metrics.get().write().unwrap().event_fails.inc(); + NET_METRICS!(&self.id, event_fails.inc()); } else if self.rx_rate_limiter.is_blocked() { - self.metrics - .get() - .write() - .unwrap() - .rx_rate_limiter_throttled - .inc(); + NET_METRICS!(&self.id, rx_rate_limiter_throttled.inc()); } else { // If the limiter is not blocked, resume the receiving of bytes. self.resume_rx() - .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); + .unwrap_or_else(|err| report_net_event_fail(&self.id, err)); } } pub fn process_tap_rx_event(&mut self) { // This is safe since we checked in the event handler that the device is activated. let mem = self.device_state.mem().unwrap(); - self.metrics.get().write().unwrap().rx_tap_event_count.inc(); + NET_METRICS!(&self.id, rx_tap_event_count.inc()); // While there are no available RX queue buffers and there's a deferred_frame // don't process any more incoming. Otherwise start processing a frame. In the // process the deferred_frame flag will be set in order to avoid freezing the // RX queue. if self.queues[RX_INDEX].is_empty(mem) && self.rx_deferred_frame { - self.metrics.get().write().unwrap().no_rx_avail_buffer.inc(); + NET_METRICS!(&self.id, no_rx_avail_buffer.inc()); return; } // While limiter is blocked, don't process any more incoming. if self.rx_rate_limiter.is_blocked() { - self.metrics - .get() - .write() - .unwrap() - .rx_rate_limiter_throttled - .inc(); + NET_METRICS!(&self.id, rx_rate_limiter_throttled.inc()); return; } @@ -736,10 +709,10 @@ impl Net { // until we manage to receive this deferred frame. { self.handle_deferred_frame() - .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); + .unwrap_or_else(|err| report_net_event_fail(&self.id, err)); } else { self.process_rx() - .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); + .unwrap_or_else(|err| report_net_event_fail(&self.id, err)); } } @@ -748,37 +721,22 @@ impl Net { /// This is called by the event manager responding to the guest adding a new /// buffer in the TX queue. pub fn process_tx_queue_event(&mut self) { - self.metrics - .get() - .write() - .unwrap() - .tx_queue_event_count - .inc(); + NET_METRICS!(&self.id, tx_queue_event_count.inc()); if let Err(err) = self.queue_evts[TX_INDEX].read() { error!("Failed to get tx queue event: {:?}", err); - self.metrics.get().write().unwrap().event_fails.inc(); + NET_METRICS!(&self.id, event_fails.inc()); } else if !self.tx_rate_limiter.is_blocked() // If the limiter is not blocked, continue transmitting bytes. { self.process_tx() - .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); + .unwrap_or_else(|err| report_net_event_fail(&self.id, err)); } else { - self.metrics - .get() - .write() - .unwrap() - .tx_rate_limiter_throttled - .inc(); + NET_METRICS!(&self.id, tx_rate_limiter_throttled.inc()); } } pub fn process_rx_rate_limiter_event(&mut self) { - self.metrics - .get() - .write() - .unwrap() - .rx_event_rate_limiter_count - .inc(); + NET_METRICS!(&self.id, rx_event_rate_limiter_count.inc()); // Upon rate limiter event, call the rate limiter handler // and restart processing the queue. @@ -786,33 +744,28 @@ impl Net { Ok(_) => { // There might be enough budget now to receive the frame. self.resume_rx() - .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); + .unwrap_or_else(|err| report_net_event_fail(&self.id, err)); } Err(err) => { error!("Failed to get rx rate-limiter event: {:?}", err); - self.metrics.get().write().unwrap().event_fails.inc(); + NET_METRICS!(&self.id, event_fails.inc()); } } } pub fn process_tx_rate_limiter_event(&mut self) { - self.metrics - .get() - .write() - .unwrap() - .tx_rate_limiter_event_count - .inc(); + NET_METRICS!(&self.id, tx_rate_limiter_event_count.inc()); // Upon rate limiter event, call the rate limiter handler // and restart processing the queue. match self.tx_rate_limiter.event_handler() { Ok(_) => { // There might be enough budget now to send the frame. self.process_tx() - .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); + .unwrap_or_else(|err| report_net_event_fail(&self.id, err)); } Err(err) => { error!("Failed to get tx rate-limiter event: {:?}", err); - self.metrics.get().write().unwrap().event_fails.inc(); + NET_METRICS!(&self.id, event_fails.inc()); } } } @@ -866,7 +819,7 @@ impl VirtioDevice for Net { let config_len = config_space_bytes.len() as u64; if offset >= config_len { error!("Failed to read config space"); - self.metrics.get().write().unwrap().cfg_fails.inc(); + NET_METRICS!(&self.id, cfg_fails.inc()); return; } if let Some(end) = offset.checked_add(data.len() as u64) { @@ -887,18 +840,13 @@ impl VirtioDevice for Net { .and_then(|(start, end)| config_space_bytes.get_mut(start..end)) else { error!("Failed to write config space"); - self.metrics.get().write().unwrap().cfg_fails.inc(); + NET_METRICS!(&self.id, cfg_fails.inc()); return; }; dst.copy_from_slice(data); self.guest_mac = Some(self.config_space.guest_mac); - self.metrics - .get() - .write() - .unwrap() - .mac_address_updates - .inc(); + NET_METRICS!(&self.id, mac_address_updates.inc()); } fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { @@ -934,7 +882,6 @@ pub mod tests { use utils::net::mac::{MacAddr, MAC_ADDR_LEN}; use super::*; - use crate::check_metric_after_block; use crate::devices::virtio::gen::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use crate::devices::virtio::iovec::IoVecBuffer; use crate::devices::virtio::net::device::{ @@ -956,6 +903,7 @@ pub mod tests { use crate::logger::IncMetric; use crate::rate_limiter::{BucketUpdate, RateLimiter, TokenBucket, TokenType}; use crate::vstate::memory::{Address, GuestMemory}; + use crate::{check_metric_after_block, check_net_metric_after_block}; impl Net { pub(crate) fn read_tap(&mut self) -> io::Result { @@ -1080,15 +1028,7 @@ pub mod tests { // Check that the guest MAC was updated. let expected_guest_mac = MacAddr::from_bytes_unchecked(&new_config); assert_eq!(expected_guest_mac, net.guest_mac.unwrap()); - assert_eq!( - net.metrics - .get() - .write() - .unwrap() - .mac_address_updates - .count(), - 1 - ); + assert_eq!(NET_METRICS!(&net.id, mac_address_updates.count()), 1); // Partial write (this is how the kernel sets a new mac address) - byte by byte. let new_config = [0x11, 0x22, 0x33, 0x44, 0x55, 0x66]; @@ -1120,8 +1060,8 @@ pub mod tests { th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); th.net().queue_evts[RX_INDEX].read().unwrap(); - check_metric_after_block!( - th.net().metrics.get().write().unwrap().event_fails, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, event_fails.count()), 1, th.simulate_event(NetEvent::RxQueue) ); @@ -1215,8 +1155,8 @@ pub mod tests { // Inject frame to tap and run epoll. let frame = inject_tap_tx_frame(&th.net(), 1000); - check_metric_after_block!( - th.net().metrics.get().write().unwrap().rx_packets_count, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, rx_packets_count.count()), 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1256,8 +1196,8 @@ pub mod tests { ); // Inject frame to tap and run epoll. let frame = inject_tap_tx_frame(&th.net(), 1000); - check_metric_after_block!( - th.net().metrics.get().write().unwrap().rx_packets_count, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, rx_packets_count.count()), 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1296,8 +1236,8 @@ pub mod tests { // Inject 2 frames to tap and run epoll. let frame_1 = inject_tap_tx_frame(&th.net(), 200); let frame_2 = inject_tap_tx_frame(&th.net(), 300); - check_metric_after_block!( - th.net().metrics.get().write().unwrap().rx_packets_count, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, rx_packets_count.count()), 2, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1327,8 +1267,8 @@ pub mod tests { th.add_desc_chain(NetQueue::Tx, 0, &[(0, 4096, 0)]); th.net().queue_evts[TX_INDEX].read().unwrap(); - check_metric_after_block!( - th.net().metrics.get().write().unwrap().event_fails, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, event_fails.count()), 1, th.simulate_event(NetEvent::TxQueue) ); @@ -1366,8 +1306,8 @@ pub mod tests { // Send an invalid frame (too small, VNET header missing). th.add_desc_chain(NetQueue::Tx, 0, &[(0, 1, 0)]); - check_metric_after_block!( - &th.net().metrics.get().write().unwrap().tx_malformed_frames, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, tx_malformed_frames.count()), 1, th.event_manager.run_with_timeout(100) ); @@ -1388,8 +1328,8 @@ pub mod tests { // Send an invalid frame (too small, VNET header missing). th.add_desc_chain(NetQueue::Tx, 0, &[(0, 0, 0)]); - check_metric_after_block!( - &th.net().metrics.get().write().unwrap().tx_malformed_frames, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, tx_malformed_frames.count()), 1, th.event_manager.run_with_timeout(100) ); @@ -1426,8 +1366,8 @@ pub mod tests { // One frame is valid, one will not be handled because it includes write-only memory // so that leaves us with 2 malformed (no vnet header) frames. - check_metric_after_block!( - &th.net().metrics.get().write().unwrap().tx_malformed_frames, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, tx_malformed_frames.count()), 2, th.event_manager.run_with_timeout(100) ); @@ -1456,8 +1396,8 @@ pub mod tests { th.add_desc_chain(NetQueue::Tx, 0, &desc_list); let frame = th.write_tx_frame(&desc_list, 1000); - check_metric_after_block!( - th.net().metrics.get().write().unwrap().tx_packets_count, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, tx_packets_count.count()), 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1482,8 +1422,8 @@ pub mod tests { th.add_desc_chain(NetQueue::Tx, 0, &desc_list); let _ = th.write_tx_frame(&desc_list, 1000); - check_metric_after_block!( - th.net().metrics.get().write().unwrap().tap_write_fails, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, tap_write_fails.count()), 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1509,8 +1449,8 @@ pub mod tests { th.add_desc_chain(NetQueue::Tx, 500, &desc_list); let frame_2 = th.write_tx_frame(&desc_list, 600); - check_metric_after_block!( - th.net().metrics.get().write().unwrap().tx_packets_count, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, tx_packets_count.count()), 2, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1589,7 +1529,7 @@ pub mod tests { &buffer, &mut net.tap, Some(src_mac), - &net.metrics, + &net.id, ) .unwrap()) ); @@ -1617,8 +1557,8 @@ pub mod tests { let mut headers = vec![0; frame_hdr_len()]; // Check that a legit MAC doesn't affect the spoofed MAC metric. - check_metric_after_block!( - &net.metrics.get().write().unwrap().tx_spoofed_mac_count, + check_net_metric_after_block!( + NET_METRICS!(&net.id, tx_spoofed_mac_count.count()), 0, Net::write_to_mmds_or_tap( net.mmds_ns.as_mut(), @@ -1627,13 +1567,13 @@ pub mod tests { &buffer, &mut net.tap, Some(guest_mac), - &net.metrics, + &net.id, ) ); // Check that a spoofed MAC increases our spoofed MAC metric. - check_metric_after_block!( - &net.metrics.get().write().unwrap().tx_spoofed_mac_count, + check_net_metric_after_block!( + NET_METRICS!(&net.id, tx_spoofed_mac_count.count()), 1, Net::write_to_mmds_or_tap( net.mmds_ns.as_mut(), @@ -1642,7 +1582,7 @@ pub mod tests { &buffer, &mut net.tap, Some(not_guest_mac), - &net.metrics, + &net.id, ) ); } @@ -1654,16 +1594,16 @@ pub mod tests { // RX rate limiter events should error since the limiter is not blocked. // Validate that the event failed and failure was properly accounted for. - check_metric_after_block!( - &th.net().metrics.get().write().unwrap().event_fails, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, event_fails.count()), 1, th.simulate_event(NetEvent::RxRateLimiter) ); // TX rate limiter events should error since the limiter is not blocked. // Validate that the event failed and failure was properly accounted for. - check_metric_after_block!( - &th.net().metrics.get().write().unwrap().event_fails, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, event_fails.count()), 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1680,8 +1620,8 @@ pub mod tests { // The RX queue is empty and rx_deffered_frame is set. th.net().rx_deferred_frame = true; - check_metric_after_block!( - &th.net().metrics.get().write().unwrap().no_rx_avail_buffer, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, no_rx_avail_buffer.count()), 1, th.simulate_event(NetEvent::Tap) ); @@ -1693,8 +1633,8 @@ pub mod tests { // Fake an avail buffer; this time, tap reading should error out. th.rxq.avail.idx.set(1); - check_metric_after_block!( - &th.net().metrics.get().write().unwrap().tap_read_fails, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, tap_read_fails.count()), 1, th.simulate_event(NetEvent::Tap) ); @@ -1706,19 +1646,12 @@ pub mod tests { th.activate_net(); th.net().tap.mocks.set_read_tap(ReadTapMock::TapFrame); - let rx_packets_count = th - .net() - .metrics - .get() - .write() - .unwrap() - .rx_packets_count - .count(); + let rx_packets_count = NET_METRICS!(&th.net().id, rx_packets_count.count()); let _ = inject_tap_tx_frame(&th.net(), 1000); // Trigger a Tap event that. This should fail since there // are not any available descriptors in the queue - check_metric_after_block!( - &th.net().metrics.get().write().unwrap().no_rx_avail_buffer, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, no_rx_avail_buffer.count()), 1, th.simulate_event(NetEvent::Tap) ); @@ -1726,13 +1659,7 @@ pub mod tests { // no frames should have been transmitted assert!(th.net().rx_deferred_frame); assert_eq!( - th.net() - .metrics - .get() - .write() - .unwrap() - .rx_packets_count - .count(), + NET_METRICS!(&th.net().id, rx_packets_count.count()), rx_packets_count ); @@ -1744,8 +1671,8 @@ pub mod tests { // frame. However, this should try to handle the second tap as well and fail // since there's only one Descriptor Chain in the queue. th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); - check_metric_after_block!( - &th.net().metrics.get().write().unwrap().no_rx_avail_buffer, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, no_rx_avail_buffer.count()), 1, th.simulate_event(NetEvent::Tap) ); @@ -1753,20 +1680,14 @@ pub mod tests { assert!(th.net().rx_deferred_frame); // However, we should have delivered the first frame assert_eq!( - th.net() - .metrics - .get() - .write() - .unwrap() - .rx_packets_count - .count(), + NET_METRICS!(&th.net().id, rx_packets_count.count()), rx_packets_count + 1 ); // Let's add one more descriptor and try to handle the last frame as well. th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); - check_metric_after_block!( - &th.net().metrics.get().write().unwrap().rx_packets_count, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, rx_packets_count.count()), 1, th.simulate_event(NetEvent::RxQueue) ); @@ -1782,8 +1703,8 @@ pub mod tests { th.net().rx_rate_limiter = RateLimiter::new(0, 0, 0, 0, 0, 0).unwrap(); // There is no actual event on the rate limiter's timerfd. - check_metric_after_block!( - &th.net().metrics.get().write().unwrap().event_fails, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, event_fails.count()), 1, th.simulate_event(NetEvent::RxRateLimiter) ); @@ -1797,8 +1718,8 @@ pub mod tests { th.net().tx_rate_limiter = RateLimiter::new(0, 0, 0, 0, 0, 0).unwrap(); th.simulate_event(NetEvent::TxRateLimiter); // There is no actual event on the rate limiter's timerfd. - check_metric_after_block!( - &th.net().metrics.get().write().unwrap().event_fails, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, event_fails.count()), 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1829,13 +1750,7 @@ pub mod tests { // assert that limiter is blocked assert!(th.net().tx_rate_limiter.is_blocked()); assert_eq!( - th.net() - .metrics - .get() - .write() - .unwrap() - .tx_rate_limiter_throttled - .count(), + NET_METRICS!(&th.net().id, tx_rate_limiter_throttled.count()), 1 ); // make sure the data is still queued for processing @@ -1849,13 +1764,7 @@ pub mod tests { th.simulate_event(NetEvent::TxQueue); assert_eq!( - th.net() - .metrics - .get() - .write() - .unwrap() - .tx_rate_limiter_throttled - .count(), + NET_METRICS!(&th.net().id, tx_rate_limiter_throttled.count()), 2 ); } @@ -1867,8 +1776,8 @@ pub mod tests { // following TX procedure should succeed because bandwidth should now be available { // tx_count increments 1 from write_to_mmds_or_tap() - check_metric_after_block!( - &th.net().metrics.get().write().unwrap().tx_count, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, tx_count.count()), 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1884,8 +1793,8 @@ pub mod tests { // following TX procedure should succeed to handle the second frame as well { // tx_count increments 1 from write_to_mmds_or_tap() - check_metric_after_block!( - &th.net().metrics.get().write().unwrap().tx_count, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, tx_count.count()), 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1918,13 +1827,7 @@ pub mod tests { // assert that limiter is blocked assert!(th.net().rx_rate_limiter.is_blocked()); assert_eq!( - th.net() - .metrics - .get() - .write() - .unwrap() - .rx_rate_limiter_throttled - .count(), + NET_METRICS!(&th.net().id, rx_rate_limiter_throttled.count()), 1 ); assert!(th.net().rx_deferred_frame); @@ -1940,13 +1843,7 @@ pub mod tests { th.simulate_event(NetEvent::RxQueue); assert_eq!( - th.net() - .metrics - .get() - .write() - .unwrap() - .rx_rate_limiter_throttled - .count(), + NET_METRICS!(&th.net().id, rx_rate_limiter_throttled.count()), 2 ); } @@ -1959,13 +1856,8 @@ pub mod tests { { let frame = &th.net().tap.mocks.read_tap.mock_frame(); // no longer throttled - check_metric_after_block!( - &th.net() - .metrics - .get() - .write() - .unwrap() - .rx_rate_limiter_throttled, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, rx_rate_limiter_throttled.count()), 0, th.simulate_event(NetEvent::RxRateLimiter) ); @@ -2002,13 +1894,8 @@ pub mod tests { { // trigger the TX handler th.add_desc_chain(NetQueue::Tx, 0, &[(0, 4096, 0)]); - check_metric_after_block!( - th.net() - .metrics - .get() - .write() - .unwrap() - .tx_rate_limiter_throttled, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, tx_rate_limiter_throttled.count()), 1, th.simulate_event(NetEvent::TxQueue) ); @@ -2026,13 +1913,8 @@ pub mod tests { // following TX procedure should succeed because ops should now be available { // no longer throttled - check_metric_after_block!( - &th.net() - .metrics - .get() - .write() - .unwrap() - .tx_rate_limiter_throttled, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, tx_rate_limiter_throttled.count()), 0, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -2060,29 +1942,15 @@ pub mod tests { // following RX procedure should fail because of ops rate limiting { // trigger the RX handler - check_metric_after_block!( - th.net() - .metrics - .get() - .write() - .unwrap() - .rx_rate_limiter_throttled, + check_net_metric_after_block!( + NET_METRICS!(&th.net().id, rx_rate_limiter_throttled.count()), 1, th.simulate_event(NetEvent::Tap) ); // assert that limiter is blocked assert!(th.net().rx_rate_limiter.is_blocked()); - assert!( - th.net() - .metrics - .get() - .write() - .unwrap() - .rx_rate_limiter_throttled - .count() - >= 1 - ); + assert!(NET_METRICS!(&th.net().id, rx_rate_limiter_throttled.count()) >= 1); assert!(th.net().rx_deferred_frame); // assert that no operation actually completed (limiter blocked it) assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); diff --git a/src/vmm/src/devices/virtio/net/event_handler.rs b/src/vmm/src/devices/virtio/net/event_handler.rs index 86eda751654..5863a9ad0fe 100644 --- a/src/vmm/src/devices/virtio/net/event_handler.rs +++ b/src/vmm/src/devices/virtio/net/event_handler.rs @@ -7,8 +7,12 @@ use event_manager::{EventOps, Events, MutEventSubscriber}; use utils::epoll::EventSet; use crate::devices::virtio::net::device::Net; +// get_net_metrics and NetMetricsPerDevice are the functions called by the +// macro NET_METRICS +use crate::devices::virtio::net::metrics::{get_net_metrics, NetMetricsPerDevice}; use crate::devices::virtio::{VirtioDevice, RX_INDEX, TX_INDEX}; use crate::logger::{debug, error, warn, IncMetric}; +use crate::NET_METRICS; impl Net { fn register_runtime_events(&self, ops: &mut EventOps) { @@ -84,7 +88,7 @@ impl MutEventSubscriber for Net { _ if activate_fd == source => self.process_activate_event(ops), _ => { warn!("Net: Spurious event received: {:?}", source); - self.metrics.get().write().unwrap().event_fails.inc(); + NET_METRICS!(&self.id, event_fails.inc()); } } } else { diff --git a/src/vmm/src/devices/virtio/net/metrics.rs b/src/vmm/src/devices/virtio/net/metrics.rs index 55985560e0c..d2fa6cba2f4 100644 --- a/src/vmm/src/devices/virtio/net/metrics.rs +++ b/src/vmm/src/devices/virtio/net/metrics.rs @@ -9,7 +9,7 @@ //! ## JSON example with metrics: //! ```json //! { -//! "net": { +//! "net_eth0": { //! "activate_fails": "SharedIncMetric", //! "cfg_fails": "SharedIncMetric", //! "mac_address_updates": "SharedIncMetric", @@ -17,7 +17,7 @@ //! "no_tx_avail_buffer": "SharedIncMetric", //! ... //! } -//! "net0": { +//! "net_eth1": { //! "activate_fails": "SharedIncMetric", //! "cfg_fails": "SharedIncMetric", //! "mac_address_updates": "SharedIncMetric", @@ -25,7 +25,8 @@ //! "no_tx_avail_buffer": "SharedIncMetric", //! ... //! } -//! "net1": { +//! ... +//! "net_iface_id": { //! "activate_fails": "SharedIncMetric", //! "cfg_fails": "SharedIncMetric", //! "mac_address_updates": "SharedIncMetric", @@ -33,8 +34,7 @@ //! "no_tx_avail_buffer": "SharedIncMetric", //! ... //! } -//! ... -//! "netN": { +//! "net": { //! "activate_fails": "SharedIncMetric", //! "cfg_fails": "SharedIncMetric", //! "mac_address_updates": "SharedIncMetric", @@ -46,7 +46,9 @@ //! ``` //! Each `net` field in the example above is a serializable `NetDeviceMetrics` structure //! collecting metrics such as `activate_fails`, `cfg_fails`, etc. for the network device. -//! `net0`, `net1` and `netN` in the above example represent metrics 0th, 1st and 'N'th +//! `net_eth0` represent metrics for the endpoint "/network-interfaces/eth0", +//! `net_eth1` represent metrics for the endpoint "/network-interfaces/eth1", and +//! `net_iface_id` represent metrics for the endpoint "/network-interfaces/{iface_id}" //! network device respectively and `net` is the aggregate of all the per device metrics. //! //! # Limitations @@ -61,8 +63,13 @@ //! * Use lockless operations, preferably ones that don't require anything other than simple //! reads/writes being atomic. //! * Rely on `serde` to provide the actual serialization for writing the metrics. -//! * Since all metrics start at 0, we implement the `Default` trait via derive for all of them, to -//! avoid having to initialize everything by hand. +//! +//! * Devices could be created in any order i.e. the first device created could either be eth0 or +//! eth1 so if we use a vector for NetDeviceMetrics and call 1st device as net0, then net0 could +//! sometimes point to eth0 and sometimes to eth1 which doesn't help with analysing the metrics. +//! So, use Map instead of Vec to help understand which interface the metrics actually belongs to. +//! * We use "net_$iface_id" for the metrics name instead of "net_$tap_name" to be consistent with +//! the net endpoint "/network-interfaces/{iface_id}". //! //! The system implements 1 types of metrics: //! * Shared Incremental Metrics (SharedIncMetrics) - dedicated for the metrics which need a counter @@ -70,80 +77,102 @@ //! We use NET_DEV_METRICS_PVT instead of adding an entry of NetDeviceMetrics //! in Net so that metrics are accessible to be flushed even from signal handlers. -use std::sync::RwLock; +use std::collections::BTreeMap; +use std::sync::{RwLock, RwLockReadGuard}; use serde::ser::SerializeMap; use serde::{Serialize, Serializer}; use crate::logger::{IncMetric, SharedIncMetric}; -#[derive(Copy, Clone, Debug, Default)] -pub struct NetDeviceMetricsIndex(usize); - -impl NetDeviceMetricsIndex { - pub fn get(&self) -> &RwLock { - // SAFETY: This is called by individual Net devices - // on self (i.e. self.metrics.get()) so no race condition - // and it is safe. - unsafe { &NET_DEV_METRICS_PVT.metrics[self.0] } - } +/// returns lock-unwrapped NET_DEV_METRICS_PVT.metrics, +/// we have this function so that we don't have to make +/// NET_DEV_METRICS_PVT public +pub fn get_net_metrics() -> RwLockReadGuard<'static, NetMetricsPerDevice> { + // lock is always initialized so it is safe the unwrap the lock without a check. + NET_DEV_METRICS_PVT.read().unwrap() } -/// provides instance for net metrics +/// map of network interface id and metrics +/// this should be protected by a lock before accessing. #[derive(Debug)] -pub struct NetDeviceMetricsAllocator { +pub struct NetMetricsPerDevice { /// used to access per net device metrics - metrics: Vec>, + pub metrics: BTreeMap, } -impl NetDeviceMetricsAllocator { - /// default construction - pub fn alloc() -> NetDeviceMetricsIndex { - // SAFETY: This is only called during creation of a Net device - // and since each device is created sequentially so there is no - // case of race condition. - unsafe { +impl NetMetricsPerDevice { + /// Allocate `NetDeviceMetrics` for net device having + /// id `iface_id`. Also, allocate only if it doesn't + /// exist to avoid overwriting previously allocated data. + /// lock is always initialized so it is safe the unwrap + /// the lock without a check. + pub fn alloc(iface_id: String) { + if NET_DEV_METRICS_PVT + .read() + .unwrap() + .metrics + .get(&iface_id) + .is_none() + { NET_DEV_METRICS_PVT + .write() + .unwrap() .metrics - .push(RwLock::new(NetDeviceMetrics::new())); - NetDeviceMetricsIndex(NET_DEV_METRICS_PVT.metrics.len() - 1) + .insert(iface_id, NetDeviceMetrics::new()); } } } -/// Contains Network-related metrics per device. -static mut NET_DEV_METRICS_PVT: NetDeviceMetricsAllocator = NetDeviceMetricsAllocator { - metrics: Vec::new(), -}; +#[macro_export] +// Per device metrics are behind a ref lock and in a map which +// makes it difficult to return the metrics pointer via a function +// This macro is an attempt to make the code more readable and to +// centralize the error handling. +// The macro can be called in below ways where iface_id is String and +// activate_fails is a metric from struct `NetDeviceMetrics`: +// NET_METRICS!(&iface_id, activate_fails.inc()); +// NET_METRICS!(&iface_id, activate_fails.add(5)); +// NET_METRICS!(&iface_id, activate_fails.count()); +macro_rules! NET_METRICS { + ($iface_id:expr,$metric:ident.$action:ident($($value:tt)?)) => { + if get_net_metrics().metrics.get($iface_id).is_some() { + get_net_metrics().metrics.get($iface_id).unwrap().$metric.$action($($value)?) + }else{ + NetMetricsPerDevice::alloc($iface_id.to_string()); + get_net_metrics().metrics.get($iface_id).unwrap().$metric.$action($($value)?) + } + } +} -pub fn flush_metrics(serializer: S) -> Result { - // SAFETY: This is called from vmm::logger::metrics::METRICS.write which - // has a lock and takes care of race condition. - unsafe { - // +1 to accomodate aggregate net metrics - let mut seq = serializer.serialize_map(Some(1 + NET_DEV_METRICS_PVT.metrics.len()))?; +/// Pool of Network-related metrics per device behing a lock to +/// keep things thread safe. Since the lock is initialized here +/// it is safe to unwrap it without any check. +static NET_DEV_METRICS_PVT: RwLock = RwLock::new(NetMetricsPerDevice { + metrics: BTreeMap::new(), +}); - let net_aggregated: NetDeviceMetrics = NET_DEV_METRICS_PVT.metrics.iter().fold( - NetDeviceMetrics::default(), - |mut net_agg, net| { - net_agg.aggregate(&net.read().unwrap()); - net_agg - }, - ); +/// This function facilitates aggregation and serialization of +/// per net device metrics. +pub fn flush_metrics(serializer: S) -> Result { + let metrics_len = NET_DEV_METRICS_PVT.read().unwrap().metrics.len(); + // +1 to accomodate aggregate net metrics + let mut seq = serializer.serialize_map(Some(1 + metrics_len))?; - seq.serialize_entry("net", &net_aggregated)?; + let mut net_aggregated: NetDeviceMetrics = NetDeviceMetrics::new(); - for (i, metrics) in NET_DEV_METRICS_PVT.metrics.iter().enumerate() { - let devn = format!("net{}", i); - seq.serialize_entry(&devn, &*metrics.read().unwrap())?; - } - seq.end() + for metrics in NET_DEV_METRICS_PVT.read().unwrap().metrics.iter() { + let devn = format!("net_{}", metrics.0); + // serialization will flush the metrics so aggregate before it. + net_aggregated.aggregate(metrics.1); + seq.serialize_entry(&devn, &metrics.1)?; } + seq.serialize_entry("net", &net_aggregated)?; + seq.end() } /// Network-related metrics. -// #[derive(Debug, Default)] -#[derive(Debug, Default, Serialize)] +#[derive(Debug, Serialize)] pub struct NetDeviceMetrics { /// Number of times when activate failed on a network device. pub activate_fails: SharedIncMetric, @@ -290,22 +319,61 @@ impl NetDeviceMetrics { #[cfg(test)] pub mod tests { use super::*; + #[test] - fn test_net_dev_metrics() { - // we can have max 19 net devices + fn test_max_net_dev_metrics() { + // Note: this test has nothing to do with + // Net structure or IRQs, this is just to allocate + // metrics for max number of devices that system can have. + // we have 5-23 irq for net devices so max 19 net devices. const MAX_NET_DEVICES: usize = 19; - let mut net_dev_metrics: [NetDeviceMetricsIndex; MAX_NET_DEVICES] = - [NetDeviceMetricsIndex::default(); MAX_NET_DEVICES]; - for metric in net_dev_metrics.iter_mut().take(MAX_NET_DEVICES) { - *metric = NetDeviceMetricsAllocator::alloc(); - metric.get().write().unwrap().activate_fails.inc(); - } - // SAFETY: something - unsafe { - assert!(NET_DEV_METRICS_PVT.metrics.len() >= MAX_NET_DEVICES); + + assert!(NET_DEV_METRICS_PVT.read().is_ok()); + assert!(NET_DEV_METRICS_PVT.write().is_ok()); + + for i in 0..MAX_NET_DEVICES { + let devn: String = format!("eth{}", i); + NetMetricsPerDevice::alloc(devn.clone()); + NET_METRICS!(&devn, activate_fails.inc()); + NET_METRICS!(&devn, rx_bytes_count.add(10)); + NET_METRICS!(&devn, tx_bytes_count.add(5)); } - for metric in net_dev_metrics.iter_mut().take(MAX_NET_DEVICES) { - assert_eq!(metric.get().write().unwrap().activate_fails.count(), 1); + for i in 0..MAX_NET_DEVICES { + let devn: String = format!("eth{}", i); + assert!(NET_METRICS!(&devn, activate_fails.count()) >= 1); + assert!(NET_METRICS!(&devn, rx_bytes_count.count()) >= 10); + assert_eq!(NET_METRICS!(&devn, tx_bytes_count.count()), 5); } } + #[test] + fn test_signle_net_dev_metrics() { + // Use eth0 so that we can check thread safety with the + // `test_net_dev_metrics` which also uses the same name. + let devn = "eth0"; + + assert!(NET_DEV_METRICS_PVT.read().is_ok()); + assert!(NET_DEV_METRICS_PVT.write().is_ok()); + + NetMetricsPerDevice::alloc(String::from(devn)); + assert!(NET_DEV_METRICS_PVT.read().is_ok()); + assert!(get_net_metrics().metrics.get(devn).is_some()); + + NET_METRICS!(devn, activate_fails.inc()); + assert!( + NET_METRICS!(devn, activate_fails.count()) > 0, + "{}", + NET_METRICS!(devn, activate_fails.count()) + ); + // we expect only 2 tests (this and test_max_net_dev_metrics) + // to update activate_fails count for eth0. + assert!( + NET_METRICS!(devn, activate_fails.count()) <= 2, + "{}", + NET_METRICS!(devn, activate_fails.count()) + ); + + NET_METRICS!(&String::from(devn), activate_fails.inc()); + NET_METRICS!(&String::from(devn), rx_bytes_count.add(5)); + assert!(NET_METRICS!(&String::from(devn), rx_bytes_count.count()) >= 5); + } } diff --git a/src/vmm/src/devices/virtio/net/test_utils.rs b/src/vmm/src/devices/virtio/net/test_utils.rs index 04fa887f2f8..c4bc8dd1d6a 100644 --- a/src/vmm/src/devices/virtio/net/test_utils.rs +++ b/src/vmm/src/devices/virtio/net/test_utils.rs @@ -352,9 +352,11 @@ pub mod test { use event_manager::{EventManager, SubscriberId, SubscriberOps}; - use crate::check_metric_after_block; use crate::devices::virtio::net::device::vnet_hdr_len; use crate::devices::virtio::net::gen::ETH_HLEN; + // get_net_metrics and NetMetricsPerDevice are the functions called by the + // macro NET_METRICS + use crate::devices::virtio::net::metrics::{get_net_metrics, NetMetricsPerDevice}; use crate::devices::virtio::net::test_utils::{ assign_queues, default_net, inject_tap_tx_frame, NetEvent, NetQueue, ReadTapMock, }; @@ -367,6 +369,7 @@ pub mod test { use crate::vstate::memory::{ Address, Bytes, GuestAddress, GuestMemoryExtension, GuestMemoryMmap, }; + use crate::{check_net_metric_after_block, NET_METRICS}; pub struct TestHelper<'a> { pub event_manager: EventManager>>, @@ -496,8 +499,8 @@ pub mod test { // Inject frame to tap and run epoll. let frame = inject_tap_tx_frame(&self.net(), frame_len); - check_metric_after_block!( - self.net().metrics.get().write().unwrap().rx_packets_count, + check_net_metric_after_block!( + NET_METRICS!(&self.net().id, rx_packets_count.count()), 0, self.event_manager.run_with_timeout(100).unwrap() ); @@ -524,8 +527,8 @@ pub mod test { VIRTQ_DESC_F_WRITE, )], ); - check_metric_after_block!( - self.net().metrics.get().write().unwrap().rx_packets_count, + check_net_metric_after_block!( + NET_METRICS!(&self.net().id, rx_packets_count.count()), 1, self.event_manager.run_with_timeout(100).unwrap() ); diff --git a/src/vmm/src/devices/virtio/test_utils.rs b/src/vmm/src/devices/virtio/test_utils.rs index 564b59f7779..a5199f5683a 100644 --- a/src/vmm/src/devices/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/test_utils.rs @@ -22,6 +22,20 @@ macro_rules! check_metric_after_block { }}; } +#[macro_export] +// macro created from check_metric_after_block because `metric` will be +// emitted by NET_METRICS macro which does not fit with the $metric.count() +// that check_metric_after_block had. +// TODO: After all devices have per device metrics we won't need the +// check_metric_after_block macro. +macro_rules! check_net_metric_after_block { + ($metric:expr, $delta:expr, $block:expr) => {{ + let before = $metric; + let _ = $block; + assert_eq!($metric, before + $delta, "unexpected metric value"); + }}; +} + /// Creates a [`GuestMemoryMmap`] with a single region of the given size starting at guest physical /// address 0 pub fn single_region_mem(region_size: usize) -> GuestMemoryMmap { From 6a0fb2208e8091ab5a3005b379ec1590e9181149 Mon Sep 17 00:00:00 2001 From: Sudan Landge Date: Mon, 16 Oct 2023 12:02:55 +0000 Subject: [PATCH 07/12] chore(metrics)(net): better CHANGELOG & lowercase for net_metrics macro Use proper convention of lower case for macros. Make CHANHELOG less verbose. Note: this is part 7 of 12 commits to add per net device metrics. Signed-off-by: Sudan Landge --- CHANGELOG.md | 11 +- src/vmm/src/devices/mod.rs | 7 +- src/vmm/src/devices/virtio/net/device.rs | 154 +++++++++--------- .../src/devices/virtio/net/event_handler.rs | 7 +- src/vmm/src/devices/virtio/net/metrics.rs | 50 +++--- src/vmm/src/devices/virtio/net/test_utils.rs | 7 +- src/vmm/src/logger/metrics.rs | 2 +- 7 files changed, 112 insertions(+), 126 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b38f945779..8acff946895 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,13 +5,10 @@ ### Added - [#4145](https://github.com/firecracker-microvm/firecracker/pull/4145): - Added support for per net device metrics. Now, along with aggregated - network device metrics, metrics per network device with also be emitted. - Aggregate metrics will continue to be emitted with label "net" in the - metrics json object while each net device metrics will be emitted with - label "net_{iface_id}". E.g. the associated metrics for the endpoint - "/network-interfaces/eth0" will be available under "net_eth0:" in each - metrics json object. + Added support for per net device metrics. In addition to aggregate metrics `net`, + each individual net device will emit metrics under the label `"net_{iface_id}"`. + E.g. the associated metrics for the endpoint `"/network-interfaces/eth0"` will + be available under `"net_eth0"` in the metrics json object. ### Changed diff --git a/src/vmm/src/devices/mod.rs b/src/vmm/src/devices/mod.rs index 60e0ced8726..7a865ce6ad1 100644 --- a/src/vmm/src/devices/mod.rs +++ b/src/vmm/src/devices/mod.rs @@ -17,12 +17,9 @@ pub mod virtio; pub use bus::{Bus, BusDevice, BusError}; use log::error; -// get_net_metrics and NetMetricsPerDevice are the functions called by the -// macro NET_METRICS -use crate::devices::virtio::metrics::{get_net_metrics, NetMetricsPerDevice}; use crate::devices::virtio::{QueueError, VsockError}; use crate::logger::{IncMetric, METRICS}; -use crate::NET_METRICS; +use crate::net_metrics; // Function used for reporting error in terms of logging // but also in terms of metrics of net event fails. @@ -30,7 +27,7 @@ use crate::NET_METRICS; // to identify which device the metrics and `err` should be reported for. pub(crate) fn report_net_event_fail(net_iface_id: &String, err: DeviceError) { error!("{:?}:{:?}", net_iface_id, err); - NET_METRICS!(net_iface_id, event_fails.inc()); + net_metrics!(net_iface_id, event_fails.inc()); } pub(crate) fn report_balloon_event_fail(err: virtio::balloon::BalloonError) { diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index 75c5e57782a..1fb41655c74 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -37,9 +37,7 @@ use crate::vstate::memory::{ByteValued, Bytes, GuestMemoryMmap}; const FRAME_HEADER_MAX_LEN: usize = PAYLOAD_OFFSET + ETH_IPV4_FRAME_LEN; use crate::devices::virtio::iovec::IoVecBuffer; -// get_net_metrics and NetMetricsPerDevice are the functions called by the -// macro NET_METRICS -use crate::devices::virtio::metrics::{get_net_metrics, NetMetricsPerDevice}; +use crate::devices::virtio::metrics::NetMetricsPerDevice; use crate::devices::virtio::net::tap::Tap; use crate::devices::virtio::net::{ gen, NetError, NetQueue, MAX_BUFFER_SIZE, NET_QUEUE_SIZES, RX_INDEX, TX_INDEX, @@ -48,7 +46,7 @@ use crate::devices::virtio::{ ActivateError, DescriptorChain, DeviceState, IrqTrigger, IrqType, Queue, VirtioDevice, TYPE_NET, }; use crate::devices::{report_net_event_fail, DeviceError}; -use crate::NET_METRICS; +use crate::net_metrics; #[derive(Debug)] enum FrontendError { @@ -177,7 +175,7 @@ impl Net { // allocate NetDeviceMetrics to log metrics for this device. // we don't need a pointer to NetDeviceMetrics since we use - // NET_METRICS! macro with device's id to access the metrics + // net_metrics! macro with device's id to access the metrics // and so alloc doesn't need to return anything. NetMetricsPerDevice::alloc(id.clone()); Ok(Net { @@ -281,7 +279,7 @@ impl Net { self.irq_trigger .trigger_irq(IrqType::Vring) .map_err(|err| { - NET_METRICS!(&self.id, event_fails.inc()); + net_metrics!(&self.id, event_fails.inc()); DeviceError::FailedSignalingIrq(err) })?; } @@ -314,7 +312,7 @@ impl Net { // Returns true on successful frame delivery. fn rate_limited_rx_single_frame(&mut self) -> bool { if !Self::rate_limiter_consume_op(&mut self.rx_rate_limiter, self.rx_bytes_read as u64) { - NET_METRICS!(&self.id, rx_rate_limiter_throttled.inc()); + net_metrics!(&self.id, rx_rate_limiter_throttled.inc()); return false; } @@ -353,13 +351,13 @@ impl Net { let len = std::cmp::min(chunk.len(), descriptor.len as usize); match mem.write_slice(&chunk[..len], descriptor.addr) { Ok(()) => { - NET_METRICS!(net_iface_id, rx_count.inc()); + net_metrics!(net_iface_id, rx_count.inc()); chunk = &chunk[len..]; } Err(err) => { error!("Failed to write slice: {:?}", err); if let GuestMemoryError::PartialBuffer { .. } = err { - NET_METRICS!(net_iface_id, rx_partial_writes.inc()); + net_metrics!(net_iface_id, rx_partial_writes.inc()); } return Err(FrontendError::GuestMemory(err)); } @@ -368,8 +366,8 @@ impl Net { // If chunk is empty we are done here. if chunk.is_empty() { let len = data.len() as u64; - NET_METRICS!(net_iface_id, rx_bytes_count.add(len)); - NET_METRICS!(net_iface_id, rx_packets_count.inc()); + net_metrics!(net_iface_id, rx_bytes_count.add(len)); + net_metrics!(net_iface_id, rx_packets_count.inc()); return Ok(()); } @@ -387,7 +385,7 @@ impl Net { let queue = &mut self.queues[RX_INDEX]; let head_descriptor = queue.pop_or_enable_notification(mem).ok_or_else(|| { - NET_METRICS!(&self.id, no_rx_avail_buffer.inc()); + net_metrics!(&self.id, no_rx_avail_buffer.inc()); FrontendError::EmptyQueue })?; let head_index = head_descriptor.index; @@ -400,7 +398,7 @@ impl Net { ); // Mark the descriptor chain as used. If an error occurred, skip the descriptor chain. let used_len = if result.is_err() { - NET_METRICS!(&self.id, rx_fails.inc()); + net_metrics!(&self.id, rx_fails.inc()); 0 } else { // Safe to unwrap because a frame must be smaller than 2^16 bytes. @@ -450,13 +448,13 @@ impl Net { // if the frame_iovec is empty. let header_len = frame_iovec.read_at(headers, 0).ok_or_else(|| { error!("Received empty TX buffer"); - NET_METRICS!(net_iface_id, tx_malformed_frames.inc()); + net_metrics!(net_iface_id, tx_malformed_frames.inc()); NetError::VnetHeaderMissing })?; let headers = frame_bytes_from_buf(&headers[..header_len]).map_err(|e| { error!("VNET headers missing in TX frame"); - NET_METRICS!(net_iface_id, tx_malformed_frames.inc()); + net_metrics!(net_iface_id, tx_malformed_frames.inc()); e })?; @@ -483,7 +481,7 @@ impl Net { if let Some(guest_mac) = guest_mac { let _ = EthernetFrame::from_bytes(headers).map(|eth_frame| { if guest_mac != eth_frame.src_mac() { - NET_METRICS!(net_iface_id, tx_spoofed_mac_count.inc()); + net_metrics!(net_iface_id, tx_spoofed_mac_count.inc()); } }); } @@ -491,13 +489,13 @@ impl Net { match Self::write_tap(tap, frame_iovec) { Ok(_) => { let len = frame_iovec.len() as u64; - NET_METRICS!(net_iface_id, tx_bytes_count.add(len)); - NET_METRICS!(net_iface_id, tx_packets_count.inc()); - NET_METRICS!(net_iface_id, tx_count.inc()); + net_metrics!(net_iface_id, tx_bytes_count.add(len)); + net_metrics!(net_iface_id, tx_packets_count.inc()); + net_metrics!(net_iface_id, tx_count.inc()); } Err(err) => { error!("Failed to write to tap: {:?}", err); - NET_METRICS!(net_iface_id, tap_write_fails.inc()); + net_metrics!(net_iface_id, tap_write_fails.inc()); } }; Ok(false) @@ -526,7 +524,7 @@ impl Net { match self.read_from_mmds_or_tap() { Ok(count) => { self.rx_bytes_read = count; - NET_METRICS!(&self.id, rx_count.inc()); + net_metrics!(&self.id, rx_count.inc()); if !self.rate_limited_rx_single_frame() { self.rx_deferred_frame = true; break; @@ -539,7 +537,7 @@ impl Net { Some(err) if err == EAGAIN => (), _ => { error!("Failed to read tap: {:?}", err); - NET_METRICS!(&self.id, tap_read_fails.inc()); + net_metrics!(&self.id, tap_read_fails.inc()); return Err(DeviceError::FailedReadTap); } }; @@ -594,7 +592,7 @@ impl Net { let buffer = match IoVecBuffer::from_descriptor_chain(mem, head) { Ok(buffer) => buffer, Err(_) => { - NET_METRICS!(&self.id, tx_fails.inc()); + net_metrics!(&self.id, tx_fails.inc()); tx_queue .add_used(mem, head_index, 0) .map_err(DeviceError::QueueError)?; @@ -603,7 +601,7 @@ impl Net { }; if !Self::rate_limiter_consume_op(&mut self.tx_rate_limiter, buffer.len() as u64) { tx_queue.undo_pop(); - NET_METRICS!(&self.id, tx_rate_limiter_throttled.inc()); + net_metrics!(&self.id, tx_rate_limiter_throttled.inc()); break; } @@ -629,7 +627,7 @@ impl Net { } if !used_any { - NET_METRICS!(&self.id, no_tx_avail_buffer.inc()); + net_metrics!(&self.id, no_tx_avail_buffer.inc()); } self.signal_used_queue(NetQueue::Tx)?; @@ -669,14 +667,14 @@ impl Net { /// This is called by the event manager responding to the guest adding a new /// buffer in the RX queue. pub fn process_rx_queue_event(&mut self) { - NET_METRICS!(&self.id, rx_queue_event_count.inc()); + net_metrics!(&self.id, rx_queue_event_count.inc()); if let Err(err) = self.queue_evts[RX_INDEX].read() { // rate limiters present but with _very high_ allowed rate error!("Failed to get rx queue event: {:?}", err); - NET_METRICS!(&self.id, event_fails.inc()); + net_metrics!(&self.id, event_fails.inc()); } else if self.rx_rate_limiter.is_blocked() { - NET_METRICS!(&self.id, rx_rate_limiter_throttled.inc()); + net_metrics!(&self.id, rx_rate_limiter_throttled.inc()); } else { // If the limiter is not blocked, resume the receiving of bytes. self.resume_rx() @@ -687,20 +685,20 @@ impl Net { pub fn process_tap_rx_event(&mut self) { // This is safe since we checked in the event handler that the device is activated. let mem = self.device_state.mem().unwrap(); - NET_METRICS!(&self.id, rx_tap_event_count.inc()); + net_metrics!(&self.id, rx_tap_event_count.inc()); // While there are no available RX queue buffers and there's a deferred_frame // don't process any more incoming. Otherwise start processing a frame. In the // process the deferred_frame flag will be set in order to avoid freezing the // RX queue. if self.queues[RX_INDEX].is_empty(mem) && self.rx_deferred_frame { - NET_METRICS!(&self.id, no_rx_avail_buffer.inc()); + net_metrics!(&self.id, no_rx_avail_buffer.inc()); return; } // While limiter is blocked, don't process any more incoming. if self.rx_rate_limiter.is_blocked() { - NET_METRICS!(&self.id, rx_rate_limiter_throttled.inc()); + net_metrics!(&self.id, rx_rate_limiter_throttled.inc()); return; } @@ -721,22 +719,22 @@ impl Net { /// This is called by the event manager responding to the guest adding a new /// buffer in the TX queue. pub fn process_tx_queue_event(&mut self) { - NET_METRICS!(&self.id, tx_queue_event_count.inc()); + net_metrics!(&self.id, tx_queue_event_count.inc()); if let Err(err) = self.queue_evts[TX_INDEX].read() { error!("Failed to get tx queue event: {:?}", err); - NET_METRICS!(&self.id, event_fails.inc()); + net_metrics!(&self.id, event_fails.inc()); } else if !self.tx_rate_limiter.is_blocked() // If the limiter is not blocked, continue transmitting bytes. { self.process_tx() .unwrap_or_else(|err| report_net_event_fail(&self.id, err)); } else { - NET_METRICS!(&self.id, tx_rate_limiter_throttled.inc()); + net_metrics!(&self.id, tx_rate_limiter_throttled.inc()); } } pub fn process_rx_rate_limiter_event(&mut self) { - NET_METRICS!(&self.id, rx_event_rate_limiter_count.inc()); + net_metrics!(&self.id, rx_event_rate_limiter_count.inc()); // Upon rate limiter event, call the rate limiter handler // and restart processing the queue. @@ -748,13 +746,13 @@ impl Net { } Err(err) => { error!("Failed to get rx rate-limiter event: {:?}", err); - NET_METRICS!(&self.id, event_fails.inc()); + net_metrics!(&self.id, event_fails.inc()); } } } pub fn process_tx_rate_limiter_event(&mut self) { - NET_METRICS!(&self.id, tx_rate_limiter_event_count.inc()); + net_metrics!(&self.id, tx_rate_limiter_event_count.inc()); // Upon rate limiter event, call the rate limiter handler // and restart processing the queue. match self.tx_rate_limiter.event_handler() { @@ -765,7 +763,7 @@ impl Net { } Err(err) => { error!("Failed to get tx rate-limiter event: {:?}", err); - NET_METRICS!(&self.id, event_fails.inc()); + net_metrics!(&self.id, event_fails.inc()); } } } @@ -819,7 +817,7 @@ impl VirtioDevice for Net { let config_len = config_space_bytes.len() as u64; if offset >= config_len { error!("Failed to read config space"); - NET_METRICS!(&self.id, cfg_fails.inc()); + net_metrics!(&self.id, cfg_fails.inc()); return; } if let Some(end) = offset.checked_add(data.len() as u64) { @@ -840,13 +838,13 @@ impl VirtioDevice for Net { .and_then(|(start, end)| config_space_bytes.get_mut(start..end)) else { error!("Failed to write config space"); - NET_METRICS!(&self.id, cfg_fails.inc()); + net_metrics!(&self.id, cfg_fails.inc()); return; }; dst.copy_from_slice(data); self.guest_mac = Some(self.config_space.guest_mac); - NET_METRICS!(&self.id, mac_address_updates.inc()); + net_metrics!(&self.id, mac_address_updates.inc()); } fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { @@ -1028,7 +1026,7 @@ pub mod tests { // Check that the guest MAC was updated. let expected_guest_mac = MacAddr::from_bytes_unchecked(&new_config); assert_eq!(expected_guest_mac, net.guest_mac.unwrap()); - assert_eq!(NET_METRICS!(&net.id, mac_address_updates.count()), 1); + assert_eq!(net_metrics!(&net.id, mac_address_updates.count()), 1); // Partial write (this is how the kernel sets a new mac address) - byte by byte. let new_config = [0x11, 0x22, 0x33, 0x44, 0x55, 0x66]; @@ -1061,7 +1059,7 @@ pub mod tests { th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); th.net().queue_evts[RX_INDEX].read().unwrap(); check_net_metric_after_block!( - NET_METRICS!(&th.net().id, event_fails.count()), + net_metrics!(&th.net().id, event_fails.count()), 1, th.simulate_event(NetEvent::RxQueue) ); @@ -1156,7 +1154,7 @@ pub mod tests { // Inject frame to tap and run epoll. let frame = inject_tap_tx_frame(&th.net(), 1000); check_net_metric_after_block!( - NET_METRICS!(&th.net().id, rx_packets_count.count()), + net_metrics!(&th.net().id, rx_packets_count.count()), 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1197,7 +1195,7 @@ pub mod tests { // Inject frame to tap and run epoll. let frame = inject_tap_tx_frame(&th.net(), 1000); check_net_metric_after_block!( - NET_METRICS!(&th.net().id, rx_packets_count.count()), + net_metrics!(&th.net().id, rx_packets_count.count()), 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1237,7 +1235,7 @@ pub mod tests { let frame_1 = inject_tap_tx_frame(&th.net(), 200); let frame_2 = inject_tap_tx_frame(&th.net(), 300); check_net_metric_after_block!( - NET_METRICS!(&th.net().id, rx_packets_count.count()), + net_metrics!(&th.net().id, rx_packets_count.count()), 2, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1268,7 +1266,7 @@ pub mod tests { th.add_desc_chain(NetQueue::Tx, 0, &[(0, 4096, 0)]); th.net().queue_evts[TX_INDEX].read().unwrap(); check_net_metric_after_block!( - NET_METRICS!(&th.net().id, event_fails.count()), + net_metrics!(&th.net().id, event_fails.count()), 1, th.simulate_event(NetEvent::TxQueue) ); @@ -1307,7 +1305,7 @@ pub mod tests { // Send an invalid frame (too small, VNET header missing). th.add_desc_chain(NetQueue::Tx, 0, &[(0, 1, 0)]); check_net_metric_after_block!( - NET_METRICS!(&th.net().id, tx_malformed_frames.count()), + net_metrics!(&th.net().id, tx_malformed_frames.count()), 1, th.event_manager.run_with_timeout(100) ); @@ -1329,7 +1327,7 @@ pub mod tests { // Send an invalid frame (too small, VNET header missing). th.add_desc_chain(NetQueue::Tx, 0, &[(0, 0, 0)]); check_net_metric_after_block!( - NET_METRICS!(&th.net().id, tx_malformed_frames.count()), + net_metrics!(&th.net().id, tx_malformed_frames.count()), 1, th.event_manager.run_with_timeout(100) ); @@ -1367,7 +1365,7 @@ pub mod tests { // One frame is valid, one will not be handled because it includes write-only memory // so that leaves us with 2 malformed (no vnet header) frames. check_net_metric_after_block!( - NET_METRICS!(&th.net().id, tx_malformed_frames.count()), + net_metrics!(&th.net().id, tx_malformed_frames.count()), 2, th.event_manager.run_with_timeout(100) ); @@ -1397,7 +1395,7 @@ pub mod tests { let frame = th.write_tx_frame(&desc_list, 1000); check_net_metric_after_block!( - NET_METRICS!(&th.net().id, tx_packets_count.count()), + net_metrics!(&th.net().id, tx_packets_count.count()), 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1423,7 +1421,7 @@ pub mod tests { let _ = th.write_tx_frame(&desc_list, 1000); check_net_metric_after_block!( - NET_METRICS!(&th.net().id, tap_write_fails.count()), + net_metrics!(&th.net().id, tap_write_fails.count()), 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1450,7 +1448,7 @@ pub mod tests { let frame_2 = th.write_tx_frame(&desc_list, 600); check_net_metric_after_block!( - NET_METRICS!(&th.net().id, tx_packets_count.count()), + net_metrics!(&th.net().id, tx_packets_count.count()), 2, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1558,7 +1556,7 @@ pub mod tests { // Check that a legit MAC doesn't affect the spoofed MAC metric. check_net_metric_after_block!( - NET_METRICS!(&net.id, tx_spoofed_mac_count.count()), + net_metrics!(&net.id, tx_spoofed_mac_count.count()), 0, Net::write_to_mmds_or_tap( net.mmds_ns.as_mut(), @@ -1573,7 +1571,7 @@ pub mod tests { // Check that a spoofed MAC increases our spoofed MAC metric. check_net_metric_after_block!( - NET_METRICS!(&net.id, tx_spoofed_mac_count.count()), + net_metrics!(&net.id, tx_spoofed_mac_count.count()), 1, Net::write_to_mmds_or_tap( net.mmds_ns.as_mut(), @@ -1595,7 +1593,7 @@ pub mod tests { // RX rate limiter events should error since the limiter is not blocked. // Validate that the event failed and failure was properly accounted for. check_net_metric_after_block!( - NET_METRICS!(&th.net().id, event_fails.count()), + net_metrics!(&th.net().id, event_fails.count()), 1, th.simulate_event(NetEvent::RxRateLimiter) ); @@ -1603,7 +1601,7 @@ pub mod tests { // TX rate limiter events should error since the limiter is not blocked. // Validate that the event failed and failure was properly accounted for. check_net_metric_after_block!( - NET_METRICS!(&th.net().id, event_fails.count()), + net_metrics!(&th.net().id, event_fails.count()), 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1621,7 +1619,7 @@ pub mod tests { // The RX queue is empty and rx_deffered_frame is set. th.net().rx_deferred_frame = true; check_net_metric_after_block!( - NET_METRICS!(&th.net().id, no_rx_avail_buffer.count()), + net_metrics!(&th.net().id, no_rx_avail_buffer.count()), 1, th.simulate_event(NetEvent::Tap) ); @@ -1634,7 +1632,7 @@ pub mod tests { // Fake an avail buffer; this time, tap reading should error out. th.rxq.avail.idx.set(1); check_net_metric_after_block!( - NET_METRICS!(&th.net().id, tap_read_fails.count()), + net_metrics!(&th.net().id, tap_read_fails.count()), 1, th.simulate_event(NetEvent::Tap) ); @@ -1646,12 +1644,12 @@ pub mod tests { th.activate_net(); th.net().tap.mocks.set_read_tap(ReadTapMock::TapFrame); - let rx_packets_count = NET_METRICS!(&th.net().id, rx_packets_count.count()); + let rx_packets_count = net_metrics!(&th.net().id, rx_packets_count.count()); let _ = inject_tap_tx_frame(&th.net(), 1000); // Trigger a Tap event that. This should fail since there // are not any available descriptors in the queue check_net_metric_after_block!( - NET_METRICS!(&th.net().id, no_rx_avail_buffer.count()), + net_metrics!(&th.net().id, no_rx_avail_buffer.count()), 1, th.simulate_event(NetEvent::Tap) ); @@ -1659,7 +1657,7 @@ pub mod tests { // no frames should have been transmitted assert!(th.net().rx_deferred_frame); assert_eq!( - NET_METRICS!(&th.net().id, rx_packets_count.count()), + net_metrics!(&th.net().id, rx_packets_count.count()), rx_packets_count ); @@ -1672,7 +1670,7 @@ pub mod tests { // since there's only one Descriptor Chain in the queue. th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); check_net_metric_after_block!( - NET_METRICS!(&th.net().id, no_rx_avail_buffer.count()), + net_metrics!(&th.net().id, no_rx_avail_buffer.count()), 1, th.simulate_event(NetEvent::Tap) ); @@ -1680,14 +1678,14 @@ pub mod tests { assert!(th.net().rx_deferred_frame); // However, we should have delivered the first frame assert_eq!( - NET_METRICS!(&th.net().id, rx_packets_count.count()), + net_metrics!(&th.net().id, rx_packets_count.count()), rx_packets_count + 1 ); // Let's add one more descriptor and try to handle the last frame as well. th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); check_net_metric_after_block!( - NET_METRICS!(&th.net().id, rx_packets_count.count()), + net_metrics!(&th.net().id, rx_packets_count.count()), 1, th.simulate_event(NetEvent::RxQueue) ); @@ -1704,7 +1702,7 @@ pub mod tests { th.net().rx_rate_limiter = RateLimiter::new(0, 0, 0, 0, 0, 0).unwrap(); // There is no actual event on the rate limiter's timerfd. check_net_metric_after_block!( - NET_METRICS!(&th.net().id, event_fails.count()), + net_metrics!(&th.net().id, event_fails.count()), 1, th.simulate_event(NetEvent::RxRateLimiter) ); @@ -1719,7 +1717,7 @@ pub mod tests { th.simulate_event(NetEvent::TxRateLimiter); // There is no actual event on the rate limiter's timerfd. check_net_metric_after_block!( - NET_METRICS!(&th.net().id, event_fails.count()), + net_metrics!(&th.net().id, event_fails.count()), 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1750,7 +1748,7 @@ pub mod tests { // assert that limiter is blocked assert!(th.net().tx_rate_limiter.is_blocked()); assert_eq!( - NET_METRICS!(&th.net().id, tx_rate_limiter_throttled.count()), + net_metrics!(&th.net().id, tx_rate_limiter_throttled.count()), 1 ); // make sure the data is still queued for processing @@ -1764,7 +1762,7 @@ pub mod tests { th.simulate_event(NetEvent::TxQueue); assert_eq!( - NET_METRICS!(&th.net().id, tx_rate_limiter_throttled.count()), + net_metrics!(&th.net().id, tx_rate_limiter_throttled.count()), 2 ); } @@ -1777,7 +1775,7 @@ pub mod tests { { // tx_count increments 1 from write_to_mmds_or_tap() check_net_metric_after_block!( - NET_METRICS!(&th.net().id, tx_count.count()), + net_metrics!(&th.net().id, tx_count.count()), 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1794,7 +1792,7 @@ pub mod tests { { // tx_count increments 1 from write_to_mmds_or_tap() check_net_metric_after_block!( - NET_METRICS!(&th.net().id, tx_count.count()), + net_metrics!(&th.net().id, tx_count.count()), 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1827,7 +1825,7 @@ pub mod tests { // assert that limiter is blocked assert!(th.net().rx_rate_limiter.is_blocked()); assert_eq!( - NET_METRICS!(&th.net().id, rx_rate_limiter_throttled.count()), + net_metrics!(&th.net().id, rx_rate_limiter_throttled.count()), 1 ); assert!(th.net().rx_deferred_frame); @@ -1843,7 +1841,7 @@ pub mod tests { th.simulate_event(NetEvent::RxQueue); assert_eq!( - NET_METRICS!(&th.net().id, rx_rate_limiter_throttled.count()), + net_metrics!(&th.net().id, rx_rate_limiter_throttled.count()), 2 ); } @@ -1857,7 +1855,7 @@ pub mod tests { let frame = &th.net().tap.mocks.read_tap.mock_frame(); // no longer throttled check_net_metric_after_block!( - NET_METRICS!(&th.net().id, rx_rate_limiter_throttled.count()), + net_metrics!(&th.net().id, rx_rate_limiter_throttled.count()), 0, th.simulate_event(NetEvent::RxRateLimiter) ); @@ -1895,7 +1893,7 @@ pub mod tests { // trigger the TX handler th.add_desc_chain(NetQueue::Tx, 0, &[(0, 4096, 0)]); check_net_metric_after_block!( - NET_METRICS!(&th.net().id, tx_rate_limiter_throttled.count()), + net_metrics!(&th.net().id, tx_rate_limiter_throttled.count()), 1, th.simulate_event(NetEvent::TxQueue) ); @@ -1914,7 +1912,7 @@ pub mod tests { { // no longer throttled check_net_metric_after_block!( - NET_METRICS!(&th.net().id, tx_rate_limiter_throttled.count()), + net_metrics!(&th.net().id, tx_rate_limiter_throttled.count()), 0, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1943,14 +1941,14 @@ pub mod tests { { // trigger the RX handler check_net_metric_after_block!( - NET_METRICS!(&th.net().id, rx_rate_limiter_throttled.count()), + net_metrics!(&th.net().id, rx_rate_limiter_throttled.count()), 1, th.simulate_event(NetEvent::Tap) ); // assert that limiter is blocked assert!(th.net().rx_rate_limiter.is_blocked()); - assert!(NET_METRICS!(&th.net().id, rx_rate_limiter_throttled.count()) >= 1); + assert!(net_metrics!(&th.net().id, rx_rate_limiter_throttled.count()) >= 1); assert!(th.net().rx_deferred_frame); // assert that no operation actually completed (limiter blocked it) assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); diff --git a/src/vmm/src/devices/virtio/net/event_handler.rs b/src/vmm/src/devices/virtio/net/event_handler.rs index 5863a9ad0fe..b58e6a8b682 100644 --- a/src/vmm/src/devices/virtio/net/event_handler.rs +++ b/src/vmm/src/devices/virtio/net/event_handler.rs @@ -7,12 +7,9 @@ use event_manager::{EventOps, Events, MutEventSubscriber}; use utils::epoll::EventSet; use crate::devices::virtio::net::device::Net; -// get_net_metrics and NetMetricsPerDevice are the functions called by the -// macro NET_METRICS -use crate::devices::virtio::net::metrics::{get_net_metrics, NetMetricsPerDevice}; use crate::devices::virtio::{VirtioDevice, RX_INDEX, TX_INDEX}; use crate::logger::{debug, error, warn, IncMetric}; -use crate::NET_METRICS; +use crate::net_metrics; impl Net { fn register_runtime_events(&self, ops: &mut EventOps) { @@ -88,7 +85,7 @@ impl MutEventSubscriber for Net { _ if activate_fd == source => self.process_activate_event(ops), _ => { warn!("Net: Spurious event received: {:?}", source); - NET_METRICS!(&self.id, event_fails.inc()); + net_metrics!(&self.id, event_fails.inc()); } } } else { diff --git a/src/vmm/src/devices/virtio/net/metrics.rs b/src/vmm/src/devices/virtio/net/metrics.rs index d2fa6cba2f4..c658c826d7a 100644 --- a/src/vmm/src/devices/virtio/net/metrics.rs +++ b/src/vmm/src/devices/virtio/net/metrics.rs @@ -131,21 +131,21 @@ impl NetMetricsPerDevice { // centralize the error handling. // The macro can be called in below ways where iface_id is String and // activate_fails is a metric from struct `NetDeviceMetrics`: -// NET_METRICS!(&iface_id, activate_fails.inc()); -// NET_METRICS!(&iface_id, activate_fails.add(5)); -// NET_METRICS!(&iface_id, activate_fails.count()); -macro_rules! NET_METRICS { - ($iface_id:expr,$metric:ident.$action:ident($($value:tt)?)) => { - if get_net_metrics().metrics.get($iface_id).is_some() { - get_net_metrics().metrics.get($iface_id).unwrap().$metric.$action($($value)?) +// net_metrics!(&iface_id, activate_fails.inc()); +// net_metrics!(&iface_id, activate_fails.add(5)); +// net_metrics!(&iface_id, activate_fails.count()); +macro_rules! net_metrics { + ($iface_id:expr,$metric:ident.$action:ident($($value:tt)?)) => {{ + if $crate::devices::virtio::net::metrics::get_net_metrics().metrics.get($iface_id).is_some() { + $crate::devices::virtio::net::metrics::get_net_metrics().metrics.get($iface_id).unwrap().$metric.$action($($value)?) }else{ - NetMetricsPerDevice::alloc($iface_id.to_string()); - get_net_metrics().metrics.get($iface_id).unwrap().$metric.$action($($value)?) + $crate::devices::virtio::net::metrics::NetMetricsPerDevice::alloc($iface_id.to_string()); + $crate::devices::virtio::net::metrics::get_net_metrics().metrics.get($iface_id).unwrap().$metric.$action($($value)?) } - } + }} } -/// Pool of Network-related metrics per device behing a lock to +/// Pool of Network-related metrics per device behind a lock to /// keep things thread safe. Since the lock is initialized here /// it is safe to unwrap it without any check. static NET_DEV_METRICS_PVT: RwLock = RwLock::new(NetMetricsPerDevice { @@ -334,15 +334,15 @@ pub mod tests { for i in 0..MAX_NET_DEVICES { let devn: String = format!("eth{}", i); NetMetricsPerDevice::alloc(devn.clone()); - NET_METRICS!(&devn, activate_fails.inc()); - NET_METRICS!(&devn, rx_bytes_count.add(10)); - NET_METRICS!(&devn, tx_bytes_count.add(5)); + net_metrics!(&devn, activate_fails.inc()); + net_metrics!(&devn, rx_bytes_count.add(10)); + net_metrics!(&devn, tx_bytes_count.add(5)); } for i in 0..MAX_NET_DEVICES { let devn: String = format!("eth{}", i); - assert!(NET_METRICS!(&devn, activate_fails.count()) >= 1); - assert!(NET_METRICS!(&devn, rx_bytes_count.count()) >= 10); - assert_eq!(NET_METRICS!(&devn, tx_bytes_count.count()), 5); + assert!(net_metrics!(&devn, activate_fails.count()) >= 1); + assert!(net_metrics!(&devn, rx_bytes_count.count()) >= 10); + assert_eq!(net_metrics!(&devn, tx_bytes_count.count()), 5); } } #[test] @@ -358,22 +358,22 @@ pub mod tests { assert!(NET_DEV_METRICS_PVT.read().is_ok()); assert!(get_net_metrics().metrics.get(devn).is_some()); - NET_METRICS!(devn, activate_fails.inc()); + net_metrics!(devn, activate_fails.inc()); assert!( - NET_METRICS!(devn, activate_fails.count()) > 0, + net_metrics!(devn, activate_fails.count()) > 0, "{}", - NET_METRICS!(devn, activate_fails.count()) + net_metrics!(devn, activate_fails.count()) ); // we expect only 2 tests (this and test_max_net_dev_metrics) // to update activate_fails count for eth0. assert!( - NET_METRICS!(devn, activate_fails.count()) <= 2, + net_metrics!(devn, activate_fails.count()) <= 2, "{}", - NET_METRICS!(devn, activate_fails.count()) + net_metrics!(devn, activate_fails.count()) ); - NET_METRICS!(&String::from(devn), activate_fails.inc()); - NET_METRICS!(&String::from(devn), rx_bytes_count.add(5)); - assert!(NET_METRICS!(&String::from(devn), rx_bytes_count.count()) >= 5); + net_metrics!(&String::from(devn), activate_fails.inc()); + net_metrics!(&String::from(devn), rx_bytes_count.add(5)); + assert!(net_metrics!(&String::from(devn), rx_bytes_count.count()) >= 5); } } diff --git a/src/vmm/src/devices/virtio/net/test_utils.rs b/src/vmm/src/devices/virtio/net/test_utils.rs index c4bc8dd1d6a..3726362226c 100644 --- a/src/vmm/src/devices/virtio/net/test_utils.rs +++ b/src/vmm/src/devices/virtio/net/test_utils.rs @@ -354,9 +354,6 @@ pub mod test { use crate::devices::virtio::net::device::vnet_hdr_len; use crate::devices::virtio::net::gen::ETH_HLEN; - // get_net_metrics and NetMetricsPerDevice are the functions called by the - // macro NET_METRICS - use crate::devices::virtio::net::metrics::{get_net_metrics, NetMetricsPerDevice}; use crate::devices::virtio::net::test_utils::{ assign_queues, default_net, inject_tap_tx_frame, NetEvent, NetQueue, ReadTapMock, }; @@ -500,7 +497,7 @@ pub mod test { // Inject frame to tap and run epoll. let frame = inject_tap_tx_frame(&self.net(), frame_len); check_net_metric_after_block!( - NET_METRICS!(&self.net().id, rx_packets_count.count()), + net_metrics!(&self.net().id, rx_packets_count.count()), 0, self.event_manager.run_with_timeout(100).unwrap() ); @@ -528,7 +525,7 @@ pub mod test { )], ); check_net_metric_after_block!( - NET_METRICS!(&self.net().id, rx_packets_count.count()), + net_metrics!(&self.net().id, rx_packets_count.count()), 1, self.event_manager.run_with_timeout(100).unwrap() ); diff --git a/src/vmm/src/logger/metrics.rs b/src/vmm/src/logger/metrics.rs index 56be93cedca..c647029720b 100644 --- a/src/vmm/src/logger/metrics.rs +++ b/src/vmm/src/logger/metrics.rs @@ -1066,7 +1066,7 @@ impl Serialize for SerializeToUtcTimestampMs { // to serialise Firecracker app_metrics as a signle json object which // otherwise would have required extra string manipulation to pack // net as part of the same json object as FirecrackerMetrics. -pub struct NetMetricsSerializer {} +pub struct NetMetricsSerializer; impl NetMetricsSerializer { pub const fn new() -> Self { Self {} From ddc4430453d706f7ee94910e467aa902ae771b2b Mon Sep 17 00:00:00 2001 From: Sudan Landge Date: Mon, 16 Oct 2023 12:28:40 +0000 Subject: [PATCH 08/12] chore(metrics)(net): better name for NetMetricsSerializer NetMetricsSerializer naming was confusing because its not a Serializer in serde's sense (it implements Serialize, not Serializer). So give it to a better name. Note: this is part 8 of 12 commits to add per net device metrics. Signed-off-by: Sudan Landge --- src/vmm/src/logger/metrics.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/vmm/src/logger/metrics.rs b/src/vmm/src/logger/metrics.rs index c647029720b..5ac80db1d75 100644 --- a/src/vmm/src/logger/metrics.rs +++ b/src/vmm/src/logger/metrics.rs @@ -1066,14 +1066,14 @@ impl Serialize for SerializeToUtcTimestampMs { // to serialise Firecracker app_metrics as a signle json object which // otherwise would have required extra string manipulation to pack // net as part of the same json object as FirecrackerMetrics. -pub struct NetMetricsSerializer; -impl NetMetricsSerializer { +pub struct NetMetricsSerializeProxy; +impl NetMetricsSerializeProxy { pub const fn new() -> Self { Self {} } } -impl Serialize for NetMetricsSerializer { +impl Serialize for NetMetricsSerializeProxy { fn serialize(&self, serializer: S) -> Result where S: Serializer, @@ -1106,7 +1106,7 @@ pub struct FirecrackerMetrics { pub mmds: MmdsMetrics, #[serde(flatten)] /// A network device's related metrics. - pub net_ser: NetMetricsSerializer, + pub net_ser: NetMetricsSerializeProxy, /// Metrics related to API PATCH requests. pub patch_api_requests: PatchRequestsMetrics, /// Metrics related to API PUT requests. @@ -1143,7 +1143,7 @@ impl FirecrackerMetrics { latencies_us: PerformanceMetrics::new(), logger: LoggerSystemMetrics::new(), mmds: MmdsMetrics::new(), - net_ser: NetMetricsSerializer::new(), + net_ser: NetMetricsSerializeProxy::new(), patch_api_requests: PatchRequestsMetrics::new(), put_api_requests: PutRequestsMetrics::new(), #[cfg(target_arch = "aarch64")] From 9b486368be0bebd4048a40e5a5ffbfd15c9514a3 Mon Sep 17 00:00:00 2001 From: Sudan Landge Date: Mon, 16 Oct 2023 12:39:24 +0000 Subject: [PATCH 09/12] chore(metrics)(net): remove new() for NetDeviceMetrics and proxy NetDeviceMetrics was moved from logger which needed the new() because default didn't provide the const implementation that FirecrackerMetrics::new() needed. But since that is not a requirement anymore we rely on default to provide 0 initialized NetDeviceMetrics. NetMetricsSerializeProxy doesn't have any fields and so there is no need to define new() for it. Note: this is part 9 of 12 commits to add per net device metrics. Signed-off-by: Sudan Landge --- src/vmm/src/devices/virtio/net/metrics.rs | 41 +++-------------------- src/vmm/src/logger/metrics.rs | 7 +--- 2 files changed, 6 insertions(+), 42 deletions(-) diff --git a/src/vmm/src/devices/virtio/net/metrics.rs b/src/vmm/src/devices/virtio/net/metrics.rs index c658c826d7a..0cf37409fb0 100644 --- a/src/vmm/src/devices/virtio/net/metrics.rs +++ b/src/vmm/src/devices/virtio/net/metrics.rs @@ -63,6 +63,8 @@ //! * Use lockless operations, preferably ones that don't require anything other than simple //! reads/writes being atomic. //! * Rely on `serde` to provide the actual serialization for writing the metrics. +//! * Since all metrics start at 0, we implement the `Default` trait via derive for all of them, to +//! avoid having to initialize everything by hand. //! //! * Devices could be created in any order i.e. the first device created could either be eth0 or //! eth1 so if we use a vector for NetDeviceMetrics and call 1st device as net0, then net0 could @@ -119,7 +121,7 @@ impl NetMetricsPerDevice { .write() .unwrap() .metrics - .insert(iface_id, NetDeviceMetrics::new()); + .insert(iface_id, NetDeviceMetrics::default()); } } } @@ -159,7 +161,7 @@ pub fn flush_metrics(serializer: S) -> Result { // +1 to accomodate aggregate net metrics let mut seq = serializer.serialize_map(Some(1 + metrics_len))?; - let mut net_aggregated: NetDeviceMetrics = NetDeviceMetrics::new(); + let mut net_aggregated: NetDeviceMetrics = NetDeviceMetrics::default(); for metrics in NET_DEV_METRICS_PVT.read().unwrap().metrics.iter() { let devn = format!("net_{}", metrics.0); @@ -172,7 +174,7 @@ pub fn flush_metrics(serializer: S) -> Result { } /// Network-related metrics. -#[derive(Debug, Serialize)] +#[derive(Default, Debug, Serialize)] pub struct NetDeviceMetrics { /// Number of times when activate failed on a network device. pub activate_fails: SharedIncMetric, @@ -231,39 +233,6 @@ pub struct NetDeviceMetrics { } impl NetDeviceMetrics { - /// Const default construction. - pub const fn new() -> Self { - Self { - activate_fails: SharedIncMetric::new(), - cfg_fails: SharedIncMetric::new(), - mac_address_updates: SharedIncMetric::new(), - no_rx_avail_buffer: SharedIncMetric::new(), - no_tx_avail_buffer: SharedIncMetric::new(), - event_fails: SharedIncMetric::new(), - rx_queue_event_count: SharedIncMetric::new(), - rx_event_rate_limiter_count: SharedIncMetric::new(), - rx_partial_writes: SharedIncMetric::new(), - rx_rate_limiter_throttled: SharedIncMetric::new(), - rx_tap_event_count: SharedIncMetric::new(), - rx_bytes_count: SharedIncMetric::new(), - rx_packets_count: SharedIncMetric::new(), - rx_fails: SharedIncMetric::new(), - rx_count: SharedIncMetric::new(), - tap_read_fails: SharedIncMetric::new(), - tap_write_fails: SharedIncMetric::new(), - tx_bytes_count: SharedIncMetric::new(), - tx_malformed_frames: SharedIncMetric::new(), - tx_fails: SharedIncMetric::new(), - tx_count: SharedIncMetric::new(), - tx_packets_count: SharedIncMetric::new(), - tx_partial_reads: SharedIncMetric::new(), - tx_queue_event_count: SharedIncMetric::new(), - tx_rate_limiter_event_count: SharedIncMetric::new(), - tx_rate_limiter_throttled: SharedIncMetric::new(), - tx_spoofed_mac_count: SharedIncMetric::new(), - } - } - /// Net metrics are SharedIncMetric where the diff of current vs /// old is serialized i.e. serialize_u64(current-old). /// So to have the aggregate serialized in same way we need to diff --git a/src/vmm/src/logger/metrics.rs b/src/vmm/src/logger/metrics.rs index 5ac80db1d75..947c619cf40 100644 --- a/src/vmm/src/logger/metrics.rs +++ b/src/vmm/src/logger/metrics.rs @@ -1067,11 +1067,6 @@ impl Serialize for SerializeToUtcTimestampMs { // otherwise would have required extra string manipulation to pack // net as part of the same json object as FirecrackerMetrics. pub struct NetMetricsSerializeProxy; -impl NetMetricsSerializeProxy { - pub const fn new() -> Self { - Self {} - } -} impl Serialize for NetMetricsSerializeProxy { fn serialize(&self, serializer: S) -> Result @@ -1143,7 +1138,7 @@ impl FirecrackerMetrics { latencies_us: PerformanceMetrics::new(), logger: LoggerSystemMetrics::new(), mmds: MmdsMetrics::new(), - net_ser: NetMetricsSerializeProxy::new(), + net_ser: NetMetricsSerializeProxy {}, patch_api_requests: PatchRequestsMetrics::new(), put_api_requests: PutRequestsMetrics::new(), #[cfg(target_arch = "aarch64")] From 6d470362d666626f16fc0e0b5b1314a00ac57b57 Mon Sep 17 00:00:00 2001 From: Sudan Landge Date: Mon, 16 Oct 2023 13:46:04 +0000 Subject: [PATCH 10/12] chore(metrics)(net): implement review feedback for flush_metrics Better implementation for flush_metrics as suggested in feedback. Note: this is part 10 of 12 commits to add per net device metrics. Signed-off-by: Sudan Landge --- src/vmm/src/devices/virtio/net/metrics.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/vmm/src/devices/virtio/net/metrics.rs b/src/vmm/src/devices/virtio/net/metrics.rs index 0cf37409fb0..40e9632fc4a 100644 --- a/src/vmm/src/devices/virtio/net/metrics.rs +++ b/src/vmm/src/devices/virtio/net/metrics.rs @@ -157,17 +157,18 @@ static NET_DEV_METRICS_PVT: RwLock = RwLock::new(NetMetrics /// This function facilitates aggregation and serialization of /// per net device metrics. pub fn flush_metrics(serializer: S) -> Result { - let metrics_len = NET_DEV_METRICS_PVT.read().unwrap().metrics.len(); + let net_metrics = get_net_metrics(); + let metrics_len = net_metrics.metrics.len(); // +1 to accomodate aggregate net metrics let mut seq = serializer.serialize_map(Some(1 + metrics_len))?; let mut net_aggregated: NetDeviceMetrics = NetDeviceMetrics::default(); - for metrics in NET_DEV_METRICS_PVT.read().unwrap().metrics.iter() { - let devn = format!("net_{}", metrics.0); + for (name, metrics) in net_metrics.metrics.iter() { + let devn = format!("net_{}", name); // serialization will flush the metrics so aggregate before it. - net_aggregated.aggregate(metrics.1); - seq.serialize_entry(&devn, &metrics.1)?; + net_aggregated.aggregate(metrics); + seq.serialize_entry(&devn, metrics)?; } seq.serialize_entry("net", &net_aggregated)?; seq.end() From c3e365bc21d5c1ddb702c9ca2cbd1e1ab0f9c262 Mon Sep 17 00:00:00 2001 From: Sudan Landge Date: Mon, 16 Oct 2023 21:29:35 +0000 Subject: [PATCH 11/12] feat(metrics)(net): add ci test to validate net metrics Make sure there is no breaking change for NetDeviceMetrics. Add random number of net devices and validate that values of "net" are aggregate of all "net_{iface_id}". Note: this is part 11 of 12 commits to add per net device metrics. Signed-off-by: Sudan Landge --- .../functional/test_metrics.py | 145 ++++++++++++++++-- 1 file changed, 135 insertions(+), 10 deletions(-) diff --git a/tests/integration_tests/functional/test_metrics.py b/tests/integration_tests/functional/test_metrics.py index 036cc9fab54..27f2bae7c72 100644 --- a/tests/integration_tests/functional/test_metrics.py +++ b/tests/integration_tests/functional/test_metrics.py @@ -7,17 +7,15 @@ import platform -def test_flush_metrics(test_microvm_with_api): +def _validate_metrics(metrics): """ - Check the `FlushMetrics` vmm action. + This functions makes sure that all components + of FirecrackerMetrics struct are present. + In depth validation of metrics for each component + should be implemented in its own test. + e.g. validation of NetDeviceMetrics should implement + _validate_net_metrics() to check for breaking change etc. """ - microvm = test_microvm_with_api - microvm.spawn() - microvm.basic_config() - microvm.start() - - metrics = microvm.flush_metrics() - exp_keys = [ "utc_timestamp_ms", "api_server", @@ -44,7 +42,7 @@ def test_flush_metrics(test_microvm_with_api): if platform.machine() == "aarch64": exp_keys.append("rtc") - assert set(metrics.keys()) == set(exp_keys) + assert set(exp_keys).issubset(metrics.keys()) utc_time = datetime.datetime.now(datetime.timezone.utc) utc_timestamp_ms = math.floor(utc_time.timestamp() * 1000) @@ -54,3 +52,130 @@ def test_flush_metrics(test_microvm_with_api): # Epoch.Regression test for: # https://github.com/firecracker-microvm/firecracker/issues/2639 assert abs(utc_timestamp_ms - metrics["utc_timestamp_ms"]) < 1000 + + +class FcDeviceMetrics: + """ + Provides functions to validate breaking change and + aggregation of metrics + """ + + def __init__(self, name, validate_fn, num_dev): + self.dev_name = name + self.validate_dev_metrics = validate_fn + self.num_dev = num_dev + + def validate(self, microvm): + """ + validate breaking change of device metrics + """ + fc_metrics = microvm.flush_metrics() + + # make sure all items of FirecrackerMetrics are as expected + _validate_metrics(fc_metrics) + + # check for breaking change in device specific metrics + self.validate_dev_metrics(fc_metrics[self.dev_name]) + + # make sure "{self.name}" is aggregate of "{self.name}_*" + # and that there are only {num_dev} entries of "{self.name}_*" + self.validate_aggregation(fc_metrics) + print(f"\nsuccessfully validated aggregate of {self.dev_name} metrics") + + def validate_aggregation(self, fc_metrics): + """ + validate aggregation of device metrics + """ + metrics_aggregate = fc_metrics[self.dev_name] + metrics_calculated = {} + actual_num_devices = 0 + print(f"In aggregation of {self.dev_name} expected {self.num_dev=}") + for component_metric_names, component_metric_values in fc_metrics.items(): + if f"{self.dev_name}_" in component_metric_names: + print(f"found {component_metric_names} during aggr of {self.dev_name}") + actual_num_devices += 1 + for metrics_name, metric_value in component_metric_values.items(): + if metrics_name not in metrics_calculated: + metrics_calculated[metrics_name] = 0 + metrics_calculated[metrics_name] += metric_value + assert metrics_aggregate == metrics_calculated + assert self.num_dev == actual_num_devices + + +def test_flush_metrics(test_microvm_with_api): + """ + Check the `FlushMetrics` vmm action. + """ + microvm = test_microvm_with_api + microvm.spawn() + microvm.basic_config() + microvm.start() + + metrics = microvm.flush_metrics() + _validate_metrics(metrics) + + +def _validate_net_metrics(net_metrics): + exp_keys = [ + "activate_fails", + "cfg_fails", + "mac_address_updates", + "no_rx_avail_buffer", + "no_tx_avail_buffer", + "event_fails", + "rx_queue_event_count", + "rx_event_rate_limiter_count", + "rx_partial_writes", + "rx_rate_limiter_throttled", + "rx_tap_event_count", + "rx_bytes_count", + "rx_packets_count", + "rx_fails", + "rx_count", + "tap_read_fails", + "tap_write_fails", + "tx_bytes_count", + "tx_malformed_frames", + "tx_fails", + "tx_count", + "tx_packets_count", + "tx_partial_reads", + "tx_queue_event_count", + "tx_rate_limiter_event_count", + "tx_rate_limiter_throttled", + "tx_spoofed_mac_count", + ] + assert set(net_metrics.keys()) == set(exp_keys) + + +def test_net_metrics(test_microvm_with_api): + """ + Validate that NetDeviceMetrics doesn't have a breaking change + and "net" is aggregate of all "net_*" in the json object. + """ + test_microvm = test_microvm_with_api + test_microvm.spawn() + + # Set up a basic microVM. + test_microvm.basic_config() + + # randomly selected 10 as the number of net devices to test + num_net_devices = 10 + + net_metrics = FcDeviceMetrics("net", _validate_net_metrics, num_net_devices) + + # create more than 1 net devices to test aggregation + for _ in range(num_net_devices): + test_microvm.add_net_iface() + test_microvm.start() + + # check that the started microvm has "net" and "NUM_NET_DEVICES" number of "net_" metrics + net_metrics.validate(test_microvm) + + for i in range(num_net_devices): + # Test that network devices attached are operational. + # Verify if guest can run commands. + exit_code, _, _ = test_microvm.ssh_iface(i).run("sync") + # test that we get metrics while interacting with different interfaces + net_metrics.validate(test_microvm) + assert exit_code == 0 From 8d4b2f37806c77740f365fd2c779302cdf41971b Mon Sep 17 00:00:00 2001 From: Sudan Landge Date: Tue, 17 Oct 2023 00:41:04 +0000 Subject: [PATCH 12/12] feat(metrics)(net): Use Arc to allow storing metris in Net Use Arc for NetDeviceMetrics since this allows us to store NetDeviceMetrics directly inside Net and get rid of the net_metrics! macro. Also, use better name for net metrics. Note: this is part 12 of 12 commits to add per net device metrics. Signed-off-by: Sudan Landge --- src/vmm/src/devices/mod.rs | 12 +- src/vmm/src/devices/virtio/net/device.rs | 257 ++++++++---------- .../src/devices/virtio/net/event_handler.rs | 3 +- src/vmm/src/devices/virtio/net/metrics.rs | 218 ++++++++++----- src/vmm/src/devices/virtio/net/test_utils.rs | 10 +- src/vmm/src/devices/virtio/test_utils.rs | 14 - 6 files changed, 283 insertions(+), 231 deletions(-) diff --git a/src/vmm/src/devices/mod.rs b/src/vmm/src/devices/mod.rs index 7a865ce6ad1..e2738677b81 100644 --- a/src/vmm/src/devices/mod.rs +++ b/src/vmm/src/devices/mod.rs @@ -17,17 +17,17 @@ pub mod virtio; pub use bus::{Bus, BusDevice, BusError}; use log::error; +use crate::devices::virtio::metrics::NetDeviceMetrics; use crate::devices::virtio::{QueueError, VsockError}; use crate::logger::{IncMetric, METRICS}; -use crate::net_metrics; // Function used for reporting error in terms of logging // but also in terms of metrics of net event fails. -// network metrics is reported per device so we need `net_iface_id` -// to identify which device the metrics and `err` should be reported for. -pub(crate) fn report_net_event_fail(net_iface_id: &String, err: DeviceError) { - error!("{:?}:{:?}", net_iface_id, err); - net_metrics!(net_iface_id, event_fails.inc()); +// network metrics is reported per device so we need a handle to each net device's +// metrics `net_iface_metrics` to report metrics for that device. +pub(crate) fn report_net_event_fail(net_iface_metrics: &NetDeviceMetrics, err: DeviceError) { + error!("{:?}", err); + net_iface_metrics.event_fails.inc(); } pub(crate) fn report_balloon_event_fail(err: virtio::balloon::BalloonError) { diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index 1fb41655c74..9525e2f972d 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -37,7 +37,7 @@ use crate::vstate::memory::{ByteValued, Bytes, GuestMemoryMmap}; const FRAME_HEADER_MAX_LEN: usize = PAYLOAD_OFFSET + ETH_IPV4_FRAME_LEN; use crate::devices::virtio::iovec::IoVecBuffer; -use crate::devices::virtio::metrics::NetMetricsPerDevice; +use crate::devices::virtio::metrics::{NetDeviceMetrics, NetMetricsPerDevice}; use crate::devices::virtio::net::tap::Tap; use crate::devices::virtio::net::{ gen, NetError, NetQueue, MAX_BUFFER_SIZE, NET_QUEUE_SIZES, RX_INDEX, TX_INDEX, @@ -46,7 +46,6 @@ use crate::devices::virtio::{ ActivateError, DescriptorChain, DeviceState, IrqTrigger, IrqType, Queue, VirtioDevice, TYPE_NET, }; use crate::devices::{report_net_event_fail, DeviceError}; -use crate::net_metrics; #[derive(Debug)] enum FrontendError { @@ -138,6 +137,7 @@ pub struct Net { /// The MMDS stack corresponding to this interface. /// Only if MMDS transport has been associated with it. pub mmds_ns: Option, + pub(crate) metrics: Arc, } impl Net { @@ -173,13 +173,8 @@ impl Net { queues.push(Queue::new(size)); } - // allocate NetDeviceMetrics to log metrics for this device. - // we don't need a pointer to NetDeviceMetrics since we use - // net_metrics! macro with device's id to access the metrics - // and so alloc doesn't need to return anything. - NetMetricsPerDevice::alloc(id.clone()); Ok(Net { - id, + id: id.clone(), tap, avail_features, acked_features: 0u64, @@ -197,6 +192,7 @@ impl Net { device_state: DeviceState::Inactive, activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(NetError::EventFd)?, mmds_ns: None, + metrics: NetMetricsPerDevice::alloc(id), }) } @@ -279,7 +275,7 @@ impl Net { self.irq_trigger .trigger_irq(IrqType::Vring) .map_err(|err| { - net_metrics!(&self.id, event_fails.inc()); + self.metrics.event_fails.inc(); DeviceError::FailedSignalingIrq(err) })?; } @@ -312,7 +308,7 @@ impl Net { // Returns true on successful frame delivery. fn rate_limited_rx_single_frame(&mut self) -> bool { if !Self::rate_limiter_consume_op(&mut self.rx_rate_limiter, self.rx_bytes_read as u64) { - net_metrics!(&self.id, rx_rate_limiter_throttled.inc()); + self.metrics.rx_rate_limiter_throttled.inc(); return false; } @@ -338,7 +334,7 @@ impl Net { mem: &GuestMemoryMmap, data: &[u8], head: DescriptorChain, - net_iface_id: &String, + net_metrics: &NetDeviceMetrics, ) -> Result<(), FrontendError> { let mut chunk = data; let mut next_descriptor = Some(head); @@ -351,13 +347,13 @@ impl Net { let len = std::cmp::min(chunk.len(), descriptor.len as usize); match mem.write_slice(&chunk[..len], descriptor.addr) { Ok(()) => { - net_metrics!(net_iface_id, rx_count.inc()); + net_metrics.rx_count.inc(); chunk = &chunk[len..]; } Err(err) => { error!("Failed to write slice: {:?}", err); if let GuestMemoryError::PartialBuffer { .. } = err { - net_metrics!(net_iface_id, rx_partial_writes.inc()); + net_metrics.rx_partial_writes.inc(); } return Err(FrontendError::GuestMemory(err)); } @@ -366,8 +362,8 @@ impl Net { // If chunk is empty we are done here. if chunk.is_empty() { let len = data.len() as u64; - net_metrics!(net_iface_id, rx_bytes_count.add(len)); - net_metrics!(net_iface_id, rx_packets_count.inc()); + net_metrics.rx_bytes_count.add(len); + net_metrics.rx_packets_count.inc(); return Ok(()); } @@ -385,7 +381,7 @@ impl Net { let queue = &mut self.queues[RX_INDEX]; let head_descriptor = queue.pop_or_enable_notification(mem).ok_or_else(|| { - net_metrics!(&self.id, no_rx_avail_buffer.inc()); + self.metrics.no_rx_avail_buffer.inc(); FrontendError::EmptyQueue })?; let head_index = head_descriptor.index; @@ -394,11 +390,11 @@ impl Net { mem, &self.rx_frame_buf[..self.rx_bytes_read], head_descriptor, - &self.id, + &self.metrics, ); // Mark the descriptor chain as used. If an error occurred, skip the descriptor chain. let used_len = if result.is_err() { - net_metrics!(&self.id, rx_fails.inc()); + self.metrics.rx_fails.inc(); 0 } else { // Safe to unwrap because a frame must be smaller than 2^16 bytes. @@ -442,19 +438,19 @@ impl Net { frame_iovec: &IoVecBuffer, tap: &mut Tap, guest_mac: Option, - net_iface_id: &String, + net_metrics: &NetDeviceMetrics, ) -> Result { // Read the frame headers from the IoVecBuffer. This will return None // if the frame_iovec is empty. let header_len = frame_iovec.read_at(headers, 0).ok_or_else(|| { error!("Received empty TX buffer"); - net_metrics!(net_iface_id, tx_malformed_frames.inc()); + net_metrics.tx_malformed_frames.inc(); NetError::VnetHeaderMissing })?; let headers = frame_bytes_from_buf(&headers[..header_len]).map_err(|e| { error!("VNET headers missing in TX frame"); - net_metrics!(net_iface_id, tx_malformed_frames.inc()); + net_metrics.tx_malformed_frames.inc(); e })?; @@ -481,7 +477,7 @@ impl Net { if let Some(guest_mac) = guest_mac { let _ = EthernetFrame::from_bytes(headers).map(|eth_frame| { if guest_mac != eth_frame.src_mac() { - net_metrics!(net_iface_id, tx_spoofed_mac_count.inc()); + net_metrics.tx_spoofed_mac_count.inc(); } }); } @@ -489,13 +485,13 @@ impl Net { match Self::write_tap(tap, frame_iovec) { Ok(_) => { let len = frame_iovec.len() as u64; - net_metrics!(net_iface_id, tx_bytes_count.add(len)); - net_metrics!(net_iface_id, tx_packets_count.inc()); - net_metrics!(net_iface_id, tx_count.inc()); + net_metrics.tx_bytes_count.add(len); + net_metrics.tx_packets_count.inc(); + net_metrics.tx_count.inc(); } Err(err) => { error!("Failed to write to tap: {:?}", err); - net_metrics!(net_iface_id, tap_write_fails.inc()); + net_metrics.tap_write_fails.inc(); } }; Ok(false) @@ -524,7 +520,7 @@ impl Net { match self.read_from_mmds_or_tap() { Ok(count) => { self.rx_bytes_read = count; - net_metrics!(&self.id, rx_count.inc()); + self.metrics.rx_count.inc(); if !self.rate_limited_rx_single_frame() { self.rx_deferred_frame = true; break; @@ -537,7 +533,7 @@ impl Net { Some(err) if err == EAGAIN => (), _ => { error!("Failed to read tap: {:?}", err); - net_metrics!(&self.id, tap_read_fails.inc()); + self.metrics.tap_read_fails.inc(); return Err(DeviceError::FailedReadTap); } }; @@ -592,7 +588,7 @@ impl Net { let buffer = match IoVecBuffer::from_descriptor_chain(mem, head) { Ok(buffer) => buffer, Err(_) => { - net_metrics!(&self.id, tx_fails.inc()); + self.metrics.tx_fails.inc(); tx_queue .add_used(mem, head_index, 0) .map_err(DeviceError::QueueError)?; @@ -601,7 +597,7 @@ impl Net { }; if !Self::rate_limiter_consume_op(&mut self.tx_rate_limiter, buffer.len() as u64) { tx_queue.undo_pop(); - net_metrics!(&self.id, tx_rate_limiter_throttled.inc()); + self.metrics.tx_rate_limiter_throttled.inc(); break; } @@ -612,7 +608,7 @@ impl Net { &buffer, &mut self.tap, self.guest_mac, - &self.id, + &self.metrics, ) .unwrap_or(false); if frame_consumed_by_mmds && !self.rx_deferred_frame { @@ -627,7 +623,7 @@ impl Net { } if !used_any { - net_metrics!(&self.id, no_tx_avail_buffer.inc()); + self.metrics.no_tx_avail_buffer.inc(); } self.signal_used_queue(NetQueue::Tx)?; @@ -667,38 +663,38 @@ impl Net { /// This is called by the event manager responding to the guest adding a new /// buffer in the RX queue. pub fn process_rx_queue_event(&mut self) { - net_metrics!(&self.id, rx_queue_event_count.inc()); + self.metrics.rx_queue_event_count.inc(); if let Err(err) = self.queue_evts[RX_INDEX].read() { // rate limiters present but with _very high_ allowed rate error!("Failed to get rx queue event: {:?}", err); - net_metrics!(&self.id, event_fails.inc()); + self.metrics.event_fails.inc(); } else if self.rx_rate_limiter.is_blocked() { - net_metrics!(&self.id, rx_rate_limiter_throttled.inc()); + self.metrics.rx_rate_limiter_throttled.inc(); } else { // If the limiter is not blocked, resume the receiving of bytes. self.resume_rx() - .unwrap_or_else(|err| report_net_event_fail(&self.id, err)); + .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); } } pub fn process_tap_rx_event(&mut self) { // This is safe since we checked in the event handler that the device is activated. let mem = self.device_state.mem().unwrap(); - net_metrics!(&self.id, rx_tap_event_count.inc()); + self.metrics.rx_tap_event_count.inc(); // While there are no available RX queue buffers and there's a deferred_frame // don't process any more incoming. Otherwise start processing a frame. In the // process the deferred_frame flag will be set in order to avoid freezing the // RX queue. if self.queues[RX_INDEX].is_empty(mem) && self.rx_deferred_frame { - net_metrics!(&self.id, no_rx_avail_buffer.inc()); + self.metrics.no_rx_avail_buffer.inc(); return; } // While limiter is blocked, don't process any more incoming. if self.rx_rate_limiter.is_blocked() { - net_metrics!(&self.id, rx_rate_limiter_throttled.inc()); + self.metrics.rx_rate_limiter_throttled.inc(); return; } @@ -707,10 +703,10 @@ impl Net { // until we manage to receive this deferred frame. { self.handle_deferred_frame() - .unwrap_or_else(|err| report_net_event_fail(&self.id, err)); + .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); } else { self.process_rx() - .unwrap_or_else(|err| report_net_event_fail(&self.id, err)); + .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); } } @@ -719,22 +715,22 @@ impl Net { /// This is called by the event manager responding to the guest adding a new /// buffer in the TX queue. pub fn process_tx_queue_event(&mut self) { - net_metrics!(&self.id, tx_queue_event_count.inc()); + self.metrics.tx_queue_event_count.inc(); if let Err(err) = self.queue_evts[TX_INDEX].read() { error!("Failed to get tx queue event: {:?}", err); - net_metrics!(&self.id, event_fails.inc()); + self.metrics.event_fails.inc(); } else if !self.tx_rate_limiter.is_blocked() // If the limiter is not blocked, continue transmitting bytes. { self.process_tx() - .unwrap_or_else(|err| report_net_event_fail(&self.id, err)); + .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); } else { - net_metrics!(&self.id, tx_rate_limiter_throttled.inc()); + self.metrics.tx_rate_limiter_throttled.inc(); } } pub fn process_rx_rate_limiter_event(&mut self) { - net_metrics!(&self.id, rx_event_rate_limiter_count.inc()); + self.metrics.rx_event_rate_limiter_count.inc(); // Upon rate limiter event, call the rate limiter handler // and restart processing the queue. @@ -742,28 +738,28 @@ impl Net { Ok(_) => { // There might be enough budget now to receive the frame. self.resume_rx() - .unwrap_or_else(|err| report_net_event_fail(&self.id, err)); + .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); } Err(err) => { error!("Failed to get rx rate-limiter event: {:?}", err); - net_metrics!(&self.id, event_fails.inc()); + self.metrics.event_fails.inc(); } } } pub fn process_tx_rate_limiter_event(&mut self) { - net_metrics!(&self.id, tx_rate_limiter_event_count.inc()); + self.metrics.tx_rate_limiter_event_count.inc(); // Upon rate limiter event, call the rate limiter handler // and restart processing the queue. match self.tx_rate_limiter.event_handler() { Ok(_) => { // There might be enough budget now to send the frame. self.process_tx() - .unwrap_or_else(|err| report_net_event_fail(&self.id, err)); + .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); } Err(err) => { error!("Failed to get tx rate-limiter event: {:?}", err); - net_metrics!(&self.id, event_fails.inc()); + self.metrics.event_fails.inc(); } } } @@ -817,7 +813,7 @@ impl VirtioDevice for Net { let config_len = config_space_bytes.len() as u64; if offset >= config_len { error!("Failed to read config space"); - net_metrics!(&self.id, cfg_fails.inc()); + self.metrics.cfg_fails.inc(); return; } if let Some(end) = offset.checked_add(data.len() as u64) { @@ -838,13 +834,13 @@ impl VirtioDevice for Net { .and_then(|(start, end)| config_space_bytes.get_mut(start..end)) else { error!("Failed to write config space"); - net_metrics!(&self.id, cfg_fails.inc()); + self.metrics.cfg_fails.inc(); return; }; dst.copy_from_slice(data); self.guest_mac = Some(self.config_space.guest_mac); - net_metrics!(&self.id, mac_address_updates.inc()); + self.metrics.mac_address_updates.inc(); } fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { @@ -880,6 +876,7 @@ pub mod tests { use utils::net::mac::{MacAddr, MAC_ADDR_LEN}; use super::*; + use crate::check_metric_after_block; use crate::devices::virtio::gen::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use crate::devices::virtio::iovec::IoVecBuffer; use crate::devices::virtio::net::device::{ @@ -901,7 +898,6 @@ pub mod tests { use crate::logger::IncMetric; use crate::rate_limiter::{BucketUpdate, RateLimiter, TokenBucket, TokenType}; use crate::vstate::memory::{Address, GuestMemory}; - use crate::{check_metric_after_block, check_net_metric_after_block}; impl Net { pub(crate) fn read_tap(&mut self) -> io::Result { @@ -1026,7 +1022,7 @@ pub mod tests { // Check that the guest MAC was updated. let expected_guest_mac = MacAddr::from_bytes_unchecked(&new_config); assert_eq!(expected_guest_mac, net.guest_mac.unwrap()); - assert_eq!(net_metrics!(&net.id, mac_address_updates.count()), 1); + assert_eq!(net.metrics.mac_address_updates.count(), 1); // Partial write (this is how the kernel sets a new mac address) - byte by byte. let new_config = [0x11, 0x22, 0x33, 0x44, 0x55, 0x66]; @@ -1058,8 +1054,8 @@ pub mod tests { th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); th.net().queue_evts[RX_INDEX].read().unwrap(); - check_net_metric_after_block!( - net_metrics!(&th.net().id, event_fails.count()), + check_metric_after_block!( + th.net().metrics.event_fails, 1, th.simulate_event(NetEvent::RxQueue) ); @@ -1153,8 +1149,8 @@ pub mod tests { // Inject frame to tap and run epoll. let frame = inject_tap_tx_frame(&th.net(), 1000); - check_net_metric_after_block!( - net_metrics!(&th.net().id, rx_packets_count.count()), + check_metric_after_block!( + th.net().metrics.rx_packets_count, 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1194,8 +1190,8 @@ pub mod tests { ); // Inject frame to tap and run epoll. let frame = inject_tap_tx_frame(&th.net(), 1000); - check_net_metric_after_block!( - net_metrics!(&th.net().id, rx_packets_count.count()), + check_metric_after_block!( + th.net().metrics.rx_packets_count, 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1234,8 +1230,8 @@ pub mod tests { // Inject 2 frames to tap and run epoll. let frame_1 = inject_tap_tx_frame(&th.net(), 200); let frame_2 = inject_tap_tx_frame(&th.net(), 300); - check_net_metric_after_block!( - net_metrics!(&th.net().id, rx_packets_count.count()), + check_metric_after_block!( + th.net().metrics.rx_packets_count, 2, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1265,8 +1261,8 @@ pub mod tests { th.add_desc_chain(NetQueue::Tx, 0, &[(0, 4096, 0)]); th.net().queue_evts[TX_INDEX].read().unwrap(); - check_net_metric_after_block!( - net_metrics!(&th.net().id, event_fails.count()), + check_metric_after_block!( + th.net().metrics.event_fails, 1, th.simulate_event(NetEvent::TxQueue) ); @@ -1304,8 +1300,8 @@ pub mod tests { // Send an invalid frame (too small, VNET header missing). th.add_desc_chain(NetQueue::Tx, 0, &[(0, 1, 0)]); - check_net_metric_after_block!( - net_metrics!(&th.net().id, tx_malformed_frames.count()), + check_metric_after_block!( + th.net().metrics.tx_malformed_frames, 1, th.event_manager.run_with_timeout(100) ); @@ -1326,8 +1322,8 @@ pub mod tests { // Send an invalid frame (too small, VNET header missing). th.add_desc_chain(NetQueue::Tx, 0, &[(0, 0, 0)]); - check_net_metric_after_block!( - net_metrics!(&th.net().id, tx_malformed_frames.count()), + check_metric_after_block!( + th.net().metrics.tx_malformed_frames, 1, th.event_manager.run_with_timeout(100) ); @@ -1364,8 +1360,8 @@ pub mod tests { // One frame is valid, one will not be handled because it includes write-only memory // so that leaves us with 2 malformed (no vnet header) frames. - check_net_metric_after_block!( - net_metrics!(&th.net().id, tx_malformed_frames.count()), + check_metric_after_block!( + th.net().metrics.tx_malformed_frames, 2, th.event_manager.run_with_timeout(100) ); @@ -1394,8 +1390,8 @@ pub mod tests { th.add_desc_chain(NetQueue::Tx, 0, &desc_list); let frame = th.write_tx_frame(&desc_list, 1000); - check_net_metric_after_block!( - net_metrics!(&th.net().id, tx_packets_count.count()), + check_metric_after_block!( + th.net().metrics.tx_packets_count, 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1420,8 +1416,8 @@ pub mod tests { th.add_desc_chain(NetQueue::Tx, 0, &desc_list); let _ = th.write_tx_frame(&desc_list, 1000); - check_net_metric_after_block!( - net_metrics!(&th.net().id, tap_write_fails.count()), + check_metric_after_block!( + th.net().metrics.tap_write_fails, 1, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1447,8 +1443,8 @@ pub mod tests { th.add_desc_chain(NetQueue::Tx, 500, &desc_list); let frame_2 = th.write_tx_frame(&desc_list, 600); - check_net_metric_after_block!( - net_metrics!(&th.net().id, tx_packets_count.count()), + check_metric_after_block!( + th.net().metrics.tx_packets_count, 2, th.event_manager.run_with_timeout(100).unwrap() ); @@ -1527,7 +1523,7 @@ pub mod tests { &buffer, &mut net.tap, Some(src_mac), - &net.id, + &net.metrics, ) .unwrap()) ); @@ -1555,8 +1551,8 @@ pub mod tests { let mut headers = vec![0; frame_hdr_len()]; // Check that a legit MAC doesn't affect the spoofed MAC metric. - check_net_metric_after_block!( - net_metrics!(&net.id, tx_spoofed_mac_count.count()), + check_metric_after_block!( + net.metrics.tx_spoofed_mac_count, 0, Net::write_to_mmds_or_tap( net.mmds_ns.as_mut(), @@ -1565,13 +1561,13 @@ pub mod tests { &buffer, &mut net.tap, Some(guest_mac), - &net.id, + &net.metrics, ) ); // Check that a spoofed MAC increases our spoofed MAC metric. - check_net_metric_after_block!( - net_metrics!(&net.id, tx_spoofed_mac_count.count()), + check_metric_after_block!( + net.metrics.tx_spoofed_mac_count, 1, Net::write_to_mmds_or_tap( net.mmds_ns.as_mut(), @@ -1580,7 +1576,7 @@ pub mod tests { &buffer, &mut net.tap, Some(not_guest_mac), - &net.id, + &net.metrics, ) ); } @@ -1592,16 +1588,16 @@ pub mod tests { // RX rate limiter events should error since the limiter is not blocked. // Validate that the event failed and failure was properly accounted for. - check_net_metric_after_block!( - net_metrics!(&th.net().id, event_fails.count()), + check_metric_after_block!( + th.net().metrics.event_fails, 1, th.simulate_event(NetEvent::RxRateLimiter) ); // TX rate limiter events should error since the limiter is not blocked. // Validate that the event failed and failure was properly accounted for. - check_net_metric_after_block!( - net_metrics!(&th.net().id, event_fails.count()), + check_metric_after_block!( + th.net().metrics.event_fails, 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1618,8 +1614,8 @@ pub mod tests { // The RX queue is empty and rx_deffered_frame is set. th.net().rx_deferred_frame = true; - check_net_metric_after_block!( - net_metrics!(&th.net().id, no_rx_avail_buffer.count()), + check_metric_after_block!( + th.net().metrics.no_rx_avail_buffer, 1, th.simulate_event(NetEvent::Tap) ); @@ -1631,8 +1627,8 @@ pub mod tests { // Fake an avail buffer; this time, tap reading should error out. th.rxq.avail.idx.set(1); - check_net_metric_after_block!( - net_metrics!(&th.net().id, tap_read_fails.count()), + check_metric_after_block!( + th.net().metrics.tap_read_fails, 1, th.simulate_event(NetEvent::Tap) ); @@ -1644,22 +1640,19 @@ pub mod tests { th.activate_net(); th.net().tap.mocks.set_read_tap(ReadTapMock::TapFrame); - let rx_packets_count = net_metrics!(&th.net().id, rx_packets_count.count()); + let rx_packets_count = th.net().metrics.rx_packets_count.count(); let _ = inject_tap_tx_frame(&th.net(), 1000); // Trigger a Tap event that. This should fail since there // are not any available descriptors in the queue - check_net_metric_after_block!( - net_metrics!(&th.net().id, no_rx_avail_buffer.count()), + check_metric_after_block!( + th.net().metrics.no_rx_avail_buffer, 1, th.simulate_event(NetEvent::Tap) ); // The frame we read from the tap should be deferred now and // no frames should have been transmitted assert!(th.net().rx_deferred_frame); - assert_eq!( - net_metrics!(&th.net().id, rx_packets_count.count()), - rx_packets_count - ); + assert_eq!(th.net().metrics.rx_packets_count.count(), rx_packets_count); // Let's add a second frame, which should really have the same // fate. @@ -1669,8 +1662,8 @@ pub mod tests { // frame. However, this should try to handle the second tap as well and fail // since there's only one Descriptor Chain in the queue. th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); - check_net_metric_after_block!( - net_metrics!(&th.net().id, no_rx_avail_buffer.count()), + check_metric_after_block!( + th.net().metrics.no_rx_avail_buffer, 1, th.simulate_event(NetEvent::Tap) ); @@ -1678,14 +1671,14 @@ pub mod tests { assert!(th.net().rx_deferred_frame); // However, we should have delivered the first frame assert_eq!( - net_metrics!(&th.net().id, rx_packets_count.count()), + th.net().metrics.rx_packets_count.count(), rx_packets_count + 1 ); // Let's add one more descriptor and try to handle the last frame as well. th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); - check_net_metric_after_block!( - net_metrics!(&th.net().id, rx_packets_count.count()), + check_metric_after_block!( + th.net().metrics.rx_packets_count, 1, th.simulate_event(NetEvent::RxQueue) ); @@ -1701,8 +1694,8 @@ pub mod tests { th.net().rx_rate_limiter = RateLimiter::new(0, 0, 0, 0, 0, 0).unwrap(); // There is no actual event on the rate limiter's timerfd. - check_net_metric_after_block!( - net_metrics!(&th.net().id, event_fails.count()), + check_metric_after_block!( + th.net().metrics.event_fails, 1, th.simulate_event(NetEvent::RxRateLimiter) ); @@ -1716,8 +1709,8 @@ pub mod tests { th.net().tx_rate_limiter = RateLimiter::new(0, 0, 0, 0, 0, 0).unwrap(); th.simulate_event(NetEvent::TxRateLimiter); // There is no actual event on the rate limiter's timerfd. - check_net_metric_after_block!( - net_metrics!(&th.net().id, event_fails.count()), + check_metric_after_block!( + th.net().metrics.event_fails, 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1747,10 +1740,7 @@ pub mod tests { // assert that limiter is blocked assert!(th.net().tx_rate_limiter.is_blocked()); - assert_eq!( - net_metrics!(&th.net().id, tx_rate_limiter_throttled.count()), - 1 - ); + assert_eq!(th.net().metrics.tx_rate_limiter_throttled.count(), 1); // make sure the data is still queued for processing assert_eq!(th.txq.used.idx.get(), 0); } @@ -1761,10 +1751,7 @@ pub mod tests { // trigger the RX queue event handler th.simulate_event(NetEvent::TxQueue); - assert_eq!( - net_metrics!(&th.net().id, tx_rate_limiter_throttled.count()), - 2 - ); + assert_eq!(th.net().metrics.tx_rate_limiter_throttled.count(), 2); } // wait for 100ms to give the rate-limiter timer a chance to replenish @@ -1774,8 +1761,8 @@ pub mod tests { // following TX procedure should succeed because bandwidth should now be available { // tx_count increments 1 from write_to_mmds_or_tap() - check_net_metric_after_block!( - net_metrics!(&th.net().id, tx_count.count()), + check_metric_after_block!( + th.net().metrics.tx_count, 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1791,8 +1778,8 @@ pub mod tests { // following TX procedure should succeed to handle the second frame as well { // tx_count increments 1 from write_to_mmds_or_tap() - check_net_metric_after_block!( - net_metrics!(&th.net().id, tx_count.count()), + check_metric_after_block!( + th.net().metrics.tx_count, 1, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1824,10 +1811,7 @@ pub mod tests { // assert that limiter is blocked assert!(th.net().rx_rate_limiter.is_blocked()); - assert_eq!( - net_metrics!(&th.net().id, rx_rate_limiter_throttled.count()), - 1 - ); + assert_eq!(th.net().metrics.rx_rate_limiter_throttled.count(), 1); assert!(th.net().rx_deferred_frame); // assert that no operation actually completed (limiter blocked it) assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); @@ -1840,10 +1824,7 @@ pub mod tests { // trigger the RX queue event handler th.simulate_event(NetEvent::RxQueue); - assert_eq!( - net_metrics!(&th.net().id, rx_rate_limiter_throttled.count()), - 2 - ); + assert_eq!(th.net().metrics.rx_rate_limiter_throttled.count(), 2); } // wait for 100ms to give the rate-limiter timer a chance to replenish @@ -1854,8 +1835,8 @@ pub mod tests { { let frame = &th.net().tap.mocks.read_tap.mock_frame(); // no longer throttled - check_net_metric_after_block!( - net_metrics!(&th.net().id, rx_rate_limiter_throttled.count()), + check_metric_after_block!( + th.net().metrics.rx_rate_limiter_throttled, 0, th.simulate_event(NetEvent::RxRateLimiter) ); @@ -1892,8 +1873,8 @@ pub mod tests { { // trigger the TX handler th.add_desc_chain(NetQueue::Tx, 0, &[(0, 4096, 0)]); - check_net_metric_after_block!( - net_metrics!(&th.net().id, tx_rate_limiter_throttled.count()), + check_metric_after_block!( + th.net().metrics.tx_rate_limiter_throttled, 1, th.simulate_event(NetEvent::TxQueue) ); @@ -1911,8 +1892,8 @@ pub mod tests { // following TX procedure should succeed because ops should now be available { // no longer throttled - check_net_metric_after_block!( - net_metrics!(&th.net().id, tx_rate_limiter_throttled.count()), + check_metric_after_block!( + th.net().metrics.tx_rate_limiter_throttled, 0, th.simulate_event(NetEvent::TxRateLimiter) ); @@ -1940,15 +1921,15 @@ pub mod tests { // following RX procedure should fail because of ops rate limiting { // trigger the RX handler - check_net_metric_after_block!( - net_metrics!(&th.net().id, rx_rate_limiter_throttled.count()), + check_metric_after_block!( + th.net().metrics.rx_rate_limiter_throttled, 1, th.simulate_event(NetEvent::Tap) ); // assert that limiter is blocked assert!(th.net().rx_rate_limiter.is_blocked()); - assert!(net_metrics!(&th.net().id, rx_rate_limiter_throttled.count()) >= 1); + assert!(th.net().metrics.rx_rate_limiter_throttled.count() >= 1); assert!(th.net().rx_deferred_frame); // assert that no operation actually completed (limiter blocked it) assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); diff --git a/src/vmm/src/devices/virtio/net/event_handler.rs b/src/vmm/src/devices/virtio/net/event_handler.rs index b58e6a8b682..a3bd0499f18 100644 --- a/src/vmm/src/devices/virtio/net/event_handler.rs +++ b/src/vmm/src/devices/virtio/net/event_handler.rs @@ -9,7 +9,6 @@ use utils::epoll::EventSet; use crate::devices::virtio::net::device::Net; use crate::devices::virtio::{VirtioDevice, RX_INDEX, TX_INDEX}; use crate::logger::{debug, error, warn, IncMetric}; -use crate::net_metrics; impl Net { fn register_runtime_events(&self, ops: &mut EventOps) { @@ -85,7 +84,7 @@ impl MutEventSubscriber for Net { _ if activate_fd == source => self.process_activate_event(ops), _ => { warn!("Net: Spurious event received: {:?}", source); - net_metrics!(&self.id, event_fails.inc()); + self.metrics.event_fails.inc(); } } } else { diff --git a/src/vmm/src/devices/virtio/net/metrics.rs b/src/vmm/src/devices/virtio/net/metrics.rs index 40e9632fc4a..084b37bbbe3 100644 --- a/src/vmm/src/devices/virtio/net/metrics.rs +++ b/src/vmm/src/devices/virtio/net/metrics.rs @@ -76,31 +76,23 @@ //! The system implements 1 types of metrics: //! * Shared Incremental Metrics (SharedIncMetrics) - dedicated for the metrics which need a counter //! (i.e the number of times an API request failed). These metrics are reset upon flush. -//! We use NET_DEV_METRICS_PVT instead of adding an entry of NetDeviceMetrics +//! We use NET_METRICS instead of adding an entry of NetDeviceMetrics //! in Net so that metrics are accessible to be flushed even from signal handlers. use std::collections::BTreeMap; -use std::sync::{RwLock, RwLockReadGuard}; +use std::sync::{Arc, RwLock}; use serde::ser::SerializeMap; use serde::{Serialize, Serializer}; use crate::logger::{IncMetric, SharedIncMetric}; -/// returns lock-unwrapped NET_DEV_METRICS_PVT.metrics, -/// we have this function so that we don't have to make -/// NET_DEV_METRICS_PVT public -pub fn get_net_metrics() -> RwLockReadGuard<'static, NetMetricsPerDevice> { - // lock is always initialized so it is safe the unwrap the lock without a check. - NET_DEV_METRICS_PVT.read().unwrap() -} - /// map of network interface id and metrics /// this should be protected by a lock before accessing. #[derive(Debug)] pub struct NetMetricsPerDevice { /// used to access per net device metrics - pub metrics: BTreeMap, + pub metrics: BTreeMap>, } impl NetMetricsPerDevice { @@ -109,55 +101,35 @@ impl NetMetricsPerDevice { /// exist to avoid overwriting previously allocated data. /// lock is always initialized so it is safe the unwrap /// the lock without a check. - pub fn alloc(iface_id: String) { - if NET_DEV_METRICS_PVT - .read() - .unwrap() - .metrics - .get(&iface_id) - .is_none() - { - NET_DEV_METRICS_PVT + pub fn alloc(iface_id: String) -> Arc { + if NET_METRICS.read().unwrap().metrics.get(&iface_id).is_none() { + NET_METRICS .write() .unwrap() .metrics - .insert(iface_id, NetDeviceMetrics::default()); + .insert(iface_id.clone(), Arc::new(NetDeviceMetrics::default())); } + NET_METRICS + .read() + .unwrap() + .metrics + .get(&iface_id) + .unwrap() + .clone() } } -#[macro_export] -// Per device metrics are behind a ref lock and in a map which -// makes it difficult to return the metrics pointer via a function -// This macro is an attempt to make the code more readable and to -// centralize the error handling. -// The macro can be called in below ways where iface_id is String and -// activate_fails is a metric from struct `NetDeviceMetrics`: -// net_metrics!(&iface_id, activate_fails.inc()); -// net_metrics!(&iface_id, activate_fails.add(5)); -// net_metrics!(&iface_id, activate_fails.count()); -macro_rules! net_metrics { - ($iface_id:expr,$metric:ident.$action:ident($($value:tt)?)) => {{ - if $crate::devices::virtio::net::metrics::get_net_metrics().metrics.get($iface_id).is_some() { - $crate::devices::virtio::net::metrics::get_net_metrics().metrics.get($iface_id).unwrap().$metric.$action($($value)?) - }else{ - $crate::devices::virtio::net::metrics::NetMetricsPerDevice::alloc($iface_id.to_string()); - $crate::devices::virtio::net::metrics::get_net_metrics().metrics.get($iface_id).unwrap().$metric.$action($($value)?) - } - }} -} - /// Pool of Network-related metrics per device behind a lock to /// keep things thread safe. Since the lock is initialized here /// it is safe to unwrap it without any check. -static NET_DEV_METRICS_PVT: RwLock = RwLock::new(NetMetricsPerDevice { +static NET_METRICS: RwLock = RwLock::new(NetMetricsPerDevice { metrics: BTreeMap::new(), }); /// This function facilitates aggregation and serialization of /// per net device metrics. pub fn flush_metrics(serializer: S) -> Result { - let net_metrics = get_net_metrics(); + let net_metrics = NET_METRICS.read().unwrap(); let metrics_len = net_metrics.metrics.len(); // +1 to accomodate aggregate net metrics let mut seq = serializer.serialize_map(Some(1 + metrics_len))?; @@ -167,8 +139,9 @@ pub fn flush_metrics(serializer: S) -> Result { for (name, metrics) in net_metrics.metrics.iter() { let devn = format!("net_{}", name); // serialization will flush the metrics so aggregate before it. - net_aggregated.aggregate(metrics); - seq.serialize_entry(&devn, metrics)?; + let m: &NetDeviceMetrics = metrics; + net_aggregated.aggregate(m); + seq.serialize_entry(&devn, m)?; } seq.serialize_entry("net", &net_aggregated)?; seq.end() @@ -298,21 +271,73 @@ pub mod tests { // we have 5-23 irq for net devices so max 19 net devices. const MAX_NET_DEVICES: usize = 19; - assert!(NET_DEV_METRICS_PVT.read().is_ok()); - assert!(NET_DEV_METRICS_PVT.write().is_ok()); + assert!(NET_METRICS.read().is_ok()); + assert!(NET_METRICS.write().is_ok()); for i in 0..MAX_NET_DEVICES { let devn: String = format!("eth{}", i); NetMetricsPerDevice::alloc(devn.clone()); - net_metrics!(&devn, activate_fails.inc()); - net_metrics!(&devn, rx_bytes_count.add(10)); - net_metrics!(&devn, tx_bytes_count.add(5)); + NET_METRICS + .read() + .unwrap() + .metrics + .get(&devn) + .unwrap() + .activate_fails + .inc(); + NET_METRICS + .read() + .unwrap() + .metrics + .get(&devn) + .unwrap() + .rx_bytes_count + .add(10); + NET_METRICS + .read() + .unwrap() + .metrics + .get(&devn) + .unwrap() + .tx_bytes_count + .add(5); } + for i in 0..MAX_NET_DEVICES { let devn: String = format!("eth{}", i); - assert!(net_metrics!(&devn, activate_fails.count()) >= 1); - assert!(net_metrics!(&devn, rx_bytes_count.count()) >= 10); - assert_eq!(net_metrics!(&devn, tx_bytes_count.count()), 5); + assert!( + NET_METRICS + .read() + .unwrap() + .metrics + .get(&devn) + .unwrap() + .activate_fails + .count() + >= 1 + ); + assert!( + NET_METRICS + .read() + .unwrap() + .metrics + .get(&devn) + .unwrap() + .rx_bytes_count + .count() + >= 10 + ); + assert_eq!( + NET_METRICS + .read() + .unwrap() + .metrics + .get(&devn) + .unwrap() + .tx_bytes_count + .count(), + 5 + ); } } #[test] @@ -321,29 +346,90 @@ pub mod tests { // `test_net_dev_metrics` which also uses the same name. let devn = "eth0"; - assert!(NET_DEV_METRICS_PVT.read().is_ok()); - assert!(NET_DEV_METRICS_PVT.write().is_ok()); + assert!(NET_METRICS.read().is_ok()); + assert!(NET_METRICS.write().is_ok()); NetMetricsPerDevice::alloc(String::from(devn)); - assert!(NET_DEV_METRICS_PVT.read().is_ok()); - assert!(get_net_metrics().metrics.get(devn).is_some()); + assert!(NET_METRICS.read().is_ok()); + assert!(NET_METRICS.read().unwrap().metrics.get(devn).is_some()); - net_metrics!(devn, activate_fails.inc()); + NET_METRICS + .read() + .unwrap() + .metrics + .get(devn) + .unwrap() + .activate_fails + .inc(); assert!( - net_metrics!(devn, activate_fails.count()) > 0, + NET_METRICS + .read() + .unwrap() + .metrics + .get(devn) + .unwrap() + .activate_fails + .count() + > 0, "{}", - net_metrics!(devn, activate_fails.count()) + NET_METRICS + .read() + .unwrap() + .metrics + .get(devn) + .unwrap() + .activate_fails + .count() ); // we expect only 2 tests (this and test_max_net_dev_metrics) // to update activate_fails count for eth0. assert!( - net_metrics!(devn, activate_fails.count()) <= 2, + NET_METRICS + .read() + .unwrap() + .metrics + .get(devn) + .unwrap() + .activate_fails + .count() + <= 2, "{}", - net_metrics!(devn, activate_fails.count()) + NET_METRICS + .read() + .unwrap() + .metrics + .get(devn) + .unwrap() + .activate_fails + .count() ); - net_metrics!(&String::from(devn), activate_fails.inc()); - net_metrics!(&String::from(devn), rx_bytes_count.add(5)); - assert!(net_metrics!(&String::from(devn), rx_bytes_count.count()) >= 5); + NET_METRICS + .read() + .unwrap() + .metrics + .get(devn) + .unwrap() + .activate_fails + .inc(); + NET_METRICS + .read() + .unwrap() + .metrics + .get(devn) + .unwrap() + .rx_bytes_count + .add(5); + assert!( + NET_METRICS + .read() + .unwrap() + .metrics + .get(devn) + .unwrap() + .rx_bytes_count + .count() + >= 5 + ); } } diff --git a/src/vmm/src/devices/virtio/net/test_utils.rs b/src/vmm/src/devices/virtio/net/test_utils.rs index 3726362226c..376920f56b6 100644 --- a/src/vmm/src/devices/virtio/net/test_utils.rs +++ b/src/vmm/src/devices/virtio/net/test_utils.rs @@ -352,6 +352,7 @@ pub mod test { use event_manager::{EventManager, SubscriberId, SubscriberOps}; + use crate::check_metric_after_block; use crate::devices::virtio::net::device::vnet_hdr_len; use crate::devices::virtio::net::gen::ETH_HLEN; use crate::devices::virtio::net::test_utils::{ @@ -366,7 +367,6 @@ pub mod test { use crate::vstate::memory::{ Address, Bytes, GuestAddress, GuestMemoryExtension, GuestMemoryMmap, }; - use crate::{check_net_metric_after_block, NET_METRICS}; pub struct TestHelper<'a> { pub event_manager: EventManager>>, @@ -496,8 +496,8 @@ pub mod test { // Inject frame to tap and run epoll. let frame = inject_tap_tx_frame(&self.net(), frame_len); - check_net_metric_after_block!( - net_metrics!(&self.net().id, rx_packets_count.count()), + check_metric_after_block!( + self.net().metrics.rx_packets_count, 0, self.event_manager.run_with_timeout(100).unwrap() ); @@ -524,8 +524,8 @@ pub mod test { VIRTQ_DESC_F_WRITE, )], ); - check_net_metric_after_block!( - net_metrics!(&self.net().id, rx_packets_count.count()), + check_metric_after_block!( + self.net().metrics.rx_packets_count, 1, self.event_manager.run_with_timeout(100).unwrap() ); diff --git a/src/vmm/src/devices/virtio/test_utils.rs b/src/vmm/src/devices/virtio/test_utils.rs index a5199f5683a..564b59f7779 100644 --- a/src/vmm/src/devices/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/test_utils.rs @@ -22,20 +22,6 @@ macro_rules! check_metric_after_block { }}; } -#[macro_export] -// macro created from check_metric_after_block because `metric` will be -// emitted by NET_METRICS macro which does not fit with the $metric.count() -// that check_metric_after_block had. -// TODO: After all devices have per device metrics we won't need the -// check_metric_after_block macro. -macro_rules! check_net_metric_after_block { - ($metric:expr, $delta:expr, $block:expr) => {{ - let before = $metric; - let _ = $block; - assert_eq!($metric, before + $delta, "unexpected metric value"); - }}; -} - /// Creates a [`GuestMemoryMmap`] with a single region of the given size starting at guest physical /// address 0 pub fn single_region_mem(region_size: usize) -> GuestMemoryMmap {