Skip to content

Commit

Permalink
[network metrics] instance network interface metrics
Browse files Browse the repository at this point in the history
Overview
========

This is related to the ongoing work in [plumbing instance/guest
metrics](#742) through to an
oximeter producer in propolis.

Includes:
* An update to `NetworkInterfaceRequest` to include an interface_id, which is part of the request on the [sled instance in Omicron](oxidecomputer/omicron#6414).
* Within initialization, with concrete types, we create interface_ids mapping nic UUIDs => `minor` number of the device instance of viona.
  - The device instance number matches the Kstat we'll look for.
  - We track this mapping via a type alias `InterfaceIdentifiers`, which is passed through from Machine initialization to VM objects (and VM objects shared), the latter of which we can use as part of registering the oximeter producer.
* A new stats module for collecting network_interface metrics.
  - In `to_samples` we generate multiple network interface targets based on vnics per instance, appropriately updating the `interface_id`, which is tracked in `interface_ids`.
* Move kstat_types out of virtual machine-specific stats and use its mocking for other stats tests

Dependencies
------------

- [ ] oxidecomputer/omicron#6414
- [ ] https://code.illumos.org/c/illumos-gate/+/3630
  • Loading branch information
zeeshanlakhani committed Aug 26, 2024
1 parent 64350cd commit bbef1cf
Show file tree
Hide file tree
Showing 17 changed files with 1,452 additions and 403 deletions.
1,001 changes: 761 additions & 240 deletions Cargo.lock

Large diffs are not rendered by default.

19 changes: 18 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ bitflags = "2.4"
bitstruct = "0.1"
bitvec = "1.0"
byteorder = "1"
bytes = "1.1"
bytes = "1.7.1"
camino = "1.1.6"
cargo_metadata = "0.18.1"
cc = "1.0.73"
Expand Down Expand Up @@ -164,3 +164,20 @@ tracing-subscriber = "0.3.14"
usdt = { version = "0.5", default-features = false }
uuid = "1.3.2"
zerocopy = "0.7.34"


#
# It's common during development to use a local copy of various complex
# dependencies. If you want to use those, uncomment one of these blocks.
#
# [patch."https://github.com/oxidecomputer/omicron"]
# internal-dns = { path = "../omicron/internal-dns" }
# nexus-client = { path = "../omicron/clients/nexus-client" }
# omicron-common = { path = "../omicron/common" }
# omicron-zone-package = "0.9.0"
# oximeter-instruments = { path = "../omicron/oximeter/instruments" }
# oximeter-producer = { path = "../omicron/oximeter/producer" }
# oximeter = { path = "../omicron/oximeter/oximeter" }
# [patch."https://github.com/oxidecomputer/crucible"]
# crucible = { path = "../crucible/upstairs" }
# crucible-client-types = { path = "../crucible/crucible-client-types" }
12 changes: 9 additions & 3 deletions bin/propolis-server/src/lib/initializer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@ use std::os::unix::fs::FileTypeExt;
use std::sync::Arc;
use std::time::{SystemTime, UNIX_EPOCH};


use crate::serial::Serial;
use crate::stats::virtual_machine::VirtualMachine;
use crate::vm::{BlockBackendMap, CrucibleBackendMap, DeviceMap};
use crate::stats::VirtualMachine;
use crate::vm::{BlockBackendMap, CrucibleBackendMap, DeviceMap, InterfaceIdentifiers};
use anyhow::{Context, Result};
use crucible_client_types::VolumeConstructionRequest;
pub use nexus_client::Client as NexusClient;
Expand All @@ -37,7 +38,7 @@ use propolis_api_types::instance_spec::{self, v0::InstanceSpecV0};
use propolis_api_types::InstanceProperties;
use slog::info;

// Arbitrary ROM limit for now
/// Arbitrary ROM limit for now
const MAX_ROM_SIZE: usize = 0x20_0000;

fn get_spec_guest_ram_limits(spec: &InstanceSpecV0) -> (usize, usize) {
Expand Down Expand Up @@ -111,6 +112,7 @@ pub struct MachineInitializer<'a> {
pub(crate) log: slog::Logger,
pub(crate) machine: &'a Machine,
pub(crate) devices: DeviceMap,
pub(crate) interface_ids: InterfaceIdentifiers,
pub(crate) block_backends: BlockBackendMap,
pub(crate) crucible_backends: CrucibleBackendMap,
pub(crate) spec: &'a InstanceSpecV0,
Expand Down Expand Up @@ -633,6 +635,10 @@ impl<'a> MachineInitializer<'a> {
)?;
self.devices
.insert(format!("pci-virtio-viona-{}", bdf), viona.clone());

self.interface_ids
.push((vnic_spec.interface_id, viona.instance_id()?));

chipset.pci_attach(bdf, viona);
}
Ok(())
Expand Down
1 change: 1 addition & 0 deletions bin/propolis-server/src/lib/spec/api_request.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ pub(super) fn parse_nic_from_request(
let (device_name, backend_name) = super::pci_path_to_nic_names(pci_path);
let device_spec = NetworkDeviceV0::VirtioNic(VirtioNic {
backend_name: backend_name.clone(),
interface_id: nic.interface_id,
pci_path,
});

Expand Down
3 changes: 3 additions & 0 deletions bin/propolis-server/src/lib/spec/config_toml.rs
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,9 @@ pub(super) fn parse_network_device_from_config(

let device_spec = NetworkDeviceV0::VirtioNic(VirtioNic {
backend_name: backend_name.clone(),
// We don't allow for configuration to specify the interface_id, so we
// generate a new one.
interface_id: uuid::Uuid::new_v4(),
pci_path,
});

Expand Down
244 changes: 205 additions & 39 deletions bin/propolis-server/src/lib/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,31 +10,43 @@ use oximeter::{
types::{Cumulative, ProducerRegistry, Sample},
Metric, MetricsError, Producer,
};
use oximeter_producer::{Config, Error, Server};
use slog::Logger;

use std::net::SocketAddr;
use std::sync::{Arc, Mutex};
use uuid::Uuid;

use crate::server::MetricsEndpointConfig;
use crate::stats::virtual_machine::VirtualMachine;

// Propolis is built and some tests are run on non-illumos systems. The real
// `kstat` infrastructure cannot be built there, so some conditional compilation
// tricks are needed
#[cfg(all(not(test), target_os = "illumos"))]
use oximeter_instruments::kstat::KstatSampler;
use oximeter_producer::{self, Config, Server};
use slog::Logger;
use std::{
net::SocketAddr,
sync::{Arc, Mutex},
};

use crate::server::MetricsEndpointConfig;

mod network_interface;
mod pvpanic;
pub(crate) mod virtual_machine;
pub use self::pvpanic::PvpanicProducer;
mod virtual_machine;

// Interval on which we ask `oximeter` to poll us for metric data.
pub(crate) use self::network_interface::InstanceNetworkInterfaces;
pub(crate) use self::pvpanic::PvpanicProducer;
pub(crate) use self::virtual_machine::VirtualMachine;

/// Interval on which we ask `oximeter` to poll us for metric data.
///
/// This is used when setting-up the producer endpoint.
const OXIMETER_STAT_INTERVAL: tokio::time::Duration =
tokio::time::Duration::from_secs(30);

// Interval on which we produce vCPU metrics.
/// Interval on which we sample instance/guest network interface metrics.
///
/// This matches what we're currently using for sampling
/// sled link metrics.
#[cfg(all(not(test), target_os = "illumos"))]
const NETWORK_INTERFACE_SAMPLE_INTERVAL: std::time::Duration =
std::time::Duration::from_secs(10);

/// Interval on which we produce vCPU metrics.
#[cfg(all(not(test), target_os = "illumos"))]
const VCPU_KSTAT_INTERVAL: std::time::Duration =
std::time::Duration::from_secs(5);
Expand Down Expand Up @@ -132,11 +144,11 @@ impl Producer for ServerStatsOuter {
/// task, and will periodically renew that registration. The returned server is
/// running, and need not be poked or renewed to successfully serve metric data.
pub fn start_oximeter_server(
id: Uuid,
id: uuid::Uuid,
config: &MetricsEndpointConfig,
log: &Logger,
registry: &ProducerRegistry,
) -> Result<Server, Error> {
) -> Result<Server, oximeter_producer::Error> {
// Request an ephemeral port on which to serve metrics.
let producer_address = SocketAddr::new(config.listen_addr, 0);
let registration_address = config.registration_addr;
Expand Down Expand Up @@ -181,16 +193,16 @@ pub fn start_oximeter_server(
#[cfg_attr(not(all(not(test), target_os = "illumos")), allow(unused_variables))]
pub async fn register_server_metrics(
registry: &ProducerRegistry,
nics: InstanceNetworkInterfaces,
virtual_machine: VirtualMachine,
log: &Logger,
) -> anyhow::Result<ServerStatsOuter> {
let stats = ServerStats::new(virtual_machine.clone());

let stats_outer = ServerStatsOuter {
server_stats_wrapped: Arc::new(Mutex::new(stats)),
// Setup the collection of kstats for this instance.
#[cfg(all(not(test), target_os = "illumos"))]
kstat_sampler: setup_kstat_tracking(log, virtual_machine).await,
kstat_sampler: setup_kstat_tracking(log, nics, virtual_machine).await,
};

registry.register_producer(stats_outer.clone())?;
Expand All @@ -201,34 +213,188 @@ pub async fn register_server_metrics(
#[cfg(all(not(test), target_os = "illumos"))]
async fn setup_kstat_tracking(
log: &Logger,
nics: InstanceNetworkInterfaces,
virtual_machine: VirtualMachine,
) -> Option<KstatSampler> {
let kstat_limit =
usize::try_from(virtual_machine.n_vcpus() * KSTAT_LIMIT_PER_VCPU)
.unwrap();
match KstatSampler::with_sample_limit(log, kstat_limit) {
Ok(sampler) => {
let details = oximeter_instruments::kstat::CollectionDetails::never(
VCPU_KSTAT_INTERVAL,
);
if let Err(e) = sampler.add_target(virtual_machine, details).await {
slog::error!(
log,
"failed to add VirtualMachine target, \
vCPU stats will be unavailable";
"error" => ?e,
);
}
Some(sampler)
}
Err(e) => {
let n_interfaces = nics.interface_ids.len() as u32;
let kstat_limit = usize::try_from(
(virtual_machine.n_vcpus() * KSTAT_LIMIT_PER_VCPU)
+ (n_interfaces * KSTAT_LIMIT_PER_VCPU),
)
.unwrap();

let sampler = KstatSampler::with_sample_limit(log, kstat_limit)
.map_err(|err| {
slog::error!(
log,
"failed to create KstatSampler, \
vCPU and link stats will be unavailable";
"error" => ?err,
);
err
})
.ok()?;

let vcpu_details = oximeter_instruments::kstat::CollectionDetails::never(
VCPU_KSTAT_INTERVAL,
);

// We don't need to check the result of this operation, as we'll keep the
// sampler around even if we can't add the target.
let _ = sampler
.add_target(virtual_machine, vcpu_details)
.await
.map_err(|err| {
slog::error!(
log,
"failed to add VirtualMachine target, \
vCPU stats will be unavailable";
"error" => ?e,
"error" => ?err,
);
None
err
})
.map(|_| {
slog::debug!(
log,
"Added new VirtualMachine target to kstat sampler"
)
});

let nic_details = oximeter_instruments::kstat::CollectionDetails::never(
NETWORK_INTERFACE_SAMPLE_INTERVAL,
);
let interface_id = nics.target.interface_id;

// We don't need to check the result of this operation, as we'll keep the
// sampler around even if we can't add the target.
let _ = sampler
.add_target(nics, nic_details)
.await
.map_err(|err| {
slog::error!(
log,
"failed to add network interface target, \
network interface stats will be unavailable";
"network_interface_id" => %interface_id,
"error" => ?err
);
err
})
.map(|_| {
slog::debug!(
log,
"Added new network interface target to kstat sampler";
"network_interface_id" => %interface_id,
)
});

Some(sampler)
}

#[cfg(all(not(test), target_os = "illumos"))]
mod kstat_types {
pub(crate) use kstat_rs::{Data, Kstat, Named, NamedData};
pub(crate) use oximeter_instruments::kstat::{
hrtime_to_utc, ConvertNamedData, Error, KstatList, KstatTarget,
};
}

/// Mock the relevant subset of `kstat-rs` types needed for tests.
#[cfg(not(all(not(test), target_os = "illumos")))]
mod kstat_types {
use chrono::DateTime;
use chrono::Utc;
use oximeter::{Sample, Target};

pub(crate) type KstatList<'a, 'k> =
&'a [(DateTime<Utc>, Kstat<'k>, Data<'k>)];

pub(crate) trait KstatTarget:
Target + Send + Sync + 'static + std::fmt::Debug
{
/// Return true for any kstat you're interested in.
fn interested(&self, kstat: &Kstat<'_>) -> bool;

/// Convert from a kstat and its data to a list of samples.
fn to_samples(
&self,
kstats: KstatList<'_, '_>,
) -> Result<Vec<Sample>, Error>;
}

#[derive(Debug, Clone)]
pub(crate) enum Data<'a> {
Named(Vec<Named<'a>>),
#[allow(dead_code)]
Null,
}

#[derive(Debug, Clone)]
pub(crate) enum NamedData<'a> {
UInt32(u32),
UInt64(u64),
String(&'a str),
}

#[derive(Debug)]
pub(crate) struct Kstat<'a> {
pub ks_module: &'a str,
pub ks_instance: i32,
pub ks_name: &'a str,
pub ks_snaptime: i64,
}

#[derive(Debug, Clone)]
pub(crate) struct Named<'a> {
pub name: &'a str,
pub value: NamedData<'a>,
}

#[allow(unused)]
pub(crate) trait ConvertNamedData {
fn as_i32(&self) -> Result<i32, Error>;
fn as_u32(&self) -> Result<u32, Error>;
fn as_i64(&self) -> Result<i64, Error>;
fn as_u64(&self) -> Result<u64, Error>;
}

impl<'a> ConvertNamedData for NamedData<'a> {
fn as_i32(&self) -> Result<i32, Error> {
unimplemented!()
}

fn as_u32(&self) -> Result<u32, Error> {
if let NamedData::UInt32(x) = self {
Ok(*x)
} else {
panic!()
}
}

fn as_i64(&self) -> Result<i64, Error> {
unimplemented!()
}

fn as_u64(&self) -> Result<u64, Error> {
if let NamedData::UInt64(x) = self {
Ok(*x)
} else {
panic!()
}
}
}

#[derive(thiserror::Error, Clone, Debug)]
pub(crate) enum Error {
#[error("No such kstat")]
NoSuchKstat,
#[error("Expected a named kstat")]
ExpectedNamedKstat,
#[error("Sample error")]
Sample(#[from] oximeter::MetricsError),
}

pub(crate) fn hrtime_to_utc(_: i64) -> Result<DateTime<Utc>, Error> {
Ok(Utc::now())
}
}
Loading

0 comments on commit bbef1cf

Please sign in to comment.