From a7271cc473fef34999f947d92c9cf760b9926652 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 14 Aug 2024 15:33:56 -0700 Subject: [PATCH 01/77] sketch schema for sled sensor measurements --- .../oximeter/schema/sensor-measurement.toml | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 oximeter/oximeter/schema/sensor-measurement.toml diff --git a/oximeter/oximeter/schema/sensor-measurement.toml b/oximeter/oximeter/schema/sensor-measurement.toml new file mode 100644 index 0000000000..54bd95e854 --- /dev/null +++ b/oximeter/oximeter/schema/sensor-measurement.toml @@ -0,0 +1,67 @@ +[target] +name = "sled" +description = "A compute sled" +authz_scope = "fleet" +versions = [ + { version = 1, fields = ["rack_id", "sled_id", "sled_model", "sled_model", "sled_revision", "sled_serial" ]} +] + +[fields.rack_id] +type = "uuid" +description = "ID for the sled's rack" + +[fields.sled_id] +type = "uuid" +description = "ID for the sled" + +[fields.sled_model] +type = "string" +description = "Model number of the the sled" + +[fields.sled_revision] +type = "u32" +description = "Revision number of the sled" + +[fields.sled_serial] +type = "string" +description = "Serial number of the sled" + +[fields.sensor] +type = "string" +description = "A name identifying the sensor that recorded a measurement" + +[[metrics]] +name = "temperature" +description = "Temperature reading in degrees Celcius" +units = "degrees_celcius" +datum_type = "f64" +versions = [ + { added_in = 1, fields = ["sensor"]} +] + +[[metrics]] +name = "current" +description = "Electric current reading in amperes" +units = "amps" +datum_type = "f64" +versions = [ + { added_in = 1, fields = ["sensor"]} +] + +[[metrics]] +name = "voltage" +description = "A voltage reading" +units = "volts" +datum_type = "f64" +versions = [ + { added_in = 1, fields = ["sensor"]} +] + +[[metrics]] +name = "fan_speed" +description = "A fan speed measurement, in rotations per minute" +units = "rpm" +datum_type = "f64" +versions = [ + { added_in = 1, fields = ["sensor"]} +] From 7d49243c996a113ee991ffe4b13f7758a004ead3 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 14 Aug 2024 19:39:11 -0700 Subject: [PATCH 02/77] add errors to schema --- .../oximeter/schema/sensor-measurement.toml | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/oximeter/oximeter/schema/sensor-measurement.toml b/oximeter/oximeter/schema/sensor-measurement.toml index 54bd95e854..06c69d36a0 100644 --- a/oximeter/oximeter/schema/sensor-measurement.toml +++ b/oximeter/oximeter/schema/sensor-measurement.toml @@ -30,11 +30,15 @@ description = "Serial number of the sled" type = "string" description = "A name identifying the sensor that recorded a measurement" +[fields.error] +type = "string" +description = "A string identifying the type of sensor error that occurred" + [[metrics]] name = "temperature" description = "Temperature reading in degrees Celcius" units = "degrees_celcius" -datum_type = "f64" +datum_type = "f32" versions = [ { added_in = 1, fields = ["sensor"]} ] @@ -43,7 +47,7 @@ versions = [ name = "current" description = "Electric current reading in amperes" units = "amps" -datum_type = "f64" +datum_type = "f32" versions = [ { added_in = 1, fields = ["sensor"]} ] @@ -52,7 +56,7 @@ versions = [ name = "voltage" description = "A voltage reading" units = "volts" -datum_type = "f64" +datum_type = "f32" versions = [ { added_in = 1, fields = ["sensor"]} ] @@ -61,7 +65,16 @@ versions = [ name = "fan_speed" description = "A fan speed measurement, in rotations per minute" units = "rpm" -datum_type = "f64" +datum_type = "f32" versions = [ { added_in = 1, fields = ["sensor"]} ] + +[[metrics]] +name = "sensor_error_count" +description = "Cumulative count of errors reported by a sensor" +units = "count" +datum_type = "cumulative_u64" +versions = [ + { added_in = 1, fields = ["sensor", "error"]} +] From 41c071ecc4b82493422986babaaeb65dc4e54f91 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 15 Aug 2024 09:51:02 -0700 Subject: [PATCH 03/77] wip --- Cargo.lock | 1 + gateway/Cargo.toml | 1 + gateway/src/lib.rs | 1 + gateway/src/metrics.rs | 7 +++++++ 4 files changed, 10 insertions(+) create mode 100644 gateway/src/metrics.rs diff --git a/Cargo.lock b/Cargo.lock index 837015f3bc..507350ee2d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5979,6 +5979,7 @@ dependencies = [ "omicron-test-utils", "omicron-workspace-hack", "once_cell", + "oximeter", "schemars", "serde", "serde_json", diff --git a/gateway/Cargo.toml b/gateway/Cargo.toml index 3cfd1d447b..19efb28070 100644 --- a/gateway/Cargo.toml +++ b/gateway/Cargo.toml @@ -39,6 +39,7 @@ tokio-tungstenite.workspace = true toml.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true +oximeter.workspace = true [dev-dependencies] expectorate.workspace = true diff --git a/gateway/src/lib.rs b/gateway/src/lib.rs index e1eed05334..afe2b3bb70 100644 --- a/gateway/src/lib.rs +++ b/gateway/src/lib.rs @@ -6,6 +6,7 @@ mod config; mod context; mod error; mod management_switch; +mod metrics; mod serial_console; pub mod http_entrypoints; // TODO pub only for testing - is this right? diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs new file mode 100644 index 0000000000..07a5304377 --- /dev/null +++ b/gateway/src/metrics.rs @@ -0,0 +1,7 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use oximeter::types::ProducerRegistry; + +oximeter::use_timeseries!("sensor-measurement.toml"); From 358bd9614a4b8011720bc351ac774b1115fd1d6e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 15 Aug 2024 10:17:48 -0700 Subject: [PATCH 04/77] also add chrono dep (whoops) --- Cargo.lock | 1 + gateway/Cargo.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 507350ee2d..362dd9c533 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5961,6 +5961,7 @@ dependencies = [ "anyhow", "base64 0.22.1", "camino", + "chrono", "clap", "dropshot", "expectorate", diff --git a/gateway/Cargo.toml b/gateway/Cargo.toml index 19efb28070..e7b45684c9 100644 --- a/gateway/Cargo.toml +++ b/gateway/Cargo.toml @@ -11,6 +11,7 @@ workspace = true anyhow.workspace = true base64.workspace = true camino.workspace = true +chrono.workspace = true clap.workspace = true dropshot.workspace = true futures.workspace = true From 8f13131586d19470c8d86685ac7b9ce318f41f31 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 15 Aug 2024 10:20:34 -0700 Subject: [PATCH 05/77] change schema to use component as target --- .../oximeter/schema/sensor-measurement.toml | 41 ++++++++++++++----- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/oximeter/oximeter/schema/sensor-measurement.toml b/oximeter/oximeter/schema/sensor-measurement.toml index 06c69d36a0..66e829bf26 100644 --- a/oximeter/oximeter/schema/sensor-measurement.toml +++ b/oximeter/oximeter/schema/sensor-measurement.toml @@ -1,9 +1,22 @@ +format_version = 1 + [target] -name = "sled" -description = "A compute sled" +name = "sled_component" +description = "A component on a compute sled which reports sensor measurements" authz_scope = "fleet" versions = [ - { version = 1, fields = ["rack_id", "sled_id", "sled_model", "sled_model", "sled_revision", "sled_serial" ]} + { + version = 1, + fields = [ + "rack_id", + "sled_id", + "sled_model", + "sled_revision", + "sled_serial", + "component", + "device", + ] + } ] [fields.rack_id] @@ -26,9 +39,17 @@ description = "Revision number of the sled" type = "string" description = "Serial number of the sled" -[fields.sensor] +[fields.component] +type = "string" +description = "The service processor component ID uniquely identifying the component on the sled" + +[fields.device] +type = "string" +description = "The name of the device which recorded a sensor reading" + +[fields.measurement] type = "string" -description = "A name identifying the sensor that recorded a measurement" +description = "A name identifying the quantity measured by a sensor measurement" [fields.error] type = "string" @@ -40,7 +61,7 @@ description = "Temperature reading in degrees Celcius" units = "degrees_celcius" datum_type = "f32" versions = [ - { added_in = 1, fields = ["sensor"]} + { added_in = 1, fields = ["measurement"]} ] [[metrics]] @@ -49,7 +70,7 @@ description = "Electric current reading in amperes" units = "amps" datum_type = "f32" versions = [ - { added_in = 1, fields = ["sensor"]} + { added_in = 1, fields = ["measurement"]} ] [[metrics]] @@ -58,7 +79,7 @@ description = "A voltage reading" units = "volts" datum_type = "f32" versions = [ - { added_in = 1, fields = ["sensor"]} + { added_in = 1, fields = ["measurement"]} ] [[metrics]] @@ -67,7 +88,7 @@ description = "A fan speed measurement, in rotations per minute" units = "rpm" datum_type = "f32" versions = [ - { added_in = 1, fields = ["sensor"]} + { added_in = 1, fields = ["measurement"]} ] [[metrics]] @@ -76,5 +97,5 @@ description = "Cumulative count of errors reported by a sensor" units = "count" datum_type = "cumulative_u64" versions = [ - { added_in = 1, fields = ["sensor", "error"]} + { added_in = 1, fields = ["measurement", "error"]} ] From 0e8a1c4ff88d646631e7ab3db4454d9ed7784737 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 15 Aug 2024 10:32:49 -0700 Subject: [PATCH 06/77] TOML syntax is bizarre... --- .../oximeter/schema/sensor-measurement.toml | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/oximeter/oximeter/schema/sensor-measurement.toml b/oximeter/oximeter/schema/sensor-measurement.toml index 66e829bf26..36b144b6de 100644 --- a/oximeter/oximeter/schema/sensor-measurement.toml +++ b/oximeter/oximeter/schema/sensor-measurement.toml @@ -5,18 +5,15 @@ name = "sled_component" description = "A component on a compute sled which reports sensor measurements" authz_scope = "fleet" versions = [ - { - version = 1, - fields = [ - "rack_id", - "sled_id", - "sled_model", - "sled_revision", - "sled_serial", - "component", - "device", - ] - } + { version = 1, fields = [ + "rack_id", + "sled_id", + "sled_model", + "sled_revision", + "sled_serial", + "component", + "device", + ]} ] [fields.rack_id] From 71de57f538607b36009a476dc6cb0bb5d4dbb3fd Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 15 Aug 2024 10:35:53 -0700 Subject: [PATCH 07/77] add switch_component target --- .../oximeter/schema/sensor-measurement.toml | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/oximeter/oximeter/schema/sensor-measurement.toml b/oximeter/oximeter/schema/sensor-measurement.toml index 36b144b6de..87ca4f20ea 100644 --- a/oximeter/oximeter/schema/sensor-measurement.toml +++ b/oximeter/oximeter/schema/sensor-measurement.toml @@ -16,6 +16,23 @@ versions = [ ]} ] +[target] +name = "switch_component" +description = "A component on a rack switch which reports sensor measurements" +authz_scope = "fleet" +versions = [ + { version = 1, fields = [ + "rack_id", + "switch_id", + "switch_revision", + "switch_component", + "switch_serial", + "component", + "device", + ]} +] + + [fields.rack_id] type = "uuid" description = "ID for the sled's rack" @@ -36,6 +53,22 @@ description = "Revision number of the sled" type = "string" description = "Serial number of the sled" +[fields.switch_id] +type = "uuid" +description = "ID for the switch" + +[fields.switch_model] +type = "string" +description = "Model number of the the switch" + +[fields.switch_revision] +type = "u32" +description = "Revision number of the switch" + +[fields.switch_serial] +type = "string" +description = "Serial number of the switch" + [fields.component] type = "string" description = "The service processor component ID uniquely identifying the component on the sled" From d7ee2a485220cff4c6069f52b840523c84960615 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 15 Aug 2024 10:39:38 -0700 Subject: [PATCH 08/77] urgh you can't have two targets in a schema --- .../oximeter/schema/sensor-measurement.toml | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/oximeter/oximeter/schema/sensor-measurement.toml b/oximeter/oximeter/schema/sensor-measurement.toml index 87ca4f20ea..4e6393cda6 100644 --- a/oximeter/oximeter/schema/sensor-measurement.toml +++ b/oximeter/oximeter/schema/sensor-measurement.toml @@ -16,22 +16,21 @@ versions = [ ]} ] -[target] -name = "switch_component" -description = "A component on a rack switch which reports sensor measurements" -authz_scope = "fleet" -versions = [ - { version = 1, fields = [ - "rack_id", - "switch_id", - "switch_revision", - "switch_component", - "switch_serial", - "component", - "device", - ]} -] - +# [target] +# name = "switch_component" +# description = "A component on a rack switch which reports sensor measurements" +# authz_scope = "fleet" +# versions = [ +# { version = 1, fields = [ +# "rack_id", +# "switch_id", +# "switch_revision", +# "switch_component", +# "switch_serial", +# "component", +# "device", +# ]} +# ] [fields.rack_id] type = "uuid" From 8c6406925b10fbe5ea7f9b3271d747278a61fa5e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 15 Aug 2024 13:09:35 -0700 Subject: [PATCH 09/77] basically all the plumbing now we just need to "actually read the sensors!" --- Cargo.lock | 1 + clients/nexus-client/src/lib.rs | 4 + clients/oximeter-client/src/lib.rs | 1 + common/src/api/internal/nexus.rs | 2 + gateway/Cargo.toml | 1 + gateway/src/lib.rs | 12 +- gateway/src/metrics.rs | 229 ++++++++++++++++++++++++ nexus/db-model/src/producer_endpoint.rs | 7 + openapi/nexus-internal.json | 7 + openapi/oximeter.json | 7 + 10 files changed, 270 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 362dd9c533..fbd3e406ff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5981,6 +5981,7 @@ dependencies = [ "omicron-workspace-hack", "once_cell", "oximeter", + "oximeter-producer", "schemars", "serde", "serde_json", diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index 62366c45e1..a55c5d4013 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -213,6 +213,7 @@ impl From fn from(kind: omicron_common::api::internal::nexus::ProducerKind) -> Self { use omicron_common::api::internal::nexus::ProducerKind; match kind { + ProducerKind::ManagementGateway => Self::ManagementGateway, ProducerKind::SledAgent => Self::SledAgent, ProducerKind::Service => Self::Service, ProducerKind::Instance => Self::Instance, @@ -390,6 +391,9 @@ impl From fn from(kind: types::ProducerKind) -> Self { use omicron_common::api::internal::nexus::ProducerKind; match kind { + types::ProducerKind::ManagementGateway => { + ProducerKind::ManagementGateway + } types::ProducerKind::SledAgent => ProducerKind::SledAgent, types::ProducerKind::Instance => ProducerKind::Instance, types::ProducerKind::Service => ProducerKind::Service, diff --git a/clients/oximeter-client/src/lib.rs b/clients/oximeter-client/src/lib.rs index 74fc6968e8..c23e5177a0 100644 --- a/clients/oximeter-client/src/lib.rs +++ b/clients/oximeter-client/src/lib.rs @@ -26,6 +26,7 @@ impl From fn from(kind: omicron_common::api::internal::nexus::ProducerKind) -> Self { use omicron_common::api::internal::nexus; match kind { + nexus::ProducerKind::ManagementGateway => Self::ManagementGateway, nexus::ProducerKind::Service => Self::Service, nexus::ProducerKind::SledAgent => Self::SledAgent, nexus::ProducerKind::Instance => Self::Instance, diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 7f4eb358a4..4daea6a198 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -223,6 +223,8 @@ pub enum ProducerKind { Service, /// The producer is a Propolis VMM managing a guest instance. Instance, + /// The producer is a management gateway service. + ManagementGateway, } /// Information announced by a metric server, used so that clients can contact it and collect diff --git a/gateway/Cargo.toml b/gateway/Cargo.toml index e7b45684c9..2dce15892d 100644 --- a/gateway/Cargo.toml +++ b/gateway/Cargo.toml @@ -41,6 +41,7 @@ toml.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true oximeter.workspace = true +oximeter-producer.workspace = true [dev-dependencies] expectorate.workspace = true diff --git a/gateway/src/lib.rs b/gateway/src/lib.rs index afe2b3bb70..50ef95b8bd 100644 --- a/gateway/src/lib.rs +++ b/gateway/src/lib.rs @@ -63,6 +63,8 @@ pub struct Server { /// `http_servers` all_servers_shutdown: FuturesUnordered, request_body_max_bytes: usize, + /// handle to the SP sensor metrics subsystem + metrics: metrics::Metrics, log: Logger, } @@ -152,6 +154,10 @@ impl Server { let mut http_servers = HashMap::with_capacity(args.addresses.len()); let all_servers_shutdown = FuturesUnordered::new(); + let metrics = metrics::Metrics::new(&log, &args).map_err(|err| { + format!("failed to initialize metrics subsystem: {err}") + })?; + for addr in args.addresses { start_dropshot_server( &apictx, @@ -168,6 +174,7 @@ impl Server { http_servers, all_servers_shutdown, request_body_max_bytes: config.dropshot.request_body_max_bytes, + metrics, log, }) } @@ -276,12 +283,14 @@ impl Server { server.close().await?; } + self.metrics.update_server_addrs(addresses).await; + Ok(()) } /// The rack_id will be set on a refresh of the SMF property when the sled /// agent starts. - pub fn set_rack_id(&self, rack_id: Option) { + pub fn set_rack_id(&mut self, rack_id: Option) { if let Some(rack_id) = rack_id { let val = self.apictx.rack_id.get_or_init(|| rack_id); if *val != rack_id { @@ -292,6 +301,7 @@ impl Server { "ignored_new_rack_id" => %rack_id); } else { info!(self.apictx.log, "Set rack_id"; "rack_id" => %rack_id); + self.metrics.set_rack_id(rack_id); } } else { warn!(self.apictx.log, "SMF refresh called without a rack id"); diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 07a5304377..1855ddeeba 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -2,6 +2,235 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use anyhow::Context; +use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::ProducerKind; use oximeter::types::ProducerRegistry; +use oximeter::types::Sample; +use oximeter::MetricsError; +use std::net::IpAddr; +use std::net::SocketAddr; +use std::net::SocketAddrV6; +use std::sync::{Arc, Mutex}; +use std::time::Duration; +use tokio::sync::oneshot; +use tokio::sync::watch; +use tokio::task::JoinHandle; +use uuid::Uuid; + +use crate::MgsArguments; oximeter::use_timeseries!("sensor-measurement.toml"); + +/// Handle to the metrics task. +pub struct Metrics { + addrs_tx: watch::Sender>, + rack_id_tx: Option>, + manager: JoinHandle>, + poller: JoinHandle>, +} + +/// Actually polls SP sensor readings +struct Poller { + samples: Arc>>, + log: slog::Logger, +} + +/// Manages a metrics server and stuff. +struct Manager { + log: slog::Logger, + addrs: watch::Receiver>, + registry: ProducerRegistry, +} + +#[derive(Debug)] +struct Producer(Arc>>); + +/// The interval on which we ask `oximeter` to poll us for metric data. +// N.B.: I picked this pretty arbitrarily... +const METRIC_COLLECTION_INTERVAL: Duration = Duration::from_secs(10); + +/// The maximum Dropshot request size for the metrics server. +const METRIC_REQUEST_MAX_SIZE: usize = 10 * 1024 * 1024; + +impl Metrics { + pub fn new( + log: &slog::Logger, + MgsArguments { id, rack_id, addresses }: &MgsArguments, + ) -> anyhow::Result { + let registry = ProducerRegistry::with_id(*id); + let samples = Arc::new(Mutex::new(Vec::new())); + + registry + .register_producer(Producer(samples.clone())) + .context("failed to register metrics producer")?; + + let (rack_id_tx, rack_id_rx) = oneshot::channel(); + let rack_id_tx = if let Some(rack_id) = *rack_id { + rack_id_tx.send(rack_id).expect( + "we just created the channel; it therefore will not be \ + closed", + ); + None + } else { + Some(rack_id_tx) + }; + let poller = tokio::spawn( + Poller { + samples, + log: log.new(slog::o!("component" => "sensor-poller")), + } + .run(rack_id_rx), + ); + + let (addrs_tx, addrs_rx) = + tokio::sync::watch::channel(addresses.clone()); + let manager = tokio::spawn( + Manager { + log: log.new(slog::o!("component" => "producer-server")), + addrs: addrs_rx, + registry, + } + .run(), + ); + Ok(Self { addrs_tx, rack_id_tx, manager, poller }) + } + + pub fn set_rack_id(&mut self, rack_id: Uuid) { + if let Some(tx) = self.rack_id_tx.take() { + tx.send(rack_id).expect("why has the sensor-poller task gone away?") + } + // ignoring duplicate attempt to set the rack ID... + } + + pub async fn update_server_addrs(&self, new_addrs: &[SocketAddrV6]) { + self.addrs_tx.send_if_modified(|current_addrs| { + if current_addrs.len() == new_addrs.len() + // N.B. that we could make this "faster" with a `HashSet`, + // but...the size of this Vec of addresses is probably going to + // two or three items, max, so the linear scan actually probably + // outperforms it... + && current_addrs.iter().all(|addr| new_addrs.contains(addr)) + { + return false; + } + + // Reuse existing `Vec` capacity if possible.This is almost + // certainly not performance-critical, but it makes me feel happy. + current_addrs.clear(); + current_addrs.extend_from_slice(new_addrs); + true + }); + } +} + +impl oximeter::Producer for Producer { + fn produce( + &mut self, + ) -> Result>, MetricsError> { + let samples = { + let mut lock = self.0.lock().unwrap(); + std::mem::take(&mut *lock) + }; + Ok(Box::new(samples.into_iter())) + } +} + +impl Poller { + async fn run(self, rack_id: oneshot::Receiver) -> anyhow::Result<()> { + // First, wait until we know what the rack ID is... + let rack_id = rack_id.await.context( + "rack ID sender has gone away...we must be shutting down", + )?; + + anyhow::bail!("TODO(eliza): draw the rest of the owl!") + } +} + +impl Manager { + async fn run(mut self) -> anyhow::Result<()> { + let mut current_server: Option = None; + loop { + let current_ip = current_server.as_ref().map(|s| s.address().ip()); + let mut new_ip = None; + for addr in self.addrs.borrow_and_update().iter() { + let &ip = addr.ip(); + // Don't bind the metrics endpoint on ::1 + if ip.is_loopback() { + continue; + } + // If our current address is contained in the new addresses, + // no need to rebind. + if current_ip == Some(IpAddr::V6(ip)) { + new_ip = None; + break; + } else { + new_ip = Some(ip); + } + } + + if let Some(ip) = new_ip { + slog::info!( + &self.log, + "rebinding producer server on new IP"; + "new_ip" => ?ip, + "current_ip" => ?current_ip, + ); + let server = { + // Listen on any available socket, using the provided underlay IP. + let address = SocketAddr::new(ip.into(), 0); + + // Discover Nexus via DNS + let registration_address = None; + + let server_info = ProducerEndpoint { + id: self.registry.producer_id(), + kind: ProducerKind::ManagementGateway, + address, + interval: METRIC_COLLECTION_INTERVAL, + }; + let config = oximeter_producer::Config { + server_info, + registration_address, + request_body_max_bytes: METRIC_REQUEST_MAX_SIZE, + log: oximeter_producer::LogConfig::Logger( + self.log.clone(), + ), + }; + oximeter_producer::Server::with_registry( + self.registry.clone(), + &config, + ) + .context("failed to start producer server")? + }; + + slog::info!( + &self.log, + "bound metrics producer server"; + "address" => %server.address(), + ); + + if let Some(old_server) = current_server.replace(server) { + let old_addr = old_server.address(); + if let Err(error) = old_server.close().await { + slog::error!( + &self.log, + "failed to close old metrics producer server"; + "address" => %old_addr, + "error" => %error, + ); + } else { + slog::debug!( + &self.log, + "old metrics producer server shut down"; + "address" => %old_addr, + ) + } + } + } + + // Wait for a subsequent address change. + self.addrs.changed().await?; + } + } +} diff --git a/nexus/db-model/src/producer_endpoint.rs b/nexus/db-model/src/producer_endpoint.rs index 74a7356adb..c2fab2de5a 100644 --- a/nexus/db-model/src/producer_endpoint.rs +++ b/nexus/db-model/src/producer_endpoint.rs @@ -22,6 +22,7 @@ impl_enum_type!( #[diesel(sql_type = ProducerKindEnum)] pub enum ProducerKind; + ManagementGateway => b"management_gateway" SledAgent => b"sled_agent" Service => b"service" Instance => b"instance" @@ -30,6 +31,9 @@ impl_enum_type!( impl From for ProducerKind { fn from(kind: internal::nexus::ProducerKind) -> Self { match kind { + internal::nexus::ProducerKind::ManagementGateway => { + ProducerKind::ManagementGateway + } internal::nexus::ProducerKind::SledAgent => ProducerKind::SledAgent, internal::nexus::ProducerKind::Service => ProducerKind::Service, internal::nexus::ProducerKind::Instance => ProducerKind::Instance, @@ -40,6 +44,9 @@ impl From for ProducerKind { impl From for internal::nexus::ProducerKind { fn from(kind: ProducerKind) -> Self { match kind { + ProducerKind::ManagementGateway => { + internal::nexus::ProducerKind::ManagementGateway + } ProducerKind::SledAgent => internal::nexus::ProducerKind::SledAgent, ProducerKind::Service => internal::nexus::ProducerKind::Service, ProducerKind::Instance => internal::nexus::ProducerKind::Instance, diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 54b4822e51..111bd552d0 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -4443,6 +4443,13 @@ "enum": [ "instance" ] + }, + { + "description": "The producer is a management gateway service.", + "type": "string", + "enum": [ + "management_gateway" + ] } ] }, diff --git a/openapi/oximeter.json b/openapi/oximeter.json index f596ac6ee6..327351d961 100644 --- a/openapi/oximeter.json +++ b/openapi/oximeter.json @@ -277,6 +277,13 @@ "enum": [ "instance" ] + }, + { + "description": "The producer is a management gateway service.", + "type": "string", + "enum": [ + "management_gateway" + ] } ] } From 38de1666e3c12beffba84fff224323a94f9698b3 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 15 Aug 2024 13:29:37 -0700 Subject: [PATCH 10/77] oh, okay, that's how you talk to the SPs! --- gateway/src/lib.rs | 7 ++++--- gateway/src/metrics.rs | 12 ++++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/gateway/src/lib.rs b/gateway/src/lib.rs index 50ef95b8bd..a7464465f0 100644 --- a/gateway/src/lib.rs +++ b/gateway/src/lib.rs @@ -154,9 +154,10 @@ impl Server { let mut http_servers = HashMap::with_capacity(args.addresses.len()); let all_servers_shutdown = FuturesUnordered::new(); - let metrics = metrics::Metrics::new(&log, &args).map_err(|err| { - format!("failed to initialize metrics subsystem: {err}") - })?; + let metrics = metrics::Metrics::new(&log, &args, apictx.clone()) + .map_err(|err| { + format!("failed to initialize metrics subsystem: {err}") + })?; for addr in args.addresses { start_dropshot_server( diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 1855ddeeba..69b4c414ee 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -1,7 +1,7 @@ // This Source Code Form is subject to the terms of the Mozilla Public // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. - +use crate::ServerContext; use anyhow::Context; use omicron_common::api::internal::nexus::ProducerEndpoint; use omicron_common::api::internal::nexus::ProducerKind; @@ -34,6 +34,7 @@ pub struct Metrics { struct Poller { samples: Arc>>, log: slog::Logger, + apictx: Arc, } /// Manages a metrics server and stuff. @@ -56,7 +57,8 @@ const METRIC_REQUEST_MAX_SIZE: usize = 10 * 1024 * 1024; impl Metrics { pub fn new( log: &slog::Logger, - MgsArguments { id, rack_id, addresses }: &MgsArguments, + MgsArguments { id, rack_id, addresses, .. }: &MgsArguments, + apictx: Arc, ) -> anyhow::Result { let registry = ProducerRegistry::with_id(*id); let samples = Arc::new(Mutex::new(Vec::new())); @@ -65,6 +67,11 @@ impl Metrics { .register_producer(Producer(samples.clone())) .context("failed to register metrics producer")?; + // Using a channel for this is, admittedly, a bit of an end-run around + // the `OnceLock` on the `ServerContext` that *also* stores the rack ID, + // but it has the nice benefit of allowing the `Poller` task to _await_ + // the rack ID being set...we might want to change other code to use a + // similar approach in the future. let (rack_id_tx, rack_id_rx) = oneshot::channel(); let rack_id_tx = if let Some(rack_id) = *rack_id { rack_id_tx.send(rack_id).expect( @@ -78,6 +85,7 @@ impl Metrics { let poller = tokio::spawn( Poller { samples, + apictx, log: log.new(slog::o!("component" => "sensor-poller")), } .run(rack_id_rx), From 12d660b6d76f39cc715b86bf32aff0eb2e753d84 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 15 Aug 2024 14:28:27 -0700 Subject: [PATCH 11/77] schema munging --- .../oximeter/schema/sensor-measurement.toml | 69 ++++++------------- 1 file changed, 21 insertions(+), 48 deletions(-) diff --git a/oximeter/oximeter/schema/sensor-measurement.toml b/oximeter/oximeter/schema/sensor-measurement.toml index 4e6393cda6..f951b5848e 100644 --- a/oximeter/oximeter/schema/sensor-measurement.toml +++ b/oximeter/oximeter/schema/sensor-measurement.toml @@ -1,76 +1,49 @@ format_version = 1 [target] -name = "sled_component" -description = "A component on a compute sled which reports sensor measurements" +name = "component" +description = "A component which reports sensor measurements" authz_scope = "fleet" versions = [ { version = 1, fields = [ "rack_id", - "sled_id", - "sled_model", - "sled_revision", - "sled_serial", + "chassis_type", + "model", + "revision", + "serial", "component", "device", ]} ] -# [target] -# name = "switch_component" -# description = "A component on a rack switch which reports sensor measurements" -# authz_scope = "fleet" -# versions = [ -# { version = 1, fields = [ -# "rack_id", -# "switch_id", -# "switch_revision", -# "switch_component", -# "switch_serial", -# "component", -# "device", -# ]} -# ] - [fields.rack_id] type = "uuid" -description = "ID for the sled's rack" - -[fields.sled_id] -type = "uuid" -description = "ID for the sled" +description = "ID of the rack on which this measurement was recorded." -[fields.sled_model] +[fields.model] type = "string" -description = "Model number of the the sled" +description = "Model number of the sled, switch, or power shelf" -[fields.sled_revision] +[fields.revision] type = "u32" -description = "Revision number of the sled" +description = "Revision number of the sled, switch, or power shelf" -[fields.sled_serial] +[fields.serial] type = "string" -description = "Serial number of the sled" - -[fields.switch_id] -type = "uuid" -description = "ID for the switch" - -[fields.switch_model] -type = "string" -description = "Model number of the the switch" - -[fields.switch_revision] -type = "u32" -description = "Revision number of the switch" +description = "Serial number of the sled, switch, or power shelf" -[fields.switch_serial] +[fields.chassis_type] type = "string" -description = "Serial number of the switch" +description = """ +What kind of thing the component resides on. This will be one of 'sled'\ +for components on compute sled; 'switch', for components on rack switches; \ +or 'power', for components on power shelves.""" [fields.component] type = "string" -description = "The service processor component ID uniquely identifying the component on the sled" +description = """ +The service processor component ID uniquely identifying the component on +the sled, switch, or power shelf.""" [fields.device] type = "string" From f584881df64da74165545fa75099c07f6e35371d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 15 Aug 2024 16:31:01 -0700 Subject: [PATCH 12/77] more schema munging --- .../oximeter/schema/sensor-measurement.toml | 42 +++++++++++++++---- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/oximeter/oximeter/schema/sensor-measurement.toml b/oximeter/oximeter/schema/sensor-measurement.toml index f951b5848e..3fd2e051e8 100644 --- a/oximeter/oximeter/schema/sensor-measurement.toml +++ b/oximeter/oximeter/schema/sensor-measurement.toml @@ -8,9 +8,10 @@ versions = [ { version = 1, fields = [ "rack_id", "chassis_type", + "serial", "model", "revision", - "serial", + "firmware_id", "component", "device", ]} @@ -32,6 +33,10 @@ description = "Revision number of the sled, switch, or power shelf" type = "string" description = "Serial number of the sled, switch, or power shelf" +[fields.firmware_id] +type = "string" +description = "Hubris archive ID of the service processor when the measurement was recorded." + [fields.chassis_type] type = "string" description = """ @@ -49,7 +54,7 @@ the sled, switch, or power shelf.""" type = "string" description = "The name of the device which recorded a sensor reading" -[fields.measurement] +[fields.name] type = "string" description = "A name identifying the quantity measured by a sensor measurement" @@ -63,34 +68,55 @@ description = "Temperature reading in degrees Celcius" units = "degrees_celcius" datum_type = "f32" versions = [ - { added_in = 1, fields = ["measurement"]} + { added_in = 1, fields = ["name"]} ] [[metrics]] name = "current" -description = "Electric current reading in amperes" +description = "Output current reading in amperes" units = "amps" datum_type = "f32" versions = [ + { added_in = 1, fields = ["name"]} +] { added_in = 1, fields = ["measurement"]} ] [[metrics]] name = "voltage" -description = "A voltage reading" +description = "Output voltage reading, in volts" units = "volts" datum_type = "f32" versions = [ - { added_in = 1, fields = ["measurement"]} + { added_in = 1, fields = ["name"]} ] +[[metrics]] +name = "input_current" +description = "Input electric current reading in amperes" +units = "amps" +datum_type = "f32" +versions = [ + { added_in = 1, fields = ["name"]} +] + +[[metrics]] +name = "input_voltage" +description = "Input electric voltage reading, in volts" +units = "volts" +datum_type = "f32" +versions = [ + { added_in = 1, fields = ["name"]} +] + + [[metrics]] name = "fan_speed" description = "A fan speed measurement, in rotations per minute" units = "rpm" datum_type = "f32" versions = [ - { added_in = 1, fields = ["measurement"]} + { added_in = 1, fields = ["name"]} ] [[metrics]] @@ -99,5 +125,5 @@ description = "Cumulative count of errors reported by a sensor" units = "count" datum_type = "cumulative_u64" versions = [ - { added_in = 1, fields = ["measurement", "error"]} + { added_in = 1, fields = ["name", "error"]} ] From 7fd33a7ae88fe7c5dc6e9a8bee53cbe7bce24782 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 15 Aug 2024 17:24:03 -0700 Subject: [PATCH 13/77] oops i forgot to do watts --- openapi/nexus.json | 1 + oximeter/oximeter/schema/sensor-measurement.toml | 9 ++++++++- oximeter/schema/src/codegen.rs | 1 + oximeter/types/src/schema.rs | 1 + 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/openapi/nexus.json b/openapi/nexus.json index 285dcd82bb..747a90c6f1 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -19934,6 +19934,7 @@ "nanoseconds", "volts", "amps", + "watts", "degrees_celcius" ] }, diff --git a/oximeter/oximeter/schema/sensor-measurement.toml b/oximeter/oximeter/schema/sensor-measurement.toml index 3fd2e051e8..e23693d9ac 100644 --- a/oximeter/oximeter/schema/sensor-measurement.toml +++ b/oximeter/oximeter/schema/sensor-measurement.toml @@ -79,7 +79,14 @@ datum_type = "f32" versions = [ { added_in = 1, fields = ["name"]} ] - { added_in = 1, fields = ["measurement"]} + +[[metrics]] +name = "power" +description = "Power reading, in watts" +units = "watts" +datum_type = "f32" +versions = [ + { added_in = 1, fields = ["name"]} ] [[metrics]] diff --git a/oximeter/schema/src/codegen.rs b/oximeter/schema/src/codegen.rs index 0429cf0534..df9d54a3fd 100644 --- a/oximeter/schema/src/codegen.rs +++ b/oximeter/schema/src/codegen.rs @@ -512,6 +512,7 @@ fn quote_units(units: Units) -> TokenStream { } Units::Amps => quote! { ::oximeter::schema::Units::Amps }, Units::Volts => quote! { ::oximeter::schema::Units::Volts }, + Units::Watts => quote! { ::oximeter::schema::Units::Watts }, Units::DegreesCelcius => { quote! { ::oximeter::schema::Units::DegreesCelcius } } diff --git a/oximeter/types/src/schema.rs b/oximeter/types/src/schema.rs index 80aaa6f101..649da87cca 100644 --- a/oximeter/types/src/schema.rs +++ b/oximeter/types/src/schema.rs @@ -189,6 +189,7 @@ pub enum Units { Nanoseconds, Volts, Amps, + Watts, DegreesCelcius, /// Rotations per minute. Rpm, From 0bd914bcc80662572fa62305e557af7a5b2d578a Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 15 Aug 2024 17:25:27 -0700 Subject: [PATCH 14/77] draw the rest of the owl --- gateway/src/metrics.rs | 258 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 254 insertions(+), 4 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 69b4c414ee..ce377f0d7c 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -1,13 +1,22 @@ // This Source Code Form is subject to the terms of the Mozilla Public // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use crate::management_switch::SpIdentifier; +use crate::management_switch::SpType; use crate::ServerContext; use anyhow::Context; +use gateway_messages::measurement::MeasurementKind; +use gateway_messages::ComponentDetails; +use gateway_messages::DeviceCapabilities; +use gateway_sp_comms::SpComponent; +use gateway_sp_comms::VersionedSpState; use omicron_common::api::internal::nexus::ProducerEndpoint; use omicron_common::api::internal::nexus::ProducerKind; use oximeter::types::ProducerRegistry; use oximeter::types::Sample; use oximeter::MetricsError; +use std::borrow::Cow; +use std::collections::HashMap; use std::net::IpAddr; use std::net::SocketAddr; use std::net::SocketAddrV6; @@ -32,7 +41,7 @@ pub struct Metrics { /// Actually polls SP sensor readings struct Poller { - samples: Arc>>, + samples: Arc>>>, log: slog::Logger, apictx: Arc, } @@ -44,13 +53,31 @@ struct Manager { registry: ProducerRegistry, } +struct SpPoller { + apictx: Arc, + spid: SpIdentifier, + my_understanding: Mutex, + log: slog::Logger, + rack_id: Uuid, +} + +#[derive(Default)] +struct SpUnderstanding { + state: Option, + devices: Vec<(SpComponent, component::Component)>, +} + #[derive(Debug)] -struct Producer(Arc>>); +struct Producer(Arc>>>); /// The interval on which we ask `oximeter` to poll us for metric data. // N.B.: I picked this pretty arbitrarily... const METRIC_COLLECTION_INTERVAL: Duration = Duration::from_secs(10); +/// The interval at which we poll sensor readings from SPs. Bryan wants to try +/// 1Hz and see if the SP can handle it. +const POLL_INTERVAL: Duration = Duration::from_secs(1); + /// The maximum Dropshot request size for the metrics server. const METRIC_REQUEST_MAX_SIZE: usize = 10 * 1024 * 1024; @@ -140,18 +167,241 @@ impl oximeter::Producer for Producer { let mut lock = self.0.lock().unwrap(); std::mem::take(&mut *lock) }; - Ok(Box::new(samples.into_iter())) + Ok(Box::new(samples.into_iter().flatten())) } } impl Poller { async fn run(self, rack_id: oneshot::Receiver) -> anyhow::Result<()> { + let switch = &self.apictx.mgmt_switch; + // First, wait until we know what the rack ID is... let rack_id = rack_id.await.context( "rack ID sender has gone away...we must be shutting down", )?; - anyhow::bail!("TODO(eliza): draw the rest of the owl!") + let mut poll_interval = tokio::time::interval(POLL_INTERVAL); + let mut sps_as_i_understand_them = HashMap::new(); + let mut tasks = tokio::task::JoinSet::new(); + loop { + poll_interval.tick().await; + + // Wait for SP discovery to complete, if it hasn't already. + // TODO(eliza): presently, we busy-poll here. It would be nicer to + // replace the `OnceLock` in `ManagementSwitch` + // with a `tokio::sync::watch` + if !switch.is_discovery_complete() { + continue; + } + + let sps = match switch.all_sps() { + Ok(sps) => sps, + Err(e) => { + slog::warn!( + &self.log, + "failed to enumerate service processors! will try again in a bit"; + "error" => %e, + ); + continue; + } + }; + + for (spid, _) in sps { + let understanding = sps_as_i_understand_them + .entry(spid) + .or_insert_with(|| { + slog::debug!(&self.log, "found a new little friend!"; "sp" => ?spid); + Arc::new(SpPoller { + spid, + rack_id, + apictx: self.apictx.clone(), + log: self.log.new(slog::o!("slot" => spid.slot, "sp_type" => format!("{:?}", spid.typ))), + my_understanding: Mutex::new(Default::default()), + }) + }) + .clone(); + tasks.spawn(understanding.clone().poll_sp()); + } + + while let Some(result) = tasks.join_next().await { + match result { + Ok(Ok(samples)) => { + // No sense copying all the samples into the big vec thing, + // just push the vec instead. + self.samples.lock().unwrap().push(samples); + } + Ok(Err(error)) => { + // TODO(eliza): actually handle errors polling a SP + // nicely... + slog::error!( + &self.log, + "something bad happened"; + "error" => %error, + ); + } + Err(_) => { + unreachable!( + "tasks on the joinset never get aborted, and we \ + compile with panic=abort, so they won't panic" + ) + } + } + } + } + } +} + +impl SpPoller { + async fn poll_sp(self: Arc) -> anyhow::Result> { + let switch = &self.apictx.mgmt_switch; + let sp = switch.sp(self.spid)?; + // Check if the SP's state has changed. If it has, we need to make sure + // we still know what all of its sensors are. + let current_state = sp.state().await?; + let known_state = self.my_understanding.lock().unwrap().state.clone(); + + let devices = if Some(¤t_state) != known_state.as_ref() { + slog::debug!( + &self.log, + "our little friend seems to have changed in some kind of way"; + "current_state" => ?current_state, + "known_state" => ?known_state, + ); + let inv_devices = sp.inventory().await?.devices; + let mut devices: Vec<(SpComponent, component::Component)> = + Vec::with_capacity(inv_devices.len()); + + // Reimplement this ourselves because we don't really care about + // reading the RoT state at present. This is unfortunately copied + // from `gateway_messages`. + fn stringify_byte_string(bytes: &[u8]) -> String { + // We expect serial and model numbers to be ASCII and 0-padded: find the first 0 + // byte and convert to a string. If that fails, hexlify the entire slice. + let first_zero = + bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len()); + + std::str::from_utf8(&bytes[..first_zero]) + .map(|s| s.to_string()) + .unwrap_or_else(|_err| hex::encode(bytes)) + } + let (model, serial, hubris_archive_id, revision) = + match current_state { + VersionedSpState::V1(v) => ( + stringify_byte_string(&v.model), + stringify_byte_string(&v.serial_number[..]), + hex::encode(v.hubris_archive_id), + v.revision, + ), + VersionedSpState::V2(v) => ( + stringify_byte_string(&v.model), + stringify_byte_string(&v.serial_number[..]), + hex::encode(v.hubris_archive_id), + v.revision, + ), + VersionedSpState::V3(v) => ( + stringify_byte_string(&v.model), + stringify_byte_string(&v.serial_number[..]), + hex::encode(v.hubris_archive_id), + v.revision, + ), + }; + for dev in inv_devices { + // Skip devices which have nothing interesting for us. + if !dev + .capabilities + .contains(DeviceCapabilities::HAS_MEASUREMENT_CHANNELS) + { + continue; + } + let component_name = match dev.component.as_str() { + Some(c) => c, + None => { + // These are supposed to always be strings. But, if we + // see one that's not a string, bail instead of panicking. + slog::error!(&self.log, "a SP component ID was not a string! this isn't supposed to happen!"; "device" => ?dev); + anyhow::bail!("a SP component ID was not stringy!"); + } + }; + // TODO(eliza): i hate having to clone all these strings for + // every device on the SP...it would be cool if Oximeter let us + // reference count them... + let target = component::Component { + chassis_type: match self.spid.typ { + SpType::Sled => Cow::Borrowed("sled"), + SpType::Switch => Cow::Borrowed("switch"), + SpType::Power => Cow::Borrowed("power"), + }, + component: Cow::Owned(component_name.to_string()), + device: Cow::Owned(dev.device), + model: Cow::Owned(model.clone()), + revision, + serial: Cow::Owned(serial.clone()), + rack_id: self.rack_id, + firmware_id: Cow::Owned(hubris_archive_id.clone()), + }; + devices.push((dev.component, target)) + } + devices + } else { + // This is a bit goofy, but we have to release the lock before + // hitting any `await` points, so just move the inventory out of it. + // We'll put it back when we're done. This lock is *actually* owned + // exclusively by this `SpPoller`, but since it lives in a HashMap, + // rust doesn't understand that. + std::mem::take(&mut self.my_understanding.lock().unwrap().devices) + }; + + let mut samples = Vec::with_capacity(devices.len()); + for (c, target) in &devices { + let details = match sp.component_details(*c).await { + Ok(deets) => deets, + Err(error) => { + slog::warn!( + &self.log, + "failed to read details on SP component"; + "component" => %c, + "error" => %error, + ); + // TODO(eliza): we should increment a metric here... + continue; + } + }; + for d in details.entries { + let ComponentDetails::Measurement(m) = d else { continue }; + let name = Cow::Owned(m.name); + let sample = match (m.value, m.kind) { + (Ok(datum), MeasurementKind::Temperature) => Sample::new( + target, + &component::Temperature { name, datum }, + ), + (Ok(datum), MeasurementKind::Current) => { + Sample::new(target, &component::Current { name, datum }) + } + (Ok(datum), MeasurementKind::Voltage) => { + Sample::new(target, &component::Voltage { name, datum }) + } + (Ok(datum), MeasurementKind::Power) => { + Sample::new(target, &component::Power { name, datum }) + } + (Ok(datum), MeasurementKind::InputCurrent) => Sample::new( + target, + &component::InputCurrent { name, datum }, + ), + (Ok(datum), MeasurementKind::InputVoltage) => Sample::new( + target, + &component::InputVoltage { name, datum }, + ), + (Ok(datum), MeasurementKind::Speed) => Sample::new( + target, + &component::FanSpeed { name, datum }, + ), + (Err(error), _) => todo!(), + }?; + samples.push(sample); + } + } + + Ok(samples) } } From f9ff0f2129d6e3568156ff6d7d902bce3bdba3f0 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 15 Aug 2024 20:40:22 -0700 Subject: [PATCH 15/77] actually update our understanding --- gateway/src/metrics.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index ce377f0d7c..601728cb9f 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -401,6 +401,11 @@ impl SpPoller { } } + // Update our understanding again. + let mut understanding = self.my_understanding.lock().unwrap(); + understanding.devices = devices; + understanding.state = Some(current_state); + Ok(samples) } } @@ -439,7 +444,8 @@ impl Manager { let address = SocketAddr::new(ip.into(), 0); // Discover Nexus via DNS - let registration_address = None; + let registration_address = + Some("[]::1]:12223".parse().unwrap()); let server_info = ProducerEndpoint { id: self.registry.producer_id(), From f38bf1bf54082f6daa99218e4b267f9ae3278be6 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 16 Aug 2024 09:10:24 -0700 Subject: [PATCH 16/77] remove dbgs --- gateway/src/metrics.rs | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 601728cb9f..4062b545cb 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -228,6 +228,7 @@ impl Poller { Ok(Ok(samples)) => { // No sense copying all the samples into the big vec thing, // just push the vec instead. + slog::info!(&self.log, "made a thingy: {samples:#?}"); self.samples.lock().unwrap().push(samples); } Ok(Err(error)) => { @@ -359,13 +360,21 @@ impl SpPoller { slog::warn!( &self.log, "failed to read details on SP component"; - "component" => %c, + "sp_component" => %c, "error" => %error, ); // TODO(eliza): we should increment a metric here... continue; } }; + if details.entries.is_empty() { + slog::warn!( + &self.log, + "a component which claimed to have measurement channels \ + had empty details. this seems weird..."; + "sp_component" => %c, + ); + } for d in details.entries { let ComponentDetails::Measurement(m) = d else { continue }; let name = Cow::Owned(m.name); @@ -419,9 +428,9 @@ impl Manager { for addr in self.addrs.borrow_and_update().iter() { let &ip = addr.ip(); // Don't bind the metrics endpoint on ::1 - if ip.is_loopback() { - continue; - } + // if ip.is_loopback() { + // continue; + // } // If our current address is contained in the new addresses, // no need to rebind. if current_ip == Some(IpAddr::V6(ip)) { @@ -444,8 +453,10 @@ impl Manager { let address = SocketAddr::new(ip.into(), 0); // Discover Nexus via DNS + + // TODO(eliza) HARDCODED DEMO ADDRESS LOL let registration_address = - Some("[]::1]:12223".parse().unwrap()); + Some("[::1]:12221".parse().unwrap()); let server_info = ProducerEndpoint { id: self.registry.producer_id(), From 4b03501d9684a0a893c933b371935782933e89e7 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 16 Aug 2024 09:39:20 -0700 Subject: [PATCH 17/77] more schema tweaks --- gateway/src/metrics.rs | 17 +++++++++++++---- .../oximeter/schema/sensor-measurement.toml | 15 ++++++++++++--- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 4062b545cb..9931bfc58d 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -210,15 +210,23 @@ impl Poller { let understanding = sps_as_i_understand_them .entry(spid) .or_insert_with(|| { - slog::debug!(&self.log, "found a new little friend!"; "sp" => ?spid); + slog::debug!( + &self.log, + "found a new little friend!"; + "sp_slot" => ?spid.slot, + "chassis_type" => ?spid.typ, + ); Arc::new(SpPoller { spid, rack_id, apictx: self.apictx.clone(), - log: self.log.new(slog::o!("slot" => spid.slot, "sp_type" => format!("{:?}", spid.typ))), + log: self.log.new(slog::o!( + "sp_slot" => spid.slot, + "chassis_type" => format!("{:?}", spid.typ), + )), my_understanding: Mutex::new(Default::default()), }) - }) + }) .clone(); tasks.spawn(understanding.clone().poll_sp()); } @@ -332,13 +340,14 @@ impl SpPoller { SpType::Switch => Cow::Borrowed("switch"), SpType::Power => Cow::Borrowed("power"), }, + slot: self.spid.slot as u32, component: Cow::Owned(component_name.to_string()), device: Cow::Owned(dev.device), model: Cow::Owned(model.clone()), revision, serial: Cow::Owned(serial.clone()), rack_id: self.rack_id, - firmware_id: Cow::Owned(hubris_archive_id.clone()), + hubris_archive_id: Cow::Owned(hubris_archive_id.clone()), }; devices.push((dev.component, target)) } diff --git a/oximeter/oximeter/schema/sensor-measurement.toml b/oximeter/oximeter/schema/sensor-measurement.toml index e23693d9ac..fd5db1c36b 100644 --- a/oximeter/oximeter/schema/sensor-measurement.toml +++ b/oximeter/oximeter/schema/sensor-measurement.toml @@ -7,11 +7,12 @@ authz_scope = "fleet" versions = [ { version = 1, fields = [ "rack_id", + "slot", "chassis_type", "serial", "model", "revision", - "firmware_id", + "hubris_archive_id", "component", "device", ]} @@ -21,6 +22,12 @@ versions = [ type = "uuid" description = "ID of the rack on which this measurement was recorded." +[fields.slot] +type = "u32" +description = """ +The cubby number or switch slot of the service processor reporting the \ +measurement""" + [fields.model] type = "string" description = "Model number of the sled, switch, or power shelf" @@ -33,9 +40,11 @@ description = "Revision number of the sled, switch, or power shelf" type = "string" description = "Serial number of the sled, switch, or power shelf" -[fields.firmware_id] +[fields.hubris_archive_id] type = "string" -description = "Hubris archive ID of the service processor when the measurement was recorded." +description = """ +Hubris firmware archive ID of the service processor when the measurement \ +was recorded.""" [fields.chassis_type] type = "string" From a84f394247f8033f888bbd4a7420a0cce953bde7 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 16 Aug 2024 09:39:39 -0700 Subject: [PATCH 18/77] sp_sim_config.test.toml has no sensors --- gateway-test-utils/configs/sp_sim_config.test.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gateway-test-utils/configs/sp_sim_config.test.toml b/gateway-test-utils/configs/sp_sim_config.test.toml index cc08eec30b..b19ff102ae 100644 --- a/gateway-test-utils/configs/sp_sim_config.test.toml +++ b/gateway-test-utils/configs/sp_sim_config.test.toml @@ -20,6 +20,10 @@ device = "fake-tmp-sensor" description = "FAKE temperature sensor 1" capabilities = 0x2 presence = "Present" +sensors = [ + { name = "Southwest", kind = "Temperature", sensor_id = 0, last_data.value = 41.7890625, last_data.timestamp = 1234 }, +] + [[simulated_sps.sidecar.components]] id = "dev-1" From a41dc72933fe669d2b126bd94e719d25215cda5f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 16 Aug 2024 09:40:10 -0700 Subject: [PATCH 19/77] add dev-only MGS args instead of hardcoding --- gateway/src/bin/mgs.rs | 6 ++++- gateway/src/lib.rs | 3 ++- gateway/src/metrics.rs | 58 ++++++++++++++++++++++++++++++------------ 3 files changed, 49 insertions(+), 18 deletions(-) diff --git a/gateway/src/bin/mgs.rs b/gateway/src/bin/mgs.rs index 91290bffae..505f41f7a5 100644 --- a/gateway/src/bin/mgs.rs +++ b/gateway/src/bin/mgs.rs @@ -47,6 +47,9 @@ enum Args { required_unless_present = "id_and_address_from_smf" )] address: Option, + + #[clap(flatten)] + metrics_args: omicron_gateway::metrics::Args, }, } @@ -73,6 +76,7 @@ async fn do_run() -> Result<(), CmdError> { id_and_address_from_smf, id, address, + metrics_args, } => { let config = Config::from_file(&config_file_path) .map_err(anyhow::Error::new) @@ -92,7 +96,7 @@ async fn do_run() -> Result<(), CmdError> { // `id_and_address_from_smf` is false, so we can safely unwrap. (id.unwrap(), vec![address.unwrap()], rack_id) }; - let args = MgsArguments { id, addresses, rack_id }; + let args = MgsArguments { id, addresses, rack_id, metrics_args }; let mut server = start_server(config, args) .await .map_err(|e| CmdError::Failure(anyhow!(e)))?; diff --git a/gateway/src/lib.rs b/gateway/src/lib.rs index a7464465f0..8e09b09729 100644 --- a/gateway/src/lib.rs +++ b/gateway/src/lib.rs @@ -6,7 +6,7 @@ mod config; mod context; mod error; mod management_switch; -mod metrics; +pub mod metrics; mod serial_console; pub mod http_entrypoints; // TODO pub only for testing - is this right? @@ -49,6 +49,7 @@ pub struct MgsArguments { pub id: Uuid, pub addresses: Vec, pub rack_id: Option, + pub metrics_args: metrics::Args, } type HttpServer = dropshot::HttpServer>; diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 9931bfc58d..498d5cbd9b 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -5,6 +5,7 @@ use crate::management_switch::SpIdentifier; use crate::management_switch::SpType; use crate::ServerContext; use anyhow::Context; +use clap::Parser; use gateway_messages::measurement::MeasurementKind; use gateway_messages::ComponentDetails; use gateway_messages::DeviceCapabilities; @@ -39,6 +40,26 @@ pub struct Metrics { poller: JoinHandle>, } +/// CLI arguments for configuring metrics. +#[derive(Copy, Clone, Debug, clap::Parser)] +#[clap(next_help_heading = "SP Metrics Development Configuration")] +pub struct Args { + /// Override the Nexus address used to register the SP metrics Oximeter + /// producer. This is intended for use in development and testing. + /// + /// If this argument is not present, Nexus is discovered through DNS. + #[clap(long = "dev-nexus-address")] + nexus_address: Option, + + /// Allow the metrics producer endpoint to bind on loopback. + /// + /// This should be disabled in production, as Nexus will not be able to + /// reach the loopback interface, but is necessary for local development and + /// test purposes. + #[clap(long = "dev-metrics-bind-loopback")] + bind_loopback: bool, +} + /// Actually polls SP sensor readings struct Poller { samples: Arc>>>, @@ -47,10 +68,11 @@ struct Poller { } /// Manages a metrics server and stuff. -struct Manager { +struct ServerManager { log: slog::Logger, addrs: watch::Receiver>, registry: ProducerRegistry, + args: Args, } struct SpPoller { @@ -84,10 +106,11 @@ const METRIC_REQUEST_MAX_SIZE: usize = 10 * 1024 * 1024; impl Metrics { pub fn new( log: &slog::Logger, - MgsArguments { id, rack_id, addresses, .. }: &MgsArguments, + args: &MgsArguments, apictx: Arc, ) -> anyhow::Result { - let registry = ProducerRegistry::with_id(*id); + let &MgsArguments { id, rack_id, ref addresses, metrics_args } = args; + let registry = ProducerRegistry::with_id(id); let samples = Arc::new(Mutex::new(Vec::new())); registry @@ -100,7 +123,7 @@ impl Metrics { // the rack ID being set...we might want to change other code to use a // similar approach in the future. let (rack_id_tx, rack_id_rx) = oneshot::channel(); - let rack_id_tx = if let Some(rack_id) = *rack_id { + let rack_id_tx = if let Some(rack_id) = rack_id { rack_id_tx.send(rack_id).expect( "we just created the channel; it therefore will not be \ closed", @@ -121,10 +144,11 @@ impl Metrics { let (addrs_tx, addrs_rx) = tokio::sync::watch::channel(addresses.clone()); let manager = tokio::spawn( - Manager { + ServerManager { log: log.new(slog::o!("component" => "producer-server")), addrs: addrs_rx, registry, + cfg: metrics_args, } .run(), ); @@ -428,8 +452,16 @@ impl SpPoller { } } -impl Manager { +impl ServerManager { async fn run(mut self) -> anyhow::Result<()> { + if self.args.nexus_address.is_some() || self.args.bind_loopback { + slog::warn!( + &self.log, + "using development metrics configuration overrides!"; + "nexus_address" => ?self.args.nexus_address, + "bind_loopback" => self.args.bind_loopback, + ); + } let mut current_server: Option = None; loop { let current_ip = current_server.as_ref().map(|s| s.address().ip()); @@ -437,9 +469,9 @@ impl Manager { for addr in self.addrs.borrow_and_update().iter() { let &ip = addr.ip(); // Don't bind the metrics endpoint on ::1 - // if ip.is_loopback() { - // continue; - // } + if ip.is_loopback() && !self.args.bind_loopback { + continue; + } // If our current address is contained in the new addresses, // no need to rebind. if current_ip == Some(IpAddr::V6(ip)) { @@ -461,12 +493,6 @@ impl Manager { // Listen on any available socket, using the provided underlay IP. let address = SocketAddr::new(ip.into(), 0); - // Discover Nexus via DNS - - // TODO(eliza) HARDCODED DEMO ADDRESS LOL - let registration_address = - Some("[::1]:12221".parse().unwrap()); - let server_info = ProducerEndpoint { id: self.registry.producer_id(), kind: ProducerKind::ManagementGateway, @@ -475,7 +501,7 @@ impl Manager { }; let config = oximeter_producer::Config { server_info, - registration_address, + registration_address: self.args.nexus_address, request_body_max_bytes: METRIC_REQUEST_MAX_SIZE, log: oximeter_producer::LogConfig::Logger( self.log.clone(), From 0e82aa541ba56b08701009084cb97bf032b534a7 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 16 Aug 2024 09:50:08 -0700 Subject: [PATCH 20/77] thread through metrics dev args perhaps this should've been in the config file, but i feel like it's nicer for explicit overrides to be provided by the user when running the command... --- gateway-test-utils/src/setup.rs | 6 +++++- gateway/src/metrics.rs | 5 ++--- nexus/test-utils/src/lib.rs | 2 ++ 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/gateway-test-utils/src/setup.rs b/gateway-test-utils/src/setup.rs index 46bc55805a..5874ac9636 100644 --- a/gateway-test-utils/src/setup.rs +++ b/gateway-test-utils/src/setup.rs @@ -8,6 +8,7 @@ use camino::Utf8Path; use dropshot::test_util::ClientTestContext; use dropshot::test_util::LogContext; use gateway_messages::SpPort; +pub use omicron_gateway::metrics::Args as MgsMetricsArgs; use omicron_gateway::MgsArguments; use omicron_gateway::SpType; use omicron_gateway::SwitchPortConfig; @@ -69,6 +70,7 @@ pub async fn test_setup( server_config, &sp_sim_config, None, + Default::default(), ) .await } @@ -99,6 +101,7 @@ pub async fn test_setup_with_config( mut server_config: omicron_gateway::Config, sp_sim_config: &sp_sim::Config, listen_addr: Option, + metrics_args: MgsMetricsArgs, ) -> GatewayTestContext { // Can't be `const` because `SocketAddrV6::new()` isn't const yet let localhost_port_0 = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 0, 0, 0); @@ -144,7 +147,8 @@ pub async fn test_setup_with_config( // Start gateway server let rack_id = Some(Uuid::parse_str(RACK_UUID).unwrap()); - let args = MgsArguments { id: Uuid::new_v4(), addresses, rack_id }; + let args = + MgsArguments { id: Uuid::new_v4(), addresses, rack_id, metrics_args }; let server = omicron_gateway::Server::start( server_config.clone(), args, diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 498d5cbd9b..98796c1d79 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -5,7 +5,6 @@ use crate::management_switch::SpIdentifier; use crate::management_switch::SpType; use crate::ServerContext; use anyhow::Context; -use clap::Parser; use gateway_messages::measurement::MeasurementKind; use gateway_messages::ComponentDetails; use gateway_messages::DeviceCapabilities; @@ -41,7 +40,7 @@ pub struct Metrics { } /// CLI arguments for configuring metrics. -#[derive(Copy, Clone, Debug, clap::Parser)] +#[derive(Copy, Clone, Debug, Default, clap::Parser)] #[clap(next_help_heading = "SP Metrics Development Configuration")] pub struct Args { /// Override the Nexus address used to register the SP metrics Oximeter @@ -148,7 +147,7 @@ impl Metrics { log: log.new(slog::o!("component" => "producer-server")), addrs: addrs_rx, registry, - cfg: metrics_args, + args: metrics_args, } .run(), ); diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index acee46ce10..90114fb6ca 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -510,6 +510,8 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { mgs_config, &sp_sim_config, mgs_addr, + // TODO(eliza): pass the nexus address here... + Default::default(), ) .await; self.gateway.insert(switch_location, gateway); From af5b827bff204bd5040d66fb044aaf91b0fa829c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 16 Aug 2024 10:02:11 -0700 Subject: [PATCH 21/77] actually finish plumbing through metrics dev args --- dev-tools/mgs-dev/src/main.rs | 17 +++++++++++------ gateway-test-utils/src/setup.rs | 17 +++++++++++++++++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/dev-tools/mgs-dev/src/main.rs b/dev-tools/mgs-dev/src/main.rs index 85b1313d68..76fe3f0750 100644 --- a/dev-tools/mgs-dev/src/main.rs +++ b/dev-tools/mgs-dev/src/main.rs @@ -36,7 +36,10 @@ enum MgsDevCmd { } #[derive(Clone, Debug, Args)] -struct MgsRunArgs {} +struct MgsRunArgs { + #[clap(flatten)] + mgs_metrics_args: gateway_test_utils::setup::MgsMetricsArgs, +} impl MgsRunArgs { async fn exec(&self) -> Result<(), anyhow::Error> { @@ -46,11 +49,13 @@ impl MgsRunArgs { let mut signal_stream = signals.fuse(); println!("mgs-dev: setting up MGS ... "); - let gwtestctx = gateway_test_utils::setup::test_setup( - "mgs-dev", - gateway_messages::SpPort::One, - ) - .await; + let gwtestctx = + gateway_test_utils::setup::test_setup_with_metrics_args( + "mgs-dev", + gateway_messages::SpPort::One, + self.mgs_metrics_args, + ) + .await; println!("mgs-dev: MGS is running."); let addr = gwtestctx.client.bind_address; diff --git a/gateway-test-utils/src/setup.rs b/gateway-test-utils/src/setup.rs index 5874ac9636..a66d26d046 100644 --- a/gateway-test-utils/src/setup.rs +++ b/gateway-test-utils/src/setup.rs @@ -75,6 +75,23 @@ pub async fn test_setup( .await } +pub async fn test_setup_with_metrics_args( + test_name: &str, + sp_port: SpPort, + metrics_args: MgsMetricsArgs, +) -> GatewayTestContext { + let (server_config, sp_sim_config) = load_test_config(); + test_setup_with_config( + test_name, + sp_port, + server_config, + &sp_sim_config, + None, + metrics_args, + ) + .await +} + fn expected_location( config: &omicron_gateway::Config, sp_port: SpPort, From 9e13ee752b89dd121e01618c4b4882066e14d16f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 16 Aug 2024 10:02:36 -0700 Subject: [PATCH 22/77] misc cleanup --- gateway/src/metrics.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 98796c1d79..4aef6f2422 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -182,6 +182,14 @@ impl Metrics { } } +impl Drop for Metrics { + fn drop(&mut self) { + // Clean up our children on drop. + self.manager.abort(); + self.poller.abort(); + } +} + impl oximeter::Producer for Producer { fn produce( &mut self, From 2c7a0295d4a91f81b4d9f185b07c32490e0ab25f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 16 Aug 2024 10:12:33 -0700 Subject: [PATCH 23/77] rough pass on sensor errors --- .../configs/sp_sim_config.test.toml | 4 ++ gateway/src/metrics.rs | 39 ++++++++++++++++++- .../oximeter/schema/sensor-measurement.toml | 10 ++++- 3 files changed, 50 insertions(+), 3 deletions(-) diff --git a/gateway-test-utils/configs/sp_sim_config.test.toml b/gateway-test-utils/configs/sp_sim_config.test.toml index b19ff102ae..f29c66502d 100644 --- a/gateway-test-utils/configs/sp_sim_config.test.toml +++ b/gateway-test-utils/configs/sp_sim_config.test.toml @@ -31,6 +31,10 @@ device = "fake-tmp-sensor" description = "FAKE temperature sensor 2" capabilities = 0x2 presence = "Failed" +sensors = [ + { name = "South", kind = "Temperature", sensor_id = 1, last_error.value = "DeviceError", last_error.timestamp = 1234 }, +] + [[simulated_sps.sidecar]] multicast_addr = "::1" diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 4aef6f2422..c9d0e4bd52 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -5,6 +5,7 @@ use crate::management_switch::SpIdentifier; use crate::management_switch::SpType; use crate::ServerContext; use anyhow::Context; +use gateway_messages::measurement::MeasurementError; use gateway_messages::measurement::MeasurementKind; use gateway_messages::ComponentDetails; use gateway_messages::DeviceCapabilities; @@ -416,7 +417,11 @@ impl SpPoller { ); } for d in details.entries { - let ComponentDetails::Measurement(m) = d else { continue }; + let ComponentDetails::Measurement(m) = d else { + // If the component details are switch port details rather + // than measurement channels, ignore it for now. + continue; + }; let name = Cow::Owned(m.name); let sample = match (m.value, m.kind) { (Ok(datum), MeasurementKind::Temperature) => Sample::new( @@ -444,7 +449,37 @@ impl SpPoller { target, &component::FanSpeed { name, datum }, ), - (Err(error), _) => todo!(), + (Err(e), kind) => { + let sensor_kind = match kind { + MeasurementKind::Temperature => "temperature", + MeasurementKind::Current => "current", + MeasurementKind::Voltage => "voltage", + MeasurementKind::Power => "power", + MeasurementKind::InputCurrent => "input_current", + MeasurementKind::InputVoltage => "input_voltage", + MeasurementKind::Speed => "fan_speed", + }; + let error = match e { + MeasurementError::InvalidSensor => "invalid_sensor", + MeasurementError::NoReading => "no_reading", + MeasurementError::NotPresent => "not_present", + MeasurementError::DeviceError => "device_error", + MeasurementError::DeviceUnavailable => { + "device_unavailable" + } + MeasurementError::DeviceTimeout => "device_timeout", + MeasurementError::DeviceOff => "device_off", + }; + Sample::new( + target, + &component::SensorErrorCount { + error: Cow::Borrowed(error), + name, + datum: oximeter::types::Cumulative::new(1), + sensor_kind: Cow::Borrowed(sensor_kind), + }, + ) + } }?; samples.push(sample); } diff --git a/oximeter/oximeter/schema/sensor-measurement.toml b/oximeter/oximeter/schema/sensor-measurement.toml index fd5db1c36b..3d6ac0fdb0 100644 --- a/oximeter/oximeter/schema/sensor-measurement.toml +++ b/oximeter/oximeter/schema/sensor-measurement.toml @@ -71,6 +71,14 @@ description = "A name identifying the quantity measured by a sensor measurement" type = "string" description = "A string identifying the type of sensor error that occurred" +[fields.sensor_kind] +type = "string" +description = """ +A string identifying which sensor could not be read. This will be one of \ +'temperature', 'current', 'power', 'voltage', 'input_current', \ +'input_voltage', or 'fan_speed' (the same names as the metrics emitted by \ +these sensors when they are read successfully).""" + [[metrics]] name = "temperature" description = "Temperature reading in degrees Celcius" @@ -141,5 +149,5 @@ description = "Cumulative count of errors reported by a sensor" units = "count" datum_type = "cumulative_u64" versions = [ - { added_in = 1, fields = ["name", "error"]} + { added_in = 1, fields = ["name", "error", "sensor_kind"]} ] From 7ae1ab60d69ac65e2ae83231d1630e3da6439d5d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 16 Aug 2024 10:26:43 -0700 Subject: [PATCH 24/77] add some more simulated devices values taken from REAL gimlets on madrid. i skipped some, but i wanted at least simulate more sensor kinds. --- .../configs/sp_sim_config.test.toml | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/gateway-test-utils/configs/sp_sim_config.test.toml b/gateway-test-utils/configs/sp_sim_config.test.toml index f29c66502d..a840ea86ff 100644 --- a/gateway-test-utils/configs/sp_sim_config.test.toml +++ b/gateway-test-utils/configs/sp_sim_config.test.toml @@ -64,6 +64,82 @@ device = "fake-tmp-sensor" description = "FAKE temperature sensor" capabilities = 0x2 presence = "Failed" +sensors = [ + { name = "Southwest", kind = "Temperature", sensor_id = 0, last_error.value = "DeviceError", last_error.timestamp = 1234 }, +] +[[simulated_sps.gimlet.components]] +id = "dev-1" +device = "tmp117" +description = "FAKE temperature sensor" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "South", kind = "Temperature", sensor_id = 1, last_data.value = 42.5625, last_data.timestamp = 1234 }, +] + +[[simulated_sps.gimlet.components]] +id = "dev-2" +device = "tmp117" +description = "FAKE Southeast temperature sensor" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "Southeast", kind = "Temperature", sensor_id = 2, last_data.value = 41.570313, last_data.timestamp = 1234 }, +] + +[[simulated_sps.gimlet.components]] +id = "dev-6" +device = "at24csw080" +description = "FAKE U.2 Sharkfin A VPD" +capabilities = 0x0 +presence = "Present" + +[[simulated_sps.gimlet.components]] +id = "dev-7" +device = "max5970" +description = "FAKE U.2 Sharkfin A hot swap controller" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "V12_U2A_A0", kind = "Current", sensor_id = 3, last_data.value = 0.45898438, last_data.timestamp = 1234 }, + { name = "V3P3_U2A_A0", kind = "Current", sensor_id = 4, last_data.value = 0.024414063, last_data.timestamp = 1234 }, + { name = "V12_U2A_A0", kind = "Voltage", sensor_id = 5, last_data.value = 12.03125, last_data.timestamp = 1234 }, + { name = "V3P3_U2A_A0", kind = "Voltage", sensor_id = 6, last_data.value = 3.328125, last_data.timestamp = 1234 }, +] + +[[simulated_sps.gimlet.components]] +id = "dev-8" +device = "nvme_bmc" +description = "FAKE U.2 A NVMe Basic Management Command" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "U2_N0", kind = "Temperature", sensor_id = 7, last_data.value = 56.0, last_data.timestamp = 1234 }, +] +[[simulated_sps.gimlet.components]] +id = "dev-39" +device = "tmp451" +description = "FAKE T6 temperature sensor" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "t6", kind = "Temperature", sensor_id = 9, last_data.value = 70.625, last_data.timestamp = 1234 }, +] +[[simulated_sps.gimlet.components]] +id = "dev-53" +device = "max31790" +description = "FAKE Fan controller" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "Southeast", kind = "Speed", sensor_id = 10, last_data.value = 2607.0, last_data.timestamp = 1234 }, + { name = "Northeast", kind = "Speed", sensor_id = 11, last_data.value = 2476.0, last_data.timestamp = 1234 }, + { name = "South", kind = "Speed", sensor_id = 12, last_data.value = 2553.0, last_data.timestamp = 1234 }, + { name = "North", kind = "Speed", sensor_id = 13, last_data.value = 2265.0, last_data.timestamp = 1234 }, + { name = "Southwest", kind = "Speed", sensor_id = 14, last_data.value = 2649.0, last_data.timestamp = 1234 }, + { name = "Northwest", kind = "Speed", sensor_id = 15, last_data.value = 2275.0, last_data.timestamp = 1234 }, +] + [[simulated_sps.gimlet]] multicast_addr = "::1" From 0f66a39914d5a811de0f9513fe795fb9837c3d14 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 16 Aug 2024 10:42:13 -0700 Subject: [PATCH 25/77] include MGS uuids in metrics --- gateway/src/metrics.rs | 5 +++++ oximeter/oximeter/schema/sensor-measurement.toml | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index c9d0e4bd52..98fc6e927a 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -65,6 +65,7 @@ struct Poller { samples: Arc>>>, log: slog::Logger, apictx: Arc, + mgs_id: Uuid, } /// Manages a metrics server and stuff. @@ -81,6 +82,7 @@ struct SpPoller { my_understanding: Mutex, log: slog::Logger, rack_id: Uuid, + mgs_id: Uuid, } #[derive(Default)] @@ -137,6 +139,7 @@ impl Metrics { samples, apictx, log: log.new(slog::o!("component" => "sensor-poller")), + mgs_id: id, } .run(rack_id_rx), ); @@ -251,6 +254,7 @@ impl Poller { Arc::new(SpPoller { spid, rack_id, + mgs_id: self.mgs_id, apictx: self.apictx.clone(), log: self.log.new(slog::o!( "sp_slot" => spid.slot, @@ -379,6 +383,7 @@ impl SpPoller { revision, serial: Cow::Owned(serial.clone()), rack_id: self.rack_id, + gateway_id: self.mgs_id, hubris_archive_id: Cow::Owned(hubris_archive_id.clone()), }; devices.push((dev.component, target)) diff --git a/oximeter/oximeter/schema/sensor-measurement.toml b/oximeter/oximeter/schema/sensor-measurement.toml index 3d6ac0fdb0..09a64191d1 100644 --- a/oximeter/oximeter/schema/sensor-measurement.toml +++ b/oximeter/oximeter/schema/sensor-measurement.toml @@ -13,6 +13,7 @@ versions = [ "model", "revision", "hubris_archive_id", + "gateway_id", "component", "device", ]} @@ -46,6 +47,12 @@ description = """ Hubris firmware archive ID of the service processor when the measurement \ was recorded.""" + +[fields.gateway_id] +type = "uuid" +description = """ +ID of the Management Gateway Service process which recorded the measurement.""" + [fields.chassis_type] type = "string" description = """ From 9d4b53c322df5d58b378fa43bf75c9a9525b9597 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 16 Aug 2024 10:42:56 -0700 Subject: [PATCH 26/77] remove obnoxious log line --- gateway/src/metrics.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 98fc6e927a..32543632e0 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -272,7 +272,6 @@ impl Poller { Ok(Ok(samples)) => { // No sense copying all the samples into the big vec thing, // just push the vec instead. - slog::info!(&self.log, "made a thingy: {samples:#?}"); self.samples.lock().unwrap().push(samples); } Ok(Err(error)) => { From b96f9c986f0b2a26384c9089d04a4b2eaffacddf Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 16 Aug 2024 10:50:55 -0700 Subject: [PATCH 27/77] give the other sim gimlet some sensors also --- .../configs/sp_sim_config.test.toml | 96 ++++++++++++++++++- 1 file changed, 93 insertions(+), 3 deletions(-) diff --git a/gateway-test-utils/configs/sp_sim_config.test.toml b/gateway-test-utils/configs/sp_sim_config.test.toml index a840ea86ff..fd430f027b 100644 --- a/gateway-test-utils/configs/sp_sim_config.test.toml +++ b/gateway-test-utils/configs/sp_sim_config.test.toml @@ -21,10 +21,9 @@ description = "FAKE temperature sensor 1" capabilities = 0x2 presence = "Present" sensors = [ - { name = "Southwest", kind = "Temperature", sensor_id = 0, last_data.value = 41.7890625, last_data.timestamp = 1234 }, + {name = "Southwest", kind = "Temperature", sensor_id = 0, last_data.value = 41.7890625, last_data.timestamp = 1234 }, ] - [[simulated_sps.sidecar.components]] id = "dev-1" device = "fake-tmp-sensor" @@ -35,7 +34,6 @@ sensors = [ { name = "South", kind = "Temperature", sensor_id = 1, last_error.value = "DeviceError", last_error.timestamp = 1234 }, ] - [[simulated_sps.sidecar]] multicast_addr = "::1" bind_addrs = ["[::1]:0", "[::1]:0"] @@ -156,6 +154,98 @@ capabilities = 0 presence = "Present" serial_console = "[::1]:0" + +[[simulated_sps.gimlet.components]] +id = "sp3-host-cpu" +device = "sp3-host-cpu" +description = "FAKE host cpu" +capabilities = 0 +presence = "Present" +serial_console = "[::1]:0" + +[[simulated_sps.gimlet.components]] +id = "dev-0" +device = "tmp117" +description = "FAKE temperature sensor" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "Southwest", kind = "Temperature", sensor_id = 0, last_data.value = 41.3629, last_data.timestamp = 1234 }, +] +[[simulated_sps.gimlet.components]] +id = "dev-1" +device = "tmp117" +description = "FAKE temperature sensor" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "South", kind = "Temperature", sensor_id = 1, last_data.value = 42.5625, last_data.timestamp = 1234 }, +] + +[[simulated_sps.gimlet.components]] +id = "dev-2" +device = "tmp117" +description = "FAKE Southeast temperature sensor" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "Southeast", kind = "Temperature", sensor_id = 2, last_data.value = 41.570313, last_data.timestamp = 1234 }, +] + +[[simulated_sps.gimlet.components]] +id = "dev-6" +device = "at24csw080" +description = "FAKE U.2 Sharkfin A VPD" +capabilities = 0x0 +presence = "Present" + +[[simulated_sps.gimlet.components]] +id = "dev-7" +device = "max5970" +description = "FAKE U.2 Sharkfin A hot swap controller" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "V12_U2A_A0", kind = "Current", sensor_id = 3, last_data.value = 0.41893438, last_data.timestamp = 1234 }, + { name = "V3P3_U2A_A0", kind = "Current", sensor_id = 4, last_data.value = 0.025614603, last_data.timestamp = 1234 }, + { name = "V12_U2A_A0", kind = "Voltage", sensor_id = 5, last_data.value = 12.02914, last_data.timestamp = 1234 }, + { name = "V3P3_U2A_A0", kind = "Voltage", sensor_id = 6, last_data.value = 3.2618, last_data.timestamp = 1234 }, +] + +[[simulated_sps.gimlet.components]] +id = "dev-8" +device = "nvme_bmc" +description = "FAKE U.2 A NVMe Basic Management Command" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "U2_N0", kind = "Temperature", sensor_id = 7, last_data.value = 56.0, last_data.timestamp = 1234 }, +] +[[simulated_sps.gimlet.components]] +id = "dev-39" +device = "tmp451" +description = "FAKE T6 temperature sensor" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "t6", kind = "Temperature", sensor_id = 9, last_data.value = 70.625, last_data.timestamp = 1234 }, +] +[[simulated_sps.gimlet.components]] +id = "dev-53" +device = "max31790" +description = "FAKE Fan controller" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "Southeast", kind = "Speed", sensor_id = 10, last_data.value = 2510.0, last_data.timestamp = 1234 }, + { name = "Northeast", kind = "Speed", sensor_id = 11, last_data.value = 2390.0, last_data.timestamp = 1234 }, + { name = "South", kind = "Speed", sensor_id = 12, last_data.value = 2467.0, last_data.timestamp = 1234 }, + { name = "North", kind = "Speed", sensor_id = 13, last_data.value = 2195.0, last_data.timestamp = 1234 }, + { name = "Southwest", kind = "Speed", sensor_id = 14, last_data.value = 2680.0, last_data.timestamp = 1234 }, + { name = "Northwest", kind = "Speed", sensor_id = 15, last_data.value = 2212.0, last_data.timestamp = 1234 }, +] + + # # NOTE: for the test suite, the [log] section is ignored; sp-sim logs are rolled # into the gateway logfile. From 45f54ef6bd7206ea81fdf728e6d51e0a99280f70 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 16 Aug 2024 15:36:22 -0700 Subject: [PATCH 28/77] rewrite the whole thing to use less memory etc --- .../configs/sp_sim_config.test.toml | 2 +- gateway/src/metrics.rs | 651 +++++++++++------- 2 files changed, 390 insertions(+), 263 deletions(-) diff --git a/gateway-test-utils/configs/sp_sim_config.test.toml b/gateway-test-utils/configs/sp_sim_config.test.toml index fd430f027b..6be27c28e6 100644 --- a/gateway-test-utils/configs/sp_sim_config.test.toml +++ b/gateway-test-utils/configs/sp_sim_config.test.toml @@ -161,7 +161,7 @@ device = "sp3-host-cpu" description = "FAKE host cpu" capabilities = 0 presence = "Present" -serial_console = "[::1]:0" +# serial_console = "[::1]:0" [[simulated_sps.gimlet.components]] id = "dev-0" diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 32543632e0..8ad76533d9 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -21,8 +21,9 @@ use std::collections::HashMap; use std::net::IpAddr; use std::net::SocketAddr; use std::net::SocketAddrV6; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; use std::time::Duration; +use tokio::sync::broadcast; use tokio::sync::oneshot; use tokio::sync::watch; use tokio::task::JoinHandle; @@ -60,39 +61,39 @@ pub struct Args { bind_loopback: bool, } -/// Actually polls SP sensor readings -struct Poller { - samples: Arc>>>, +/// Manages SP pollers, making sure that every SP has a poller task. +struct PollerManager { log: slog::Logger, apictx: Arc, mgs_id: Uuid, + /// Poller tasks + tasks: tokio::task::JoinSet>, + /// The manager doesn't actually produce samples, but it needs to be able to + /// clone a sender for every poller task it spawns. + sample_tx: broadcast::Sender>, } -/// Manages a metrics server and stuff. -struct ServerManager { - log: slog::Logger, - addrs: watch::Receiver>, - registry: ProducerRegistry, - args: Args, -} - +/// Polls sensor readings from an individual SP. struct SpPoller { apictx: Arc, spid: SpIdentifier, - my_understanding: Mutex, + devices: Vec<(SpComponent, component::Component)>, log: slog::Logger, rack_id: Uuid, mgs_id: Uuid, + sample_tx: broadcast::Sender>, } -#[derive(Default)] -struct SpUnderstanding { - state: Option, - devices: Vec<(SpComponent, component::Component)>, +/// Manages a metrics server and stuff. +struct ServerManager { + log: slog::Logger, + addrs: watch::Receiver>, + registry: ProducerRegistry, + args: Args, } #[derive(Debug)] -struct Producer(Arc>>>); +struct Producer(broadcast::Receiver>); /// The interval on which we ask `oximeter` to poll us for metric data. // N.B.: I picked this pretty arbitrarily... @@ -105,6 +106,44 @@ const POLL_INTERVAL: Duration = Duration::from_secs(1); /// The maximum Dropshot request size for the metrics server. const METRIC_REQUEST_MAX_SIZE: usize = 10 * 1024 * 1024; +/// The expected number of SPs in a fully-loaded rack. +/// +/// N.B. that there *might* be more than this; we shouldn't ever panic or +/// otherwise misbehave if we see more than this number. This is just intended +/// for sizing buffers/map allocations and so forth; we can always realloc if we +/// see a bonus SP or two. That's why it's called "normal number of SPs" and not +/// "MAX_SPS" or similar. +/// +/// Additionally, note that we always determine the channel capacity based on +/// the assumption that *someday*, the rack might be fully loaded with compute +/// sleds, even if it isn't *right now*. A rack with 16 sleds could always grow +/// another 16 later! +const NORMAL_NUMBER_OF_SPS: usize = + 32 // 32 compute sleds + + 2 // two switches + + 2 // two power shelves, someday. + ; + +/// Number of sample vectors from individual SPs to buffer. +const SAMPLE_CHANNEL_CAPACITY: usize = { + // Roughly how many times will we poll SPs for each metrics collection + // interval? + let polls_per_metrics_interval = (METRIC_COLLECTION_INTERVAL.as_secs() + / POLL_INTERVAL.as_secs()) + as usize; + // How many sample collection intervals do we want to allow to elapse before + // we start putting stuff on the floor? + // + // Let's say 16. Chosen totally arbitrarily but seems reasonable-ish. + let sloppiness = 16; + let capacity = + NORMAL_NUMBER_OF_SPS * polls_per_metrics_interval * sloppiness; + // Finally, the buffer capacity will probably be allocated in a power of two + // anyway, so let's make sure our thing is a power of two so we don't waste + // the allocation we're gonna get anyway. + capacity.next_power_of_two() +}; + impl Metrics { pub fn new( log: &slog::Logger, @@ -113,15 +152,28 @@ impl Metrics { ) -> anyhow::Result { let &MgsArguments { id, rack_id, ref addresses, metrics_args } = args; let registry = ProducerRegistry::with_id(id); - let samples = Arc::new(Mutex::new(Vec::new())); + + // Create a channel for the SP poller tasks to send samples to the + // Oximeter producer endpoint. + // + // A broadcast channel is used here, not because we are actually + // multi-consumer (`Producer::produce` is never called concurrently), + // but because the broadcast channel has properly ring-buffer-like + // behavior, where earlier messages are discarded, rather than exerting + // backpressure on senders (as Tokio's MPSC channel does). This + // is what we want, as we would prefer a full buffer to result in + // clobbering the oldest measurements, rather than leaving the newest + // ones on the floor. + let (sample_tx, sample_rx) = + broadcast::channel(SAMPLE_CHANNEL_CAPACITY); registry - .register_producer(Producer(samples.clone())) + .register_producer(Producer(sample_rx)) .context("failed to register metrics producer")?; // Using a channel for this is, admittedly, a bit of an end-run around // the `OnceLock` on the `ServerContext` that *also* stores the rack ID, - // but it has the nice benefit of allowing the `Poller` task to _await_ + // but it has the nice benefit of allowing the `PollerManager` task to _await_ // the rack ID being set...we might want to change other code to use a // similar approach in the future. let (rack_id_tx, rack_id_rx) = oneshot::channel(); @@ -135,9 +187,10 @@ impl Metrics { Some(rack_id_tx) }; let poller = tokio::spawn( - Poller { - samples, + PollerManager { + sample_tx, apictx, + tasks: tokio::task::JoinSet::new(), log: log.new(slog::o!("component" => "sensor-poller")), mgs_id: id, } @@ -198,16 +251,37 @@ impl oximeter::Producer for Producer { fn produce( &mut self, ) -> Result>, MetricsError> { - let samples = { - let mut lock = self.0.lock().unwrap(); - std::mem::take(&mut *lock) - }; + // Drain all samples currently in the queue into a `Vec`. + // + // N.B. it may be tempting to pursue an alternative design where we + // implement `Iterator` for a `broadcast::Receiver>` and + // just return that using `Receiver::resubscribe`...DON'T DO THAT! The + // `resubscribe` function creates a receiver at the current *tail* of + // the ringbuffer, so it won't see any samples produced *before* now. + // Which is the opposite of what we want! + let mut samples = Vec::with_capacity(self.0.len()); + use broadcast::error::TryRecvError; + loop { + match self.0.try_recv() { + Ok(sample_chunk) => samples.push(sample_chunk), + // This error indicates that an old ringbuffer entry was + // overwritten. That's fine, just get the next one. + Err(TryRecvError::Lagged(_)) => continue, + // We've drained all currently available samples! We're done here! + Err(TryRecvError::Empty) | Err(TryRecvError::Closed) => break, + } + } + + // There you go, that's all I've got. Ok(Box::new(samples.into_iter().flatten())) } } -impl Poller { - async fn run(self, rack_id: oneshot::Receiver) -> anyhow::Result<()> { +impl PollerManager { + async fn run( + mut self, + rack_id: oneshot::Receiver, + ) -> anyhow::Result<()> { let switch = &self.apictx.mgmt_switch; // First, wait until we know what the rack ID is... @@ -216,19 +290,22 @@ impl Poller { )?; let mut poll_interval = tokio::time::interval(POLL_INTERVAL); - let mut sps_as_i_understand_them = HashMap::new(); - let mut tasks = tokio::task::JoinSet::new(); - loop { + let mut known_sps: HashMap = + HashMap::with_capacity(NORMAL_NUMBER_OF_SPS); + // Wait for SP discovery to complete, if it hasn't already. + // TODO(eliza): presently, we busy-poll here. It would be nicer to + // replace the `OnceLock` in `ManagementSwitch` + // with a `tokio::sync::watch` + while !switch.is_discovery_complete() { poll_interval.tick().await; + } - // Wait for SP discovery to complete, if it hasn't already. - // TODO(eliza): presently, we busy-poll here. It would be nicer to - // replace the `OnceLock` in `ManagementSwitch` - // with a `tokio::sync::watch` - if !switch.is_discovery_complete() { - continue; - } + slog::info!( + &self.log, + "SP discovery complete! starting to poll sensors..." + ); + loop { let sps = match switch.all_sps() { Ok(sps) => sps, Err(e) => { @@ -237,264 +314,314 @@ impl Poller { "failed to enumerate service processors! will try again in a bit"; "error" => %e, ); + poll_interval.tick().await; continue; } }; for (spid, _) in sps { - let understanding = sps_as_i_understand_them - .entry(spid) - .or_insert_with(|| { - slog::debug!( + // Do we know about this li'l guy already? + match known_sps.get(&spid) { + // Okay, and has it got someone checking up on it? Right? + Some(poller) if poller.is_finished() => { + // Welp. + slog::info!( &self.log, - "found a new little friend!"; + "uh-oh! a known SP's poller task has gone AWOL. restarting it..."; "sp_slot" => ?spid.slot, "chassis_type" => ?spid.typ, ); - Arc::new(SpPoller { - spid, - rack_id, - mgs_id: self.mgs_id, - apictx: self.apictx.clone(), - log: self.log.new(slog::o!( - "sp_slot" => spid.slot, - "chassis_type" => format!("{:?}", spid.typ), - )), - my_understanding: Mutex::new(Default::default()), - }) - }) - .clone(); - tasks.spawn(understanding.clone().poll_sp()); - } - - while let Some(result) = tasks.join_next().await { - match result { - Ok(Ok(samples)) => { - // No sense copying all the samples into the big vec thing, - // just push the vec instead. - self.samples.lock().unwrap().push(samples); } - Ok(Err(error)) => { - // TODO(eliza): actually handle errors polling a SP - // nicely... - slog::error!( + Some(_) => continue, + None => { + slog::info!( &self.log, - "something bad happened"; - "error" => %error, + "found a new little friend!"; + "sp_slot" => ?spid.slot, + "chassis_type" => ?spid.typ, ); } - Err(_) => { - unreachable!( - "tasks on the joinset never get aborted, and we \ - compile with panic=abort, so they won't panic" - ) - } } + + let poller = SpPoller { + spid, + rack_id, + mgs_id: self.mgs_id, + apictx: self.apictx.clone(), + log: self.log.new(slog::o!( + "sp_slot" => spid.slot, + "chassis_type" => format!("{:?}", spid.typ), + )), + devices: Vec::new(), + sample_tx: self.sample_tx.clone(), + }; + let poller_handle = self.tasks.spawn(poller.run(POLL_INTERVAL)); + let _prev_poller = known_sps.insert(spid, poller_handle); + debug_assert!( + _prev_poller.map(|p| p.is_finished()).unwrap_or(true), + "if we clobbered an existing poller task, it better have \ + been because it was dead..." + ); + } + + // All pollers started! Now wait to see if any of them have died... + let mut err = self.tasks.join_next().await; + while let Some(Ok(Err(error))) = err { + // TODO(eliza): actually handle errors polling a SP + // nicely... + slog::error!( + &self.log, + "something bad happened while polling a SP..."; + "error" => %error, + ); + // drain any remaining errors + err = self.tasks.try_join_next(); } } } } +impl Drop for PollerManager { + fn drop(&mut self) { + // This is why the `JoinSet` is a field on the `PollerManager` struct + // rather than a local variable in `async fn run()`! + self.tasks.abort_all(); + } +} + impl SpPoller { - async fn poll_sp(self: Arc) -> anyhow::Result> { + async fn run(mut self, poll_interval: Duration) -> anyhow::Result<()> { let switch = &self.apictx.mgmt_switch; let sp = switch.sp(self.spid)?; - // Check if the SP's state has changed. If it has, we need to make sure - // we still know what all of its sensors are. - let current_state = sp.state().await?; - let known_state = self.my_understanding.lock().unwrap().state.clone(); - - let devices = if Some(¤t_state) != known_state.as_ref() { - slog::debug!( - &self.log, - "our little friend seems to have changed in some kind of way"; - "current_state" => ?current_state, - "known_state" => ?known_state, - ); - let inv_devices = sp.inventory().await?.devices; - let mut devices: Vec<(SpComponent, component::Component)> = - Vec::with_capacity(inv_devices.len()); - - // Reimplement this ourselves because we don't really care about - // reading the RoT state at present. This is unfortunately copied - // from `gateway_messages`. - fn stringify_byte_string(bytes: &[u8]) -> String { - // We expect serial and model numbers to be ASCII and 0-padded: find the first 0 - // byte and convert to a string. If that fails, hexlify the entire slice. - let first_zero = - bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len()); - - std::str::from_utf8(&bytes[..first_zero]) - .map(|s| s.to_string()) - .unwrap_or_else(|_err| hex::encode(bytes)) - } - let (model, serial, hubris_archive_id, revision) = - match current_state { - VersionedSpState::V1(v) => ( - stringify_byte_string(&v.model), - stringify_byte_string(&v.serial_number[..]), - hex::encode(v.hubris_archive_id), - v.revision, - ), - VersionedSpState::V2(v) => ( - stringify_byte_string(&v.model), - stringify_byte_string(&v.serial_number[..]), - hex::encode(v.hubris_archive_id), - v.revision, - ), - VersionedSpState::V3(v) => ( - stringify_byte_string(&v.model), - stringify_byte_string(&v.serial_number[..]), - hex::encode(v.hubris_archive_id), - v.revision, - ), - }; - for dev in inv_devices { - // Skip devices which have nothing interesting for us. - if !dev - .capabilities - .contains(DeviceCapabilities::HAS_MEASUREMENT_CHANNELS) - { - continue; + let mut interval = tokio::time::interval(poll_interval); + let mut known_state = None; + loop { + interval.tick().await; + slog::trace!(&self.log, "interval elapsed, polling SP..."); + + // Check if the SP's state has changed. If it has, we need to make sure + // we still know what all of its sensors are. + let current_state = sp.state().await?; + if Some(¤t_state) != known_state.as_ref() { + // The SP's state appears to have changed. Time to make sure our + // understanding of its devices and identity is up to date! + slog::debug!( + &self.log, + "our little friend seems to have changed in some kind of way"; + "current_state" => ?current_state, + "known_state" => ?known_state, + ); + let inv_devices = sp.inventory().await?.devices; + + // Clear out any previously-known devices, and preallocate capacity + // for all the new ones. + self.devices.clear(); + self.devices.reserve(inv_devices.len()); + + // Reimplement this ourselves because we don't really care about + // reading the RoT state at present. This is unfortunately copied + // from `gateway_messages`. + fn stringify_byte_string(bytes: &[u8]) -> String { + // We expect serial and model numbers to be ASCII and 0-padded: find the first 0 + // byte and convert to a string. If that fails, hexlify the entire slice. + let first_zero = bytes + .iter() + .position(|&b| b == 0) + .unwrap_or(bytes.len()); + + std::str::from_utf8(&bytes[..first_zero]) + .map(|s| s.to_string()) + .unwrap_or_else(|_err| hex::encode(bytes)) } - let component_name = match dev.component.as_str() { - Some(c) => c, - None => { - // These are supposed to always be strings. But, if we - // see one that's not a string, bail instead of panicking. - slog::error!(&self.log, "a SP component ID was not a string! this isn't supposed to happen!"; "device" => ?dev); - anyhow::bail!("a SP component ID was not stringy!"); + let (model, serial, hubris_archive_id, revision) = + match current_state { + VersionedSpState::V1(ref v) => ( + stringify_byte_string(&v.model), + stringify_byte_string(&v.serial_number[..]), + hex::encode(v.hubris_archive_id), + v.revision, + ), + VersionedSpState::V2(ref v) => ( + stringify_byte_string(&v.model), + stringify_byte_string(&v.serial_number[..]), + hex::encode(v.hubris_archive_id), + v.revision, + ), + VersionedSpState::V3(ref v) => ( + stringify_byte_string(&v.model), + stringify_byte_string(&v.serial_number[..]), + hex::encode(v.hubris_archive_id), + v.revision, + ), + }; + for dev in inv_devices { + // Skip devices which have nothing interesting for us. + if !dev + .capabilities + .contains(DeviceCapabilities::HAS_MEASUREMENT_CHANNELS) + { + continue; } - }; - // TODO(eliza): i hate having to clone all these strings for - // every device on the SP...it would be cool if Oximeter let us - // reference count them... - let target = component::Component { - chassis_type: match self.spid.typ { - SpType::Sled => Cow::Borrowed("sled"), - SpType::Switch => Cow::Borrowed("switch"), - SpType::Power => Cow::Borrowed("power"), - }, - slot: self.spid.slot as u32, - component: Cow::Owned(component_name.to_string()), - device: Cow::Owned(dev.device), - model: Cow::Owned(model.clone()), - revision, - serial: Cow::Owned(serial.clone()), - rack_id: self.rack_id, - gateway_id: self.mgs_id, - hubris_archive_id: Cow::Owned(hubris_archive_id.clone()), - }; - devices.push((dev.component, target)) + let component_name = match dev.component.as_str() { + Some(c) => c, + None => { + // These are supposed to always be strings. But, if we + // see one that's not a string, bail instead of panicking. + slog::error!( + &self.log, + "a SP component ID was not a string! this isn't supposed to happen!"; + "device" => ?dev, + ); + anyhow::bail!("a SP component ID was not stringy!"); + } + }; + // TODO(eliza): i hate having to clone all these strings for + // every device on the SP...it would be cool if Oximeter let us + // reference count them... + let target = component::Component { + chassis_type: match self.spid.typ { + SpType::Sled => Cow::Borrowed("sled"), + SpType::Switch => Cow::Borrowed("switch"), + SpType::Power => Cow::Borrowed("power"), + }, + slot: self.spid.slot as u32, + component: Cow::Owned(component_name.to_string()), + device: Cow::Owned(dev.device), + model: Cow::Owned(model.clone()), + revision, + serial: Cow::Owned(serial.clone()), + rack_id: self.rack_id, + gateway_id: self.mgs_id, + hubris_archive_id: Cow::Owned( + hubris_archive_id.clone(), + ), + }; + self.devices.push((dev.component, target)) + } + + known_state = Some(current_state); } - devices - } else { - // This is a bit goofy, but we have to release the lock before - // hitting any `await` points, so just move the inventory out of it. - // We'll put it back when we're done. This lock is *actually* owned - // exclusively by this `SpPoller`, but since it lives in a HashMap, - // rust doesn't understand that. - std::mem::take(&mut self.my_understanding.lock().unwrap().devices) - }; - let mut samples = Vec::with_capacity(devices.len()); - for (c, target) in &devices { - let details = match sp.component_details(*c).await { - Ok(deets) => deets, - Err(error) => { + let mut samples = Vec::with_capacity(self.devices.len()); + for (c, target) in &self.devices { + let details = match sp.component_details(*c).await { + Ok(deets) => deets, + Err(error) => { + slog::warn!( + &self.log, + "failed to read details on SP component"; + "sp_component" => %c, + "error" => %error, + ); + // TODO(eliza): we should increment a metric here... + continue; + } + }; + if details.entries.is_empty() { slog::warn!( &self.log, - "failed to read details on SP component"; + "a component which claimed to have measurement channels \ + had empty details. this seems weird..."; "sp_component" => %c, - "error" => %error, ); - // TODO(eliza): we should increment a metric here... - continue; } - }; - if details.entries.is_empty() { - slog::warn!( + for d in details.entries { + let ComponentDetails::Measurement(m) = d else { + // If the component details are switch port details rather + // than measurement channels, ignore it for now. + continue; + }; + let name = Cow::Owned(m.name); + let sample = match (m.value, m.kind) { + (Ok(datum), MeasurementKind::Temperature) => { + Sample::new( + target, + &component::Temperature { name, datum }, + ) + } + (Ok(datum), MeasurementKind::Current) => Sample::new( + target, + &component::Current { name, datum }, + ), + (Ok(datum), MeasurementKind::Voltage) => Sample::new( + target, + &component::Voltage { name, datum }, + ), + (Ok(datum), MeasurementKind::Power) => Sample::new( + target, + &component::Power { name, datum }, + ), + (Ok(datum), MeasurementKind::InputCurrent) => { + Sample::new( + target, + &component::InputCurrent { name, datum }, + ) + } + (Ok(datum), MeasurementKind::InputVoltage) => { + Sample::new( + target, + &component::InputVoltage { name, datum }, + ) + } + (Ok(datum), MeasurementKind::Speed) => Sample::new( + target, + &component::FanSpeed { name, datum }, + ), + (Err(e), kind) => { + let sensor_kind = match kind { + MeasurementKind::Temperature => "temperature", + MeasurementKind::Current => "current", + MeasurementKind::Voltage => "voltage", + MeasurementKind::Power => "power", + MeasurementKind::InputCurrent => { + "input_current" + } + MeasurementKind::InputVoltage => { + "input_voltage" + } + MeasurementKind::Speed => "fan_speed", + }; + let error = match e { + MeasurementError::InvalidSensor => { + "invalid_sensor" + } + MeasurementError::NoReading => "no_reading", + MeasurementError::NotPresent => "not_present", + MeasurementError::DeviceError => "device_error", + MeasurementError::DeviceUnavailable => { + "device_unavailable" + } + MeasurementError::DeviceTimeout => { + "device_timeout" + } + MeasurementError::DeviceOff => "device_off", + }; + Sample::new( + target, + &component::SensorErrorCount { + error: Cow::Borrowed(error), + name, + datum: oximeter::types::Cumulative::new(1), + sensor_kind: Cow::Borrowed(sensor_kind), + }, + ) + } + }?; + samples.push(sample); + } + } + // No sense cluttering the ringbuffer with empty vecs... + if samples.is_empty() { + continue; + } + if let Err(_) = self.sample_tx.send(samples) { + slog::info!( &self.log, - "a component which claimed to have measurement channels \ - had empty details. this seems weird..."; - "sp_component" => %c, + "all sample receiver handles have been dropped! presumably we are shutting down..."; ); - } - for d in details.entries { - let ComponentDetails::Measurement(m) = d else { - // If the component details are switch port details rather - // than measurement channels, ignore it for now. - continue; - }; - let name = Cow::Owned(m.name); - let sample = match (m.value, m.kind) { - (Ok(datum), MeasurementKind::Temperature) => Sample::new( - target, - &component::Temperature { name, datum }, - ), - (Ok(datum), MeasurementKind::Current) => { - Sample::new(target, &component::Current { name, datum }) - } - (Ok(datum), MeasurementKind::Voltage) => { - Sample::new(target, &component::Voltage { name, datum }) - } - (Ok(datum), MeasurementKind::Power) => { - Sample::new(target, &component::Power { name, datum }) - } - (Ok(datum), MeasurementKind::InputCurrent) => Sample::new( - target, - &component::InputCurrent { name, datum }, - ), - (Ok(datum), MeasurementKind::InputVoltage) => Sample::new( - target, - &component::InputVoltage { name, datum }, - ), - (Ok(datum), MeasurementKind::Speed) => Sample::new( - target, - &component::FanSpeed { name, datum }, - ), - (Err(e), kind) => { - let sensor_kind = match kind { - MeasurementKind::Temperature => "temperature", - MeasurementKind::Current => "current", - MeasurementKind::Voltage => "voltage", - MeasurementKind::Power => "power", - MeasurementKind::InputCurrent => "input_current", - MeasurementKind::InputVoltage => "input_voltage", - MeasurementKind::Speed => "fan_speed", - }; - let error = match e { - MeasurementError::InvalidSensor => "invalid_sensor", - MeasurementError::NoReading => "no_reading", - MeasurementError::NotPresent => "not_present", - MeasurementError::DeviceError => "device_error", - MeasurementError::DeviceUnavailable => { - "device_unavailable" - } - MeasurementError::DeviceTimeout => "device_timeout", - MeasurementError::DeviceOff => "device_off", - }; - Sample::new( - target, - &component::SensorErrorCount { - error: Cow::Borrowed(error), - name, - datum: oximeter::types::Cumulative::new(1), - sensor_kind: Cow::Borrowed(sensor_kind), - }, - ) - } - }?; - samples.push(sample); + return Ok(()); } } - - // Update our understanding again. - let mut understanding = self.my_understanding.lock().unwrap(); - understanding.devices = devices; - understanding.state = Some(current_state); - - Ok(samples) } } From c4402b9d03c23263bad06a8de61a00fb0bda9a9b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 17 Aug 2024 11:33:15 -0700 Subject: [PATCH 29/77] i guess we have to increment cumulative metrics? --- gateway/src/metrics.rs | 78 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 8 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 8ad76533d9..a5bf134d4d 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -13,11 +13,13 @@ use gateway_sp_comms::SpComponent; use gateway_sp_comms::VersionedSpState; use omicron_common::api::internal::nexus::ProducerEndpoint; use omicron_common::api::internal::nexus::ProducerKind; +use oximeter::types::Cumulative; use oximeter::types::ProducerRegistry; use oximeter::types::Sample; use oximeter::MetricsError; use std::borrow::Cow; -use std::collections::HashMap; +use std::collections::hash_map; +use std::collections::hash_map::HashMap; use std::net::IpAddr; use std::net::SocketAddr; use std::net::SocketAddrV6; @@ -77,13 +79,25 @@ struct PollerManager { struct SpPoller { apictx: Arc, spid: SpIdentifier, - devices: Vec<(SpComponent, component::Component)>, + devices: HashMap, log: slog::Logger, rack_id: Uuid, mgs_id: Uuid, sample_tx: broadcast::Sender>, } +struct PollerDevice { + target: component::Component, + sensor_errors: HashMap>, +} + +#[derive(Eq, PartialEq, Hash)] +struct SensorErrorKey { + name: Cow<'static, str>, + kind: &'static str, + error: &'static str, +} + /// Manages a metrics server and stuff. struct ServerManager { log: slog::Logger, @@ -352,7 +366,7 @@ impl PollerManager { "sp_slot" => spid.slot, "chassis_type" => format!("{:?}", spid.typ), )), - devices: Vec::new(), + devices: HashMap::new(), sample_tx: self.sample_tx.clone(), }; let poller_handle = self.tasks.spawn(poller.run(POLL_INTERVAL)); @@ -395,6 +409,7 @@ impl SpPoller { let sp = switch.sp(self.spid)?; let mut interval = tokio::time::interval(poll_interval); let mut known_state = None; + loop { interval.tick().await; slog::trace!(&self.log, "interval elapsed, polling SP..."); @@ -496,14 +511,47 @@ impl SpPoller { hubris_archive_id.clone(), ), }; - self.devices.push((dev.component, target)) + match self.devices.entry(dev.component) { + // Found a new device! + hash_map::Entry::Vacant(entry) => { + slog::debug!( + &self.log, + "discovered a new component!"; + "component" => ?dev.component, + ); + entry.insert(PollerDevice { + target, + sensor_errors: HashMap::new(), + }); + } + // We previously had a known device for this thing, but + // the metrics target has changed, so we should reset + // its cumulative metrics. + hash_map::Entry::Occupied(mut entry) + if entry.get().target != target => + { + slog::trace!( + &self.log, + "target has changed, resetting cumulative metrics for component."; + "component" => ?dev.component, + ); + entry.insert(PollerDevice { + target, + sensor_errors: HashMap::new(), + }); + } + + // The target for this device hasn't changed, don't reset it. + hash_map::Entry::Occupied(_) => {} + } } known_state = Some(current_state); } let mut samples = Vec::with_capacity(self.devices.len()); - for (c, target) in &self.devices { + for (c, PollerDevice { target, sensor_errors }) in &mut self.devices + { let details = match sp.component_details(*c).await { Ok(deets) => deets, Err(error) => { @@ -568,7 +616,7 @@ impl SpPoller { &component::FanSpeed { name, datum }, ), (Err(e), kind) => { - let sensor_kind = match kind { + let kind = match kind { MeasurementKind::Temperature => "temperature", MeasurementKind::Current => "current", MeasurementKind::Voltage => "voltage", @@ -596,13 +644,27 @@ impl SpPoller { } MeasurementError::DeviceOff => "device_off", }; + let datum = sensor_errors + .entry(SensorErrorKey { + name: name.clone(), + kind, + error, + }) + .or_insert(Cumulative::new(0)); + // TODO(eliza): perhaps we should treat this as + // "level-triggered" and only increment the counter + // when the sensor has *changed* to an errored + // state after we have seen at least one good + // measurement from it since the last time the error + // was observed? + datum.increment(); Sample::new( target, &component::SensorErrorCount { error: Cow::Borrowed(error), name, - datum: oximeter::types::Cumulative::new(1), - sensor_kind: Cow::Borrowed(sensor_kind), + datum: *datum, + sensor_kind: Cow::Borrowed(kind), }, ) } From 3de80cb7656b8d56475c5758c7f4a8c9f3c413aa Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 17 Aug 2024 11:40:37 -0700 Subject: [PATCH 30/77] start doing poll error metrics too --- gateway/src/metrics.rs | 28 +++++++++++-------- .../oximeter/schema/sensor-measurement.toml | 10 +++++++ 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index a5bf134d4d..1ed7bafe6f 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -79,16 +79,19 @@ struct PollerManager { struct SpPoller { apictx: Arc, spid: SpIdentifier, - devices: HashMap, + components: HashMap, log: slog::Logger, rack_id: Uuid, mgs_id: Uuid, sample_tx: broadcast::Sender>, } -struct PollerDevice { +struct ComponentMetrics { target: component::Component, + /// Counts of errors reported by sensors on this component. sensor_errors: HashMap>, + /// Counts of errors that occurred whilst polling the d + poll_errors: HashMap<&'static str, Cumulative>, } #[derive(Eq, PartialEq, Hash)] @@ -366,7 +369,7 @@ impl PollerManager { "sp_slot" => spid.slot, "chassis_type" => format!("{:?}", spid.typ), )), - devices: HashMap::new(), + components: HashMap::new(), sample_tx: self.sample_tx.clone(), }; let poller_handle = self.tasks.spawn(poller.run(POLL_INTERVAL)); @@ -430,8 +433,8 @@ impl SpPoller { // Clear out any previously-known devices, and preallocate capacity // for all the new ones. - self.devices.clear(); - self.devices.reserve(inv_devices.len()); + self.components.clear(); + self.components.reserve(inv_devices.len()); // Reimplement this ourselves because we don't really care about // reading the RoT state at present. This is unfortunately copied @@ -511,7 +514,7 @@ impl SpPoller { hubris_archive_id.clone(), ), }; - match self.devices.entry(dev.component) { + match self.components.entry(dev.component) { // Found a new device! hash_map::Entry::Vacant(entry) => { slog::debug!( @@ -519,9 +522,10 @@ impl SpPoller { "discovered a new component!"; "component" => ?dev.component, ); - entry.insert(PollerDevice { + entry.insert(ComponentMetrics { target, sensor_errors: HashMap::new(), + poll_errors: HashMap::new(), }); } // We previously had a known device for this thing, but @@ -532,12 +536,13 @@ impl SpPoller { { slog::trace!( &self.log, - "target has changed, resetting cumulative metrics for component."; + "target has changed, resetting cumulative metrics for component"; "component" => ?dev.component, ); - entry.insert(PollerDevice { + entry.insert(ComponentMetrics { target, sensor_errors: HashMap::new(), + poll_errors: HashMap::new(), }); } @@ -549,8 +554,9 @@ impl SpPoller { known_state = Some(current_state); } - let mut samples = Vec::with_capacity(self.devices.len()); - for (c, PollerDevice { target, sensor_errors }) in &mut self.devices + let mut samples = Vec::with_capacity(self.components.len()); + for (c, ComponentMetrics { target, sensor_errors, poll_errors }) in + &mut self.components { let details = match sp.component_details(*c).await { Ok(deets) => deets, diff --git a/oximeter/oximeter/schema/sensor-measurement.toml b/oximeter/oximeter/schema/sensor-measurement.toml index 09a64191d1..bba1e3667f 100644 --- a/oximeter/oximeter/schema/sensor-measurement.toml +++ b/oximeter/oximeter/schema/sensor-measurement.toml @@ -158,3 +158,13 @@ datum_type = "cumulative_u64" versions = [ { added_in = 1, fields = ["name", "error", "sensor_kind"]} ] + +[[metrics]] +name = "poll_error_count" +description = """ +Cumulative count of errors encountered whilst polling a component's sensors.""" +units = "count" +datum_type = "cumulative_u64" +versions = [ + { added_in = 1, fields = ["error"] } +] From 74d82e1a302df6226962db1477a5ed71a7ce620f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 17 Aug 2024 13:56:17 -0700 Subject: [PATCH 31/77] redo error handling, record poll error metrics --- gateway/src/metrics.rs | 666 ++++++++++++++++++++++++----------------- 1 file changed, 393 insertions(+), 273 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 1ed7bafe6f..14ed2782d2 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -1,14 +1,18 @@ // This Source Code Form is subject to the terms of the Mozilla Public // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use crate::error::CommunicationError; +use crate::error::SpCommsError; use crate::management_switch::SpIdentifier; use crate::management_switch::SpType; +use crate::MgsArguments; use crate::ServerContext; use anyhow::Context; use gateway_messages::measurement::MeasurementError; use gateway_messages::measurement::MeasurementKind; use gateway_messages::ComponentDetails; use gateway_messages::DeviceCapabilities; +use gateway_sp_comms::SingleSp; use gateway_sp_comms::SpComponent; use gateway_sp_comms::VersionedSpState; use omicron_common::api::internal::nexus::ProducerEndpoint; @@ -31,8 +35,6 @@ use tokio::sync::watch; use tokio::task::JoinHandle; use uuid::Uuid; -use crate::MgsArguments; - oximeter::use_timeseries!("sensor-measurement.toml"); /// Handle to the metrics task. @@ -69,7 +71,7 @@ struct PollerManager { apictx: Arc, mgs_id: Uuid, /// Poller tasks - tasks: tokio::task::JoinSet>, + tasks: tokio::task::JoinSet>, /// The manager doesn't actually produce samples, but it needs to be able to /// clone a sender for every poller task it spawns. sample_tx: broadcast::Sender>, @@ -77,8 +79,8 @@ struct PollerManager { /// Polls sensor readings from an individual SP. struct SpPoller { - apictx: Arc, spid: SpIdentifier, + known_state: Option, components: HashMap, log: slog::Logger, rack_id: Uuid, @@ -364,15 +366,17 @@ impl PollerManager { spid, rack_id, mgs_id: self.mgs_id, - apictx: self.apictx.clone(), log: self.log.new(slog::o!( "sp_slot" => spid.slot, "chassis_type" => format!("{:?}", spid.typ), )), components: HashMap::new(), + known_state: None, sample_tx: self.sample_tx.clone(), }; - let poller_handle = self.tasks.spawn(poller.run(POLL_INTERVAL)); + let poller_handle = self + .tasks + .spawn(poller.run(POLL_INTERVAL, self.apictx.clone())); let _prev_poller = known_sps.insert(spid, poller_handle); debug_assert!( _prev_poller.map(|p| p.is_finished()).unwrap_or(true), @@ -382,17 +386,27 @@ impl PollerManager { } // All pollers started! Now wait to see if any of them have died... - let mut err = self.tasks.join_next().await; - while let Some(Ok(Err(error))) = err { - // TODO(eliza): actually handle errors polling a SP - // nicely... - slog::error!( - &self.log, - "something bad happened while polling a SP..."; - "error" => %error, - ); + let mut joined = self.tasks.join_next().await; + while let Some(result) = joined { + if let Err(e) = result { + if cfg!(debug_assertions) { + unreachable!( + "we compile with `panic=\"abort\"`, so a spawned task \ + panicking should abort the whole process..." + ); + } else { + slog::error!( + &self.log, + "a spawned SP poller task panicked! this should \ + never happen: we compile with `panic=\"abort\"`, so \ + a spawned task panicking should abort the whole \ + process..."; + "error" => %e, + ); + } + } // drain any remaining errors - err = self.tasks.try_join_next(); + joined = self.tasks.try_join_next(); } } } @@ -407,289 +421,336 @@ impl Drop for PollerManager { } impl SpPoller { - async fn run(mut self, poll_interval: Duration) -> anyhow::Result<()> { - let switch = &self.apictx.mgmt_switch; - let sp = switch.sp(self.spid)?; + async fn run( + mut self, + poll_interval: Duration, + apictx: Arc, + ) -> Result<(), SpCommsError> { let mut interval = tokio::time::interval(poll_interval); - let mut known_state = None; + let switch = &apictx.mgmt_switch; + let sp = switch.sp(self.spid)?; loop { interval.tick().await; slog::trace!(&self.log, "interval elapsed, polling SP..."); - // Check if the SP's state has changed. If it has, we need to make sure - // we still know what all of its sensors are. - let current_state = sp.state().await?; - if Some(¤t_state) != known_state.as_ref() { - // The SP's state appears to have changed. Time to make sure our - // understanding of its devices and identity is up to date! - slog::debug!( - &self.log, - "our little friend seems to have changed in some kind of way"; - "current_state" => ?current_state, - "known_state" => ?known_state, - ); - let inv_devices = sp.inventory().await?.devices; - - // Clear out any previously-known devices, and preallocate capacity - // for all the new ones. - self.components.clear(); - self.components.reserve(inv_devices.len()); - - // Reimplement this ourselves because we don't really care about - // reading the RoT state at present. This is unfortunately copied - // from `gateway_messages`. - fn stringify_byte_string(bytes: &[u8]) -> String { - // We expect serial and model numbers to be ASCII and 0-padded: find the first 0 - // byte and convert to a string. If that fails, hexlify the entire slice. - let first_zero = bytes - .iter() - .position(|&b| b == 0) - .unwrap_or(bytes.len()); - - std::str::from_utf8(&bytes[..first_zero]) - .map(|s| s.to_string()) - .unwrap_or_else(|_err| hex::encode(bytes)) + match self.poll(sp).await { + // No sense cluttering the ringbuffer with empty vecs... + Ok(samples) if samples.is_empty() => { + slog::trace!(&self.log, "polled SP, no samples returned"; "num_samples" => 0usize); } - let (model, serial, hubris_archive_id, revision) = - match current_state { - VersionedSpState::V1(ref v) => ( - stringify_byte_string(&v.model), - stringify_byte_string(&v.serial_number[..]), - hex::encode(v.hubris_archive_id), - v.revision, - ), - VersionedSpState::V2(ref v) => ( - stringify_byte_string(&v.model), - stringify_byte_string(&v.serial_number[..]), - hex::encode(v.hubris_archive_id), - v.revision, - ), - VersionedSpState::V3(ref v) => ( - stringify_byte_string(&v.model), - stringify_byte_string(&v.serial_number[..]), - hex::encode(v.hubris_archive_id), - v.revision, - ), - }; - for dev in inv_devices { - // Skip devices which have nothing interesting for us. - if !dev - .capabilities - .contains(DeviceCapabilities::HAS_MEASUREMENT_CHANNELS) - { - continue; - } - let component_name = match dev.component.as_str() { - Some(c) => c, - None => { - // These are supposed to always be strings. But, if we - // see one that's not a string, bail instead of panicking. - slog::error!( - &self.log, - "a SP component ID was not a string! this isn't supposed to happen!"; - "device" => ?dev, - ); - anyhow::bail!("a SP component ID was not stringy!"); - } - }; - // TODO(eliza): i hate having to clone all these strings for - // every device on the SP...it would be cool if Oximeter let us - // reference count them... - let target = component::Component { - chassis_type: match self.spid.typ { - SpType::Sled => Cow::Borrowed("sled"), - SpType::Switch => Cow::Borrowed("switch"), - SpType::Power => Cow::Borrowed("power"), - }, - slot: self.spid.slot as u32, - component: Cow::Owned(component_name.to_string()), - device: Cow::Owned(dev.device), - model: Cow::Owned(model.clone()), - revision, - serial: Cow::Owned(serial.clone()), - rack_id: self.rack_id, - gateway_id: self.mgs_id, - hubris_archive_id: Cow::Owned( - hubris_archive_id.clone(), - ), - }; - match self.components.entry(dev.component) { - // Found a new device! - hash_map::Entry::Vacant(entry) => { - slog::debug!( - &self.log, - "discovered a new component!"; - "component" => ?dev.component, - ); - entry.insert(ComponentMetrics { - target, - sensor_errors: HashMap::new(), - poll_errors: HashMap::new(), - }); - } - // We previously had a known device for this thing, but - // the metrics target has changed, so we should reset - // its cumulative metrics. - hash_map::Entry::Occupied(mut entry) - if entry.get().target != target => - { - slog::trace!( - &self.log, - "target has changed, resetting cumulative metrics for component"; - "component" => ?dev.component, - ); - entry.insert(ComponentMetrics { - target, - sensor_errors: HashMap::new(), - poll_errors: HashMap::new(), - }); - } + Ok(samples) => { + slog::trace!(&self.log, "polled SP successfully"; "num_samples" => samples.len()); - // The target for this device hasn't changed, don't reset it. - hash_map::Entry::Occupied(_) => {} + if let Err(_) = self.sample_tx.send(samples) { + slog::info!( + &self.log, + "all sample receiver handles have been dropped! \ + presumably we are shutting down..."; + ); + return Ok(()); } } - - known_state = Some(current_state); + Err(CommunicationError::NoSpDiscovered) => { + slog::info!( + &self.log, + "our SP seems to no longer be present; giving up." + ); + return Ok(()); + } + Err(error) => { + slog::warn!( + &self.log, + "failed to poll SP, will try again momentarily..."; + "error" => %error, + ); + // TODO(eliza): we should probably have a metric for failed + // SP polls. + } } + } + } - let mut samples = Vec::with_capacity(self.components.len()); - for (c, ComponentMetrics { target, sensor_errors, poll_errors }) in - &mut self.components - { - let details = match sp.component_details(*c).await { - Ok(deets) => deets, - Err(error) => { + async fn poll( + &mut self, + sp: &SingleSp, + ) -> Result, CommunicationError> { + // Check if the SP's state has changed. If it has, we need to make sure + // we still know what all of its sensors are. + let current_state = sp.state().await?; + if Some(¤t_state) != self.known_state.as_ref() { + // The SP's state appears to have changed. Time to make sure our + // understanding of its devices and identity is up to date! + slog::debug!( + &self.log, + "our little friend seems to have changed in some kind of way"; + "current_state" => ?current_state, + "known_state" => ?self.known_state, + ); + let inv_devices = sp.inventory().await?.devices; + + // Clear out any previously-known devices, and preallocate capacity + // for all the new ones. + self.components.clear(); + self.components.reserve(inv_devices.len()); + + // Reimplement this ourselves because we don't really care about + // reading the RoT state at present. This is unfortunately copied + // from `gateway_messages`. + fn stringify_byte_string(bytes: &[u8]) -> String { + // We expect serial and model numbers to be ASCII and 0-padded: find the first 0 + // byte and convert to a string. If that fails, hexlify the entire slice. + let first_zero = + bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len()); + + std::str::from_utf8(&bytes[..first_zero]) + .map(|s| s.to_string()) + .unwrap_or_else(|_err| hex::encode(bytes)) + } + let (model, serial, hubris_archive_id, revision) = + match current_state { + VersionedSpState::V1(ref v) => ( + stringify_byte_string(&v.model), + stringify_byte_string(&v.serial_number[..]), + hex::encode(v.hubris_archive_id), + v.revision, + ), + VersionedSpState::V2(ref v) => ( + stringify_byte_string(&v.model), + stringify_byte_string(&v.serial_number[..]), + hex::encode(v.hubris_archive_id), + v.revision, + ), + VersionedSpState::V3(ref v) => ( + stringify_byte_string(&v.model), + stringify_byte_string(&v.serial_number[..]), + hex::encode(v.hubris_archive_id), + v.revision, + ), + }; + for dev in inv_devices { + // Skip devices which have nothing interesting for us. + if !dev + .capabilities + .contains(DeviceCapabilities::HAS_MEASUREMENT_CHANNELS) + { + continue; + } + let component = match dev.component.as_str() { + Some(c) => Cow::Owned(c.to_string()), + None => { + // These are supposed to always be strings. But, if we + // see one that's not a string, fall back to the hex + // representation rather than panicking. + let hex = hex::encode(dev.component.id); slog::warn!( &self.log, - "failed to read details on SP component"; - "sp_component" => %c, - "error" => %error, + "a SP component ID was not a string! this isn't \ + supposed to happen!"; + "component" => %hex, + "device" => ?dev, ); - // TODO(eliza): we should increment a metric here... - continue; + Cow::Owned(hex) } }; - if details.entries.is_empty() { - slog::warn!( - &self.log, - "a component which claimed to have measurement channels \ - had empty details. this seems weird..."; - "sp_component" => %c, - ); - } - for d in details.entries { - let ComponentDetails::Measurement(m) = d else { - // If the component details are switch port details rather - // than measurement channels, ignore it for now. - continue; - }; - let name = Cow::Owned(m.name); - let sample = match (m.value, m.kind) { - (Ok(datum), MeasurementKind::Temperature) => { - Sample::new( - target, - &component::Temperature { name, datum }, - ) - } - (Ok(datum), MeasurementKind::Current) => Sample::new( - target, - &component::Current { name, datum }, - ), - (Ok(datum), MeasurementKind::Voltage) => Sample::new( - target, - &component::Voltage { name, datum }, - ), - (Ok(datum), MeasurementKind::Power) => Sample::new( + // TODO(eliza): i hate having to clone all these strings for + // every device on the SP...it would be cool if Oximeter let us + // reference count them... + let target = component::Component { + chassis_type: Cow::Borrowed(match self.spid.typ { + SpType::Sled => "sled", + SpType::Switch => "switch", + SpType::Power => "power", + }), + slot: self.spid.slot as u32, + component, + device: Cow::Owned(dev.device), + model: Cow::Owned(model.clone()), + revision, + serial: Cow::Owned(serial.clone()), + rack_id: self.rack_id, + gateway_id: self.mgs_id, + hubris_archive_id: Cow::Owned(hubris_archive_id.clone()), + }; + match self.components.entry(dev.component) { + // Found a new device! + hash_map::Entry::Vacant(entry) => { + slog::debug!( + &self.log, + "discovered a new component!"; + "component" => ?dev.component, + "device" => ?target.device, + ); + entry.insert(ComponentMetrics { target, - &component::Power { name, datum }, - ), - (Ok(datum), MeasurementKind::InputCurrent) => { - Sample::new( - target, - &component::InputCurrent { name, datum }, - ) - } - (Ok(datum), MeasurementKind::InputVoltage) => { - Sample::new( - target, - &component::InputVoltage { name, datum }, - ) - } - (Ok(datum), MeasurementKind::Speed) => Sample::new( + sensor_errors: HashMap::new(), + poll_errors: HashMap::new(), + }); + } + // We previously had a known device for this thing, but + // the metrics target has changed, so we should reset + // its cumulative metrics. + hash_map::Entry::Occupied(mut entry) + if entry.get().target != target => + { + slog::trace!( + &self.log, + "target has changed, resetting cumulative metrics \ + for component"; + "component" => ?dev.component, + ); + entry.insert(ComponentMetrics { target, - &component::FanSpeed { name, datum }, - ), - (Err(e), kind) => { - let kind = match kind { - MeasurementKind::Temperature => "temperature", - MeasurementKind::Current => "current", - MeasurementKind::Voltage => "voltage", - MeasurementKind::Power => "power", - MeasurementKind::InputCurrent => { - "input_current" - } - MeasurementKind::InputVoltage => { - "input_voltage" - } - MeasurementKind::Speed => "fan_speed", - }; - let error = match e { - MeasurementError::InvalidSensor => { - "invalid_sensor" - } - MeasurementError::NoReading => "no_reading", - MeasurementError::NotPresent => "not_present", - MeasurementError::DeviceError => "device_error", - MeasurementError::DeviceUnavailable => { - "device_unavailable" - } - MeasurementError::DeviceTimeout => { - "device_timeout" - } - MeasurementError::DeviceOff => "device_off", - }; - let datum = sensor_errors - .entry(SensorErrorKey { - name: name.clone(), - kind, - error, - }) - .or_insert(Cumulative::new(0)); - // TODO(eliza): perhaps we should treat this as - // "level-triggered" and only increment the counter - // when the sensor has *changed* to an errored - // state after we have seen at least one good - // measurement from it since the last time the error - // was observed? - datum.increment(); - Sample::new( - target, - &component::SensorErrorCount { - error: Cow::Borrowed(error), - name, - datum: *datum, - sensor_kind: Cow::Borrowed(kind), - }, - ) - } - }?; - samples.push(sample); + sensor_errors: HashMap::new(), + poll_errors: HashMap::new(), + }); + } + + // The target for this device hasn't changed, don't reset it. + hash_map::Entry::Occupied(_) => {} } } - // No sense cluttering the ringbuffer with empty vecs... - if samples.is_empty() { - continue; + + self.known_state = Some(current_state); + } + + let mut samples = Vec::with_capacity(self.components.len()); + for (c, metrics) in &mut self.components { + // Metrics samples *should* always be well-formed. If we ever emit a + // messed up one, this is a programmer error, and therefore should + // fail in test, but should probably *not* take down the whole + // management gateway in a real-life rack, especially because it's + // probably going to happen again if we were to get restarted. + const BAD_SAMPLE: &str = + "we emitted a bad metrics sample! this should never happen"; + macro_rules! try_sample { + ($sample:expr) => { + match $sample { + Ok(sample) => samples.push(sample), + + Err(err) => { + slog::error!( + &self.log, + "{BAD_SAMPLE}!"; + "error" => %err, + ); + #[cfg(debug_assertions)] + unreachable!("{BAD_SAMPLE}: {err}"); + } + } + } } - if let Err(_) = self.sample_tx.send(samples) { - slog::info!( + let details = match sp.component_details(*c).await { + Ok(deets) => deets, + // SP seems gone! + Err(CommunicationError::NoSpDiscovered) => { + return Err(CommunicationError::NoSpDiscovered) + } + Err(error) => { + slog::warn!( + &self.log, + "failed to read details on SP component"; + "sp_component" => %c, + "error" => %error, + ); + try_sample!(metrics.poll_error(comms_error_str(error))); + continue; + } + }; + if details.entries.is_empty() { + slog::warn!( &self.log, - "all sample receiver handles have been dropped! presumably we are shutting down..."; + "a component which claimed to have measurement channels \ + had empty details. this seems weird..."; + "sp_component" => %c, ); - return Ok(()); + try_sample!(metrics.poll_error("no_measurement_channels")); + continue; + } + let ComponentMetrics { sensor_errors, target, .. } = metrics; + for d in details.entries { + let ComponentDetails::Measurement(m) = d else { + // If the component details are switch port details rather + // than measurement channels, ignore it for now. + continue; + }; + let name = Cow::Owned(m.name); + let sample = match (m.value, m.kind) { + (Ok(datum), MeasurementKind::Temperature) => Sample::new( + target, + &component::Temperature { name, datum }, + ), + (Ok(datum), MeasurementKind::Current) => { + Sample::new(target, &component::Current { name, datum }) + } + (Ok(datum), MeasurementKind::Voltage) => { + Sample::new(target, &component::Voltage { name, datum }) + } + (Ok(datum), MeasurementKind::Power) => { + Sample::new(target, &component::Power { name, datum }) + } + (Ok(datum), MeasurementKind::InputCurrent) => Sample::new( + target, + &component::InputCurrent { name, datum }, + ), + (Ok(datum), MeasurementKind::InputVoltage) => Sample::new( + target, + &component::InputVoltage { name, datum }, + ), + (Ok(datum), MeasurementKind::Speed) => Sample::new( + target, + &component::FanSpeed { name, datum }, + ), + (Err(e), kind) => { + let kind = match kind { + MeasurementKind::Temperature => "temperature", + MeasurementKind::Current => "current", + MeasurementKind::Voltage => "voltage", + MeasurementKind::Power => "power", + MeasurementKind::InputCurrent => "input_current", + MeasurementKind::InputVoltage => "input_voltage", + MeasurementKind::Speed => "fan_speed", + }; + let error = match e { + MeasurementError::InvalidSensor => "invalid_sensor", + MeasurementError::NoReading => "no_reading", + MeasurementError::NotPresent => "not_present", + MeasurementError::DeviceError => "device_error", + MeasurementError::DeviceUnavailable => { + "device_unavailable" + } + MeasurementError::DeviceTimeout => "device_timeout", + MeasurementError::DeviceOff => "device_off", + }; + let datum = sensor_errors + .entry(SensorErrorKey { + name: name.clone(), + kind, + error, + }) + .or_insert(Cumulative::new(0)); + // TODO(eliza): perhaps we should treat this as + // "level-triggered" and only increment the counter + // when the sensor has *changed* to an errored + // state after we have seen at least one good + // measurement from it since the last time the error + // was observed? + datum.increment(); + Sample::new( + target, + &component::SensorErrorCount { + error: Cow::Borrowed(error), + name, + datum: *datum, + sensor_kind: Cow::Borrowed(kind), + }, + ) + } + }; + try_sample!(sample); } } + Ok(samples) } } @@ -785,3 +846,62 @@ impl ServerManager { } } } + +impl ComponentMetrics { + fn poll_error( + &mut self, + error_str: &'static str, + ) -> Result { + let datum = self + .poll_errors + .entry(error_str) + .or_insert_with(|| Cumulative::new(0)); + datum.increment(); + Sample::new( + &self.target, + &component::PollErrorCount { + error: Cow::Borrowed(error_str), + datum: *datum, + }, + ) + } +} + +fn comms_error_str(error: CommunicationError) -> &'static str { + // TODO(eliza): a bunch of these probably can't be returned by the specific + // operations we try to do. It could be good to make the methods this code + // calls return a smaller enum of just the errors it might actually + // encounter? Figure this out later. + match error { + CommunicationError::NoSpDiscovered => "no_sp_discovered", + CommunicationError::InterfaceError(_) => "interface", + CommunicationError::ScopeIdChangingFrequently { .. } => { + "scope_id_changing_frequently" + } + CommunicationError::JoinMulticast { .. } => "join_multicast", + CommunicationError::UdpSendTo { .. } => "udp_send_to", + CommunicationError::UdpRecv(_) => "udp_recv", + CommunicationError::Deserialize { .. } => "deserialize", + CommunicationError::ExhaustedNumAttempts(_) => "exhausted_num_attempts", + CommunicationError::BadResponseType { .. } => "bad_response_type", + CommunicationError::SpError { .. } => "sp_error", + CommunicationError::BogusSerialConsoleState { .. } => { + "bogus_serial_console_state" + } + CommunicationError::VersionMismatch { .. } => { + "protocol_version_mismatch" + } + CommunicationError::TlvDeserialize { .. } => "tlv_deserialize", + CommunicationError::TlvDecode(_) => "tlv_decode", + CommunicationError::TlvPagination { .. } => "tlv_pagination", + CommunicationError::IpccKeyLookupValueTooLarge => { + "ipcc_key_lookup_value_too_large" + } + CommunicationError::UnexpectedTrailingData(_) => { + "unexpected_trailing_data" + } + CommunicationError::BadTrailingDataSize { .. } => { + "bad_trailing_data_size" + } + } +} From 7f5c1e266498dc14334cfeea7f038017c5214d55 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sun, 18 Aug 2024 11:29:24 -0700 Subject: [PATCH 32/77] move metrics stuff to the config file --- Cargo.lock | 1 + dev-tools/mgs-dev/Cargo.toml | 1 + dev-tools/mgs-dev/src/main.rs | 32 +- gateway-test-utils/configs/config.test.toml | 11 + gateway-test-utils/src/setup.rs | 23 +- gateway/examples/config.toml | 9 + gateway/src/bin/mgs.rs | 6 +- gateway/src/config.rs | 3 + gateway/src/lib.rs | 10 +- gateway/src/metrics.rs | 335 ++++++++++++++------ smf/mgs-sim/config.toml | 9 + smf/mgs/config.toml | 9 + 12 files changed, 303 insertions(+), 146 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fbd3e406ff..9aa93be005 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4720,6 +4720,7 @@ dependencies = [ "gateway-messages", "gateway-test-utils", "libc", + "omicron-gateway", "omicron-workspace-hack", "signal-hook-tokio", "tokio", diff --git a/dev-tools/mgs-dev/Cargo.toml b/dev-tools/mgs-dev/Cargo.toml index d5f61f4b96..70382c0469 100644 --- a/dev-tools/mgs-dev/Cargo.toml +++ b/dev-tools/mgs-dev/Cargo.toml @@ -14,6 +14,7 @@ futures.workspace = true gateway-messages.workspace = true gateway-test-utils.workspace = true libc.workspace = true +omicron-gateway.workspace = true omicron-workspace-hack.workspace = true signal-hook-tokio.workspace = true tokio.workspace = true diff --git a/dev-tools/mgs-dev/src/main.rs b/dev-tools/mgs-dev/src/main.rs index 76fe3f0750..9c280337b8 100644 --- a/dev-tools/mgs-dev/src/main.rs +++ b/dev-tools/mgs-dev/src/main.rs @@ -8,6 +8,7 @@ use clap::{Args, Parser, Subcommand}; use futures::StreamExt; use libc::SIGINT; use signal_hook_tokio::Signals; +use std::net::SocketAddr; #[tokio::main] async fn main() -> anyhow::Result<()> { @@ -37,8 +38,10 @@ enum MgsDevCmd { #[derive(Clone, Debug, Args)] struct MgsRunArgs { - #[clap(flatten)] - mgs_metrics_args: gateway_test_utils::setup::MgsMetricsArgs, + /// Override the address of the Nexus instance to use when registering the + /// Oximeter producer. + #[clap(long)] + nexus_address: Option, } impl MgsRunArgs { @@ -49,13 +52,24 @@ impl MgsRunArgs { let mut signal_stream = signals.fuse(); println!("mgs-dev: setting up MGS ... "); - let gwtestctx = - gateway_test_utils::setup::test_setup_with_metrics_args( - "mgs-dev", - gateway_messages::SpPort::One, - self.mgs_metrics_args, - ) - .await; + let (mut mgs_config, sp_sim_config) = + gateway_test_utils::setup::load_test_config(); + if let Some(addr) = self.nexus_address { + mgs_config.metrics.dev = + Some(omicron_gateway::metrics::DevConfig { + bind_loopback: true, + nexus_address: Some(addr), + }); + } + + let gwtestctx = gateway_test_utils::setup::test_setup_with_config( + "mgs-dev", + gateway_messages::SpPort::One, + mgs_config, + &sp_sim_config, + None, + ) + .await; println!("mgs-dev: MGS is running."); let addr = gwtestctx.client.bind_address; diff --git a/gateway-test-utils/configs/config.test.toml b/gateway-test-utils/configs/config.test.toml index 79975f4611..82ac60da7d 100644 --- a/gateway-test-utils/configs/config.test.toml +++ b/gateway-test-utils/configs/config.test.toml @@ -88,6 +88,17 @@ addr = "[::1]:0" ignition-target = 3 location = { switch0 = ["sled", 1], switch1 = ["sled", 1] } +# +# Configuration for SP sensor metrics polling +# +[metrics] +# Bryan wants to try polling SP sensors at 1Hz. +sp_poll_interval_ms = 1000 +# Tell Oximeter to collect our metrics every 10 seconds. +oximeter_collection_interval_secs = 10 +# Allow binding the metrics server on localhost. +dev = { bind_loopback = true } + # # NOTE: for the test suite, if mode = "file", the file path MUST be the sentinel # string "UNUSED". The actual path will be generated by the test suite for each diff --git a/gateway-test-utils/src/setup.rs b/gateway-test-utils/src/setup.rs index a66d26d046..46bc55805a 100644 --- a/gateway-test-utils/src/setup.rs +++ b/gateway-test-utils/src/setup.rs @@ -8,7 +8,6 @@ use camino::Utf8Path; use dropshot::test_util::ClientTestContext; use dropshot::test_util::LogContext; use gateway_messages::SpPort; -pub use omicron_gateway::metrics::Args as MgsMetricsArgs; use omicron_gateway::MgsArguments; use omicron_gateway::SpType; use omicron_gateway::SwitchPortConfig; @@ -70,24 +69,6 @@ pub async fn test_setup( server_config, &sp_sim_config, None, - Default::default(), - ) - .await -} - -pub async fn test_setup_with_metrics_args( - test_name: &str, - sp_port: SpPort, - metrics_args: MgsMetricsArgs, -) -> GatewayTestContext { - let (server_config, sp_sim_config) = load_test_config(); - test_setup_with_config( - test_name, - sp_port, - server_config, - &sp_sim_config, - None, - metrics_args, ) .await } @@ -118,7 +99,6 @@ pub async fn test_setup_with_config( mut server_config: omicron_gateway::Config, sp_sim_config: &sp_sim::Config, listen_addr: Option, - metrics_args: MgsMetricsArgs, ) -> GatewayTestContext { // Can't be `const` because `SocketAddrV6::new()` isn't const yet let localhost_port_0 = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 0, 0, 0); @@ -164,8 +144,7 @@ pub async fn test_setup_with_config( // Start gateway server let rack_id = Some(Uuid::parse_str(RACK_UUID).unwrap()); - let args = - MgsArguments { id: Uuid::new_v4(), addresses, rack_id, metrics_args }; + let args = MgsArguments { id: Uuid::new_v4(), addresses, rack_id }; let server = omicron_gateway::Server::start( server_config.clone(), args, diff --git a/gateway/examples/config.toml b/gateway/examples/config.toml index d29d9508b9..71048ff487 100644 --- a/gateway/examples/config.toml +++ b/gateway/examples/config.toml @@ -71,6 +71,15 @@ addr = "[::1]:33320" ignition-target = 3 location = { switch0 = ["sled", 1], switch1 = ["sled", 1] } +# +# Configuration for SP sensor metrics polling +# +[metrics] +# Bryan wants to try polling SP sensors at 1Hz. +sp_poll_interval_ms = 1000 +# Tell Oximeter to collect our metrics every 10 seconds. +oximeter_collection_interval_secs = 10 + [log] # Show log messages of this level and more severe level = "debug" diff --git a/gateway/src/bin/mgs.rs b/gateway/src/bin/mgs.rs index 505f41f7a5..91290bffae 100644 --- a/gateway/src/bin/mgs.rs +++ b/gateway/src/bin/mgs.rs @@ -47,9 +47,6 @@ enum Args { required_unless_present = "id_and_address_from_smf" )] address: Option, - - #[clap(flatten)] - metrics_args: omicron_gateway::metrics::Args, }, } @@ -76,7 +73,6 @@ async fn do_run() -> Result<(), CmdError> { id_and_address_from_smf, id, address, - metrics_args, } => { let config = Config::from_file(&config_file_path) .map_err(anyhow::Error::new) @@ -96,7 +92,7 @@ async fn do_run() -> Result<(), CmdError> { // `id_and_address_from_smf` is false, so we can safely unwrap. (id.unwrap(), vec![address.unwrap()], rack_id) }; - let args = MgsArguments { id, addresses, rack_id, metrics_args }; + let args = MgsArguments { id, addresses, rack_id }; let mut server = start_server(config, args) .await .map_err(|e| CmdError::Failure(anyhow!(e)))?; diff --git a/gateway/src/config.rs b/gateway/src/config.rs index afdb046881..ba9818ff2c 100644 --- a/gateway/src/config.rs +++ b/gateway/src/config.rs @@ -6,6 +6,7 @@ //! configuration use crate::management_switch::SwitchConfig; +use crate::metrics::MetricsConfig; use camino::Utf8Path; use camino::Utf8PathBuf; use dropshot::ConfigLogging; @@ -25,6 +26,8 @@ pub struct Config { pub switch: SwitchConfig, /// Server-wide logging configuration. pub log: ConfigLogging, + /// Configuration for SP sensor metrics. + pub metrics: MetricsConfig, } impl Config { diff --git a/gateway/src/lib.rs b/gateway/src/lib.rs index 8e09b09729..0566c79ea2 100644 --- a/gateway/src/lib.rs +++ b/gateway/src/lib.rs @@ -49,7 +49,6 @@ pub struct MgsArguments { pub id: Uuid, pub addresses: Vec, pub rack_id: Option, - pub metrics_args: metrics::Args, } type HttpServer = dropshot::HttpServer>; @@ -155,10 +154,11 @@ impl Server { let mut http_servers = HashMap::with_capacity(args.addresses.len()); let all_servers_shutdown = FuturesUnordered::new(); - let metrics = metrics::Metrics::new(&log, &args, apictx.clone()) - .map_err(|err| { - format!("failed to initialize metrics subsystem: {err}") - })?; + let metrics = + metrics::Metrics::new(&log, &args, config.metrics, apictx.clone()) + .map_err(|err| { + format!("failed to initialize metrics subsystem: {err}") + })?; for addr in args.addresses { start_dropshot_server( diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 14ed2782d2..b1fdffb34c 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -17,6 +17,7 @@ use gateway_sp_comms::SpComponent; use gateway_sp_comms::VersionedSpState; use omicron_common::api::internal::nexus::ProducerEndpoint; use omicron_common::api::internal::nexus::ProducerKind; +use omicron_common::backoff; use oximeter::types::Cumulative; use oximeter::types::ProducerRegistry; use oximeter::types::Sample; @@ -37,32 +38,48 @@ use uuid::Uuid; oximeter::use_timeseries!("sensor-measurement.toml"); -/// Handle to the metrics task. +/// Handle to the metrics tasks. pub struct Metrics { addrs_tx: watch::Sender>, rack_id_tx: Option>, - manager: JoinHandle>, - poller: JoinHandle>, + server: JoinHandle>, + pollers: JoinHandle>, } -/// CLI arguments for configuring metrics. -#[derive(Copy, Clone, Debug, Default, clap::Parser)] -#[clap(next_help_heading = "SP Metrics Development Configuration")] -pub struct Args { +/// Configuration for metrics. +#[derive(Clone, Debug, PartialEq, Eq, serde::Deserialize, serde::Serialize)] +#[serde(deny_unknown_fields)] +pub struct MetricsConfig { + /// Collection interval to request from Oximeter, in seconds. + /// + /// This is the frequency with which Oximeter will collect samples the + /// metrics producer endpoint, *not* the frequency with which sensor + /// measurements are polled from service processors. + oximeter_collection_interval_secs: usize, + + /// The interval at which service processors are polled for sensor readings, + /// in milliseconds + sp_poll_interval_ms: usize, + + /// Configuration settings for testing and development use. + pub dev: Option, +} + +#[derive(Clone, Debug, PartialEq, Eq, serde::Deserialize, serde::Serialize)] +#[serde(deny_unknown_fields)] +pub struct DevConfig { /// Override the Nexus address used to register the SP metrics Oximeter /// producer. This is intended for use in development and testing. /// /// If this argument is not present, Nexus is discovered through DNS. - #[clap(long = "dev-nexus-address")] - nexus_address: Option, + pub nexus_address: Option, /// Allow the metrics producer endpoint to bind on loopback. /// /// This should be disabled in production, as Nexus will not be able to /// reach the loopback interface, but is necessary for local development and /// test purposes. - #[clap(long = "dev-metrics-bind-loopback")] - bind_loopback: bool, + pub bind_loopback: bool, } /// Manages SP pollers, making sure that every SP has a poller task. @@ -75,6 +92,7 @@ struct PollerManager { /// The manager doesn't actually produce samples, but it needs to be able to /// clone a sender for every poller task it spawns. sample_tx: broadcast::Sender>, + poll_interval: Duration, } /// Polls sensor readings from an individual SP. @@ -92,7 +110,8 @@ struct ComponentMetrics { target: component::Component, /// Counts of errors reported by sensors on this component. sensor_errors: HashMap>, - /// Counts of errors that occurred whilst polling the d + /// Counts of errors that occurred whilst polling the SP for measurements + /// from this component. poll_errors: HashMap<&'static str, Cumulative>, } @@ -108,19 +127,19 @@ struct ServerManager { log: slog::Logger, addrs: watch::Receiver>, registry: ProducerRegistry, - args: Args, + cfg: MetricsConfig, } #[derive(Debug)] -struct Producer(broadcast::Receiver>); - -/// The interval on which we ask `oximeter` to poll us for metric data. -// N.B.: I picked this pretty arbitrarily... -const METRIC_COLLECTION_INTERVAL: Duration = Duration::from_secs(10); - -/// The interval at which we poll sensor readings from SPs. Bryan wants to try -/// 1Hz and see if the SP can handle it. -const POLL_INTERVAL: Duration = Duration::from_secs(1); +struct Producer { + /// Receiver for samples produced by SP pollers. + sample_rx: broadcast::Receiver>, + /// Logging context. + /// + /// We stick this on the producer because we would like to be able to log + /// when stale samples are dropped. + log: slog::Logger, +} /// The maximum Dropshot request size for the metrics server. const METRIC_REQUEST_MAX_SIZE: usize = 10 * 1024 * 1024; @@ -143,34 +162,14 @@ const NORMAL_NUMBER_OF_SPS: usize = + 2 // two power shelves, someday. ; -/// Number of sample vectors from individual SPs to buffer. -const SAMPLE_CHANNEL_CAPACITY: usize = { - // Roughly how many times will we poll SPs for each metrics collection - // interval? - let polls_per_metrics_interval = (METRIC_COLLECTION_INTERVAL.as_secs() - / POLL_INTERVAL.as_secs()) - as usize; - // How many sample collection intervals do we want to allow to elapse before - // we start putting stuff on the floor? - // - // Let's say 16. Chosen totally arbitrarily but seems reasonable-ish. - let sloppiness = 16; - let capacity = - NORMAL_NUMBER_OF_SPS * polls_per_metrics_interval * sloppiness; - // Finally, the buffer capacity will probably be allocated in a power of two - // anyway, so let's make sure our thing is a power of two so we don't waste - // the allocation we're gonna get anyway. - capacity.next_power_of_two() -}; - impl Metrics { pub fn new( log: &slog::Logger, args: &MgsArguments, + cfg: MetricsConfig, apictx: Arc, ) -> anyhow::Result { - let &MgsArguments { id, rack_id, ref addresses, metrics_args } = args; - let registry = ProducerRegistry::with_id(id); + let &MgsArguments { id, rack_id, ref addresses } = args; // Create a channel for the SP poller tasks to send samples to the // Oximeter producer endpoint. @@ -183,12 +182,9 @@ impl Metrics { // is what we want, as we would prefer a full buffer to result in // clobbering the oldest measurements, rather than leaving the newest // ones on the floor. + let max_buffered_sample_chunks = cfg.sample_channel_capacity(); let (sample_tx, sample_rx) = - broadcast::channel(SAMPLE_CHANNEL_CAPACITY); - - registry - .register_producer(Producer(sample_rx)) - .context("failed to register metrics producer")?; + broadcast::channel(max_buffered_sample_chunks); // Using a channel for this is, admittedly, a bit of an end-run around // the `OnceLock` on the `ServerContext` that *also* stores the rack ID, @@ -205,29 +201,44 @@ impl Metrics { } else { Some(rack_id_tx) }; - let poller = tokio::spawn( - PollerManager { - sample_tx, - apictx, - tasks: tokio::task::JoinSet::new(), - log: log.new(slog::o!("component" => "sensor-poller")), - mgs_id: id, - } - .run(rack_id_rx), - ); + let pollers = { + let log = log.new(slog::o!("component" => "sensor-poller")); + let poll_interval = + Duration::from_millis(cfg.sp_poll_interval_ms as u64); + slog::info!( + &log, + "SP sensor metrics configured"; + "poll_interval" => ?poll_interval, + "max_buffered_sample_chunks" => max_buffered_sample_chunks, + ); + + tokio::spawn( + PollerManager { + sample_tx, + apictx, + poll_interval, + tasks: tokio::task::JoinSet::new(), + log, + mgs_id: id, + } + .run(rack_id_rx), + ) + }; let (addrs_tx, addrs_rx) = tokio::sync::watch::channel(addresses.clone()); - let manager = tokio::spawn( - ServerManager { - log: log.new(slog::o!("component" => "producer-server")), - addrs: addrs_rx, - registry, - args: metrics_args, - } - .run(), - ); - Ok(Self { addrs_tx, rack_id_tx, manager, poller }) + let server = { + let log = log.new(slog::o!("component" => "producer-server")); + let registry = ProducerRegistry::with_id(id); + registry + .register_producer(Producer { sample_rx, log: log.clone() }) + .context("failed to register metrics producer")?; + + tokio::spawn( + ServerManager { log, addrs: addrs_rx, registry, cfg }.run(), + ) + }; + Ok(Self { addrs_tx, rack_id_tx, server, pollers }) } pub fn set_rack_id(&mut self, rack_id: Uuid) { @@ -261,8 +272,40 @@ impl Metrics { impl Drop for Metrics { fn drop(&mut self) { // Clean up our children on drop. - self.manager.abort(); - self.poller.abort(); + self.server.abort(); + self.pollers.abort(); + } +} + +impl MetricsConfig { + fn oximeter_collection_interval(&self) -> Duration { + Duration::from_secs(self.oximeter_collection_interval_secs as u64) + } + + /// Returns the number of sample chunks from individual SPs to buffer. + fn sample_channel_capacity(&self) -> usize { + // Roughly how many times will we poll SPs for each metrics collection + // interval? + let polls_per_metrics_interval = { + let collection_interval_ms: usize = self + .oximeter_collection_interval() + .as_millis() + .try_into() + .expect("your oximeter collection interval is way too big..."); + collection_interval_ms / self.sp_poll_interval_ms + }; + + // How many sample collection intervals do we want to allow to elapse before + // we start putting stuff on the floor? + // + // Let's say 16. Chosen totally arbitrarily but seems reasonable-ish. + let sloppiness = 16; + let capacity = + NORMAL_NUMBER_OF_SPS * polls_per_metrics_interval * sloppiness; + // Finally, the buffer capacity will probably be allocated in a power of two + // anyway, so let's make sure our thing is a power of two so we don't waste + // the allocation we're gonna get anyway. + capacity.next_power_of_two() } } @@ -278,19 +321,54 @@ impl oximeter::Producer for Producer { // `resubscribe` function creates a receiver at the current *tail* of // the ringbuffer, so it won't see any samples produced *before* now. // Which is the opposite of what we want! - let mut samples = Vec::with_capacity(self.0.len()); + let mut samples = Vec::with_capacity(self.sample_rx.len()); + // Because we recieve the individual samples in a `Vec` of all samples + // produced by a poller, let's also sum the length of each of those + // `Vec`s here, so we can log it later. + let mut total_samples = 0; + // Also, track whether any sample chunks were dropped off the end of the + // ring buffer. + let mut dropped_chunks = 0; + use broadcast::error::TryRecvError; loop { - match self.0.try_recv() { - Ok(sample_chunk) => samples.push(sample_chunk), + match self.sample_rx.try_recv() { + Ok(sample_chunk) => { + total_samples += sample_chunk.len(); + samples.push(sample_chunk) + } // This error indicates that an old ringbuffer entry was // overwritten. That's fine, just get the next one. - Err(TryRecvError::Lagged(_)) => continue, + Err(TryRecvError::Lagged(dropped)) => { + dropped_chunks += dropped; + } // We've drained all currently available samples! We're done here! - Err(TryRecvError::Empty) | Err(TryRecvError::Closed) => break, + Err(TryRecvError::Empty) => break, + // This should only happen when shutting down. + Err(TryRecvError::Closed) => { + slog::debug!(&self.log, "sample producer channel closed"); + break; + } } } + if dropped_chunks > 0 { + slog::info!( + &self.log, + "produced metric samples. some old sample chunks were dropped!"; + "samples" => total_samples, + "sample_chunks" => samples.len(), + "dropped_chunks" => dropped_chunks, + ); + } else { + slog::debug!( + &self.log, + "produced metric samples"; + "samples" => total_samples, + "sample_chunks" => samples.len(), + ); + } + // There you go, that's all I've got. Ok(Box::new(samples.into_iter().flatten())) } @@ -308,35 +386,71 @@ impl PollerManager { "rack ID sender has gone away...we must be shutting down", )?; - let mut poll_interval = tokio::time::interval(POLL_INTERVAL); let mut known_sps: HashMap = HashMap::with_capacity(NORMAL_NUMBER_OF_SPS); // Wait for SP discovery to complete, if it hasn't already. // TODO(eliza): presently, we busy-poll here. It would be nicer to // replace the `OnceLock` in `ManagementSwitch` // with a `tokio::sync::watch` - while !switch.is_discovery_complete() { - poll_interval.tick().await; - } + backoff::retry_notify_ext( + backoff::retry_policy_local(), + || async { + if switch.is_discovery_complete() { + Ok(()) + } else { + Err(backoff::BackoffError::transient(())) + } + }, + |_, _, elapsed| { + let secs = elapsed.as_secs(); + if secs < 30 { + slog::debug!( + &self.log, + "waiting for SP discovery to complete..."; + "elapsed" => ?elapsed, + ); + } else if secs < 180 { + slog::info!( + &self.log, + "still waiting for SP discovery to complete..."; + "elapsed" => ?elapsed, + ) + } else { + slog::warn!( + &self.log, + "we have been waiting for SP discovery to complete \ + for a pretty long time!"; + "elapsed" => ?elapsed, + ) + } + }, + ) + .await + .expect("we should never return a fatal error here"); slog::info!( &self.log, - "SP discovery complete! starting to poll sensors..." + "starting to polling SP sensor data every {:?}", self.poll_interval; ); loop { - let sps = match switch.all_sps() { - Ok(sps) => sps, - Err(e) => { + let sps = backoff::retry_notify_ext( + backoff::retry_policy_internal_service(), + || async { + switch.all_sps().map_err(backoff::BackoffError::transient) + }, + |error, attempts, elapsed| { slog::warn!( &self.log, - "failed to enumerate service processors! will try again in a bit"; - "error" => %e, - ); - poll_interval.tick().await; - continue; - } - }; + "failed to list SPs! we'll try again in a little bit."; + "error" => error, + "elapsed" => ?elapsed, + "attempts" => attempts, + ) + }, + ) + .await + .expect("we never return a permanent error here"); for (spid, _) in sps { // Do we know about this li'l guy already? @@ -376,7 +490,7 @@ impl PollerManager { }; let poller_handle = self .tasks - .spawn(poller.run(POLL_INTERVAL, self.apictx.clone())); + .spawn(poller.run(self.poll_interval, self.apictx.clone())); let _prev_poller = known_sps.insert(spid, poller_handle); debug_assert!( _prev_poller.map(|p| p.is_finished()).unwrap_or(true), @@ -756,14 +870,21 @@ impl SpPoller { impl ServerManager { async fn run(mut self) -> anyhow::Result<()> { - if self.args.nexus_address.is_some() || self.args.bind_loopback { - slog::warn!( - &self.log, - "using development metrics configuration overrides!"; - "nexus_address" => ?self.args.nexus_address, - "bind_loopback" => self.args.bind_loopback, - ); - } + let (registration_address, bind_loopback) = + if let Some(ref dev) = self.cfg.dev { + slog::warn!( + &self.log, + "using development metrics configuration overrides!"; + "nexus_address" => ?dev.nexus_address, + "bind_loopback" => dev.bind_loopback, + ); + (dev.nexus_address, dev.bind_loopback) + } else { + (None, false) + }; + let interval = self.cfg.oximeter_collection_interval(); + let id = self.registry.producer_id(); + let mut current_server: Option = None; loop { let current_ip = current_server.as_ref().map(|s| s.address().ip()); @@ -771,7 +892,7 @@ impl ServerManager { for addr in self.addrs.borrow_and_update().iter() { let &ip = addr.ip(); // Don't bind the metrics endpoint on ::1 - if ip.is_loopback() && !self.args.bind_loopback { + if ip.is_loopback() && !bind_loopback { continue; } // If our current address is contained in the new addresses, @@ -785,25 +906,27 @@ impl ServerManager { } if let Some(ip) = new_ip { - slog::info!( + slog::debug!( &self.log, "rebinding producer server on new IP"; "new_ip" => ?ip, "current_ip" => ?current_ip, + "collection_interval" => ?interval, + "producer_id" => ?id, ); let server = { // Listen on any available socket, using the provided underlay IP. let address = SocketAddr::new(ip.into(), 0); let server_info = ProducerEndpoint { - id: self.registry.producer_id(), + id, kind: ProducerKind::ManagementGateway, address, - interval: METRIC_COLLECTION_INTERVAL, + interval, }; let config = oximeter_producer::Config { server_info, - registration_address: self.args.nexus_address, + registration_address, request_body_max_bytes: METRIC_REQUEST_MAX_SIZE, log: oximeter_producer::LogConfig::Logger( self.log.clone(), @@ -819,6 +942,8 @@ impl ServerManager { slog::info!( &self.log, "bound metrics producer server"; + "collection_interval" => ?interval, + "producer_id" => ?id, "address" => %server.address(), ); diff --git a/smf/mgs-sim/config.toml b/smf/mgs-sim/config.toml index 511524137a..6d323ce04b 100644 --- a/smf/mgs-sim/config.toml +++ b/smf/mgs-sim/config.toml @@ -71,6 +71,15 @@ addr = "[::1]:33320" ignition-target = 3 location = { switch0 = ["sled", 1], switch1 = ["sled", 1] } +# +# Configuration for SP sensor metrics polling +# +[metrics] +# Bryan wants to try polling SP sensors at 1Hz. +sp_poll_interval_ms = 1000 +# Tell Oximeter to collect our metrics every 10 seconds. +oximeter_collection_interval_secs = 10 + [log] # Show log messages of this level and more severe level = "debug" diff --git a/smf/mgs/config.toml b/smf/mgs/config.toml index fa1232b1b2..a7e2e27aa7 100644 --- a/smf/mgs/config.toml +++ b/smf/mgs/config.toml @@ -286,6 +286,15 @@ interface = "gimlet31" ignition-target = 18 location = { switch0 = ["sled", 31], switch1 = ["sled", 31] } +# +# Configuration for SP sensor metrics polling +# +[metrics] +# Bryan wants to try polling SP sensors at 1Hz. +sp_poll_interval_ms = 1000 +# Tell Oximeter to collect our metrics every 10 seconds. +oximeter_collection_interval_secs = 10 + [log] level = "info" mode = "file" From 8b25a894c3314b01701630d3a3bec25deb95462e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 19 Aug 2024 09:46:52 -0700 Subject: [PATCH 33/77] whoops fix nexus-test --- nexus/test-utils/src/lib.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 90114fb6ca..acee46ce10 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -510,8 +510,6 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { mgs_config, &sp_sim_config, mgs_addr, - // TODO(eliza): pass the nexus address here... - Default::default(), ) .await; self.gateway.insert(switch_location, gateway); From 52b8d7463b376bc4c17692688b099887bd60d3ce Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 19 Aug 2024 15:13:42 -0700 Subject: [PATCH 34/77] update omdb output --- dev-tools/omdb/tests/successes.out | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 2a9c9c8051..b3f5eb916f 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -141,9 +141,16 @@ SP DETAILS: type "Sled" slot 0 COMPONENTS - NAME DESCRIPTION DEVICE PRESENCE SERIAL - sp3-host-cpu FAKE host cpu sp3-host-cpu Present None - dev-0 FAKE temperature sensor fake-tmp-sensor Failed None + NAME DESCRIPTION DEVICE PRESENCE SERIAL + sp3-host-cpu FAKE host cpu sp3-host-cpu Present None + dev-0 FAKE temperature sensor fake-tmp-sensor Failed None + dev-1 FAKE temperature sensor tmp117 Present None + dev-2 FAKE Southeast temperature sensor tmp117 Present None + dev-6 FAKE U.2 Sharkfin A VPD at24csw080 Present None + dev-7 FAKE U.2 Sharkfin A hot swap controller max5970 Present None + dev-8 FAKE U.2 A NVMe Basic Management Command nvme_bmc Present None + dev-39 FAKE T6 temperature sensor tmp451 Present None + dev-53 FAKE Fan controller max31790 Present None CABOOSES: none found @@ -167,8 +174,17 @@ SP DETAILS: type "Sled" slot 1 COMPONENTS - NAME DESCRIPTION DEVICE PRESENCE SERIAL - sp3-host-cpu FAKE host cpu sp3-host-cpu Present None + NAME DESCRIPTION DEVICE PRESENCE SERIAL + sp3-host-cpu FAKE host cpu sp3-host-cpu Present None + sp3-host-cpu FAKE host cpu sp3-host-cpu Present None + dev-0 FAKE temperature sensor tmp117 Present None + dev-1 FAKE temperature sensor tmp117 Present None + dev-2 FAKE Southeast temperature sensor tmp117 Present None + dev-6 FAKE U.2 Sharkfin A VPD at24csw080 Present None + dev-7 FAKE U.2 Sharkfin A hot swap controller max5970 Present None + dev-8 FAKE U.2 A NVMe Basic Management Command nvme_bmc Present None + dev-39 FAKE T6 temperature sensor tmp451 Present None + dev-53 FAKE Fan controller max31790 Present None CABOOSES: none found From 21597a3308c1dcc0a4d40aa3cfeeb835be17eac8 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 19 Aug 2024 15:15:01 -0700 Subject: [PATCH 35/77] add more sim components to test --- .../tests/integration_tests/component_list.rs | 57 ++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/gateway/tests/integration_tests/component_list.rs b/gateway/tests/integration_tests/component_list.rs index ec876c0783..0d0396eb37 100644 --- a/gateway/tests/integration_tests/component_list.rs +++ b/gateway/tests/integration_tests/component_list.rs @@ -57,7 +57,62 @@ async fn component_list() { capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS .bits(), presence: SpComponentPresence::Failed, - } + }, + SpComponentInfo { + component: "dev-1".to_string(), + device: "tmp117".to_string(), + serial_number: None, + description: "FAKE Southeast temperature sensor".to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-6".to_string(), + device: "at24csw080".to_string(), + serial_number: None, + description: "FAKE U.2 Sharkfin A VPD".to_string(), + capabilities: 0, + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-7".to_string(), + device: "max5970".to_string(), + serial_number: None, + description: "FAKE U.2 Sharkfin A hot swap controller" + .to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-8".to_string(), + device: "nvme_bmc".to_string(), + serial_number: None, + description: "FAKE U.2 A NVMe Basic Management Command" + .to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-39".to_string(), + device: "tmp451".to_string(), + serial_number: None, + description: "FAKE T6 temperature sensor".to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-53".to_string(), + device: "max31790".to_string(), + serial_number: None, + description: "FAKE Fan controller".to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, ] ); From 2869fb06a860973fb784477bf5a9b3f93404a8cd Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 19 Aug 2024 15:21:49 -0700 Subject: [PATCH 36/77] lol oops --- gateway/tests/integration_tests/component_list.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/gateway/tests/integration_tests/component_list.rs b/gateway/tests/integration_tests/component_list.rs index 0d0396eb37..851e7d8a37 100644 --- a/gateway/tests/integration_tests/component_list.rs +++ b/gateway/tests/integration_tests/component_list.rs @@ -62,6 +62,15 @@ async fn component_list() { component: "dev-1".to_string(), device: "tmp117".to_string(), serial_number: None, + description: "FAKE temperature sensor".to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-2".to_string(), + device: "tmp117".to_string(), + serial_number: None, description: "FAKE Southeast temperature sensor".to_string(), capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS .bits(), From 5b56dbd4b0ec7b893e0cdf3d00fe10acec9475f3 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 19 Aug 2024 15:33:14 -0700 Subject: [PATCH 37/77] blergh --- .../configs/sp_sim_config.test.toml | 8 -- .../tests/integration_tests/component_list.rs | 91 +++++++++++++++++-- 2 files changed, 83 insertions(+), 16 deletions(-) diff --git a/gateway-test-utils/configs/sp_sim_config.test.toml b/gateway-test-utils/configs/sp_sim_config.test.toml index 6be27c28e6..6abe513de3 100644 --- a/gateway-test-utils/configs/sp_sim_config.test.toml +++ b/gateway-test-utils/configs/sp_sim_config.test.toml @@ -155,14 +155,6 @@ presence = "Present" serial_console = "[::1]:0" -[[simulated_sps.gimlet.components]] -id = "sp3-host-cpu" -device = "sp3-host-cpu" -description = "FAKE host cpu" -capabilities = 0 -presence = "Present" -# serial_console = "[::1]:0" - [[simulated_sps.gimlet.components]] id = "dev-0" device = "tmp117" diff --git a/gateway/tests/integration_tests/component_list.rs b/gateway/tests/integration_tests/component_list.rs index 851e7d8a37..993dcc9e93 100644 --- a/gateway/tests/integration_tests/component_list.rs +++ b/gateway/tests/integration_tests/component_list.rs @@ -131,14 +131,89 @@ async fn component_list() { assert_eq!( resp.components, - &[SpComponentInfo { - component: SpComponent::SP3_HOST_CPU.const_as_str().to_string(), - device: SpComponent::SP3_HOST_CPU.const_as_str().to_string(), - serial_number: None, - description: "FAKE host cpu".to_string(), - capabilities: 0, - presence: SpComponentPresence::Present, - },] + &[ + SpComponentInfo { + component: SpComponent::SP3_HOST_CPU.const_as_str().to_string(), + device: SpComponent::SP3_HOST_CPU.const_as_str().to_string(), + serial_number: None, + description: "FAKE host cpu".to_string(), + capabilities: 0, + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-0".to_string(), + device: "tmp117".to_string(), + serial_number: None, + description: "FAKE temperature sensor".to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-1".to_string(), + device: "tmp117".to_string(), + serial_number: None, + description: "FAKE temperature sensor".to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-2".to_string(), + device: "tmp117".to_string(), + serial_number: None, + description: "FAKE Southeast temperature sensor".to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-6".to_string(), + device: "at24csw080".to_string(), + serial_number: None, + description: "FAKE U.2 Sharkfin A VPD".to_string(), + capabilities: 0, + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-7".to_string(), + device: "max5970".to_string(), + serial_number: None, + description: "FAKE U.2 Sharkfin A hot swap controller" + .to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-8".to_string(), + device: "nvme_bmc".to_string(), + serial_number: None, + description: "FAKE U.2 A NVMe Basic Management Command" + .to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-39".to_string(), + device: "tmp451".to_string(), + serial_number: None, + description: "FAKE T6 temperature sensor".to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-53".to_string(), + device: "max31790".to_string(), + serial_number: None, + description: "FAKE Fan controller".to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + ] ); // Get the component list for switch 0. From 8fa9ad83722bf118efef95080f62b9a24a4681f5 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 19 Aug 2024 17:12:17 -0700 Subject: [PATCH 38/77] don't churn restarting pollers for non-present SPs --- gateway/src/metrics.rs | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index b1fdffb34c..ee30e8e16f 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -540,10 +540,15 @@ impl SpPoller { poll_interval: Duration, apictx: Arc, ) -> Result<(), SpCommsError> { + /// How long to wait when a SP is not present. + /// + /// I picked this arbitrarily. It would be much nicer if we could + /// instead recieve a notification when the discovery state changes... + const NO_SP_BACKOFF: Duration = Duration::from_secs(30); let mut interval = tokio::time::interval(poll_interval); let switch = &apictx.mgmt_switch; let sp = switch.sp(self.spid)?; - + let mut not_present_message_logged = false; loop { interval.tick().await; slog::trace!(&self.log, "interval elapsed, polling SP..."); @@ -565,12 +570,23 @@ impl SpPoller { return Ok(()); } } + // Err(CommunicationError::NoSpDiscovered) => { - slog::info!( - &self.log, - "our SP seems to no longer be present; giving up." - ); - return Ok(()); + if !not_present_message_logged { + not_present_message_logged = true; + slog::info!( + &self.log, + "our SP seems to not be present, waiting to see if it \ + appears..." + ); + } else { + slog::debug!( + &self.log, + "SP is still not there, checking again in a little bit." + ) + } + + tokio::time::sleep(NO_SP_BACKOFF).await; } Err(error) => { slog::warn!( From b01d0fb1f6f1c109dca27f20923e0179b97aed42 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 20 Aug 2024 08:37:42 -0700 Subject: [PATCH 39/77] use `sp_addr_watch` to wait for SPs to appear --- gateway/src/metrics.rs | 47 ++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index ee30e8e16f..c5f8592065 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -540,15 +540,9 @@ impl SpPoller { poll_interval: Duration, apictx: Arc, ) -> Result<(), SpCommsError> { - /// How long to wait when a SP is not present. - /// - /// I picked this arbitrarily. It would be much nicer if we could - /// instead recieve a notification when the discovery state changes... - const NO_SP_BACKOFF: Duration = Duration::from_secs(30); let mut interval = tokio::time::interval(poll_interval); let switch = &apictx.mgmt_switch; let sp = switch.sp(self.spid)?; - let mut not_present_message_logged = false; loop { interval.tick().await; slog::trace!(&self.log, "interval elapsed, polling SP..."); @@ -570,23 +564,40 @@ impl SpPoller { return Ok(()); } } - // + // No SP is currently present for this ID. This may change in + // the future: a cubby that is not populated at present may have + // a sled added to it in the future. So, let's wait until it + // changes. Err(CommunicationError::NoSpDiscovered) => { - if !not_present_message_logged { - not_present_message_logged = true; + let mut watch = sp.sp_addr_watch().clone(); + loop { + if let Some((addr, port)) = *watch.borrow_and_update() { + // Ladies and gentlemen...we got him! + slog::info!( + &self.log, + "found a SP, resuming polling."; + "sp_addr" => ?addr, + "sp_port" => ?port, + ); + break; + } + slog::info!( &self.log, - "our SP seems to not be present, waiting to see if it \ - appears..." + "no SP is present for this slot. waiting for a \ + little buddy to appear..."; ); - } else { - slog::debug!( - &self.log, - "SP is still not there, checking again in a little bit." - ) - } - tokio::time::sleep(NO_SP_BACKOFF).await; + // Wait for an address to be discovered. + if watch.changed().await.is_err() { + slog::debug!( + &self.log, + "SP address watch has been closed, presumably \ + we are shutting down"; + ); + return Ok(()); + } + } } Err(error) => { slog::warn!( From 8f3eae67efc8b67cfd681b37ab438776b783cb5c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 20 Aug 2024 08:41:14 -0700 Subject: [PATCH 40/77] smallish logging tweaks --- gateway/src/metrics.rs | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index c5f8592065..78a3a9b38a 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -550,13 +550,21 @@ impl SpPoller { match self.poll(sp).await { // No sense cluttering the ringbuffer with empty vecs... Ok(samples) if samples.is_empty() => { - slog::trace!(&self.log, "polled SP, no samples returned"; "num_samples" => 0usize); + slog::trace!( + &self.log, + "polled SP, no samples returned"; + "num_samples" => 0usize + ); } Ok(samples) => { - slog::trace!(&self.log, "polled SP successfully"; "num_samples" => samples.len()); + slog::trace!( + &self.log, + "polled SP successfully"; + "num_samples" => samples.len(), + ); if let Err(_) = self.sample_tx.send(samples) { - slog::info!( + slog::debug!( &self.log, "all sample receiver handles have been dropped! \ presumably we are shutting down..."; @@ -569,6 +577,11 @@ impl SpPoller { // a sled added to it in the future. So, let's wait until it // changes. Err(CommunicationError::NoSpDiscovered) => { + slog::info!( + &self.log, + "no SP is present for this slot. waiting for a \ + little buddy to appear..."; + ); let mut watch = sp.sp_addr_watch().clone(); loop { if let Some((addr, port)) = *watch.borrow_and_update() { @@ -582,13 +595,8 @@ impl SpPoller { break; } - slog::info!( - &self.log, - "no SP is present for this slot. waiting for a \ - little buddy to appear..."; - ); - // Wait for an address to be discovered. + slog::debug!(&self.log, "waiting for a SP to appear."); if watch.changed().await.is_err() { slog::debug!( &self.log, From 6efcdf7d8599afc3930d966d3a3ede3973f983c6 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 20 Aug 2024 10:21:00 -0700 Subject: [PATCH 41/77] i forgot to add the new producer kind to the db --- nexus/db-model/src/schema_versions.rs | 3 ++- schema/crdb/add-management-gateway-producer-kind/up.sql | 2 ++ schema/crdb/dbinit.sql | 4 +++- 3 files changed, 7 insertions(+), 2 deletions(-) create mode 100644 schema/crdb/add-management-gateway-producer-kind/up.sql diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index d0542874fb..aef95e6d53 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(90, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(91, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(91, "add-management-gateway-producer-kind"), KnownVersion::new(90, "lookup-bgp-config-by-asn"), KnownVersion::new(89, "collapse_lldp_settings"), KnownVersion::new(88, "route-local-pref"), diff --git a/schema/crdb/add-management-gateway-producer-kind/up.sql b/schema/crdb/add-management-gateway-producer-kind/up.sql new file mode 100644 index 0000000000..e872278e2f --- /dev/null +++ b/schema/crdb/add-management-gateway-producer-kind/up.sql @@ -0,0 +1,2 @@ +ALTER TYPE omicron.public.producer_kind + ADD VALUE IF NOT EXISTS 'management_gateway' AFTER 'instance'; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index baef38e44f..551c51262c 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1335,6 +1335,8 @@ CREATE TYPE IF NOT EXISTS omicron.public.producer_kind AS ENUM ( 'service', -- A Propolis VMM for an instance in the omicron.public.instance table 'instance' + -- A management gateway service on a scrimlet. + 'management_gateway' ); /* @@ -4212,7 +4214,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '90.0.0', NULL) + (TRUE, NOW(), NOW(), '91.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; From 2c5f83b11bd865175c74495c134b734aa120901c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 20 Aug 2024 10:25:29 -0700 Subject: [PATCH 42/77] post rebase remove explicit simulator sensor IDs --- .../configs/sp_sim_config.test.toml | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/gateway-test-utils/configs/sp_sim_config.test.toml b/gateway-test-utils/configs/sp_sim_config.test.toml index 6abe513de3..4f370a167c 100644 --- a/gateway-test-utils/configs/sp_sim_config.test.toml +++ b/gateway-test-utils/configs/sp_sim_config.test.toml @@ -21,7 +21,7 @@ description = "FAKE temperature sensor 1" capabilities = 0x2 presence = "Present" sensors = [ - {name = "Southwest", kind = "Temperature", sensor_id = 0, last_data.value = 41.7890625, last_data.timestamp = 1234 }, + {name = "Southwest", kind = "Temperature", last_data.value = 41.7890625, last_data.timestamp = 1234 }, ] [[simulated_sps.sidecar.components]] @@ -31,7 +31,7 @@ description = "FAKE temperature sensor 2" capabilities = 0x2 presence = "Failed" sensors = [ - { name = "South", kind = "Temperature", sensor_id = 1, last_error.value = "DeviceError", last_error.timestamp = 1234 }, + { name = "South", kind = "Temperature", last_error.value = "DeviceError", last_error.timestamp = 1234 }, ] [[simulated_sps.sidecar]] @@ -63,7 +63,7 @@ description = "FAKE temperature sensor" capabilities = 0x2 presence = "Failed" sensors = [ - { name = "Southwest", kind = "Temperature", sensor_id = 0, last_error.value = "DeviceError", last_error.timestamp = 1234 }, + { name = "Southwest", kind = "Temperature", last_error.value = "DeviceError", last_error.timestamp = 1234 }, ] [[simulated_sps.gimlet.components]] id = "dev-1" @@ -72,7 +72,7 @@ description = "FAKE temperature sensor" capabilities = 0x2 presence = "Present" sensors = [ - { name = "South", kind = "Temperature", sensor_id = 1, last_data.value = 42.5625, last_data.timestamp = 1234 }, + { name = "South", kind = "Temperature", last_data.value = 42.5625, last_data.timestamp = 1234 }, ] [[simulated_sps.gimlet.components]] @@ -82,7 +82,7 @@ description = "FAKE Southeast temperature sensor" capabilities = 0x2 presence = "Present" sensors = [ - { name = "Southeast", kind = "Temperature", sensor_id = 2, last_data.value = 41.570313, last_data.timestamp = 1234 }, + { name = "Southeast", kind = "Temperature", last_data.value = 41.570313, last_data.timestamp = 1234 }, ] [[simulated_sps.gimlet.components]] @@ -99,10 +99,10 @@ description = "FAKE U.2 Sharkfin A hot swap controller" capabilities = 0x2 presence = "Present" sensors = [ - { name = "V12_U2A_A0", kind = "Current", sensor_id = 3, last_data.value = 0.45898438, last_data.timestamp = 1234 }, - { name = "V3P3_U2A_A0", kind = "Current", sensor_id = 4, last_data.value = 0.024414063, last_data.timestamp = 1234 }, - { name = "V12_U2A_A0", kind = "Voltage", sensor_id = 5, last_data.value = 12.03125, last_data.timestamp = 1234 }, - { name = "V3P3_U2A_A0", kind = "Voltage", sensor_id = 6, last_data.value = 3.328125, last_data.timestamp = 1234 }, + { name = "V12_U2A_A0", kind = "Current", last_data.value = 0.45898438, last_data.timestamp = 1234 }, + { name = "V3P3_U2A_A0", kind = "Current", last_data.value = 0.024414063, last_data.timestamp = 1234 }, + { name = "V12_U2A_A0", kind = "Voltage", last_data.value = 12.03125, last_data.timestamp = 1234 }, + { name = "V3P3_U2A_A0", kind = "Voltage", last_data.value = 3.328125, last_data.timestamp = 1234 }, ] [[simulated_sps.gimlet.components]] @@ -112,7 +112,7 @@ description = "FAKE U.2 A NVMe Basic Management Command" capabilities = 0x2 presence = "Present" sensors = [ - { name = "U2_N0", kind = "Temperature", sensor_id = 7, last_data.value = 56.0, last_data.timestamp = 1234 }, + { name = "U2_N0", kind = "Temperature", last_data.value = 56.0, last_data.timestamp = 1234 }, ] [[simulated_sps.gimlet.components]] id = "dev-39" @@ -121,7 +121,7 @@ description = "FAKE T6 temperature sensor" capabilities = 0x2 presence = "Present" sensors = [ - { name = "t6", kind = "Temperature", sensor_id = 9, last_data.value = 70.625, last_data.timestamp = 1234 }, + { name = "t6", kind = "Temperature", last_data.value = 70.625, last_data.timestamp = 1234 }, ] [[simulated_sps.gimlet.components]] id = "dev-53" @@ -130,12 +130,12 @@ description = "FAKE Fan controller" capabilities = 0x2 presence = "Present" sensors = [ - { name = "Southeast", kind = "Speed", sensor_id = 10, last_data.value = 2607.0, last_data.timestamp = 1234 }, - { name = "Northeast", kind = "Speed", sensor_id = 11, last_data.value = 2476.0, last_data.timestamp = 1234 }, - { name = "South", kind = "Speed", sensor_id = 12, last_data.value = 2553.0, last_data.timestamp = 1234 }, - { name = "North", kind = "Speed", sensor_id = 13, last_data.value = 2265.0, last_data.timestamp = 1234 }, - { name = "Southwest", kind = "Speed", sensor_id = 14, last_data.value = 2649.0, last_data.timestamp = 1234 }, - { name = "Northwest", kind = "Speed", sensor_id = 15, last_data.value = 2275.0, last_data.timestamp = 1234 }, + { name = "Southeast", kind = "Speed", last_data.value = 2607.0, last_data.timestamp = 1234 }, + { name = "Northeast", kind = "Speed", last_data.value = 2476.0, last_data.timestamp = 1234 }, + { name = "South", kind = "Speed", last_data.value = 2553.0, last_data.timestamp = 1234 }, + { name = "North", kind = "Speed", last_data.value = 2265.0, last_data.timestamp = 1234 }, + { name = "Southwest", kind = "Speed", last_data.value = 2649.0, last_data.timestamp = 1234 }, + { name = "Northwest", kind = "Speed", last_data.value = 2275.0, last_data.timestamp = 1234 }, ] @@ -162,7 +162,7 @@ description = "FAKE temperature sensor" capabilities = 0x2 presence = "Present" sensors = [ - { name = "Southwest", kind = "Temperature", sensor_id = 0, last_data.value = 41.3629, last_data.timestamp = 1234 }, + { name = "Southwest", kind = "Temperature", last_data.value = 41.3629, last_data.timestamp = 1234 }, ] [[simulated_sps.gimlet.components]] id = "dev-1" @@ -171,7 +171,7 @@ description = "FAKE temperature sensor" capabilities = 0x2 presence = "Present" sensors = [ - { name = "South", kind = "Temperature", sensor_id = 1, last_data.value = 42.5625, last_data.timestamp = 1234 }, + { name = "South", kind = "Temperature", last_data.value = 42.5625, last_data.timestamp = 1234 }, ] [[simulated_sps.gimlet.components]] @@ -181,7 +181,7 @@ description = "FAKE Southeast temperature sensor" capabilities = 0x2 presence = "Present" sensors = [ - { name = "Southeast", kind = "Temperature", sensor_id = 2, last_data.value = 41.570313, last_data.timestamp = 1234 }, + { name = "Southeast", kind = "Temperature", last_data.value = 41.570313, last_data.timestamp = 1234 }, ] [[simulated_sps.gimlet.components]] @@ -198,10 +198,10 @@ description = "FAKE U.2 Sharkfin A hot swap controller" capabilities = 0x2 presence = "Present" sensors = [ - { name = "V12_U2A_A0", kind = "Current", sensor_id = 3, last_data.value = 0.41893438, last_data.timestamp = 1234 }, - { name = "V3P3_U2A_A0", kind = "Current", sensor_id = 4, last_data.value = 0.025614603, last_data.timestamp = 1234 }, - { name = "V12_U2A_A0", kind = "Voltage", sensor_id = 5, last_data.value = 12.02914, last_data.timestamp = 1234 }, - { name = "V3P3_U2A_A0", kind = "Voltage", sensor_id = 6, last_data.value = 3.2618, last_data.timestamp = 1234 }, + { name = "V12_U2A_A0", kind = "Current", last_data.value = 0.41893438, last_data.timestamp = 1234 }, + { name = "V3P3_U2A_A0", kind = "Current", last_data.value = 0.025614603, last_data.timestamp = 1234 }, + { name = "V12_U2A_A0", kind = "Voltage", last_data.value = 12.02914, last_data.timestamp = 1234 }, + { name = "V3P3_U2A_A0", kind = "Voltage", last_data.value = 3.2618, last_data.timestamp = 1234 }, ] [[simulated_sps.gimlet.components]] @@ -211,7 +211,7 @@ description = "FAKE U.2 A NVMe Basic Management Command" capabilities = 0x2 presence = "Present" sensors = [ - { name = "U2_N0", kind = "Temperature", sensor_id = 7, last_data.value = 56.0, last_data.timestamp = 1234 }, + { name = "U2_N0", kind = "Temperature", last_data.value = 56.0, last_data.timestamp = 1234 }, ] [[simulated_sps.gimlet.components]] id = "dev-39" @@ -220,7 +220,7 @@ description = "FAKE T6 temperature sensor" capabilities = 0x2 presence = "Present" sensors = [ - { name = "t6", kind = "Temperature", sensor_id = 9, last_data.value = 70.625, last_data.timestamp = 1234 }, + { name = "t6", kind = "Temperature", last_data.value = 70.625, last_data.timestamp = 1234 }, ] [[simulated_sps.gimlet.components]] id = "dev-53" @@ -229,12 +229,12 @@ description = "FAKE Fan controller" capabilities = 0x2 presence = "Present" sensors = [ - { name = "Southeast", kind = "Speed", sensor_id = 10, last_data.value = 2510.0, last_data.timestamp = 1234 }, - { name = "Northeast", kind = "Speed", sensor_id = 11, last_data.value = 2390.0, last_data.timestamp = 1234 }, - { name = "South", kind = "Speed", sensor_id = 12, last_data.value = 2467.0, last_data.timestamp = 1234 }, - { name = "North", kind = "Speed", sensor_id = 13, last_data.value = 2195.0, last_data.timestamp = 1234 }, - { name = "Southwest", kind = "Speed", sensor_id = 14, last_data.value = 2680.0, last_data.timestamp = 1234 }, - { name = "Northwest", kind = "Speed", sensor_id = 15, last_data.value = 2212.0, last_data.timestamp = 1234 }, + { name = "Southeast", kind = "Speed", last_data.value = 2510.0, last_data.timestamp = 1234 }, + { name = "Northeast", kind = "Speed", last_data.value = 2390.0, last_data.timestamp = 1234 }, + { name = "South", kind = "Speed", last_data.value = 2467.0, last_data.timestamp = 1234 }, + { name = "North", kind = "Speed", last_data.value = 2195.0, last_data.timestamp = 1234 }, + { name = "Southwest", kind = "Speed", last_data.value = 2680.0, last_data.timestamp = 1234 }, + { name = "Northwest", kind = "Speed", last_data.value = 2212.0, last_data.timestamp = 1234 }, ] From 72ccdd89fc42d94be4115d62add09621803d9099 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 20 Aug 2024 10:45:21 -0700 Subject: [PATCH 43/77] GAH i hate sql syntax --- schema/crdb/dbinit.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 551c51262c..1457532c49 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1334,7 +1334,7 @@ CREATE TYPE IF NOT EXISTS omicron.public.producer_kind AS ENUM ( -- removed). 'service', -- A Propolis VMM for an instance in the omicron.public.instance table - 'instance' + 'instance', -- A management gateway service on a scrimlet. 'management_gateway' ); From 5443e65d95b4775ac3f7360c13d901264e2f0e57 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 20 Aug 2024 13:56:05 -0700 Subject: [PATCH 44/77] update OMDB success cases again --- dev-tools/omdb/tests/successes.out | 1 - 1 file changed, 1 deletion(-) diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index b3f5eb916f..e939bfa864 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -176,7 +176,6 @@ SP DETAILS: type "Sled" slot 1 NAME DESCRIPTION DEVICE PRESENCE SERIAL sp3-host-cpu FAKE host cpu sp3-host-cpu Present None - sp3-host-cpu FAKE host cpu sp3-host-cpu Present None dev-0 FAKE temperature sensor tmp117 Present None dev-1 FAKE temperature sensor tmp117 Present None dev-2 FAKE Southeast temperature sensor tmp117 Present None From 4fd919b58abc42c092841610a0a73de904a159fc Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 20 Aug 2024 14:23:11 -0700 Subject: [PATCH 45/77] rename most of the schema fields Initially, I tried to reuse the same names from the `gateway_sp_comms` RPC interface for the metrics fields, since I thought it was nice to keep the naming consistent throughout the whole stack. However, some of these names are kind of confusing or unclear to the reader. Based on @bnaecker's [comments][1] and some things I thought seemed kind of unclear, I've given the metric schema fields names that I hope are more readily understandable. In particular: - `component` is now `hardware_component`, since `component` is way too generic - `device` is now `component_kind` --- in the `gateway_sp_comms` API, a "component" is the name of the individual component, while a "device" is the _type_ of component, e.g. a TMP117 sensor. I found this kind of unclear. - Changed the metrics' `name` field to `sensor`. - Made the use of `kind`/`type` consistent by using `kind` everywhere, instead of having `chassis_type` and `sensor_kind`. - Added a `chassis_` prefix to `serial`, `model`, and `revision`, to make it clear that those fields describe the sled/switch/power shelf, rather than the individual component on that chassis. --- gateway/src/metrics.rs | 52 +++++++++---------- .../oximeter/schema/sensor-measurement.toml | 52 +++++++++---------- 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 78a3a9b38a..355ccd96ce 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -37,6 +37,7 @@ use tokio::task::JoinHandle; use uuid::Uuid; oximeter::use_timeseries!("sensor-measurement.toml"); +use hardware_component as metric; /// Handle to the metrics tasks. pub struct Metrics { @@ -107,7 +108,7 @@ struct SpPoller { } struct ComponentMetrics { - target: component::Component, + target: metric::HardwareComponent, /// Counts of errors reported by sensors on this component. sensor_errors: HashMap>, /// Counts of errors that occurred whilst polling the SP for measurements @@ -705,21 +706,21 @@ impl SpPoller { // TODO(eliza): i hate having to clone all these strings for // every device on the SP...it would be cool if Oximeter let us // reference count them... - let target = component::Component { - chassis_type: Cow::Borrowed(match self.spid.typ { + let target = metric::HardwareComponent { + rack_id: self.rack_id, + gateway_id: self.mgs_id, + chassis_model: Cow::Owned(model.clone()), + chassis_revision: revision, + chassis_kind: Cow::Borrowed(match self.spid.typ { SpType::Sled => "sled", SpType::Switch => "switch", SpType::Power => "power", }), + chassis_serial: Cow::Owned(serial.clone()), + hubris_archive_id: Cow::Owned(hubris_archive_id.clone()), slot: self.spid.slot as u32, + component_kind: Cow::Owned(dev.device), component, - device: Cow::Owned(dev.device), - model: Cow::Owned(model.clone()), - revision, - serial: Cow::Owned(serial.clone()), - rack_id: self.rack_id, - gateway_id: self.mgs_id, - hubris_archive_id: Cow::Owned(hubris_archive_id.clone()), }; match self.components.entry(dev.component) { // Found a new device! @@ -728,7 +729,7 @@ impl SpPoller { &self.log, "discovered a new component!"; "component" => ?dev.component, - "device" => ?target.device, + "device" => ?target.component_kind, ); entry.insert(ComponentMetrics { target, @@ -823,33 +824,32 @@ impl SpPoller { // than measurement channels, ignore it for now. continue; }; - let name = Cow::Owned(m.name); + let sensor = Cow::Owned(m.name); let sample = match (m.value, m.kind) { (Ok(datum), MeasurementKind::Temperature) => Sample::new( target, - &component::Temperature { name, datum }, + &metric::Temperature { sensor, datum }, ), (Ok(datum), MeasurementKind::Current) => { - Sample::new(target, &component::Current { name, datum }) + Sample::new(target, &metric::Current { sensor, datum }) } (Ok(datum), MeasurementKind::Voltage) => { - Sample::new(target, &component::Voltage { name, datum }) + Sample::new(target, &metric::Voltage { sensor, datum }) } (Ok(datum), MeasurementKind::Power) => { - Sample::new(target, &component::Power { name, datum }) + Sample::new(target, &metric::Power { sensor, datum }) } (Ok(datum), MeasurementKind::InputCurrent) => Sample::new( target, - &component::InputCurrent { name, datum }, + &metric::InputCurrent { sensor, datum }, ), (Ok(datum), MeasurementKind::InputVoltage) => Sample::new( target, - &component::InputVoltage { name, datum }, - ), - (Ok(datum), MeasurementKind::Speed) => Sample::new( - target, - &component::FanSpeed { name, datum }, + &metric::InputVoltage { sensor, datum }, ), + (Ok(datum), MeasurementKind::Speed) => { + Sample::new(target, &metric::FanSpeed { sensor, datum }) + } (Err(e), kind) => { let kind = match kind { MeasurementKind::Temperature => "temperature", @@ -873,7 +873,7 @@ impl SpPoller { }; let datum = sensor_errors .entry(SensorErrorKey { - name: name.clone(), + name: sensor.clone(), kind, error, }) @@ -887,9 +887,9 @@ impl SpPoller { datum.increment(); Sample::new( target, - &component::SensorErrorCount { + &metric::SensorErrorCount { error: Cow::Borrowed(error), - name, + sensor, datum: *datum, sensor_kind: Cow::Borrowed(kind), }, @@ -1019,7 +1019,7 @@ impl ComponentMetrics { datum.increment(); Sample::new( &self.target, - &component::PollErrorCount { + &metric::PollErrorCount { error: Cow::Borrowed(error_str), datum: *datum, }, diff --git a/oximeter/oximeter/schema/sensor-measurement.toml b/oximeter/oximeter/schema/sensor-measurement.toml index bba1e3667f..d2e3400da2 100644 --- a/oximeter/oximeter/schema/sensor-measurement.toml +++ b/oximeter/oximeter/schema/sensor-measurement.toml @@ -1,21 +1,21 @@ format_version = 1 [target] -name = "component" -description = "A component which reports sensor measurements" +name = "hardware_component" +description = "A hardware component on a compute sled, switch, or power shelf" authz_scope = "fleet" versions = [ { version = 1, fields = [ "rack_id", "slot", - "chassis_type", - "serial", - "model", - "revision", + "chassis_kind", + "chassis_serial", + "chassis_model", + "chassis_revision", "hubris_archive_id", "gateway_id", + "component_kind", "component", - "device", ]} ] @@ -29,15 +29,15 @@ description = """ The cubby number or switch slot of the service processor reporting the \ measurement""" -[fields.model] +[fields.chassis_model] type = "string" description = "Model number of the sled, switch, or power shelf" -[fields.revision] +[fields.chassis_revision] type = "u32" description = "Revision number of the sled, switch, or power shelf" -[fields.serial] +[fields.chassis_serial] type = "string" description = "Serial number of the sled, switch, or power shelf" @@ -47,13 +47,12 @@ description = """ Hubris firmware archive ID of the service processor when the measurement \ was recorded.""" - [fields.gateway_id] type = "uuid" description = """ ID of the Management Gateway Service process which recorded the measurement.""" -[fields.chassis_type] +[fields.chassis_kind] type = "string" description = """ What kind of thing the component resides on. This will be one of 'sled'\ @@ -63,16 +62,17 @@ or 'power', for components on power shelves.""" [fields.component] type = "string" description = """ -The service processor component ID uniquely identifying the component on -the sled, switch, or power shelf.""" +The service processor component ID uniquely identifying the hardware \ +component on the sled, switch, or power shelf.""" -[fields.device] +[fields.component_kind] type = "string" -description = "The name of the device which recorded a sensor reading" +description = "What type of hardware component this thing is." -[fields.name] +[fields.sensor] type = "string" -description = "A name identifying the quantity measured by a sensor measurement" +description = """ +A string identifying the name of a sensor that recorded a sensor reading.""" [fields.error] type = "string" @@ -92,7 +92,7 @@ description = "Temperature reading in degrees Celcius" units = "degrees_celcius" datum_type = "f32" versions = [ - { added_in = 1, fields = ["name"]} + { added_in = 1, fields = ["sensor"]} ] [[metrics]] @@ -101,7 +101,7 @@ description = "Output current reading in amperes" units = "amps" datum_type = "f32" versions = [ - { added_in = 1, fields = ["name"]} + { added_in = 1, fields = ["sensor"]} ] [[metrics]] @@ -110,7 +110,7 @@ description = "Power reading, in watts" units = "watts" datum_type = "f32" versions = [ - { added_in = 1, fields = ["name"]} + { added_in = 1, fields = ["sensor"]} ] [[metrics]] @@ -119,7 +119,7 @@ description = "Output voltage reading, in volts" units = "volts" datum_type = "f32" versions = [ - { added_in = 1, fields = ["name"]} + { added_in = 1, fields = ["sensor"]} ] [[metrics]] @@ -128,7 +128,7 @@ description = "Input electric current reading in amperes" units = "amps" datum_type = "f32" versions = [ - { added_in = 1, fields = ["name"]} + { added_in = 1, fields = ["sensor"]} ] [[metrics]] @@ -137,7 +137,7 @@ description = "Input electric voltage reading, in volts" units = "volts" datum_type = "f32" versions = [ - { added_in = 1, fields = ["name"]} + { added_in = 1, fields = ["sensor"]} ] @@ -147,7 +147,7 @@ description = "A fan speed measurement, in rotations per minute" units = "rpm" datum_type = "f32" versions = [ - { added_in = 1, fields = ["name"]} + { added_in = 1, fields = ["sensor"]} ] [[metrics]] @@ -156,7 +156,7 @@ description = "Cumulative count of errors reported by a sensor" units = "count" datum_type = "cumulative_u64" versions = [ - { added_in = 1, fields = ["name", "error", "sensor_kind"]} + { added_in = 1, fields = ["sensor", "error", "sensor_kind"]} ] [[metrics]] From ce9817d4ee336e5e7ad93f9bcf180985f497ddd4 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 21 Aug 2024 10:51:36 -0700 Subject: [PATCH 46/77] add component descriptions to target --- gateway/src/metrics.rs | 10 ++++++---- oximeter/oximeter/schema/sensor-measurement.toml | 12 ++++++++++-- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 355ccd96ce..c631532fba 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -686,7 +686,7 @@ impl SpPoller { { continue; } - let component = match dev.component.as_str() { + let component_id = match dev.component.as_str() { Some(c) => Cow::Owned(c.to_string()), None => { // These are supposed to always be strings. But, if we @@ -720,7 +720,8 @@ impl SpPoller { hubris_archive_id: Cow::Owned(hubris_archive_id.clone()), slot: self.spid.slot as u32, component_kind: Cow::Owned(dev.device), - component, + component_id, + description: Cow::Owned(dev.description), }; match self.components.entry(dev.component) { // Found a new device! @@ -728,8 +729,9 @@ impl SpPoller { slog::debug!( &self.log, "discovered a new component!"; - "component" => ?dev.component, - "device" => ?target.component_kind, + "component_id" => %target.component_id, + "component_kind" => %target.component_kind, + "description" => %target.component_id, ); entry.insert(ComponentMetrics { target, diff --git a/oximeter/oximeter/schema/sensor-measurement.toml b/oximeter/oximeter/schema/sensor-measurement.toml index d2e3400da2..43cc91244c 100644 --- a/oximeter/oximeter/schema/sensor-measurement.toml +++ b/oximeter/oximeter/schema/sensor-measurement.toml @@ -15,7 +15,8 @@ versions = [ "hubris_archive_id", "gateway_id", "component_kind", - "component", + "component_id", + "description", ]} ] @@ -59,7 +60,7 @@ What kind of thing the component resides on. This will be one of 'sled'\ for components on compute sled; 'switch', for components on rack switches; \ or 'power', for components on power shelves.""" -[fields.component] +[fields.component_id] type = "string" description = """ The service processor component ID uniquely identifying the hardware \ @@ -69,6 +70,13 @@ component on the sled, switch, or power shelf.""" type = "string" description = "What type of hardware component this thing is." +[fields.description] +type = "string" +description = """ +A human-readable description of the hardware component. This may include \ +its location or role in the system (e.g. a DIMM's number, or a temperature \ +sensor's location).""" + [fields.sensor] type = "string" description = """ From 5b69235d25ef55e2900984adc47d847ca056a527 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 21 Aug 2024 14:19:47 -0700 Subject: [PATCH 47/77] discard samples if SP state changes mid-poll Thanks to @jgallagher for pointing out that a SP's firmware may be updated while we are in the middle of scraping its sensors (see [this comment][1]). Now, we'll re-check that the SP's state hasn't changed since the start of the poll, and throw away the current batch of samples and retry if it has. This avoids reporting samples to Oximeter that may have incorrect fields. [1]: https://github.com/oxidecomputer/omicron/pull/6354#issuecomment-2302953239 --- gateway/src/metrics.rs | 613 ++++++++++++++++++++++++----------------- 1 file changed, 355 insertions(+), 258 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index c631532fba..c4b097263b 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -99,7 +99,7 @@ struct PollerManager { /// Polls sensor readings from an individual SP. struct SpPoller { spid: SpIdentifier, - known_state: Option, + known_state: Option, components: HashMap, log: slog::Logger, rack_id: Uuid, @@ -625,286 +625,383 @@ impl SpPoller { &mut self, sp: &SingleSp, ) -> Result, CommunicationError> { - // Check if the SP's state has changed. If it has, we need to make sure - // we still know what all of its sensors are. - let current_state = sp.state().await?; - if Some(¤t_state) != self.known_state.as_ref() { - // The SP's state appears to have changed. Time to make sure our - // understanding of its devices and identity is up to date! - slog::debug!( - &self.log, - "our little friend seems to have changed in some kind of way"; - "current_state" => ?current_state, - "known_state" => ?self.known_state, - ); - let inv_devices = sp.inventory().await?.devices; - - // Clear out any previously-known devices, and preallocate capacity - // for all the new ones. - self.components.clear(); - self.components.reserve(inv_devices.len()); - - // Reimplement this ourselves because we don't really care about - // reading the RoT state at present. This is unfortunately copied - // from `gateway_messages`. - fn stringify_byte_string(bytes: &[u8]) -> String { - // We expect serial and model numbers to be ASCII and 0-padded: find the first 0 - // byte and convert to a string. If that fails, hexlify the entire slice. - let first_zero = - bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len()); - - std::str::from_utf8(&bytes[..first_zero]) - .map(|s| s.to_string()) - .unwrap_or_else(|_err| hex::encode(bytes)) - } - let (model, serial, hubris_archive_id, revision) = - match current_state { - VersionedSpState::V1(ref v) => ( - stringify_byte_string(&v.model), - stringify_byte_string(&v.serial_number[..]), - hex::encode(v.hubris_archive_id), - v.revision, - ), - VersionedSpState::V2(ref v) => ( - stringify_byte_string(&v.model), - stringify_byte_string(&v.serial_number[..]), - hex::encode(v.hubris_archive_id), - v.revision, - ), - VersionedSpState::V3(ref v) => ( - stringify_byte_string(&v.model), - stringify_byte_string(&v.serial_number[..]), - hex::encode(v.hubris_archive_id), - v.revision, - ), - }; - for dev in inv_devices { - // Skip devices which have nothing interesting for us. - if !dev - .capabilities - .contains(DeviceCapabilities::HAS_MEASUREMENT_CHANNELS) - { - continue; - } - let component_id = match dev.component.as_str() { - Some(c) => Cow::Owned(c.to_string()), - None => { - // These are supposed to always be strings. But, if we - // see one that's not a string, fall back to the hex - // representation rather than panicking. - let hex = hex::encode(dev.component.id); - slog::warn!( - &self.log, - "a SP component ID was not a string! this isn't \ - supposed to happen!"; - "component" => %hex, - "device" => ?dev, - ); - Cow::Owned(hex) - } - }; - // TODO(eliza): i hate having to clone all these strings for - // every device on the SP...it would be cool if Oximeter let us - // reference count them... - let target = metric::HardwareComponent { - rack_id: self.rack_id, - gateway_id: self.mgs_id, - chassis_model: Cow::Owned(model.clone()), - chassis_revision: revision, - chassis_kind: Cow::Borrowed(match self.spid.typ { - SpType::Sled => "sled", - SpType::Switch => "switch", - SpType::Power => "power", - }), - chassis_serial: Cow::Owned(serial.clone()), - hubris_archive_id: Cow::Owned(hubris_archive_id.clone()), - slot: self.spid.slot as u32, - component_kind: Cow::Owned(dev.device), - component_id, - description: Cow::Owned(dev.description), + let mut current_state = SpUnderstanding::from(sp.state().await?); + let mut samples = Vec::new(); + // If the SP's state changes dramatically *during* a poll, it may be + // necessary to re-do the metrics scrape, thus the loop. Normally, we + // will only loop a single time, but may retry if necessary. + loop { + // Check if the SP's state has changed. If it has, we need to make sure + // we still know what all of its sensors are. + if Some(¤t_state) != self.known_state.as_ref() { + // The SP's state appears to have changed. Time to make sure our + // understanding of its devices and identity is up to date! + + let chassis_kind = match self.spid.typ { + SpType::Sled => "sled", + SpType::Switch => "switch", + SpType::Power => "power", }; - match self.components.entry(dev.component) { - // Found a new device! - hash_map::Entry::Vacant(entry) => { - slog::debug!( - &self.log, - "discovered a new component!"; - "component_id" => %target.component_id, - "component_kind" => %target.component_kind, - "description" => %target.component_id, - ); - entry.insert(ComponentMetrics { - target, - sensor_errors: HashMap::new(), - poll_errors: HashMap::new(), - }); - } - // We previously had a known device for this thing, but - // the metrics target has changed, so we should reset - // its cumulative metrics. - hash_map::Entry::Occupied(mut entry) - if entry.get().target != target => - { - slog::trace!( - &self.log, - "target has changed, resetting cumulative metrics \ - for component"; - "component" => ?dev.component, - ); - entry.insert(ComponentMetrics { - target, - sensor_errors: HashMap::new(), - poll_errors: HashMap::new(), - }); - } + let model = stringify_byte_string(¤t_state.model[..]); + let serial = + stringify_byte_string(¤t_state.serial_number[..]); + let hubris_archive_id = + hex::encode(¤t_state.hubris_archive_id); - // The target for this device hasn't changed, don't reset it. - hash_map::Entry::Occupied(_) => {} - } - } + slog::debug!( + &self.log, + "our little friend seems to have changed in some kind of way"; + "current_state" => ?current_state, + "known_state" => ?self.known_state, + "new_model" => %model, + "new_serial" => %serial, + "new_hubris_archive_id" => %hubris_archive_id, + ); - self.known_state = Some(current_state); - } + let inv_devices = sp.inventory().await?.devices; - let mut samples = Vec::with_capacity(self.components.len()); - for (c, metrics) in &mut self.components { - // Metrics samples *should* always be well-formed. If we ever emit a - // messed up one, this is a programmer error, and therefore should - // fail in test, but should probably *not* take down the whole - // management gateway in a real-life rack, especially because it's - // probably going to happen again if we were to get restarted. - const BAD_SAMPLE: &str = - "we emitted a bad metrics sample! this should never happen"; - macro_rules! try_sample { - ($sample:expr) => { - match $sample { - Ok(sample) => samples.push(sample), - - Err(err) => { - slog::error!( + // Clear out any previously-known devices, and preallocate capacity + // for all the new ones. + self.components.clear(); + self.components.reserve(inv_devices.len()); + + for dev in inv_devices { + // Skip devices which have nothing interesting for us. + if !dev + .capabilities + .contains(DeviceCapabilities::HAS_MEASUREMENT_CHANNELS) + { + continue; + } + let component_id = match dev.component.as_str() { + Some(c) => Cow::Owned(c.to_string()), + None => { + // These are supposed to always be strings. But, if we + // see one that's not a string, fall back to the hex + // representation rather than panicking. + let hex = hex::encode(dev.component.id); + slog::warn!( &self.log, - "{BAD_SAMPLE}!"; - "error" => %err, + "a SP component ID was not a string! this isn't \ + supposed to happen!"; + "component" => %hex, + "device" => ?dev, ); - #[cfg(debug_assertions)] - unreachable!("{BAD_SAMPLE}: {err}"); + Cow::Owned(hex) } + }; + + // TODO(eliza): i hate having to clone all these strings for + // every device on the SP...it would be cool if Oximeter let us + // reference count them... + let target = metric::HardwareComponent { + rack_id: self.rack_id, + gateway_id: self.mgs_id, + chassis_model: Cow::Owned(model.clone()), + chassis_revision: current_state.revision, + chassis_kind: Cow::Borrowed(chassis_kind), + chassis_serial: Cow::Owned(serial.clone()), + hubris_archive_id: Cow::Owned( + hubris_archive_id.clone(), + ), + slot: self.spid.slot as u32, + component_kind: Cow::Owned(dev.device), + component_id, + description: Cow::Owned(dev.description), + }; + match self.components.entry(dev.component) { + // Found a new device! + hash_map::Entry::Vacant(entry) => { + slog::debug!( + &self.log, + "discovered a new component!"; + "component_id" => %target.component_id, + "component_kind" => %target.component_kind, + "description" => %target.component_id, + ); + entry.insert(ComponentMetrics { + target, + sensor_errors: HashMap::new(), + poll_errors: HashMap::new(), + }); + } + // We previously had a known device for this thing, but + // the metrics target has changed, so we should reset + // its cumulative metrics. + hash_map::Entry::Occupied(mut entry) + if entry.get().target != target => + { + slog::trace!( + &self.log, + "target has changed, resetting cumulative metrics \ + for component"; + "component" => ?dev.component, + ); + entry.insert(ComponentMetrics { + target, + sensor_errors: HashMap::new(), + poll_errors: HashMap::new(), + }); + } + + // The target for this device hasn't changed, don't reset it. + hash_map::Entry::Occupied(_) => {} } } + + self.known_state = Some(current_state); } - let details = match sp.component_details(*c).await { - Ok(deets) => deets, - // SP seems gone! - Err(CommunicationError::NoSpDiscovered) => { - return Err(CommunicationError::NoSpDiscovered) + + // We will need capacity for *at least* the number of components on the + // SP --- it will probably be more, as several components have multiple + // measurement channels which will produce independent samples (e.g. a + // power rail will likely have both voltage and current measurements, + // and a device may have multiple rails...) but, this way, we can avoid + // *some* amount of reallocating... + samples.reserve(self.components.len()); + for (c, metrics) in &mut self.components { + // Metrics samples *should* always be well-formed. If we ever emit a + // messed up one, this is a programmer error, and therefore should + // fail in test, but should probably *not* take down the whole + // management gateway in a real-life rack, especially because it's + // probably going to happen again if we were to get restarted. + const BAD_SAMPLE: &str = + "we emitted a bad metrics sample! this should never happen"; + macro_rules! try_sample { + ($sample:expr) => { + match $sample { + Ok(sample) => samples.push(sample), + + Err(err) => { + slog::error!( + &self.log, + "{BAD_SAMPLE}!"; + "error" => %err, + ); + #[cfg(debug_assertions)] + unreachable!("{BAD_SAMPLE}: {err}"); + } + } + } } - Err(error) => { + let details = match sp.component_details(*c).await { + Ok(deets) => deets, + // SP seems gone! + Err(CommunicationError::NoSpDiscovered) => { + return Err(CommunicationError::NoSpDiscovered) + } + Err(error) => { + slog::warn!( + &self.log, + "failed to read details on SP component"; + "sp_component" => %c, + "error" => %error, + ); + try_sample!(metrics.poll_error(comms_error_str(error))); + continue; + } + }; + if details.entries.is_empty() { slog::warn!( &self.log, - "failed to read details on SP component"; + "a component which claimed to have measurement channels \ + had empty details. this seems weird..."; "sp_component" => %c, - "error" => %error, ); - try_sample!(metrics.poll_error(comms_error_str(error))); + try_sample!(metrics.poll_error("no_measurement_channels")); continue; } - }; - if details.entries.is_empty() { - slog::warn!( - &self.log, - "a component which claimed to have measurement channels \ - had empty details. this seems weird..."; - "sp_component" => %c, - ); - try_sample!(metrics.poll_error("no_measurement_channels")); - continue; - } - let ComponentMetrics { sensor_errors, target, .. } = metrics; - for d in details.entries { - let ComponentDetails::Measurement(m) = d else { - // If the component details are switch port details rather - // than measurement channels, ignore it for now. - continue; - }; - let sensor = Cow::Owned(m.name); - let sample = match (m.value, m.kind) { - (Ok(datum), MeasurementKind::Temperature) => Sample::new( - target, - &metric::Temperature { sensor, datum }, - ), - (Ok(datum), MeasurementKind::Current) => { - Sample::new(target, &metric::Current { sensor, datum }) - } - (Ok(datum), MeasurementKind::Voltage) => { - Sample::new(target, &metric::Voltage { sensor, datum }) - } - (Ok(datum), MeasurementKind::Power) => { - Sample::new(target, &metric::Power { sensor, datum }) - } - (Ok(datum), MeasurementKind::InputCurrent) => Sample::new( - target, - &metric::InputCurrent { sensor, datum }, - ), - (Ok(datum), MeasurementKind::InputVoltage) => Sample::new( - target, - &metric::InputVoltage { sensor, datum }, - ), - (Ok(datum), MeasurementKind::Speed) => { - Sample::new(target, &metric::FanSpeed { sensor, datum }) - } - (Err(e), kind) => { - let kind = match kind { - MeasurementKind::Temperature => "temperature", - MeasurementKind::Current => "current", - MeasurementKind::Voltage => "voltage", - MeasurementKind::Power => "power", - MeasurementKind::InputCurrent => "input_current", - MeasurementKind::InputVoltage => "input_voltage", - MeasurementKind::Speed => "fan_speed", - }; - let error = match e { - MeasurementError::InvalidSensor => "invalid_sensor", - MeasurementError::NoReading => "no_reading", - MeasurementError::NotPresent => "not_present", - MeasurementError::DeviceError => "device_error", - MeasurementError::DeviceUnavailable => { - "device_unavailable" - } - MeasurementError::DeviceTimeout => "device_timeout", - MeasurementError::DeviceOff => "device_off", - }; - let datum = sensor_errors - .entry(SensorErrorKey { - name: sensor.clone(), - kind, - error, - }) - .or_insert(Cumulative::new(0)); - // TODO(eliza): perhaps we should treat this as - // "level-triggered" and only increment the counter - // when the sensor has *changed* to an errored - // state after we have seen at least one good - // measurement from it since the last time the error - // was observed? - datum.increment(); - Sample::new( + let ComponentMetrics { sensor_errors, target, .. } = metrics; + for d in details.entries { + let ComponentDetails::Measurement(m) = d else { + // If the component details are switch port details rather + // than measurement channels, ignore it for now. + continue; + }; + let sensor = Cow::Owned(m.name); + let sample = match (m.value, m.kind) { + (Ok(datum), MeasurementKind::Temperature) => { + Sample::new( + target, + &metric::Temperature { sensor, datum }, + ) + } + (Ok(datum), MeasurementKind::Current) => Sample::new( target, - &metric::SensorErrorCount { - error: Cow::Borrowed(error), - sensor, - datum: *datum, - sensor_kind: Cow::Borrowed(kind), - }, - ) - } - }; - try_sample!(sample); + &metric::Current { sensor, datum }, + ), + (Ok(datum), MeasurementKind::Voltage) => Sample::new( + target, + &metric::Voltage { sensor, datum }, + ), + (Ok(datum), MeasurementKind::Power) => Sample::new( + target, + &metric::Power { sensor, datum }, + ), + (Ok(datum), MeasurementKind::InputCurrent) => { + Sample::new( + target, + &metric::InputCurrent { sensor, datum }, + ) + } + (Ok(datum), MeasurementKind::InputVoltage) => { + Sample::new( + target, + &metric::InputVoltage { sensor, datum }, + ) + } + (Ok(datum), MeasurementKind::Speed) => Sample::new( + target, + &metric::FanSpeed { sensor, datum }, + ), + (Err(e), kind) => { + let kind = match kind { + MeasurementKind::Temperature => "temperature", + MeasurementKind::Current => "current", + MeasurementKind::Voltage => "voltage", + MeasurementKind::Power => "power", + MeasurementKind::InputCurrent => { + "input_current" + } + MeasurementKind::InputVoltage => { + "input_voltage" + } + MeasurementKind::Speed => "fan_speed", + }; + let error = match e { + MeasurementError::InvalidSensor => { + "invalid_sensor" + } + MeasurementError::NoReading => "no_reading", + MeasurementError::NotPresent => "not_present", + MeasurementError::DeviceError => "device_error", + MeasurementError::DeviceUnavailable => { + "device_unavailable" + } + MeasurementError::DeviceTimeout => { + "device_timeout" + } + MeasurementError::DeviceOff => "device_off", + }; + let datum = sensor_errors + .entry(SensorErrorKey { + name: sensor.clone(), + kind, + error, + }) + .or_insert(Cumulative::new(0)); + // TODO(eliza): perhaps we should treat this as + // "level-triggered" and only increment the counter + // when the sensor has *changed* to an errored + // state after we have seen at least one good + // measurement from it since the last time the error + // was observed? + datum.increment(); + Sample::new( + target, + &metric::SensorErrorCount { + error: Cow::Borrowed(error), + sensor, + datum: *datum, + sensor_kind: Cow::Borrowed(kind), + }, + ) + } + }; + try_sample!(sample); + } } + + // Now, fetch the SP's state *again*. It is possible that, while we + // were scraping the SP's samples, the SP's identity changed in some + // way: perhaps its version was updated during the poll, or it + // was removed from the rack and replaced with an entirely different + // chassis! If that's the case, some of the samples we collected may + // have a metrics target describing the wrong thing (e.g. they could + // still have the previous firmware's `hubris_archive_id`, if the SP + // was updated). In that case, we need to throw away the samples we + // collected and try again, potentially rebuilding our understanding + // of the SP's inventory. + let state = SpUnderstanding::from(sp.state().await?); + if state == current_state { + // All good, the SP is still who we thought it was! We can + // "commit" this batch of samples + return Ok(samples); + } + + slog::info!( + &self.log, + "SP's state changed mid-poll! discarding current samples and \ + starting over!"; + "new_state" => ?state, + "current_state" => ?current_state, + ); + // Let's reuse the buffer we already have for the next batch of + // samples. + samples.clear(); + //...and try again with the new state. + current_state = state; } - Ok(samples) } } +/// The fields of the `gateway_messages` `VersionedSpState` and +/// `SpStateV1`/`SpStateV2`/`SpStateV3` that we actually care about for purposes +/// of determining whether our understanding of the SP's components are still +/// valid. +/// +/// In particular, we throw out the RoT state and the SP's power state, because +/// those changing won't actually invalidate our understanding of the SP's +/// components. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +struct SpUnderstanding { + hubris_archive_id: [u8; 8], + serial_number: [u8; 32], + model: [u8; 32], + revision: u32, +} + +impl From for SpUnderstanding { + fn from(v: VersionedSpState) -> Self { + match v { + VersionedSpState::V1(gateway_messages::SpStateV1 { + hubris_archive_id, + serial_number, + model, + revision, + .. + }) => Self { hubris_archive_id, serial_number, model, revision }, + VersionedSpState::V2(gateway_messages::SpStateV2 { + hubris_archive_id, + serial_number, + model, + revision, + .. + }) => Self { hubris_archive_id, serial_number, model, revision }, + VersionedSpState::V3(gateway_messages::SpStateV3 { + hubris_archive_id, + serial_number, + model, + revision, + .. + }) => Self { hubris_archive_id, serial_number, model, revision }, + } + } +} + +// Reimplement this ourselves because we don't really care about +// reading the RoT state at present. This is unfortunately copied +// from `gateway_messages`. +fn stringify_byte_string(bytes: &[u8]) -> String { + // We expect serial and model numbers to be ASCII and 0-padded: find the first 0 + // byte and convert to a string. If that fails, hexlify the entire slice. + let first_zero = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len()); + + std::str::from_utf8(&bytes[..first_zero]) + .map(|s| s.to_string()) + .unwrap_or_else(|_err| hex::encode(bytes)) +} + impl ServerManager { async fn run(mut self) -> anyhow::Result<()> { let (registration_address, bind_loopback) = From c11992a3da645b80940b738e59338e826f32123d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 21 Aug 2024 14:54:29 -0700 Subject: [PATCH 48/77] add a bit more config validation --- gateway/src/metrics.rs | 101 ++++++++++++++++++++++++++--------------- 1 file changed, 64 insertions(+), 37 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index c4b097263b..a0a86e2c67 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -83,6 +83,15 @@ pub struct DevConfig { pub bind_loopback: bool, } +struct ValidatedMetricsConfig { + sp_poll_interval: Duration, + oximeter_collection_interval: Duration, + /// Capacity for the channel of samples from poller tasks to the Oximeter + /// producer. + max_buffered_sample_chunks: usize, + dev_config: Option, +} + /// Manages SP pollers, making sure that every SP has a poller task. struct PollerManager { log: slog::Logger, @@ -128,7 +137,6 @@ struct ServerManager { log: slog::Logger, addrs: watch::Receiver>, registry: ProducerRegistry, - cfg: MetricsConfig, } #[derive(Debug)] @@ -171,6 +179,7 @@ impl Metrics { apictx: Arc, ) -> anyhow::Result { let &MgsArguments { id, rack_id, ref addresses } = args; + let cfg = cfg.validate()?; // Create a channel for the SP poller tasks to send samples to the // Oximeter producer endpoint. @@ -183,9 +192,8 @@ impl Metrics { // is what we want, as we would prefer a full buffer to result in // clobbering the oldest measurements, rather than leaving the newest // ones on the floor. - let max_buffered_sample_chunks = cfg.sample_channel_capacity(); let (sample_tx, sample_rx) = - broadcast::channel(max_buffered_sample_chunks); + broadcast::channel(cfg.max_buffered_sample_chunks); // Using a channel for this is, admittedly, a bit of an end-run around // the `OnceLock` on the `ServerContext` that *also* stores the rack ID, @@ -204,20 +212,18 @@ impl Metrics { }; let pollers = { let log = log.new(slog::o!("component" => "sensor-poller")); - let poll_interval = - Duration::from_millis(cfg.sp_poll_interval_ms as u64); slog::info!( &log, "SP sensor metrics configured"; - "poll_interval" => ?poll_interval, - "max_buffered_sample_chunks" => max_buffered_sample_chunks, + "poll_interval" => ?cfg.sp_poll_interval, + "max_buffered_sample_chunks" => cfg.max_buffered_sample_chunks, ); tokio::spawn( PollerManager { sample_tx, apictx, - poll_interval, + poll_interval: cfg.sp_poll_interval, tasks: tokio::task::JoinSet::new(), log, mgs_id: id, @@ -236,7 +242,7 @@ impl Metrics { .context("failed to register metrics producer")?; tokio::spawn( - ServerManager { log, addrs: addrs_rx, registry, cfg }.run(), + ServerManager { log, addrs: addrs_rx, registry }.run(cfg), ) }; Ok(Self { addrs_tx, rack_id_tx, server, pollers }) @@ -279,34 +285,55 @@ impl Drop for Metrics { } impl MetricsConfig { - fn oximeter_collection_interval(&self) -> Duration { - Duration::from_secs(self.oximeter_collection_interval_secs as u64) - } + fn validate(self) -> anyhow::Result { + anyhow::ensure!( + self.oximeter_collection_interval_secs > 0, + "`metrics.oximeter_collection_interval_secs` probably shouldn't \ + be 0 seconds", + ); + let oximeter_collection_interval = + Duration::from_secs(self.oximeter_collection_interval_secs as u64); - /// Returns the number of sample chunks from individual SPs to buffer. - fn sample_channel_capacity(&self) -> usize { - // Roughly how many times will we poll SPs for each metrics collection - // interval? - let polls_per_metrics_interval = { - let collection_interval_ms: usize = self - .oximeter_collection_interval() - .as_millis() - .try_into() - .expect("your oximeter collection interval is way too big..."); - collection_interval_ms / self.sp_poll_interval_ms + anyhow::ensure!( + self.sp_poll_interval_ms > 0, + "`metrics.sp_poll_interval_ms` probably shouldn't be 0 ms", + ); + let sp_poll_interval = + Duration::from_secs(self.sp_poll_interval_ms as u64); + + let max_buffered_sample_chunks = { + // Roughly how many times will we poll SPs for each metrics collection + // interval? + let polls_per_metrics_interval = { + let collection_interval_ms: usize = oximeter_collection_interval + .as_millis() + .try_into() + .with_context(|| format!( + "configured Oximeter collection interval ({:?}) is way too big...", + oximeter_collection_interval, + ))?; + collection_interval_ms / self.sp_poll_interval_ms + }; + + // How many sample collection intervals do we want to allow to elapse before + // we start putting stuff on the floor? + // + // Let's say 16. Chosen totally arbitrarily but seems reasonable-ish. + let sloppiness = 16; + let capacity = + NORMAL_NUMBER_OF_SPS * polls_per_metrics_interval * sloppiness; + // Finally, the buffer capacity will probably be allocated in a power of two + // anyway, so let's make sure our thing is a power of two so we don't waste + // the allocation we're gonna get anyway. + capacity.next_power_of_two() }; - // How many sample collection intervals do we want to allow to elapse before - // we start putting stuff on the floor? - // - // Let's say 16. Chosen totally arbitrarily but seems reasonable-ish. - let sloppiness = 16; - let capacity = - NORMAL_NUMBER_OF_SPS * polls_per_metrics_interval * sloppiness; - // Finally, the buffer capacity will probably be allocated in a power of two - // anyway, so let's make sure our thing is a power of two so we don't waste - // the allocation we're gonna get anyway. - capacity.next_power_of_two() + Ok(ValidatedMetricsConfig { + oximeter_collection_interval, + sp_poll_interval, + max_buffered_sample_chunks, + dev_config: self.dev, + }) } } @@ -1003,9 +1030,9 @@ fn stringify_byte_string(bytes: &[u8]) -> String { } impl ServerManager { - async fn run(mut self) -> anyhow::Result<()> { + async fn run(mut self, cfg: ValidatedMetricsConfig) -> anyhow::Result<()> { let (registration_address, bind_loopback) = - if let Some(ref dev) = self.cfg.dev { + if let Some(ref dev) = cfg.dev_config { slog::warn!( &self.log, "using development metrics configuration overrides!"; @@ -1016,8 +1043,8 @@ impl ServerManager { } else { (None, false) }; - let interval = self.cfg.oximeter_collection_interval(); let id = self.registry.producer_id(); + let interval = cfg.oximeter_collection_interval; let mut current_server: Option = None; loop { From 830bb209d35db033aa52fdcedd1e3b3d3715dfea Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 21 Aug 2024 15:19:47 -0700 Subject: [PATCH 49/77] way less complex poller manager --- gateway/src/metrics.rs | 134 ++++++++++++++++++----------------------- 1 file changed, 59 insertions(+), 75 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index a0a86e2c67..9ef7ebfba3 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -414,34 +414,31 @@ impl PollerManager { "rack ID sender has gone away...we must be shutting down", )?; - let mut known_sps: HashMap = - HashMap::with_capacity(NORMAL_NUMBER_OF_SPS); // Wait for SP discovery to complete, if it hasn't already. // TODO(eliza): presently, we busy-poll here. It would be nicer to // replace the `OnceLock` in `ManagementSwitch` // with a `tokio::sync::watch` - backoff::retry_notify_ext( + let sps = backoff::retry_notify_ext( backoff::retry_policy_local(), || async { - if switch.is_discovery_complete() { - Ok(()) - } else { - Err(backoff::BackoffError::transient(())) - } + switch.all_sps().map_err(backoff::BackoffError::transient) }, - |_, _, elapsed| { + |err, _, elapsed| { let secs = elapsed.as_secs(); if secs < 30 { slog::debug!( &self.log, "waiting for SP discovery to complete..."; "elapsed" => ?elapsed, + "error" => err, ); } else if secs < 180 { slog::info!( &self.log, "still waiting for SP discovery to complete..."; "elapsed" => ?elapsed, + + "error" => err, ) } else { slog::warn!( @@ -449,6 +446,7 @@ impl PollerManager { "we have been waiting for SP discovery to complete \ for a pretty long time!"; "elapsed" => ?elapsed, + "error" => err, ) } }, @@ -461,73 +459,24 @@ impl PollerManager { "starting to polling SP sensor data every {:?}", self.poll_interval; ); - loop { - let sps = backoff::retry_notify_ext( - backoff::retry_policy_internal_service(), - || async { - switch.all_sps().map_err(backoff::BackoffError::transient) - }, - |error, attempts, elapsed| { - slog::warn!( - &self.log, - "failed to list SPs! we'll try again in a little bit."; - "error" => error, - "elapsed" => ?elapsed, - "attempts" => attempts, - ) - }, - ) - .await - .expect("we never return a permanent error here"); - - for (spid, _) in sps { - // Do we know about this li'l guy already? - match known_sps.get(&spid) { - // Okay, and has it got someone checking up on it? Right? - Some(poller) if poller.is_finished() => { - // Welp. - slog::info!( - &self.log, - "uh-oh! a known SP's poller task has gone AWOL. restarting it..."; - "sp_slot" => ?spid.slot, - "chassis_type" => ?spid.typ, - ); - } - Some(_) => continue, - None => { - slog::info!( - &self.log, - "found a new little friend!"; - "sp_slot" => ?spid.slot, - "chassis_type" => ?spid.typ, - ); - } - } - - let poller = SpPoller { - spid, - rack_id, - mgs_id: self.mgs_id, - log: self.log.new(slog::o!( - "sp_slot" => spid.slot, - "chassis_type" => format!("{:?}", spid.typ), - )), - components: HashMap::new(), - known_state: None, - sample_tx: self.sample_tx.clone(), - }; - let poller_handle = self - .tasks - .spawn(poller.run(self.poll_interval, self.apictx.clone())); - let _prev_poller = known_sps.insert(spid, poller_handle); - debug_assert!( - _prev_poller.map(|p| p.is_finished()).unwrap_or(true), - "if we clobbered an existing poller task, it better have \ - been because it was dead..." - ); - } + let mut known_sps = HashMap::with_capacity(NORMAL_NUMBER_OF_SPS); + for (spid, _) in sps { + slog::info!( + &self.log, + "found a new little friend!"; + "sp_slot" => ?spid.slot, + "chassis_type" => ?spid.typ, + ); + let task = self.tasks.spawn(self.make_poller(spid, rack_id)); + let _prev = known_sps.insert(spid, task); + debug_assert!( + _prev.is_none(), + "we should not have clobbered an existing SP poller!" + ); + } - // All pollers started! Now wait to see if any of them have died... + // All pollers started! Now wait to see if any of them have died... + loop { let mut joined = self.tasks.join_next().await; while let Some(result) = joined { if let Err(e) = result { @@ -550,8 +499,43 @@ impl PollerManager { // drain any remaining errors joined = self.tasks.try_join_next(); } + + for (spid, poller) in &mut known_sps { + // There's still someone checking up on this SP, right? + if poller.is_finished() { + // Welp. + slog::info!( + &self.log, + "uh-oh! a known SP's poller task has gone AWOL. restarting it..."; + "sp_slot" => ?spid.slot, + "chassis_type" => ?spid.typ, + ); + *poller = + self.tasks.spawn(self.make_poller(*spid, rack_id)); + } + } } } + + fn make_poller( + &self, + spid: SpIdentifier, + rack_id: Uuid, + ) -> impl std::future::Future> { + let poller = SpPoller { + spid, + rack_id, + mgs_id: self.mgs_id, + log: self.log.new(slog::o!( + "sp_slot" => spid.slot, + "chassis_type" => format!("{:?}", spid.typ), + )), + components: HashMap::new(), + known_state: None, + sample_tx: self.sample_tx.clone(), + }; + poller.run(self.poll_interval, self.apictx.clone()) + } } impl Drop for PollerManager { From 23ab449a730d5eb280b110881e4549a90a6f6ff4 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 21 Aug 2024 16:49:27 -0700 Subject: [PATCH 50/77] add a simple test that metrics make it to oximeter --- gateway-test-utils/src/setup.rs | 10 ++- nexus/tests/integration_tests/metrics.rs | 96 +++++++++++++++++++++++- 2 files changed, 102 insertions(+), 4 deletions(-) diff --git a/gateway-test-utils/src/setup.rs b/gateway-test-utils/src/setup.rs index 46bc55805a..6aff29d3de 100644 --- a/gateway-test-utils/src/setup.rs +++ b/gateway-test-utils/src/setup.rs @@ -8,6 +8,9 @@ use camino::Utf8Path; use dropshot::test_util::ClientTestContext; use dropshot::test_util::LogContext; use gateway_messages::SpPort; +pub use omicron_gateway::metrics::{ + DevConfig as MetricsDevConfig, MetricsConfig, +}; use omicron_gateway::MgsArguments; use omicron_gateway::SpType; use omicron_gateway::SwitchPortConfig; @@ -33,6 +36,7 @@ pub struct GatewayTestContext { pub server: omicron_gateway::Server, pub simrack: SimRack, pub logctx: LogContext, + pub gateway_id: Uuid, } impl GatewayTestContext { @@ -143,8 +147,8 @@ pub async fn test_setup_with_config( // Start gateway server let rack_id = Some(Uuid::parse_str(RACK_UUID).unwrap()); - - let args = MgsArguments { id: Uuid::new_v4(), addresses, rack_id }; + let gateway_id = Uuid::new_v4(); + let args = MgsArguments { id: gateway_id, addresses, rack_id }; let server = omicron_gateway::Server::start( server_config.clone(), args, @@ -206,5 +210,5 @@ pub async fn test_setup_with_config( log.new(o!("component" => "client test context")), ); - GatewayTestContext { client, server, simrack, logctx } + GatewayTestContext { client, server, simrack, logctx, gateway_id } } diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs index 3b808984ae..8b44d67048 100644 --- a/nexus/tests/integration_tests/metrics.rs +++ b/nexus/tests/integration_tests/metrics.rs @@ -23,8 +23,10 @@ use nexus_types::external_api::views::OxqlQueryResult; use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError}; use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; use oximeter::types::Datum; +use oximeter::types::FieldValue; use oximeter::types::Measurement; use oximeter::TimeseriesSchema; +use std::borrow::Borrow; use uuid::Uuid; pub async fn query_for_metrics( @@ -344,7 +346,6 @@ async fn test_instance_watcher_metrics( ); }}; } - use oximeter::types::FieldValue; const INSTANCE_ID_FIELD: &str = "instance_id"; const STATE_FIELD: &str = "state"; const STATE_STARTING: &str = "starting"; @@ -589,6 +590,99 @@ async fn test_instance_watcher_metrics( assert_gte!(ts2_running, 2); } +#[nexus_test] +async fn test_mgs_metrics( + cptestctx: &ControlPlaneTestContext, +) { + // Make a MGS + let mgs = { + let (mut mgs_config, sp_sim_config) = + gateway_test_utils::setup::load_test_config(); + // munge the already-parsed MGS config file to point it at the test + // Nexus' address. + mgs_config.metrics.dev = + Some(gateway_test_utils::setup::MetricsDevConfig { + bind_loopback: true, + nexus_address: Some(cptestctx.internal_client.bind_address), + }); + gateway_test_utils::setup::test_setup_with_config( + "test_mgs_metrics", + gateway_messages::SpPort::One, + mgs_config, + &sp_sim_config, + None, + ) + .await + }; + + // Wait until the MGS registers as a producer with Oximeter. + wait_for_producer(&cptestctx.oximeter, &mgs.gateway_id).await; + cptestctx.oximeter.force_collect().await; + + async fn get_timeseries( + cptestctx: &ControlPlaneTestContext, + name: &str, + ) -> oxql_types::Table { + let table = timeseries_query(&cptestctx, &format!("get {name}")) + .await + .into_iter() + .find(|t| t.name() == name); + match table { + Some(table) => table, + None => panic!("missing table for {name}"), + } + } + + #[track_caller] + fn check_all_serials_present(table: oxql_types::Table) { + let mut sim_gimlet_00 = 0; + let mut sim_gimlet_01 = 0; + for timeseries in table.timeseries() { + let fields = ×eries.fields; + let n_points = timeseries.points.len(); + eprintln!("found timeseries: {fields:?} ({n_points} points)"); + assert!(n_points > 0, "timeseries {fields:?} should have points"); + let serial_str = match timeseries.fields.get("chassis_serial") { + Some(FieldValue::String(s)) => s.borrow(), + Some(x) => panic!( + "`chassis_serial` field should be a string, but got: {x:?}" + ), + None => { + panic!("timeseries should have a `chassis_serial` field") + } + }; + match serial_str { + "SimGimlet00" => sim_gimlet_00 += 1, + "SimGimlet01" => sim_gimlet_01 += 1, + // if someone adds sensor readings to the fake sidecar later, + // that's okay... + _ => eprintln!("bonus simulated chassis serial {serial_str:?}"), + } + } + + assert!( + sim_gimlet_00 > 0, + "expected at least one timeseries from SimGimlet00 in {table:#?}" + ); + assert!( + sim_gimlet_01 > 0, + "expected at least one timeseries from SimGimlet01 in {table:#?}" + ); + } + + let temp_metrics = + get_timeseries(&cptestctx, "hardware_component:temperature").await; + check_all_serials_present(temp_metrics); + + let voltage_metrics = + get_timeseries(&cptestctx, "hardware_component:voltage").await; + check_all_serials_present(voltage_metrics); + + let current_metrics = + get_timeseries(&cptestctx, "hardware_component:current").await; + check_all_serials_present(current_metrics); +} + /// Wait until a producer is registered with Oximeter. /// /// This blocks until the producer is registered, for up to 60s. It panics if From 3c766b3bec7870cc66ae41863df66ba76fd47e48 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 22 Aug 2024 08:42:09 -0700 Subject: [PATCH 51/77] that was supposed to be milliseconds --- gateway/src/metrics.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 9ef7ebfba3..0070e353d6 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -299,7 +299,7 @@ impl MetricsConfig { "`metrics.sp_poll_interval_ms` probably shouldn't be 0 ms", ); let sp_poll_interval = - Duration::from_secs(self.sp_poll_interval_ms as u64); + Duration::from_millis(self.sp_poll_interval_ms as u64); let max_buffered_sample_chunks = { // Roughly how many times will we poll SPs for each metrics collection From 8a6252ab979d79d5297bda9b2d1b786874359e5b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 22 Aug 2024 11:03:19 -0700 Subject: [PATCH 52/77] SP poller tasks should never need to be restarted Per [this conversation][1] with @jgallagher, it turns out that the fairly complex code for restarting poller tasks is basically entirely necessary. The `ManagementSwitch::sp` function actually only returns an error if discovery hasn't run yet, or if the SP identifier is invalid. Because we wait to start poller tasks until discovery has run, and we use SP identifiers returned by the management switch, our call to that function should never fail, so we can make the poller tasks never return an error and not have to worry about restarting them. This makes the code a lot simpler. [1]: https://github.com/oxidecomputer/omicron/pull/6354#discussion_r1725727261 --- gateway/src/metrics.rs | 250 +++++++++++++++-------------------------- 1 file changed, 92 insertions(+), 158 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 0070e353d6..9a07a634eb 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -2,7 +2,6 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. use crate::error::CommunicationError; -use crate::error::SpCommsError; use crate::management_switch::SpIdentifier; use crate::management_switch::SpType; use crate::MgsArguments; @@ -44,7 +43,6 @@ pub struct Metrics { addrs_tx: watch::Sender>, rack_id_tx: Option>, server: JoinHandle>, - pollers: JoinHandle>, } /// Configuration for metrics. @@ -92,19 +90,6 @@ struct ValidatedMetricsConfig { dev_config: Option, } -/// Manages SP pollers, making sure that every SP has a poller task. -struct PollerManager { - log: slog::Logger, - apictx: Arc, - mgs_id: Uuid, - /// Poller tasks - tasks: tokio::task::JoinSet>, - /// The manager doesn't actually produce samples, but it needs to be able to - /// clone a sender for every poller task it spawns. - sample_tx: broadcast::Sender>, - poll_interval: Duration, -} - /// Polls sensor readings from an individual SP. struct SpPoller { spid: SpIdentifier, @@ -210,7 +195,8 @@ impl Metrics { } else { Some(rack_id_tx) }; - let pollers = { + + tokio::spawn({ let log = log.new(slog::o!("component" => "sensor-poller")); slog::info!( &log, @@ -219,18 +205,15 @@ impl Metrics { "max_buffered_sample_chunks" => cfg.max_buffered_sample_chunks, ); - tokio::spawn( - PollerManager { - sample_tx, - apictx, - poll_interval: cfg.sp_poll_interval, - tasks: tokio::task::JoinSet::new(), - log, - mgs_id: id, - } - .run(rack_id_rx), + start_pollers( + log, + apictx.clone(), + rack_id_rx, + id, + sample_tx, + cfg.sp_poll_interval, ) - }; + }); let (addrs_tx, addrs_rx) = tokio::sync::watch::channel(addresses.clone()); @@ -245,7 +228,7 @@ impl Metrics { ServerManager { log, addrs: addrs_rx, registry }.run(cfg), ) }; - Ok(Self { addrs_tx, rack_id_tx, server, pollers }) + Ok(Self { addrs_tx, rack_id_tx, server }) } pub fn set_rack_id(&mut self, rack_id: Uuid) { @@ -280,7 +263,6 @@ impl Drop for Metrics { fn drop(&mut self) { // Clean up our children on drop. self.server.abort(); - self.pollers.abort(); } } @@ -402,148 +384,88 @@ impl oximeter::Producer for Producer { } } -impl PollerManager { - async fn run( - mut self, - rack_id: oneshot::Receiver, - ) -> anyhow::Result<()> { - let switch = &self.apictx.mgmt_switch; - - // First, wait until we know what the rack ID is... - let rack_id = rack_id.await.context( - "rack ID sender has gone away...we must be shutting down", - )?; - - // Wait for SP discovery to complete, if it hasn't already. - // TODO(eliza): presently, we busy-poll here. It would be nicer to - // replace the `OnceLock` in `ManagementSwitch` - // with a `tokio::sync::watch` - let sps = backoff::retry_notify_ext( - backoff::retry_policy_local(), - || async { - switch.all_sps().map_err(backoff::BackoffError::transient) - }, - |err, _, elapsed| { - let secs = elapsed.as_secs(); - if secs < 30 { - slog::debug!( - &self.log, - "waiting for SP discovery to complete..."; - "elapsed" => ?elapsed, - "error" => err, - ); - } else if secs < 180 { - slog::info!( - &self.log, - "still waiting for SP discovery to complete..."; - "elapsed" => ?elapsed, +async fn start_pollers( + log: slog::Logger, + apictx: Arc, + rack_id: oneshot::Receiver, + mgs_id: Uuid, + sample_tx: broadcast::Sender>, + poll_interval: Duration, +) -> anyhow::Result<()> { + let switch = &apictx.mgmt_switch; - "error" => err, - ) - } else { - slog::warn!( - &self.log, - "we have been waiting for SP discovery to complete \ - for a pretty long time!"; - "elapsed" => ?elapsed, - "error" => err, - ) - } - }, - ) + // First, wait until we know what the rack ID is known... + let rack_id = rack_id .await - .expect("we should never return a fatal error here"); - - slog::info!( - &self.log, - "starting to polling SP sensor data every {:?}", self.poll_interval; - ); - - let mut known_sps = HashMap::with_capacity(NORMAL_NUMBER_OF_SPS); - for (spid, _) in sps { - slog::info!( - &self.log, - "found a new little friend!"; - "sp_slot" => ?spid.slot, - "chassis_type" => ?spid.typ, - ); - let task = self.tasks.spawn(self.make_poller(spid, rack_id)); - let _prev = known_sps.insert(spid, task); - debug_assert!( - _prev.is_none(), - "we should not have clobbered an existing SP poller!" - ); - } + .context("rack ID sender has gone away...we must be shutting down")?; + + // Wait for SP discovery to complete, if it hasn't already. + // TODO(eliza): presently, we busy-poll here. It would be nicer to + // replace the `OnceLock` in `ManagementSwitch` + // with a `tokio::sync::watch` + let sps = backoff::retry_notify_ext( + backoff::retry_policy_local(), + || async { switch.all_sps().map_err(backoff::BackoffError::transient) }, + |err, _, elapsed| { + let secs = elapsed.as_secs(); + if secs < 30 { + slog::debug!( + &log, + "waiting for SP discovery to complete..."; + "elapsed" => ?elapsed, + "error" => err, + ); + } else if secs < 180 { + slog::info!( + &log, + "still waiting for SP discovery to complete..."; + "elapsed" => ?elapsed, - // All pollers started! Now wait to see if any of them have died... - loop { - let mut joined = self.tasks.join_next().await; - while let Some(result) = joined { - if let Err(e) = result { - if cfg!(debug_assertions) { - unreachable!( - "we compile with `panic=\"abort\"`, so a spawned task \ - panicking should abort the whole process..." - ); - } else { - slog::error!( - &self.log, - "a spawned SP poller task panicked! this should \ - never happen: we compile with `panic=\"abort\"`, so \ - a spawned task panicking should abort the whole \ - process..."; - "error" => %e, - ); - } - } - // drain any remaining errors - joined = self.tasks.try_join_next(); + "error" => err, + ) + } else { + slog::warn!( + &log, + "we have been waiting for SP discovery to complete \ + for a pretty long time!"; + "elapsed" => ?elapsed, + "error" => err, + ) } + }, + ) + .await + .expect("we should never return a fatal error here"); - for (spid, poller) in &mut known_sps { - // There's still someone checking up on this SP, right? - if poller.is_finished() { - // Welp. - slog::info!( - &self.log, - "uh-oh! a known SP's poller task has gone AWOL. restarting it..."; - "sp_slot" => ?spid.slot, - "chassis_type" => ?spid.typ, - ); - *poller = - self.tasks.spawn(self.make_poller(*spid, rack_id)); - } - } - } - } + slog::info!( + &log, + "starting to polling SP sensor data every {poll_interval:?}"; + ); + + for (spid, _) in sps { + slog::info!( + &log, + "found a new little friend!"; + "sp_slot" => ?spid.slot, + "chassis_type" => ?spid.typ, + ); - fn make_poller( - &self, - spid: SpIdentifier, - rack_id: Uuid, - ) -> impl std::future::Future> { let poller = SpPoller { spid, rack_id, - mgs_id: self.mgs_id, - log: self.log.new(slog::o!( + mgs_id, + log: log.new(slog::o!( "sp_slot" => spid.slot, "chassis_type" => format!("{:?}", spid.typ), )), components: HashMap::new(), known_state: None, - sample_tx: self.sample_tx.clone(), + sample_tx: sample_tx.clone(), }; - poller.run(self.poll_interval, self.apictx.clone()) + tokio::spawn(poller.run(poll_interval, apictx.clone())); } -} -impl Drop for PollerManager { - fn drop(&mut self) { - // This is why the `JoinSet` is a field on the `PollerManager` struct - // rather than a local variable in `async fn run()`! - self.tasks.abort_all(); - } + Ok(()) } impl SpPoller { @@ -551,10 +473,22 @@ impl SpPoller { mut self, poll_interval: Duration, apictx: Arc, - ) -> Result<(), SpCommsError> { + ) { let mut interval = tokio::time::interval(poll_interval); let switch = &apictx.mgmt_switch; - let sp = switch.sp(self.spid)?; + let sp = match switch.sp(self.spid) { + Ok(sp) => sp, + Err(e) => { + unreachable!( + "the `SpPoller::run` function is only called after \ + discovery completes successfully, and the `SpIdentifier` \ + used was returned by the management switch, so it \ + should be valid. however, we saw a {e:?} error when \ + looking up {:?}", + self.spid + ); + } + }; loop { interval.tick().await; slog::trace!(&self.log, "interval elapsed, polling SP..."); @@ -581,7 +515,7 @@ impl SpPoller { "all sample receiver handles have been dropped! \ presumably we are shutting down..."; ); - return Ok(()); + return; } } // No SP is currently present for this ID. This may change in @@ -615,7 +549,7 @@ impl SpPoller { "SP address watch has been closed, presumably \ we are shutting down"; ); - return Ok(()); + return; } } } From 7aa3f77fd3481d046c082e754301bea35eaa52cc Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 22 Aug 2024 11:18:28 -0700 Subject: [PATCH 53/77] schema grammar/wording suggestions from @bnaeker --- .../oximeter/schema/sensor-measurement.toml | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/oximeter/oximeter/schema/sensor-measurement.toml b/oximeter/oximeter/schema/sensor-measurement.toml index 43cc91244c..3209723f69 100644 --- a/oximeter/oximeter/schema/sensor-measurement.toml +++ b/oximeter/oximeter/schema/sensor-measurement.toml @@ -56,9 +56,10 @@ ID of the Management Gateway Service process which recorded the measurement.""" [fields.chassis_kind] type = "string" description = """ -What kind of thing the component resides on. This will be one of 'sled'\ -for components on compute sled; 'switch', for components on rack switches; \ -or 'power', for components on power shelves.""" +What kind of thing the component resides on. + +This will be one of 'sled', for components on compute sleds; 'switch', for \ +components on rack switches; or 'power', for components on power shelves.""" [fields.component_id] type = "string" @@ -79,24 +80,24 @@ sensor's location).""" [fields.sensor] type = "string" -description = """ -A string identifying the name of a sensor that recorded a sensor reading.""" +description = """The name of a sensor that recorded a sensor reading.""" [fields.error] type = "string" -description = "A string identifying the type of sensor error that occurred" +description = "The kind of sensor error that occurred" [fields.sensor_kind] type = "string" description = """ -A string identifying which sensor could not be read. This will be one of \ -'temperature', 'current', 'power', 'voltage', 'input_current', \ -'input_voltage', or 'fan_speed' (the same names as the metrics emitted by \ -these sensors when they are read successfully).""" +Which kind of sensor could not be read due to a sensor error. + +This will be one of 'temperature', 'current', 'power', 'voltage', \ +'input_current', 'input_voltage', or 'fan_speed' (the same names as \ +the metrics emitted by these sensors when they are read successfully).""" [[metrics]] name = "temperature" -description = "Temperature reading in degrees Celcius" +description = "A temperature reading from a hardware component." units = "degrees_celcius" datum_type = "f32" versions = [ @@ -170,7 +171,11 @@ versions = [ [[metrics]] name = "poll_error_count" description = """ -Cumulative count of errors encountered whilst polling a component's sensors.""" +Cumulative count of errors encountered whilst polling a component's sensors. + +Unlike the `sensor_error_count` metric, this counts errors encountered by \ +the management gateway while polling the component, rather than errors \ +reported by the component itself.""" units = "count" datum_type = "cumulative_u64" versions = [ From 90f950a5621fcb98d9f802766ebf13fe082d28da Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 22 Aug 2024 11:20:32 -0700 Subject: [PATCH 54/77] comment edits --- gateway/src/metrics.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 9a07a634eb..42e0fb8c55 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -51,7 +51,7 @@ pub struct Metrics { pub struct MetricsConfig { /// Collection interval to request from Oximeter, in seconds. /// - /// This is the frequency with which Oximeter will collect samples the + /// This is the frequency with which Oximeter will collect samples from the /// metrics producer endpoint, *not* the frequency with which sensor /// measurements are polled from service processors. oximeter_collection_interval_secs: usize, @@ -420,7 +420,6 @@ async fn start_pollers( &log, "still waiting for SP discovery to complete..."; "elapsed" => ?elapsed, - "error" => err, ) } else { @@ -439,7 +438,7 @@ async fn start_pollers( slog::info!( &log, - "starting to polling SP sensor data every {poll_interval:?}"; + "starting to poll SP sensor data every {poll_interval:?}" ); for (spid, _) in sps { @@ -706,7 +705,7 @@ impl SpPoller { samples.reserve(self.components.len()); for (c, metrics) in &mut self.components { // Metrics samples *should* always be well-formed. If we ever emit a - // messed up one, this is a programmer error, and therefore should + // messed up one, this is a programmer error, and therefore should // fail in test, but should probably *not* take down the whole // management gateway in a real-life rack, especially because it's // probably going to happen again if we were to get restarted. From 6b457329db04aa738ae7b163f2465d176673cee2 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 22 Aug 2024 11:22:40 -0700 Subject: [PATCH 55/77] i before e, except after c --- gateway/src/metrics.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 42e0fb8c55..27fff189bd 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -332,7 +332,7 @@ impl oximeter::Producer for Producer { // the ringbuffer, so it won't see any samples produced *before* now. // Which is the opposite of what we want! let mut samples = Vec::with_capacity(self.sample_rx.len()); - // Because we recieve the individual samples in a `Vec` of all samples + // Because we receive the individual samples in a `Vec` of all samples // produced by a poller, let's also sum the length of each of those // `Vec`s here, so we can log it later. let mut total_samples = 0; From 0d33578762625fe25d2d8193681d0858a347d23c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 22 Aug 2024 11:26:38 -0700 Subject: [PATCH 56/77] rename schema file to match the name of the target It seems like we might, conceivably, want to add other metrics with this target that are not, strictly speaking, sensor measurements --- like per-component error counts. Also, it seems like most of the Oximeter schema files have the same name as the target. --- gateway/src/metrics.rs | 2 +- .../schema/{sensor-measurement.toml => hardware-component.toml} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename oximeter/oximeter/schema/{sensor-measurement.toml => hardware-component.toml} (100%) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 27fff189bd..c81a4c7b43 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -35,7 +35,7 @@ use tokio::sync::watch; use tokio::task::JoinHandle; use uuid::Uuid; -oximeter::use_timeseries!("sensor-measurement.toml"); +oximeter::use_timeseries!("hardware-component.toml"); use hardware_component as metric; /// Handle to the metrics tasks. diff --git a/oximeter/oximeter/schema/sensor-measurement.toml b/oximeter/oximeter/schema/hardware-component.toml similarity index 100% rename from oximeter/oximeter/schema/sensor-measurement.toml rename to oximeter/oximeter/schema/hardware-component.toml From 084d4659a2e0d05e05a15e662e826b08d622e943 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 22 Aug 2024 12:06:50 -0700 Subject: [PATCH 57/77] record missing samples on sensor errors --- gateway/src/metrics.rs | 154 +++++++++++++++++++++++++++-------------- 1 file changed, 103 insertions(+), 51 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index c81a4c7b43..40ff6b978d 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -755,6 +755,7 @@ impl SpPoller { try_sample!(metrics.poll_error("no_measurement_channels")); continue; } + let ComponentMetrics { sensor_errors, target, .. } = metrics; for d in details.entries { let ComponentDetails::Measurement(m) = d else { @@ -762,7 +763,68 @@ impl SpPoller { // than measurement channels, ignore it for now. continue; }; - let sensor = Cow::Owned(m.name); + let sensor: Cow<'static, str> = Cow::Owned(m.name); + + // First, if there's a measurement error, increment the + // error count metric. We will synthesize a missing sample + // for the sensor's metric as well, after we produce the + // measurement error sample. + // + // We do this first so that we only have to clone the + // sensor's name if there's an error, rather than always + // cloning it in *case* there's an error. + if let Err(error) = m.value { + let kind = match m.kind { + MeasurementKind::Temperature => "temperature", + MeasurementKind::Current => "current", + MeasurementKind::Voltage => "voltage", + MeasurementKind::Power => "power", + MeasurementKind::InputCurrent => "input_current", + MeasurementKind::InputVoltage => "input_voltage", + MeasurementKind::Speed => "fan_speed", + }; + let error = match error { + MeasurementError::InvalidSensor => "invalid_sensor", + MeasurementError::NoReading => "no_reading", + MeasurementError::NotPresent => "not_present", + MeasurementError::DeviceError => "device_error", + MeasurementError::DeviceUnavailable => { + "device_unavailable" + } + MeasurementError::DeviceTimeout => "device_timeout", + MeasurementError::DeviceOff => "device_off", + }; + let datum = sensor_errors + .entry(SensorErrorKey { + name: sensor.clone(), + kind, + error, + }) + .or_insert(Cumulative::new(0)); + // TODO(eliza): perhaps we should treat this as + // "level-triggered" and only increment the counter + // when the sensor has *changed* to an errored + // state after we have seen at least one good + // measurement from it since the last time the error + // was observed? + datum.increment(); + try_sample!(Sample::new( + target, + &metric::SensorErrorCount { + error: Cow::Borrowed(error), + sensor: sensor.clone(), + datum: *datum, + sensor_kind: Cow::Borrowed(kind), + }, + )); + } + + // I don't love this massive `match`, but because the + // `Sample::new_missing` constructor is a different function + // from `Sample::new`, we need separate branches for the + // error and not-error cases, rather than just doing + // something to produce a datum from both the `Ok` and + // `Error` cases... let sample = match (m.value, m.kind) { (Ok(datum), MeasurementKind::Temperature) => { Sample::new( @@ -770,85 +832,75 @@ impl SpPoller { &metric::Temperature { sensor, datum }, ) } + (Err(_), MeasurementKind::Temperature) => { + Sample::new_missing( + target, + &metric::Temperature { sensor, datum: 0.0 }, + ) + } (Ok(datum), MeasurementKind::Current) => Sample::new( target, &metric::Current { sensor, datum }, ), + (Err(_), MeasurementKind::Current) => { + Sample::new_missing( + target, + &metric::Current { sensor, datum: 0.0 }, + ) + } (Ok(datum), MeasurementKind::Voltage) => Sample::new( target, &metric::Voltage { sensor, datum }, ), + + (Err(_), MeasurementKind::Voltage) => { + Sample::new_missing( + target, + &metric::Voltage { sensor, datum: 0.0 }, + ) + } (Ok(datum), MeasurementKind::Power) => Sample::new( target, &metric::Power { sensor, datum }, ), + (Err(_), MeasurementKind::Power) => { + Sample::new_missing( + target, + &metric::Power { sensor, datum: 0.0 }, + ) + } (Ok(datum), MeasurementKind::InputCurrent) => { Sample::new( target, &metric::InputCurrent { sensor, datum }, ) } + (Err(_), MeasurementKind::InputCurrent) => { + Sample::new_missing( + target, + &metric::InputCurrent { sensor, datum: 0.0 }, + ) + } (Ok(datum), MeasurementKind::InputVoltage) => { Sample::new( target, &metric::InputVoltage { sensor, datum }, ) } + (Err(_), MeasurementKind::InputVoltage) => { + Sample::new_missing( + target, + &metric::InputVoltage { sensor, datum: 0.0 }, + ) + } (Ok(datum), MeasurementKind::Speed) => Sample::new( target, &metric::FanSpeed { sensor, datum }, ), - (Err(e), kind) => { - let kind = match kind { - MeasurementKind::Temperature => "temperature", - MeasurementKind::Current => "current", - MeasurementKind::Voltage => "voltage", - MeasurementKind::Power => "power", - MeasurementKind::InputCurrent => { - "input_current" - } - MeasurementKind::InputVoltage => { - "input_voltage" - } - MeasurementKind::Speed => "fan_speed", - }; - let error = match e { - MeasurementError::InvalidSensor => { - "invalid_sensor" - } - MeasurementError::NoReading => "no_reading", - MeasurementError::NotPresent => "not_present", - MeasurementError::DeviceError => "device_error", - MeasurementError::DeviceUnavailable => { - "device_unavailable" - } - MeasurementError::DeviceTimeout => { - "device_timeout" - } - MeasurementError::DeviceOff => "device_off", - }; - let datum = sensor_errors - .entry(SensorErrorKey { - name: sensor.clone(), - kind, - error, - }) - .or_insert(Cumulative::new(0)); - // TODO(eliza): perhaps we should treat this as - // "level-triggered" and only increment the counter - // when the sensor has *changed* to an errored - // state after we have seen at least one good - // measurement from it since the last time the error - // was observed? - datum.increment(); - Sample::new( + (Err(_), MeasurementKind::Speed) => { + Sample::new_missing( target, - &metric::SensorErrorCount { - error: Cow::Borrowed(error), - sensor, - datum: *datum, - sensor_kind: Cow::Borrowed(kind), - }, + &metric::FanSpeed { sensor, datum: 0.0 }, ) } }; From 817e25730883ba814c16b8c296ab914883371cdd Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 22 Aug 2024 12:31:03 -0700 Subject: [PATCH 58/77] don't panic if the runtime is going away --- gateway/src/metrics.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 40ff6b978d..5d72a33614 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -233,9 +233,11 @@ impl Metrics { pub fn set_rack_id(&mut self, rack_id: Uuid) { if let Some(tx) = self.rack_id_tx.take() { - tx.send(rack_id).expect("why has the sensor-poller task gone away?") + // If the task that starts sensor pollers has gone away already, + // we're probably shutting down, and shouldn't panic. + let _ = tx.send(rack_id); } - // ignoring duplicate attempt to set the rack ID... + // Ignoring duplicate attempt to set the rack ID... } pub async fn update_server_addrs(&self, new_addrs: &[SocketAddrV6]) { From 3b897117f49001e0a2aa1aa962d8c0e257d47661 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 22 Aug 2024 12:54:20 -0700 Subject: [PATCH 59/77] pretty-print toml parse errors when loading MGS config --- gateway-test-utils/src/setup.rs | 13 +++++++++---- gateway/src/config.rs | 4 ++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/gateway-test-utils/src/setup.rs b/gateway-test-utils/src/setup.rs index 6aff29d3de..3a4e1e354a 100644 --- a/gateway-test-utils/src/setup.rs +++ b/gateway-test-utils/src/setup.rs @@ -52,13 +52,18 @@ pub fn load_test_config() -> (omicron_gateway::Config, sp_sim::Config) { let manifest_dir = Utf8Path::new(env!("CARGO_MANIFEST_DIR")); let server_config_file_path = manifest_dir.join("configs/config.test.toml"); let server_config = - omicron_gateway::Config::from_file(&server_config_file_path) - .expect("failed to load config.test.toml"); + match omicron_gateway::Config::from_file(&server_config_file_path) { + Ok(config) => config, + Err(e) => panic!("failed to load MGS config: {e}"), + }; let sp_sim_config_file_path = manifest_dir.join("configs/sp_sim_config.test.toml"); - let sp_sim_config = sp_sim::Config::from_file(&sp_sim_config_file_path) - .expect("failed to load sp_sim_config.test.toml"); + let sp_sim_config = + match sp_sim::Config::from_file(&sp_sim_config_file_path) { + Ok(config) => config, + Err(e) => panic!("failed to load SP simulator config: {e}"), + }; (server_config, sp_sim_config) } diff --git a/gateway/src/config.rs b/gateway/src/config.rs index ba9818ff2c..67ee480f34 100644 --- a/gateway/src/config.rs +++ b/gateway/src/config.rs @@ -50,13 +50,13 @@ pub struct PartialDropshotConfig { #[derive(Debug, Error, SlogInlineError)] pub enum LoadError { - #[error("error reading \"{path}\"")] + #[error("error reading \"{path}\": {err}")] Io { path: Utf8PathBuf, #[source] err: std::io::Error, }, - #[error("error parsing \"{path}\"")] + #[error("error parsing \"{path}\": {err}")] Parse { path: Utf8PathBuf, #[source] From ac86ecff57c31750d6e8532411cd321919eae656 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 22 Aug 2024 12:55:38 -0700 Subject: [PATCH 60/77] just use serde's duration parser in the config file --- gateway-test-utils/configs/config.test.toml | 4 +- gateway/examples/config.toml | 4 +- gateway/src/metrics.rs | 120 ++++++++------------ smf/mgs-sim/config.toml | 4 +- 4 files changed, 54 insertions(+), 78 deletions(-) diff --git a/gateway-test-utils/configs/config.test.toml b/gateway-test-utils/configs/config.test.toml index 82ac60da7d..e6830ef9f2 100644 --- a/gateway-test-utils/configs/config.test.toml +++ b/gateway-test-utils/configs/config.test.toml @@ -93,9 +93,9 @@ location = { switch0 = ["sled", 1], switch1 = ["sled", 1] } # [metrics] # Bryan wants to try polling SP sensors at 1Hz. -sp_poll_interval_ms = 1000 +sp_poll_interval = { secs = 10, nanos = 0 } # Tell Oximeter to collect our metrics every 10 seconds. -oximeter_collection_interval_secs = 10 +oximeter_collection_interval = { secs = 10, nanos = 0 } # Allow binding the metrics server on localhost. dev = { bind_loopback = true } diff --git a/gateway/examples/config.toml b/gateway/examples/config.toml index 71048ff487..c9f64fac11 100644 --- a/gateway/examples/config.toml +++ b/gateway/examples/config.toml @@ -76,9 +76,9 @@ location = { switch0 = ["sled", 1], switch1 = ["sled", 1] } # [metrics] # Bryan wants to try polling SP sensors at 1Hz. -sp_poll_interval_ms = 1000 +sp_poll_interval = { secs = 10, nanos = 0 } # Tell Oximeter to collect our metrics every 10 seconds. -oximeter_collection_interval_secs = 10 +oximeter_collection_interval = { secs = 10, nanos = 0 } [log] # Show log messages of this level and more severe diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 5d72a33614..f5c072606d 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -54,11 +54,11 @@ pub struct MetricsConfig { /// This is the frequency with which Oximeter will collect samples from the /// metrics producer endpoint, *not* the frequency with which sensor /// measurements are polled from service processors. - oximeter_collection_interval_secs: usize, + oximeter_collection_interval: Duration, /// The interval at which service processors are polled for sensor readings, /// in milliseconds - sp_poll_interval_ms: usize, + sp_poll_interval: Duration, /// Configuration settings for testing and development use. pub dev: Option, @@ -81,15 +81,6 @@ pub struct DevConfig { pub bind_loopback: bool, } -struct ValidatedMetricsConfig { - sp_poll_interval: Duration, - oximeter_collection_interval: Duration, - /// Capacity for the channel of samples from poller tasks to the Oximeter - /// producer. - max_buffered_sample_chunks: usize, - dev_config: Option, -} - /// Polls sensor readings from an individual SP. struct SpPoller { spid: SpIdentifier, @@ -164,7 +155,45 @@ impl Metrics { apictx: Arc, ) -> anyhow::Result { let &MgsArguments { id, rack_id, ref addresses } = args; - let cfg = cfg.validate()?; + + anyhow::ensure!( + cfg.sp_poll_interval >= Duration::from_millis(50), + "configured SP poll interval ({:?}) should probably be at least 50ms", + cfg.sp_poll_interval, + ); + anyhow::ensure!( + cfg.oximeter_collection_interval >= cfg.sp_poll_interval, + "there is no sense in having an Oximeter collection interval that's \ + shorter than the SP poll interval, because there will be no new \ + samples ready (the SP poll interval is {:?}, but the Oximeter \ + collection interval is {:?})", + cfg.sp_poll_interval, + cfg.oximeter_collection_interval, + ); + + let max_buffered_sample_chunks = { + // Roughly how many times will we poll SPs for each metrics collection + // interval? + let polls_per_metrics_interval = { + let collection_interval_ms = + cfg.oximeter_collection_interval.as_millis(); + let poll_interval_ms = cfg.sp_poll_interval.as_millis(); + + (collection_interval_ms / poll_interval_ms) as usize + }; + + // How many sample collection intervals do we want to allow to elapse before + // we start putting stuff on the floor? + // + // Let's say 16. Chosen totally arbitrarily but seems reasonable-ish. + let sloppiness = 16; + let capacity = + NORMAL_NUMBER_OF_SPS * polls_per_metrics_interval * sloppiness; + // Finally, the buffer capacity will probably be allocated in a power of two + // anyway, so let's make sure our thing is a power of two so we don't waste + // the allocation we're gonna get anyway. + capacity.next_power_of_two() + }; // Create a channel for the SP poller tasks to send samples to the // Oximeter producer endpoint. @@ -178,7 +207,7 @@ impl Metrics { // clobbering the oldest measurements, rather than leaving the newest // ones on the floor. let (sample_tx, sample_rx) = - broadcast::channel(cfg.max_buffered_sample_chunks); + broadcast::channel(max_buffered_sample_chunks); // Using a channel for this is, admittedly, a bit of an end-run around // the `OnceLock` on the `ServerContext` that *also* stores the rack ID, @@ -202,7 +231,7 @@ impl Metrics { &log, "SP sensor metrics configured"; "poll_interval" => ?cfg.sp_poll_interval, - "max_buffered_sample_chunks" => cfg.max_buffered_sample_chunks, + "max_buffered_sample_chunks" => max_buffered_sample_chunks, ); start_pollers( @@ -268,59 +297,6 @@ impl Drop for Metrics { } } -impl MetricsConfig { - fn validate(self) -> anyhow::Result { - anyhow::ensure!( - self.oximeter_collection_interval_secs > 0, - "`metrics.oximeter_collection_interval_secs` probably shouldn't \ - be 0 seconds", - ); - let oximeter_collection_interval = - Duration::from_secs(self.oximeter_collection_interval_secs as u64); - - anyhow::ensure!( - self.sp_poll_interval_ms > 0, - "`metrics.sp_poll_interval_ms` probably shouldn't be 0 ms", - ); - let sp_poll_interval = - Duration::from_millis(self.sp_poll_interval_ms as u64); - - let max_buffered_sample_chunks = { - // Roughly how many times will we poll SPs for each metrics collection - // interval? - let polls_per_metrics_interval = { - let collection_interval_ms: usize = oximeter_collection_interval - .as_millis() - .try_into() - .with_context(|| format!( - "configured Oximeter collection interval ({:?}) is way too big...", - oximeter_collection_interval, - ))?; - collection_interval_ms / self.sp_poll_interval_ms - }; - - // How many sample collection intervals do we want to allow to elapse before - // we start putting stuff on the floor? - // - // Let's say 16. Chosen totally arbitrarily but seems reasonable-ish. - let sloppiness = 16; - let capacity = - NORMAL_NUMBER_OF_SPS * polls_per_metrics_interval * sloppiness; - // Finally, the buffer capacity will probably be allocated in a power of two - // anyway, so let's make sure our thing is a power of two so we don't waste - // the allocation we're gonna get anyway. - capacity.next_power_of_two() - }; - - Ok(ValidatedMetricsConfig { - oximeter_collection_interval, - sp_poll_interval, - max_buffered_sample_chunks, - dev_config: self.dev, - }) - } -} - impl oximeter::Producer for Producer { fn produce( &mut self, @@ -1001,16 +977,16 @@ fn stringify_byte_string(bytes: &[u8]) -> String { } impl ServerManager { - async fn run(mut self, cfg: ValidatedMetricsConfig) -> anyhow::Result<()> { + async fn run(mut self, cfg: MetricsConfig) -> anyhow::Result<()> { let (registration_address, bind_loopback) = - if let Some(ref dev) = cfg.dev_config { + if let Some(ref dev_config) = cfg.dev { slog::warn!( &self.log, "using development metrics configuration overrides!"; - "nexus_address" => ?dev.nexus_address, - "bind_loopback" => dev.bind_loopback, + "nexus_address" => ?dev_config.nexus_address, + "bind_loopback" => dev_config.bind_loopback, ); - (dev.nexus_address, dev.bind_loopback) + (dev_config.nexus_address, dev_config.bind_loopback) } else { (None, false) }; diff --git a/smf/mgs-sim/config.toml b/smf/mgs-sim/config.toml index 6d323ce04b..447bfe0627 100644 --- a/smf/mgs-sim/config.toml +++ b/smf/mgs-sim/config.toml @@ -76,9 +76,9 @@ location = { switch0 = ["sled", 1], switch1 = ["sled", 1] } # [metrics] # Bryan wants to try polling SP sensors at 1Hz. -sp_poll_interval_ms = 1000 +sp_poll_interval = { secs = 10, nanos = 0 } # Tell Oximeter to collect our metrics every 10 seconds. -oximeter_collection_interval_secs = 10 +oximeter_collection_interval = { secs = 10, nanos = 0 } [log] # Show log messages of this level and more severe From 7f552778a1af02577dbe719a63cbe601205e5e9e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 22 Aug 2024 14:29:55 -0700 Subject: [PATCH 61/77] get rid of poll interval configurability --- gateway-test-utils/configs/config.test.toml | 4 - gateway/examples/config.toml | 9 -- gateway/src/metrics.rs | 138 ++++++++------------ smf/mgs-sim/config.toml | 9 -- 4 files changed, 55 insertions(+), 105 deletions(-) diff --git a/gateway-test-utils/configs/config.test.toml b/gateway-test-utils/configs/config.test.toml index e6830ef9f2..7ca87a032b 100644 --- a/gateway-test-utils/configs/config.test.toml +++ b/gateway-test-utils/configs/config.test.toml @@ -92,10 +92,6 @@ location = { switch0 = ["sled", 1], switch1 = ["sled", 1] } # Configuration for SP sensor metrics polling # [metrics] -# Bryan wants to try polling SP sensors at 1Hz. -sp_poll_interval = { secs = 10, nanos = 0 } -# Tell Oximeter to collect our metrics every 10 seconds. -oximeter_collection_interval = { secs = 10, nanos = 0 } # Allow binding the metrics server on localhost. dev = { bind_loopback = true } diff --git a/gateway/examples/config.toml b/gateway/examples/config.toml index c9f64fac11..d29d9508b9 100644 --- a/gateway/examples/config.toml +++ b/gateway/examples/config.toml @@ -71,15 +71,6 @@ addr = "[::1]:33320" ignition-target = 3 location = { switch0 = ["sled", 1], switch1 = ["sled", 1] } -# -# Configuration for SP sensor metrics polling -# -[metrics] -# Bryan wants to try polling SP sensors at 1Hz. -sp_poll_interval = { secs = 10, nanos = 0 } -# Tell Oximeter to collect our metrics every 10 seconds. -oximeter_collection_interval = { secs = 10, nanos = 0 } - [log] # Show log messages of this level and more severe level = "debug" diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index f5c072606d..d80faaf6db 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -46,20 +46,14 @@ pub struct Metrics { } /// Configuration for metrics. +/// +/// In order to reduce the risk of a bad config file taking down the whole +/// management network, we try to keep the metrics-specific portion of the +/// config file as minimal as possible. At present, it only includes development +/// configurations that shouldn't be present in production configs. #[derive(Clone, Debug, PartialEq, Eq, serde::Deserialize, serde::Serialize)] #[serde(deny_unknown_fields)] pub struct MetricsConfig { - /// Collection interval to request from Oximeter, in seconds. - /// - /// This is the frequency with which Oximeter will collect samples from the - /// metrics producer endpoint, *not* the frequency with which sensor - /// measurements are polled from service processors. - oximeter_collection_interval: Duration, - - /// The interval at which service processors are polled for sensor readings, - /// in milliseconds - sp_poll_interval: Duration, - /// Configuration settings for testing and development use. pub dev: Option, } @@ -129,6 +123,16 @@ struct Producer { /// The maximum Dropshot request size for the metrics server. const METRIC_REQUEST_MAX_SIZE: usize = 10 * 1024 * 1024; +/// Poll interval for requesting sensor readings from SPs. +/// +/// Bryan wants to try polling at 1Hz, so let's do that for now. +const SP_POLL_INTERVAL: Duration = Duration::from_secs(1); + +///The interval at which we will ask Oximeter to collect our metric samples. +/// +/// Every ten seconds seems good. +const OXIMETER_COLLECTION_INTERVAL: Duration = Duration::from_secs(10); + /// The expected number of SPs in a fully-loaded rack. /// /// N.B. that there *might* be more than this; we shouldn't ever panic or @@ -147,6 +151,31 @@ const NORMAL_NUMBER_OF_SPS: usize = + 2 // two power shelves, someday. ; +/// What size should we make the +const MAX_BUFFERED_SAMPLE_CHUNKS: usize = { + // Roughly how many times will we poll SPs for each metrics collection + // interval? + let polls_per_metrics_interval = { + let collection_interval_secs: usize = + OXIMETER_COLLECTION_INTERVAL.as_secs() as usize; + let poll_interval_secs: usize = SP_POLL_INTERVAL.as_secs() as usize; + + collection_interval_secs / poll_interval_secs + }; + + // How many sample collection intervals do we want to allow to elapse before + // we start putting stuff on the floor? + // + // Let's say 16. Chosen totally arbitrarily but seems reasonable-ish. + let sloppiness = 16; + let capacity = + NORMAL_NUMBER_OF_SPS * polls_per_metrics_interval * sloppiness; + // Finally, the buffer capacity will probably be allocated in a power of two + // anyway, so let's make sure our thing is a power of two so we don't waste + // the allocation we're gonna get anyway. + capacity.next_power_of_two() +}; + impl Metrics { pub fn new( log: &slog::Logger, @@ -155,46 +184,6 @@ impl Metrics { apictx: Arc, ) -> anyhow::Result { let &MgsArguments { id, rack_id, ref addresses } = args; - - anyhow::ensure!( - cfg.sp_poll_interval >= Duration::from_millis(50), - "configured SP poll interval ({:?}) should probably be at least 50ms", - cfg.sp_poll_interval, - ); - anyhow::ensure!( - cfg.oximeter_collection_interval >= cfg.sp_poll_interval, - "there is no sense in having an Oximeter collection interval that's \ - shorter than the SP poll interval, because there will be no new \ - samples ready (the SP poll interval is {:?}, but the Oximeter \ - collection interval is {:?})", - cfg.sp_poll_interval, - cfg.oximeter_collection_interval, - ); - - let max_buffered_sample_chunks = { - // Roughly how many times will we poll SPs for each metrics collection - // interval? - let polls_per_metrics_interval = { - let collection_interval_ms = - cfg.oximeter_collection_interval.as_millis(); - let poll_interval_ms = cfg.sp_poll_interval.as_millis(); - - (collection_interval_ms / poll_interval_ms) as usize - }; - - // How many sample collection intervals do we want to allow to elapse before - // we start putting stuff on the floor? - // - // Let's say 16. Chosen totally arbitrarily but seems reasonable-ish. - let sloppiness = 16; - let capacity = - NORMAL_NUMBER_OF_SPS * polls_per_metrics_interval * sloppiness; - // Finally, the buffer capacity will probably be allocated in a power of two - // anyway, so let's make sure our thing is a power of two so we don't waste - // the allocation we're gonna get anyway. - capacity.next_power_of_two() - }; - // Create a channel for the SP poller tasks to send samples to the // Oximeter producer endpoint. // @@ -207,7 +196,7 @@ impl Metrics { // clobbering the oldest measurements, rather than leaving the newest // ones on the floor. let (sample_tx, sample_rx) = - broadcast::channel(max_buffered_sample_chunks); + broadcast::channel(MAX_BUFFERED_SAMPLE_CHUNKS); // Using a channel for this is, admittedly, a bit of an end-run around // the `OnceLock` on the `ServerContext` that *also* stores the rack ID, @@ -225,24 +214,13 @@ impl Metrics { Some(rack_id_tx) }; - tokio::spawn({ - let log = log.new(slog::o!("component" => "sensor-poller")); - slog::info!( - &log, - "SP sensor metrics configured"; - "poll_interval" => ?cfg.sp_poll_interval, - "max_buffered_sample_chunks" => max_buffered_sample_chunks, - ); - - start_pollers( - log, - apictx.clone(), - rack_id_rx, - id, - sample_tx, - cfg.sp_poll_interval, - ) - }); + tokio::spawn(start_pollers( + log.new(slog::o!("component" => "sensor-poller")), + apictx.clone(), + rack_id_rx, + id, + sample_tx, + )); let (addrs_tx, addrs_rx) = tokio::sync::watch::channel(addresses.clone()); @@ -368,7 +346,6 @@ async fn start_pollers( rack_id: oneshot::Receiver, mgs_id: Uuid, sample_tx: broadcast::Sender>, - poll_interval: Duration, ) -> anyhow::Result<()> { let switch = &apictx.mgmt_switch; @@ -416,7 +393,7 @@ async fn start_pollers( slog::info!( &log, - "starting to poll SP sensor data every {poll_interval:?}" + "starting to poll SP sensor data every {SP_POLL_INTERVAL:?}" ); for (spid, _) in sps { @@ -439,19 +416,15 @@ async fn start_pollers( known_state: None, sample_tx: sample_tx.clone(), }; - tokio::spawn(poller.run(poll_interval, apictx.clone())); + tokio::spawn(poller.run(apictx.clone())); } Ok(()) } impl SpPoller { - async fn run( - mut self, - poll_interval: Duration, - apictx: Arc, - ) { - let mut interval = tokio::time::interval(poll_interval); + async fn run(mut self, apictx: Arc) { + let mut interval = tokio::time::interval(SP_POLL_INTERVAL); let switch = &apictx.mgmt_switch; let sp = match switch.sp(self.spid) { Ok(sp) => sp, @@ -991,7 +964,6 @@ impl ServerManager { (None, false) }; let id = self.registry.producer_id(); - let interval = cfg.oximeter_collection_interval; let mut current_server: Option = None; loop { @@ -1019,7 +991,7 @@ impl ServerManager { "rebinding producer server on new IP"; "new_ip" => ?ip, "current_ip" => ?current_ip, - "collection_interval" => ?interval, + "collection_interval" => ?OXIMETER_COLLECTION_INTERVAL, "producer_id" => ?id, ); let server = { @@ -1030,7 +1002,7 @@ impl ServerManager { id, kind: ProducerKind::ManagementGateway, address, - interval, + interval: OXIMETER_COLLECTION_INTERVAL, }; let config = oximeter_producer::Config { server_info, @@ -1050,7 +1022,7 @@ impl ServerManager { slog::info!( &self.log, "bound metrics producer server"; - "collection_interval" => ?interval, + "collection_interval" => ?OXIMETER_COLLECTION_INTERVAL, "producer_id" => ?id, "address" => %server.address(), ); diff --git a/smf/mgs-sim/config.toml b/smf/mgs-sim/config.toml index 447bfe0627..511524137a 100644 --- a/smf/mgs-sim/config.toml +++ b/smf/mgs-sim/config.toml @@ -71,15 +71,6 @@ addr = "[::1]:33320" ignition-target = 3 location = { switch0 = ["sled", 1], switch1 = ["sled", 1] } -# -# Configuration for SP sensor metrics polling -# -[metrics] -# Bryan wants to try polling SP sensors at 1Hz. -sp_poll_interval = { secs = 10, nanos = 0 } -# Tell Oximeter to collect our metrics every 10 seconds. -oximeter_collection_interval = { secs = 10, nanos = 0 } - [log] # Show log messages of this level and more severe level = "debug" From 64ae3a0baba38ee04e9aa40457bffb6d6b2b0bc0 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 22 Aug 2024 14:55:04 -0700 Subject: [PATCH 62/77] reduce panickiness --- gateway/src/metrics.rs | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index d80faaf6db..5535306a6f 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -229,7 +229,13 @@ impl Metrics { let registry = ProducerRegistry::with_id(id); registry .register_producer(Producer { sample_rx, log: log.clone() }) - .context("failed to register metrics producer")?; + // TODO(ben): when you change `register_producer` to not return + // a `Result`, delete this `expect`. thanks in advance! :) + .expect( + "`ProducerRegistry::register_producer()` will never \ + actually return an `Err`, so this shouldn't ever \ + happen...", + ); tokio::spawn( ServerManager { log, addrs: addrs_rx, registry }.run(cfg), @@ -389,7 +395,7 @@ async fn start_pollers( }, ) .await - .expect("we should never return a fatal error here"); + .context("we should never return a fatal error here")?; slog::info!( &log, @@ -429,14 +435,28 @@ impl SpPoller { let sp = match switch.sp(self.spid) { Ok(sp) => sp, Err(e) => { - unreachable!( + // This should never happen, but it's not worth taking down the + // entire management network over that... + const MSG: &'static str = "the `SpPoller::run` function is only called after \ discovery completes successfully, and the `SpIdentifier` \ - used was returned by the management switch, so it \ - should be valid. however, we saw a {e:?} error when \ - looking up {:?}", - self.spid - ); + used was returned by the management switch, \ + so it should be valid."; + if cfg!(debug_assertions) { + unreachable!( + "{MSG} nonetheless, we saw a {e:?} error when looking \ + up {:?}", + self.spid + ); + } else { + slog::error!( + &self.log, + "THIS SHOULDN'T HAPPEN: {MSG}"; + "error" => e, + "sp" => ?self.spid, + ); + return; + } } }; loop { @@ -573,7 +593,7 @@ impl SpPoller { None => { // These are supposed to always be strings. But, if we // see one that's not a string, fall back to the hex - // representation rather than panicking. + // representation rather than panicking. let hex = hex::encode(dev.component.id); slog::warn!( &self.log, From a62b2d48a999cd77351aa23781cc804895fbf6cc Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 22 Aug 2024 15:01:19 -0700 Subject: [PATCH 63/77] oh, right: you have to actually remove the Result i nearly pulled a @bnaecker with this one! --- gateway/src/lib.rs | 5 +---- gateway/src/metrics.rs | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/gateway/src/lib.rs b/gateway/src/lib.rs index 0566c79ea2..8e764dc63f 100644 --- a/gateway/src/lib.rs +++ b/gateway/src/lib.rs @@ -155,10 +155,7 @@ impl Server { let all_servers_shutdown = FuturesUnordered::new(); let metrics = - metrics::Metrics::new(&log, &args, config.metrics, apictx.clone()) - .map_err(|err| { - format!("failed to initialize metrics subsystem: {err}") - })?; + metrics::Metrics::new(&log, &args, config.metrics, apictx.clone()); for addr in args.addresses { start_dropshot_server( diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 5535306a6f..92ed67e1cd 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -182,7 +182,7 @@ impl Metrics { args: &MgsArguments, cfg: MetricsConfig, apictx: Arc, - ) -> anyhow::Result { + ) -> Self { let &MgsArguments { id, rack_id, ref addresses } = args; // Create a channel for the SP poller tasks to send samples to the // Oximeter producer endpoint. @@ -241,7 +241,7 @@ impl Metrics { ServerManager { log, addrs: addrs_rx, registry }.run(cfg), ) }; - Ok(Self { addrs_tx, rack_id_tx, server }) + Self { addrs_tx, rack_id_tx, server } } pub fn set_rack_id(&mut self, rack_id: Uuid) { From d130a5be80e6f41eb3b415a06bc4bba3dc422e85 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 22 Aug 2024 15:23:44 -0700 Subject: [PATCH 64/77] i guess we need to delete the MGS logs --- nexus/tests/integration_tests/metrics.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs index 8b44d67048..88f21bc9fc 100644 --- a/nexus/tests/integration_tests/metrics.rs +++ b/nexus/tests/integration_tests/metrics.rs @@ -681,6 +681,10 @@ async fn test_mgs_metrics( let current_metrics = get_timeseries(&cptestctx, "hardware_component:current").await; check_all_serials_present(current_metrics); + + // Because the `ControlPlaneTestContext` isn't managing the MGS we made for + // this test, we are responsible for removing its logs. + mgs.logctx.cleanup_successful(); } /// Wait until a producer is registered with Oximeter. From f49c0ec4a18e2463e07373a379ae7d8dd84f1128 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 22 Aug 2024 15:51:30 -0700 Subject: [PATCH 65/77] if we're removing metrics stuff from the config, it needs to be optional --- gateway/src/config.rs | 2 +- gateway/src/metrics.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gateway/src/config.rs b/gateway/src/config.rs index 67ee480f34..edf895ef59 100644 --- a/gateway/src/config.rs +++ b/gateway/src/config.rs @@ -27,7 +27,7 @@ pub struct Config { /// Server-wide logging configuration. pub log: ConfigLogging, /// Configuration for SP sensor metrics. - pub metrics: MetricsConfig, + pub metrics: Option, } impl Config { diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 92ed67e1cd..2d80f202c6 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -180,7 +180,7 @@ impl Metrics { pub fn new( log: &slog::Logger, args: &MgsArguments, - cfg: MetricsConfig, + cfg: Option, apictx: Arc, ) -> Self { let &MgsArguments { id, rack_id, ref addresses } = args; @@ -970,9 +970,9 @@ fn stringify_byte_string(bytes: &[u8]) -> String { } impl ServerManager { - async fn run(mut self, cfg: MetricsConfig) -> anyhow::Result<()> { + async fn run(mut self, cfg: Option) -> anyhow::Result<()> { let (registration_address, bind_loopback) = - if let Some(ref dev_config) = cfg.dev { + if let Some(ref dev_config) = cfg.and_then(|cfg| cfg.dev) { slog::warn!( &self.log, "using development metrics configuration overrides!"; From b49acd9886282bfdafe04d8716f56fb4352f38ce Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 22 Aug 2024 15:55:51 -0700 Subject: [PATCH 66/77] bleh --- dev-tools/mgs-dev/src/main.rs | 10 ++++++---- nexus/tests/integration_tests/metrics.rs | 7 ++++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/dev-tools/mgs-dev/src/main.rs b/dev-tools/mgs-dev/src/main.rs index 9c280337b8..2f12f633db 100644 --- a/dev-tools/mgs-dev/src/main.rs +++ b/dev-tools/mgs-dev/src/main.rs @@ -55,10 +55,12 @@ impl MgsRunArgs { let (mut mgs_config, sp_sim_config) = gateway_test_utils::setup::load_test_config(); if let Some(addr) = self.nexus_address { - mgs_config.metrics.dev = - Some(omicron_gateway::metrics::DevConfig { - bind_loopback: true, - nexus_address: Some(addr), + mgs_config.metrics = + Some(omicron_gateway::metrics::MetricsConfig { + dev: Some(omicron_gateway::metrics::DevConfig { + bind_loopback: true, + nexus_address: Some(addr), + }), }); } diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs index 88f21bc9fc..eea04d3521 100644 --- a/nexus/tests/integration_tests/metrics.rs +++ b/nexus/tests/integration_tests/metrics.rs @@ -600,11 +600,12 @@ async fn test_mgs_metrics( gateway_test_utils::setup::load_test_config(); // munge the already-parsed MGS config file to point it at the test // Nexus' address. - mgs_config.metrics.dev = - Some(gateway_test_utils::setup::MetricsDevConfig { + mgs_config.metrics = Some(gateway_test_utils::setup::MetricsConfig { + dev: Some(gateway_test_utils::setup::MetricsDevConfig { bind_loopback: true, nexus_address: Some(cptestctx.internal_client.bind_address), - }); + }), + }); gateway_test_utils::setup::test_setup_with_config( "test_mgs_metrics", gateway_messages::SpPort::One, From 67e2a71d739cf7673792512f23015c0ff6cd452d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 22 Aug 2024 16:29:10 -0700 Subject: [PATCH 67/77] you have to delete it here too --- smf/mgs/config.toml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/smf/mgs/config.toml b/smf/mgs/config.toml index a7e2e27aa7..fa1232b1b2 100644 --- a/smf/mgs/config.toml +++ b/smf/mgs/config.toml @@ -286,15 +286,6 @@ interface = "gimlet31" ignition-target = 18 location = { switch0 = ["sled", 31], switch1 = ["sled", 31] } -# -# Configuration for SP sensor metrics polling -# -[metrics] -# Bryan wants to try polling SP sensors at 1Hz. -sp_poll_interval_ms = 1000 -# Tell Oximeter to collect our metrics every 10 seconds. -oximeter_collection_interval_secs = 10 - [log] level = "info" mode = "file" From 681e4a2d578c490da58c5efb0aa571869a124cff Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 23 Aug 2024 10:05:40 -0700 Subject: [PATCH 68/77] disable metrics in `test_sp_updater_delivers_progress` As suggested by @jgallagher in https://github.com/oxidecomputer/omicron/pull/6354#issuecomment-2307232906 --- gateway/src/metrics.rs | 65 ++++++++++++++------- nexus/tests/integration_tests/sp_updater.rs | 21 ++++++- 2 files changed, 63 insertions(+), 23 deletions(-) diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 2d80f202c6..7b15ce9c1f 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -40,6 +40,11 @@ use hardware_component as metric; /// Handle to the metrics tasks. pub struct Metrics { + /// If the metrics subsystem is disabled, this is `None`. + inner: Option, +} + +struct Handles { addrs_tx: watch::Sender>, rack_id_tx: Option>, server: JoinHandle>, @@ -51,9 +56,18 @@ pub struct Metrics { /// management network, we try to keep the metrics-specific portion of the /// config file as minimal as possible. At present, it only includes development /// configurations that shouldn't be present in production configs. -#[derive(Clone, Debug, PartialEq, Eq, serde::Deserialize, serde::Serialize)] +#[derive( + Clone, Debug, Default, PartialEq, Eq, serde::Deserialize, serde::Serialize, +)] #[serde(deny_unknown_fields)] pub struct MetricsConfig { + /// Completely disable the metrics subsystem. + /// + /// If `disabled = true`, sensor data metrics will not be collected, and the + /// metrics polling tasks will not be started. + #[serde(default)] + pub disabled: bool, + /// Configuration settings for testing and development use. pub dev: Option, } @@ -184,6 +198,12 @@ impl Metrics { apictx: Arc, ) -> Self { let &MgsArguments { id, rack_id, ref addresses } = args; + + if cfg.as_ref().map(|c| c.disabled).unwrap_or(false) { + slog::warn!(&log, "metrics subsystem disabled by config"); + return Self { inner: None }; + } + // Create a channel for the SP poller tasks to send samples to the // Oximeter producer endpoint. // @@ -241,11 +261,12 @@ impl Metrics { ServerManager { log, addrs: addrs_rx, registry }.run(cfg), ) }; - Self { addrs_tx, rack_id_tx, server } + Self { inner: Some(Handles { addrs_tx, rack_id_tx, server }) } } pub fn set_rack_id(&mut self, rack_id: Uuid) { - if let Some(tx) = self.rack_id_tx.take() { + let tx = self.inner.as_mut().and_then(|i| i.rack_id_tx.take()); + if let Some(tx) = tx { // If the task that starts sensor pollers has gone away already, // we're probably shutting down, and shouldn't panic. let _ = tx.send(rack_id); @@ -254,30 +275,34 @@ impl Metrics { } pub async fn update_server_addrs(&self, new_addrs: &[SocketAddrV6]) { - self.addrs_tx.send_if_modified(|current_addrs| { - if current_addrs.len() == new_addrs.len() - // N.B. that we could make this "faster" with a `HashSet`, - // but...the size of this Vec of addresses is probably going to - // two or three items, max, so the linear scan actually probably - // outperforms it... - && current_addrs.iter().all(|addr| new_addrs.contains(addr)) - { - return false; - } + if let Some(ref inner) = self.inner { + inner.addrs_tx.send_if_modified(|current_addrs| { + if current_addrs.len() == new_addrs.len() + // N.B. that we could make this "faster" with a `HashSet`, + // but...the size of this Vec of addresses is probably going to + // two or three items, max, so the linear scan actually probably + // outperforms it... + && current_addrs.iter().all(|addr| new_addrs.contains(addr)) + { + return false; + } - // Reuse existing `Vec` capacity if possible.This is almost - // certainly not performance-critical, but it makes me feel happy. - current_addrs.clear(); - current_addrs.extend_from_slice(new_addrs); - true - }); + // Reuse existing `Vec` capacity if possible.This is almost + // certainly not performance-critical, but it makes me feel happy. + current_addrs.clear(); + current_addrs.extend_from_slice(new_addrs); + true + }); + } } } impl Drop for Metrics { fn drop(&mut self) { // Clean up our children on drop. - self.server.abort(); + if let Some(ref mut inner) = self.inner { + inner.server.abort(); + } } } diff --git a/nexus/tests/integration_tests/sp_updater.rs b/nexus/tests/integration_tests/sp_updater.rs index 8314d22173..2f2c1bfccf 100644 --- a/nexus/tests/integration_tests/sp_updater.rs +++ b/nexus/tests/integration_tests/sp_updater.rs @@ -434,9 +434,24 @@ async fn test_sp_updater_switches_mgs_instances_on_failure() { #[tokio::test] async fn test_sp_updater_delivers_progress() { // Start MGS + Sim SP. - let mgstestctx = - mgs_setup::test_setup("test_sp_updater_delivers_progress", SpPort::One) - .await; + let mgstestctx = { + let (mut mgs_config, sp_sim_confg) = mgs_setup::load_test_config(); + // Enabling SP metrics collection makes this alread-flaky test even + // flakier, so let's just turn it off. + // TODO(eliza): it would be nice if we didn't have to disable metrics in + // this test, so that we can better catch regressions that could be + // introduced by the metrics subsystem... + mgs_config.metrics.get_or_insert_with(Default::default()).disabled = + true; + mgs_setup::test_setup_with_config( + "test_sp_updater_delivers_progress", + SpPort::One, + mgs_config, + sp_sim_config, + None, + ) + .await + }; // Configure an MGS client. let mut mgs_clients = From d497472389517f501f7f33bb32103c9033155b50 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 23 Aug 2024 10:12:12 -0700 Subject: [PATCH 69/77] `get_or_insert_with` is nicer --- dev-tools/mgs-dev/src/main.rs | 10 ++++------ nexus/tests/integration_tests/metrics.rs | 7 +++---- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/dev-tools/mgs-dev/src/main.rs b/dev-tools/mgs-dev/src/main.rs index 2f12f633db..dee8334f38 100644 --- a/dev-tools/mgs-dev/src/main.rs +++ b/dev-tools/mgs-dev/src/main.rs @@ -55,12 +55,10 @@ impl MgsRunArgs { let (mut mgs_config, sp_sim_config) = gateway_test_utils::setup::load_test_config(); if let Some(addr) = self.nexus_address { - mgs_config.metrics = - Some(omicron_gateway::metrics::MetricsConfig { - dev: Some(omicron_gateway::metrics::DevConfig { - bind_loopback: true, - nexus_address: Some(addr), - }), + mgs_config.metrics.get_or_insert_with(Default::default).dev = + Some(gateway_test_utils::setup::MetricsDevConfig { + nexus_address: Some(addr), + bind_loopback: true, }); } diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs index eea04d3521..58ff6c6f64 100644 --- a/nexus/tests/integration_tests/metrics.rs +++ b/nexus/tests/integration_tests/metrics.rs @@ -600,12 +600,11 @@ async fn test_mgs_metrics( gateway_test_utils::setup::load_test_config(); // munge the already-parsed MGS config file to point it at the test // Nexus' address. - mgs_config.metrics = Some(gateway_test_utils::setup::MetricsConfig { - dev: Some(gateway_test_utils::setup::MetricsDevConfig { + mgs_config.metrics.get_or_insert_with(Default::default).dev = + Some(gateway_test_utils::setup::MetricsDevConfig { bind_loopback: true, nexus_address: Some(cptestctx.internal_client.bind_address), - }), - }); + }); gateway_test_utils::setup::test_setup_with_config( "test_mgs_metrics", gateway_messages::SpPort::One, From d13aff52988c9be3b9097c0aede2a27ec5644e68 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 23 Aug 2024 10:36:54 -0700 Subject: [PATCH 70/77] gah, typo --- nexus/tests/integration_tests/sp_updater.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/tests/integration_tests/sp_updater.rs b/nexus/tests/integration_tests/sp_updater.rs index 2f2c1bfccf..570192407f 100644 --- a/nexus/tests/integration_tests/sp_updater.rs +++ b/nexus/tests/integration_tests/sp_updater.rs @@ -435,7 +435,7 @@ async fn test_sp_updater_switches_mgs_instances_on_failure() { async fn test_sp_updater_delivers_progress() { // Start MGS + Sim SP. let mgstestctx = { - let (mut mgs_config, sp_sim_confg) = mgs_setup::load_test_config(); + let (mut mgs_config, sp_sim_config) = mgs_setup::load_test_config(); // Enabling SP metrics collection makes this alread-flaky test even // flakier, so let's just turn it off. // TODO(eliza): it would be nice if we didn't have to disable metrics in From 66b8d1143cd14cfa00ea88a84cffc7957337b861 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 23 Aug 2024 10:40:20 -0700 Subject: [PATCH 71/77] GAH borrow --- nexus/tests/integration_tests/sp_updater.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/tests/integration_tests/sp_updater.rs b/nexus/tests/integration_tests/sp_updater.rs index 570192407f..172ce83441 100644 --- a/nexus/tests/integration_tests/sp_updater.rs +++ b/nexus/tests/integration_tests/sp_updater.rs @@ -447,7 +447,7 @@ async fn test_sp_updater_delivers_progress() { "test_sp_updater_delivers_progress", SpPort::One, mgs_config, - sp_sim_config, + &sp_sim_config, None, ) .await From 8ea3a29762fc8c00e5fcb11d9a0b6c27d993baa9 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 23 Aug 2024 10:42:55 -0700 Subject: [PATCH 72/77] gah typechecker --- nexus/tests/integration_tests/sp_updater.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nexus/tests/integration_tests/sp_updater.rs b/nexus/tests/integration_tests/sp_updater.rs index 172ce83441..6e482bc1ad 100644 --- a/nexus/tests/integration_tests/sp_updater.rs +++ b/nexus/tests/integration_tests/sp_updater.rs @@ -441,8 +441,7 @@ async fn test_sp_updater_delivers_progress() { // TODO(eliza): it would be nice if we didn't have to disable metrics in // this test, so that we can better catch regressions that could be // introduced by the metrics subsystem... - mgs_config.metrics.get_or_insert_with(Default::default()).disabled = - true; + mgs_config.metrics.get_or_insert_with(Default::default).disabled = true; mgs_setup::test_setup_with_config( "test_sp_updater_delivers_progress", SpPort::One, From 8705d657d0de1cd7fcc3464b1a26906dec6c403b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 23 Aug 2024 11:22:32 -0700 Subject: [PATCH 73/77] make the test way fancier I realized the test could read the actual SP simulator config file, and figure out how many metrics should be present. That way, if someone changes the config file later, it'll still work. --- nexus/tests/integration_tests/metrics.rs | 169 +++++++++++++++++------ 1 file changed, 125 insertions(+), 44 deletions(-) diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs index 58ff6c6f64..6fac010d3c 100644 --- a/nexus/tests/integration_tests/metrics.rs +++ b/nexus/tests/integration_tests/metrics.rs @@ -27,6 +27,7 @@ use oximeter::types::FieldValue; use oximeter::types::Measurement; use oximeter::TimeseriesSchema; use std::borrow::Borrow; +use std::collections::HashMap; use uuid::Uuid; pub async fn query_for_metrics( @@ -595,9 +596,9 @@ async fn test_mgs_metrics( cptestctx: &ControlPlaneTestContext, ) { // Make a MGS + let (mut mgs_config, sp_sim_config) = + gateway_test_utils::setup::load_test_config(); let mgs = { - let (mut mgs_config, sp_sim_config) = - gateway_test_utils::setup::load_test_config(); // munge the already-parsed MGS config file to point it at the test // Nexus' address. mgs_config.metrics.get_or_insert_with(Default::default).dev = @@ -615,72 +616,152 @@ async fn test_mgs_metrics( .await }; - // Wait until the MGS registers as a producer with Oximeter. - wait_for_producer(&cptestctx.oximeter, &mgs.gateway_id).await; - cptestctx.oximeter.force_collect().await; + // Let's look at all the simulated SP components in the config file which + // have sensor readings, so we can assert that there are timeseries for all + // of them. + let all_sp_configs = { + let gimlet_configs = + sp_sim_config.simulated_sps.gimlet.iter().map(|g| &g.common); + let sidecar_configs = + sp_sim_config.simulated_sps.sidecar.iter().map(|s| &s.common); + gimlet_configs.chain(sidecar_configs) + }; + // XXX(eliza): yes, this code is repetitive. We could probably make it a + // little elss ugly with nested hash maps, but like...I already wrote it, so + // you don't have to. :) + // + // TODO(eliza): presently, we just expect that the number of timeseries for + // each serial number and sensor type lines up. If we wanted to be *really* + // fancy, we could also assert that all the component IDs, component kinds, + // and measurement values line up with the config. But, honestly, it's + // pretty unlikely that a bug in MGS' sensor metrics subsystem would mess + // that up --- the most important thing is just to make sure that the sensor + // data is *present*, as that should catch most regressions. + let mut temp_sensors = HashMap::new(); + let mut current_sensors = HashMap::new(); + let mut voltage_sensors = HashMap::new(); + let mut power_sensors = HashMap::new(); + let mut input_voltage_sensors = HashMap::new(); + let mut input_current_sensors = HashMap::new(); + let mut fan_speed_sensors = HashMap::new(); + for sp in all_sp_configs { + let mut temp = 0; + let mut current = 0; + let mut voltage = 0; + let mut input_voltage = 0; + let mut input_current = 0; + let mut power = 0; + let mut speed = 0; + for component in &sp.components { + for sensor in &component.sensors { + use gateway_messages::measurement::MeasurementKind as Kind; + match sensor.def.kind { + Kind::Temperature => temp += 1, + Kind::Current => current += 1, + Kind::Voltage => voltage += 1, + Kind::InputVoltage => input_voltage += 1, + Kind::InputCurrent => input_current += 1, + Kind::Speed => speed += 1, + Kind::Power => power += 1, + } + } + } + temp_sensors.insert(sp.serial_number.clone(), temp); + current_sensors.insert(sp.serial_number.clone(), current); + voltage_sensors.insert(sp.serial_number.clone(), voltage); + input_voltage_sensors.insert(sp.serial_number.clone(), input_voltage); + input_current_sensors.insert(sp.serial_number.clone(), input_current); + fan_speed_sensors.insert(sp.serial_number.clone(), speed); + power_sensors.insert(sp.serial_number.clone(), power); + } - async fn get_timeseries( + async fn check_all_timeseries_present( cptestctx: &ControlPlaneTestContext, name: &str, - ) -> oxql_types::Table { - let table = timeseries_query(&cptestctx, &format!("get {name}")) + expected: HashMap, + ) { + let metric_name = format!("hardware_component:{name}"); + eprintln!("\n=== checking timeseries for {metric_name} ===\n"); + + if expected.values().all(|&v| v == 0) { + eprintln!( + "-> SP sim config contains no {name} sensors, skipping it" + ); + return; + } + + let table = timeseries_query(&cptestctx, &format!("get {metric_name}")) .await .into_iter() - .find(|t| t.name() == name); - match table { + .find(|t| t.name() == metric_name); + let table = match table { Some(table) => table, - None => panic!("missing table for {name}"), - } - } + None => panic!("missing table for {metric_name}"), + }; - #[track_caller] - fn check_all_serials_present(table: oxql_types::Table) { - let mut sim_gimlet_00 = 0; - let mut sim_gimlet_01 = 0; + let mut found = expected + .keys() + .map(|serial| (serial.clone(), 0)) + .collect::>(); for timeseries in table.timeseries() { let fields = ×eries.fields; let n_points = timeseries.points.len(); - eprintln!("found timeseries: {fields:?} ({n_points} points)"); - assert!(n_points > 0, "timeseries {fields:?} should have points"); - let serial_str = match timeseries.fields.get("chassis_serial") { + assert!( + n_points > 0, + "{metric_name} timeseries {fields:?} should have points" + ); + let serial_str: &str = match timeseries.fields.get("chassis_serial") + { Some(FieldValue::String(s)) => s.borrow(), Some(x) => panic!( - "`chassis_serial` field should be a string, but got: {x:?}" + "{metric_name} `chassis_serial` field should be a string, but got: {x:?}" ), None => { - panic!("timeseries should have a `chassis_serial` field") + panic!("{metric_name} timeseries should have a `chassis_serial` field") } }; - match serial_str { - "SimGimlet00" => sim_gimlet_00 += 1, - "SimGimlet01" => sim_gimlet_01 += 1, - // if someone adds sensor readings to the fake sidecar later, - // that's okay... - _ => eprintln!("bonus simulated chassis serial {serial_str:?}"), + if let Some(count) = found.get_mut(serial_str) { + *count += 1; + } else { + panic!( + "{metric_name} timeseries had an unexpected chassis serial \ + number {serial_str:?} (not in the config file)", + ); } } - assert!( - sim_gimlet_00 > 0, - "expected at least one timeseries from SimGimlet00 in {table:#?}" - ); - assert!( - sim_gimlet_01 > 0, - "expected at least one timeseries from SimGimlet01 in {table:#?}" + eprintln!("-> {metric_name}: found timeseries: {found:#?}"); + assert_eq!( + found, expected, + "number of {metric_name} timeseries didn't match expected in {table:#?}", ); + eprintln!("-> okay, looks good!"); } - let temp_metrics = - get_timeseries(&cptestctx, "hardware_component:temperature").await; - check_all_serials_present(temp_metrics); + // Wait until the MGS registers as a producer with Oximeter. + wait_for_producer(&cptestctx.oximeter, &mgs.gateway_id).await; - let voltage_metrics = - get_timeseries(&cptestctx, "hardware_component:voltage").await; - check_all_serials_present(voltage_metrics); + // ...and collect its samples. + cptestctx.oximeter.force_collect().await; - let current_metrics = - get_timeseries(&cptestctx, "hardware_component:current").await; - check_all_serials_present(current_metrics); + check_all_timeseries_present(&cptestctx, "temperature", temp_sensors).await; + check_all_timeseries_present(&cptestctx, "voltage", voltage_sensors).await; + check_all_timeseries_present(&cptestctx, "current", current_sensors).await; + check_all_timeseries_present(&cptestctx, "power", power_sensors).await; + check_all_timeseries_present( + &cptestctx, + "input_voltage", + input_voltage_sensors, + ) + .await; + check_all_timeseries_present( + &cptestctx, + "input_current", + input_current_sensors, + ) + .await; + check_all_timeseries_present(&cptestctx, "fan_speed", fan_speed_sensors) + .await; // Because the `ControlPlaneTestContext` isn't managing the MGS we made for // this test, we are responsible for removing its logs. From d4d305b61222b4853c021329e37a1ef0d1e72d31 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 23 Aug 2024 11:43:20 -0700 Subject: [PATCH 74/77] simplify metrics config there's really no reason to have a separate `dev` struct inside another layer of Option... --- dev-tools/mgs-dev/src/main.rs | 9 +++--- gateway-test-utils/src/setup.rs | 4 +-- gateway/src/metrics.rs | 36 +++++++++++++----------- nexus/tests/integration_tests/metrics.rs | 10 +++---- 4 files changed, 30 insertions(+), 29 deletions(-) diff --git a/dev-tools/mgs-dev/src/main.rs b/dev-tools/mgs-dev/src/main.rs index dee8334f38..77947999d9 100644 --- a/dev-tools/mgs-dev/src/main.rs +++ b/dev-tools/mgs-dev/src/main.rs @@ -55,10 +55,11 @@ impl MgsRunArgs { let (mut mgs_config, sp_sim_config) = gateway_test_utils::setup::load_test_config(); if let Some(addr) = self.nexus_address { - mgs_config.metrics.get_or_insert_with(Default::default).dev = - Some(gateway_test_utils::setup::MetricsDevConfig { - nexus_address: Some(addr), - bind_loopback: true, + mgs_config.metrics = + Some(gateway_test_utils::setup::MetricsConfig { + disabled: false, + dev_nexus_address: Some(addr), + dev_bind_loopback: true, }); } diff --git a/gateway-test-utils/src/setup.rs b/gateway-test-utils/src/setup.rs index 3a4e1e354a..056bb451f7 100644 --- a/gateway-test-utils/src/setup.rs +++ b/gateway-test-utils/src/setup.rs @@ -8,9 +8,7 @@ use camino::Utf8Path; use dropshot::test_util::ClientTestContext; use dropshot::test_util::LogContext; use gateway_messages::SpPort; -pub use omicron_gateway::metrics::{ - DevConfig as MetricsDevConfig, MetricsConfig, -}; +pub use omicron_gateway::metrics::MetricsConfig; use omicron_gateway::MgsArguments; use omicron_gateway::SpType; use omicron_gateway::SwitchPortConfig; diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index 7b15ce9c1f..d4e0795ae0 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -68,25 +68,20 @@ pub struct MetricsConfig { #[serde(default)] pub disabled: bool, - /// Configuration settings for testing and development use. - pub dev: Option, -} - -#[derive(Clone, Debug, PartialEq, Eq, serde::Deserialize, serde::Serialize)] -#[serde(deny_unknown_fields)] -pub struct DevConfig { /// Override the Nexus address used to register the SP metrics Oximeter /// producer. This is intended for use in development and testing. /// /// If this argument is not present, Nexus is discovered through DNS. - pub nexus_address: Option, + #[serde(default)] + pub dev_nexus_address: Option, /// Allow the metrics producer endpoint to bind on loopback. /// /// This should be disabled in production, as Nexus will not be able to /// reach the loopback interface, but is necessary for local development and /// test purposes. - pub bind_loopback: bool, + #[serde(default)] + pub dev_bind_loopback: bool, } /// Polls sensor readings from an individual SP. @@ -997,14 +992,21 @@ fn stringify_byte_string(bytes: &[u8]) -> String { impl ServerManager { async fn run(mut self, cfg: Option) -> anyhow::Result<()> { let (registration_address, bind_loopback) = - if let Some(ref dev_config) = cfg.and_then(|cfg| cfg.dev) { - slog::warn!( - &self.log, - "using development metrics configuration overrides!"; - "nexus_address" => ?dev_config.nexus_address, - "bind_loopback" => dev_config.bind_loopback, - ); - (dev_config.nexus_address, dev_config.bind_loopback) + if let Some(MetricsConfig { + dev_bind_loopback, + dev_nexus_address, + .. + }) = cfg + { + if dev_bind_loopback || dev_nexus_address.is_some() { + slog::warn!( + &self.log, + "using development metrics configuration overrides!"; + "nexus_address" => ?dev_nexus_address, + "bind_loopback" => dev_bind_loopback, + ); + } + (dev_nexus_address, dev_bind_loopback) } else { (None, false) }; diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs index 6fac010d3c..3fdb5dfd64 100644 --- a/nexus/tests/integration_tests/metrics.rs +++ b/nexus/tests/integration_tests/metrics.rs @@ -601,11 +601,11 @@ async fn test_mgs_metrics( let mgs = { // munge the already-parsed MGS config file to point it at the test // Nexus' address. - mgs_config.metrics.get_or_insert_with(Default::default).dev = - Some(gateway_test_utils::setup::MetricsDevConfig { - bind_loopback: true, - nexus_address: Some(cptestctx.internal_client.bind_address), - }); + mgs_config.metrics = Some(gateway_test_utils::setup::MetricsCOnfig { + disabled: false, + dev_bind_loopback: true, + dev_nexus_address: Some(cptestctx.internal_client.bind_address), + }); gateway_test_utils::setup::test_setup_with_config( "test_mgs_metrics", gateway_messages::SpPort::One, From aa68ed94c9c6596b76f36bb32226c8220f9699e0 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 23 Aug 2024 12:24:35 -0700 Subject: [PATCH 75/77] if you change the config format, you have to change the config files --- gateway-test-utils/configs/config.test.toml | 6 ++++-- gateway/examples/config.toml | 9 +++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/gateway-test-utils/configs/config.test.toml b/gateway-test-utils/configs/config.test.toml index 7ca87a032b..4e3e9c6e6e 100644 --- a/gateway-test-utils/configs/config.test.toml +++ b/gateway-test-utils/configs/config.test.toml @@ -92,8 +92,10 @@ location = { switch0 = ["sled", 1], switch1 = ["sled", 1] } # Configuration for SP sensor metrics polling # [metrics] -# Allow binding the metrics server on localhost. -dev = { bind_loopback = true } +# Allow the Oximeter metrics endpoint to bind on the loopback IP. This is +# useful in local testing and development, when the gateway service is not +# given a "real" underlay network IP. +dev_bind_loopback = true # # NOTE: for the test suite, if mode = "file", the file path MUST be the sentinel diff --git a/gateway/examples/config.toml b/gateway/examples/config.toml index d29d9508b9..a76edcd7b5 100644 --- a/gateway/examples/config.toml +++ b/gateway/examples/config.toml @@ -71,6 +71,15 @@ addr = "[::1]:33320" ignition-target = 3 location = { switch0 = ["sled", 1], switch1 = ["sled", 1] } +# +# Configuration for SP sensor metrics polling +# +[metrics] +# Allow the Oximeter metrics endpoint to bind on the loopback IP. This is +# useful in local testing and development, when the gateway service is not +# given a "real" underlay network IP. +dev_bind_loopback = true + [log] # Show log messages of this level and more severe level = "debug" From 55490d1b78012bf9a727e9e8d06ecf48f8dcf498 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 23 Aug 2024 12:33:33 -0700 Subject: [PATCH 76/77] learn to type good, idiot --- nexus/tests/integration_tests/metrics.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs index 3fdb5dfd64..9f4652c2da 100644 --- a/nexus/tests/integration_tests/metrics.rs +++ b/nexus/tests/integration_tests/metrics.rs @@ -601,7 +601,7 @@ async fn test_mgs_metrics( let mgs = { // munge the already-parsed MGS config file to point it at the test // Nexus' address. - mgs_config.metrics = Some(gateway_test_utils::setup::MetricsCOnfig { + mgs_config.metrics = Some(gateway_test_utils::setup::MetricsConfig { disabled: false, dev_bind_loopback: true, dev_nexus_address: Some(cptestctx.internal_client.bind_address), From e7d2430f1f0f7d16cc82ed47e6ee9d6ddb343082 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 24 Aug 2024 09:20:23 -0700 Subject: [PATCH 77/77] celsius --- oximeter/oximeter/schema/hardware-component.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oximeter/oximeter/schema/hardware-component.toml b/oximeter/oximeter/schema/hardware-component.toml index 3209723f69..30a1d6510f 100644 --- a/oximeter/oximeter/schema/hardware-component.toml +++ b/oximeter/oximeter/schema/hardware-component.toml @@ -98,7 +98,7 @@ the metrics emitted by these sensors when they are read successfully).""" [[metrics]] name = "temperature" description = "A temperature reading from a hardware component." -units = "degrees_celcius" +units = "degrees_celsius" datum_type = "f32" versions = [ { added_in = 1, fields = ["sensor"]}