Skip to content

Commit

Permalink
[gateway] Add Oximeter HTTP service metrics (#6432)
Browse files Browse the repository at this point in the history
Now that #6354 has added an Oximeter producer endpoint to MGS for
publishing SP sensor metrics, it seemed like a nice idea to also
instrument the MGS HTTP server, similar to the existing instrumentation
for other control plane services. I don't think we'll be doing a lot of
tuning of MGS performance, but the metrics seem like they could still be
useful because they also include the distribution of HTTP status codes,
and in many cases, the latency measurements also serve as a proxy for
how long it takes the *SP* to perform a certain operation, which could
be a valuable signal.

This commit adds an `oximeter_instruments::http::LatencyTracker` to the
MGS HTTP servers.

To test that it works, I started a local Clickhouse and a standalone
Oximeter, and ran MGS and the SP simulator using `cargo xtask mgs-dev
run`.
Then, I made a few HTTP requests to various MGS APIs using `curl`;
most of which were expected to succeed, and a few for SP slots that the
simulator wasn't configured to simulate a SP in (to ensure that the
request would fail). We can see the new metrics in OxQL:

```
0x〉\d
hardware_component:current
hardware_component:fan_speed
hardware_component:sensor_error_count
hardware_component:temperature
hardware_component:voltage
http_service:request_latency_histogram
oximeter_collector:collections
0x〉get http_service:request_latency_histogram | last 1

http_service:request_latency_histogram

 id: 1ac73746-2d3b-46d8-ac7c-44512c5f2263
 name: management-gateway-service
 operation_id: sp_get
 status_code: 200
   [2024-08-24 18:54:47.978590056, 2024-08-24 18:58:18.125731231]: [-179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000: 0, 0.000001: 0, 0.000002: 0, 0.000003: 0, 0.000004: 0, 0.000005: 0, 0.0000059999999999999985: 0, 0.000007: 0, 0.000008: 0, 0.000009: 0, 0.00001: 0, 0.00002: 0, 0.000030000000000000004: 0, 0.00004: 0, 0.00005: 0, 0.00006: 0, 0.00007000000000000001: 0, 0.00008: 0, 0.00009: 0, 0.0001: 0, 0.0002: 0, 0.0003: 0, 0.0004: 0, 0.0005: 1, 0.0006000000000000001: 1, 0.0007: 0, 0.0007999999999999999: 0, 0.0009: 0, 0.001: 0, 0.002: 0, 0.003: 0, 0.004: 0, 0.005: 0, 0.006: 0, 0.007: 0, 0.008: 0, 0.009000000000000001: 0, 0.01: 0, 0.020000000000000004: 0, 0.03000000000000001: 0, 0.04000000000000001: 0, 0.05000000000000001: 0, 0.06000000000000001: 0, 0.07: 0, 0.08: 0, 0.09000000000000001: 0, 0.1: 0, 0.2: 0, 0.30000000000000004: 0, 0.4: 0, 0.5: 0, 0.6: 0, 0.7000000000000001: 0, 0.8: 0, 0.9: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 20: 0, 30: 0, 40: 0, 50: 0, 60: 0, 70: 0, 80: 0, 90: 0, 100: 0, 200: 0, 300: 0, 400: 0, 500: 0, 600: 0, 700: 0, 800: 0, 900: 0, 1000: 0, min: 0.000556233, max: 0.000603704, mean: 0.0005799685000000001, std_dev: 0.00002373549999999997, p50: 0, p90: 0.000603704, p99: 0.000603704]

 id: 1ac73746-2d3b-46d8-ac7c-44512c5f2263
 name: management-gateway-service
 operation_id: ignition_list
 status_code: 200
   [2024-08-24 18:54:47.978590056, 2024-08-24 18:58:18.125290346]: [-179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000: 0, 0.000001: 0, 0.000002: 0, 0.000003: 0, 0.000004: 0, 0.000005: 0, 0.0000059999999999999985: 0, 0.000007: 0, 0.000008: 0, 0.000009: 0, 0.00001: 0, 0.00002: 0, 0.000030000000000000004: 0, 0.00004: 0, 0.00005: 0, 0.00006: 0, 0.00007000000000000001: 0, 0.00008: 0, 0.00009: 0, 0.0001: 0, 0.0002: 0, 0.0003: 0, 0.0004: 1, 0.0005: 0, 0.0006000000000000001: 0, 0.0007: 0, 0.0007999999999999999: 0, 0.0009: 0, 0.001: 0, 0.002: 0, 0.003: 0, 0.004: 0, 0.005: 0, 0.006: 0, 0.007: 0, 0.008: 0, 0.009000000000000001: 0, 0.01: 0, 0.020000000000000004: 0, 0.03000000000000001: 0, 0.04000000000000001: 0, 0.05000000000000001: 0, 0.06000000000000001: 0, 0.07: 0, 0.08: 0, 0.09000000000000001: 0, 0.1: 0, 0.2: 0, 0.30000000000000004: 0, 0.4: 0, 0.5: 0, 0.6: 0, 0.7000000000000001: 0, 0.8: 0, 0.9: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 20: 0, 30: 0, 40: 0, 50: 0, 60: 0, 70: 0, 80: 0, 90: 0, 100: 0, 200: 0, 300: 0, 400: 0, 500: 0, 600: 0, 700: 0, 800: 0, 900: 0, 1000: 0, min: 0.000427249, max: 0.000427249, mean: 0.000427249, std_dev: 0, p50: 0, p90: 0.000427249, p99: 0.000427249]

 id: 1ac73746-2d3b-46d8-ac7c-44512c5f2263
 name: management-gateway-service
 operation_id: sp_get
 status_code: 400
   [2024-08-24 18:54:47.978590056, 2024-08-24 18:58:18.126114126]: [-179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000: 0, 0.000001: 0, 0.000002: 0, 0.000003: 0, 0.000004: 0, 0.000005: 0, 0.0000059999999999999985: 0, 0.000007: 0, 0.000008: 0, 0.000009: 0, 0.00001: 0, 0.00002: 2, 0.000030000000000000004: 0, 0.00004: 0, 0.00005: 0, 0.00006: 0, 0.00007000000000000001: 0, 0.00008: 0, 0.00009: 0, 0.0001: 0, 0.0002: 0, 0.0003: 0, 0.0004: 0, 0.0005: 0, 0.0006000000000000001: 0, 0.0007: 0, 0.0007999999999999999: 0, 0.0009: 0, 0.001: 0, 0.002: 0, 0.003: 0, 0.004: 0, 0.005: 0, 0.006: 0, 0.007: 0, 0.008: 0, 0.009000000000000001: 0, 0.01: 0, 0.020000000000000004: 0, 0.03000000000000001: 0, 0.04000000000000001: 0, 0.05000000000000001: 0, 0.06000000000000001: 0, 0.07: 0, 0.08: 0, 0.09000000000000001: 0, 0.1: 0, 0.2: 0, 0.30000000000000004: 0, 0.4: 0, 0.5: 0, 0.6: 0, 0.7000000000000001: 0, 0.8: 0, 0.9: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 20: 0, 30: 0, 40: 0, 50: 0, 60: 0, 70: 0, 80: 0, 90: 0, 100: 0, 200: 0, 300: 0, 400: 0, 500: 0, 600: 0, 700: 0, 800: 0, 900: 0, 1000: 0, min: 0.000020368, max: 0.000021581, mean: 0.0000209745, std_dev: 0.0000006064999999999992, p50: 0, p90: 0.000021581, p99: 0.000021581]
0x〉exit
```
  • Loading branch information
hawkw authored Aug 26, 2024
1 parent a032d2a commit 6e0bf12
Show file tree
Hide file tree
Showing 6 changed files with 478 additions and 345 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions gateway/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ uuid.workspace = true
omicron-workspace-hack.workspace = true
oximeter.workspace = true
oximeter-producer.workspace = true
oximeter-instruments = { workspace = true, features = ["http-instruments"] }

[dev-dependencies]
expectorate.workspace = true
Expand Down
16 changes: 16 additions & 0 deletions gateway/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@ pub struct ServerContext {
pub mgmt_switch: ManagementSwitch,
pub host_phase2_provider: Arc<InMemoryHostPhase2Provider>,
pub rack_id: OnceLock<Uuid>,
pub latencies: oximeter_instruments::http::LatencyTracker,
pub log: Logger,
}

impl ServerContext {
pub async fn new(
id: Uuid,
host_phase2_provider: Arc<InMemoryHostPhase2Provider>,
switch_config: SwitchConfig,
rack_id_config: Option<Uuid>,
Expand All @@ -37,7 +39,21 @@ impl ServerContext {
OnceLock::new()
};

const START_LATENCY_DECADE: i16 = -6;
const END_LATENCY_DECADE: i16 = 3;
let latencies =
oximeter_instruments::http::LatencyTracker::with_latency_decades(
oximeter_instruments::http::HttpService {
name: "management-gateway-service".into(),
id,
},
START_LATENCY_DECADE,
END_LATENCY_DECADE,
)
.expect("start and end decades are hardcoded and should be valid");

Ok(Arc::new(ServerContext {
latencies,
mgmt_switch,
host_phase2_provider,
rack_id,
Expand Down
Loading

0 comments on commit 6e0bf12

Please sign in to comment.