diff --git a/Cargo.lock b/Cargo.lock index 249b7c5cea..7074e40993 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5983,6 +5983,7 @@ dependencies = [ "omicron-workspace-hack", "once_cell", "oximeter", + "oximeter-instruments", "oximeter-producer", "schemars", "serde", diff --git a/gateway/Cargo.toml b/gateway/Cargo.toml index 2dce15892d..bdf4a911af 100644 --- a/gateway/Cargo.toml +++ b/gateway/Cargo.toml @@ -42,6 +42,7 @@ uuid.workspace = true omicron-workspace-hack.workspace = true oximeter.workspace = true oximeter-producer.workspace = true +oximeter-instruments = { workspace = true, features = ["http-instruments"] } [dev-dependencies] expectorate.workspace = true diff --git a/gateway/src/context.rs b/gateway/src/context.rs index 939bb9b6b9..15592145cf 100644 --- a/gateway/src/context.rs +++ b/gateway/src/context.rs @@ -16,11 +16,13 @@ pub struct ServerContext { pub mgmt_switch: ManagementSwitch, pub host_phase2_provider: Arc, pub rack_id: OnceLock, + pub latencies: oximeter_instruments::http::LatencyTracker, pub log: Logger, } impl ServerContext { pub async fn new( + id: Uuid, host_phase2_provider: Arc, switch_config: SwitchConfig, rack_id_config: Option, @@ -37,7 +39,21 @@ impl ServerContext { OnceLock::new() }; + const START_LATENCY_DECADE: i16 = -6; + const END_LATENCY_DECADE: i16 = 3; + let latencies = + oximeter_instruments::http::LatencyTracker::with_latency_decades( + oximeter_instruments::http::HttpService { + name: "management-gateway-service".into(), + id, + }, + START_LATENCY_DECADE, + END_LATENCY_DECADE, + ) + .expect("start and end decades are hardcoded and should be valid"); + Ok(Arc::new(ServerContext { + latencies, mgmt_switch, host_phase2_provider, rack_id, diff --git a/gateway/src/http_entrypoints.rs b/gateway/src/http_entrypoints.rs index 332f50ed8a..c10e71ad61 100644 --- a/gateway/src/http_entrypoints.rs +++ b/gateway/src/http_entrypoints.rs @@ -81,18 +81,22 @@ impl GatewayApi for GatewayImpl { ) -> Result, HttpError> { let apictx = rqctx.context(); let sp_id = path.into_inner().sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; - let state = sp.state().await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let state = sp.state().await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; + + let rot_state = sp + .rot_state(gateway_messages::RotBootInfo::HIGHEST_KNOWN_VERSION) + .await; - let rot_state = sp - .rot_state(gateway_messages::RotBootInfo::HIGHEST_KNOWN_VERSION) - .await; + let final_state = sp_state_from_comms(state, rot_state); - let final_state = sp_state_from_comms(state, rot_state); - Ok(HttpResponseOk(final_state)) + Ok(HttpResponseOk(final_state)) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_startup_options_get( @@ -100,15 +104,18 @@ impl GatewayApi for GatewayImpl { path: Path, ) -> Result, HttpError> { let apictx = rqctx.context(); - let mgmt_switch = &apictx.mgmt_switch; - let sp_id = path.into_inner().sp.into(); - let sp = mgmt_switch.sp(sp_id)?; + let handler = async { + let mgmt_switch = &apictx.mgmt_switch; + let sp_id = path.into_inner().sp.into(); + let sp = mgmt_switch.sp(sp_id)?; - let options = sp.get_startup_options().await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let options = sp.get_startup_options().await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - Ok(HttpResponseOk(options.into())) + Ok(HttpResponseOk(options.into())) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_startup_options_set( @@ -119,13 +126,16 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let mgmt_switch = &apictx.mgmt_switch; let sp_id = path.into_inner().sp.into(); - let sp = mgmt_switch.sp(sp_id)?; + let handler = async { + let sp = mgmt_switch.sp(sp_id)?; - sp.set_startup_options(body.into_inner().into()).await.map_err( - |err| SpCommsError::SpCommunicationFailed { sp: sp_id, err }, - )?; + sp.set_startup_options(body.into_inner().into()).await.map_err( + |err| SpCommsError::SpCommunicationFailed { sp: sp_id, err }, + )?; - Ok(HttpResponseUpdatedNoContent {}) + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_sensor_read_value( @@ -135,12 +145,17 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let PathSpSensorId { sp, sensor_id } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let value = sp.read_sensor_value(sensor_id).await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let value = + sp.read_sensor_value(sensor_id).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; + + Ok(HttpResponseOk(value.into())) + }; - Ok(HttpResponseOk(value.into())) + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_list( @@ -149,12 +164,15 @@ impl GatewayApi for GatewayImpl { ) -> Result, HttpError> { let apictx = rqctx.context(); let sp_id = path.into_inner().sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let inventory = sp.inventory().await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let inventory = sp.inventory().await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - Ok(HttpResponseOk(sp_component_list_from_comms(inventory))) + Ok(HttpResponseOk(sp_component_list_from_comms(inventory))) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_get( @@ -164,16 +182,21 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let component = component_from_str(&component)?; - - let details = sp.component_details(component).await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; + + let details = + sp.component_details(component).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; + + Ok(HttpResponseOk( + details.entries.into_iter().map(Into::into).collect(), + )) + }; - Ok(HttpResponseOk( - details.entries.into_iter().map(Into::into).collect(), - )) + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } // Implementation notes: @@ -198,66 +221,79 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let ComponentCabooseSlot { firmware_slot } = query_params.into_inner(); - let component = component_from_str(&component)?; - let from_utf8 = |key: &[u8], bytes| { - // This helper closure is only called with the ascii-printable [u8; 4] - // key constants we define above, so we can unwrap this conversion. - let key = str::from_utf8(key).unwrap(); - String::from_utf8(bytes).map_err(|_| { - http_err_with_message( - http::StatusCode::SERVICE_UNAVAILABLE, - "InvalidCaboose", - format!("non-utf8 data returned for caboose key {key}"), + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let ComponentCabooseSlot { firmware_slot } = + query_params.into_inner(); + let component = component_from_str(&component)?; + + let from_utf8 = |key: &[u8], bytes| { + // This helper closure is only called with the ascii-printable [u8; 4] + // key constants we define above, so we can unwrap this conversion. + let key = str::from_utf8(key).unwrap(); + String::from_utf8(bytes).map_err(|_| { + http_err_with_message( + http::StatusCode::SERVICE_UNAVAILABLE, + "InvalidCaboose", + format!("non-utf8 data returned for caboose key {key}"), + ) + }) + }; + + let git_commit = + sp.read_component_caboose( + component, + firmware_slot, + CABOOSE_KEY_GIT_COMMIT, ) - }) - }; + .await + .map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; + let board = + sp.read_component_caboose( + component, + firmware_slot, + CABOOSE_KEY_BOARD, + ) + .await + .map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; + let name = + sp.read_component_caboose( + component, + firmware_slot, + CABOOSE_KEY_NAME, + ) + .await + .map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; + let version = + sp.read_component_caboose( + component, + firmware_slot, + CABOOSE_KEY_VERSION, + ) + .await + .map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - let git_commit = - sp.read_component_caboose( - component, - firmware_slot, - CABOOSE_KEY_GIT_COMMIT, - ) - .await - .map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; - let board = sp - .read_component_caboose(component, firmware_slot, CABOOSE_KEY_BOARD) - .await - .map_err(|err| SpCommsError::SpCommunicationFailed { - sp: sp_id, - err, - })?; - let name = sp - .read_component_caboose(component, firmware_slot, CABOOSE_KEY_NAME) - .await - .map_err(|err| SpCommsError::SpCommunicationFailed { - sp: sp_id, - err, - })?; - let version = - sp.read_component_caboose( - component, - firmware_slot, - CABOOSE_KEY_VERSION, - ) - .await - .map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let git_commit = from_utf8(&CABOOSE_KEY_GIT_COMMIT, git_commit)?; + let board = from_utf8(&CABOOSE_KEY_BOARD, board)?; + let name = from_utf8(&CABOOSE_KEY_NAME, name)?; + let version = from_utf8(&CABOOSE_KEY_VERSION, version)?; - let git_commit = from_utf8(&CABOOSE_KEY_GIT_COMMIT, git_commit)?; - let board = from_utf8(&CABOOSE_KEY_BOARD, board)?; - let name = from_utf8(&CABOOSE_KEY_NAME, name)?; - let version = from_utf8(&CABOOSE_KEY_VERSION, version)?; + let caboose = + SpComponentCaboose { git_commit, board, name, version }; - let caboose = SpComponentCaboose { git_commit, board, name, version }; + Ok(HttpResponseOk(caboose)) + }; - Ok(HttpResponseOk(caboose)) + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_clear_status( @@ -267,14 +303,18 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let component = component_from_str(&component)?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; - sp.component_clear_status(component).await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + sp.component_clear_status(component).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; + + Ok(HttpResponseUpdatedNoContent {}) + }; - Ok(HttpResponseUpdatedNoContent {}) + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_active_slot_get( @@ -284,15 +324,18 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let component = component_from_str(&component)?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; - let slot = - sp.component_active_slot(component).await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let slot = + sp.component_active_slot(component).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - Ok(HttpResponseOk(SpComponentFirmwareSlot { slot })) + Ok(HttpResponseOk(SpComponentFirmwareSlot { slot })) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_active_slot_set( @@ -304,16 +347,22 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let component = component_from_str(&component)?; - let slot = body.into_inner().slot; - let persist = query_params.into_inner().persist; - - sp.set_component_active_slot(component, slot, persist).await.map_err( - |err| SpCommsError::SpCommunicationFailed { sp: sp_id, err }, - )?; - - Ok(HttpResponseUpdatedNoContent {}) + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; + let slot = body.into_inner().slot; + let persist = query_params.into_inner().persist; + + sp.set_component_active_slot(component, slot, persist) + .await + .map_err(|err| SpCommsError::SpCommunicationFailed { + sp: sp_id, + err, + })?; + + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_serial_console_attach( @@ -321,6 +370,10 @@ impl GatewayApi for GatewayImpl { path: Path, websocket: WebsocketUpgrade, ) -> WebsocketEndpointResult { + // TODO(eliza): I'm not sure whether there's a way to make + // `oximeter_instruments`'s HTTP latency tracker work with websockets + // requests? It would be nice to get the latency and any error returned + // prior to actually returning the websocket stream... let apictx = rqctx.context(); let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); @@ -356,13 +409,15 @@ impl GatewayApi for GatewayImpl { // we don't use it at all to detach. let PathSpComponent { sp, component: _ } = path.into_inner(); let sp_id = sp.into(); + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + sp.serial_console_detach().await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - let sp = apictx.mgmt_switch.sp(sp_id)?; - sp.serial_console_detach().await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; - - Ok(HttpResponseUpdatedNoContent {}) + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_reset( @@ -372,20 +427,23 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let component = component_from_str(&component)?; - - sp.reset_component_prepare(component) - // We always want to run with the watchdog when resetting as - // disabling the watchdog should be considered a debug only feature - .and_then(|()| sp.reset_component_trigger(component, false)) - .await - .map_err(|err| SpCommsError::SpCommunicationFailed { - sp: sp_id, - err, - })?; - - Ok(HttpResponseUpdatedNoContent {}) + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; + + sp.reset_component_prepare(component) + // We always want to run with the watchdog when resetting as + // disabling the watchdog should be considered a debug only feature + .and_then(|()| sp.reset_component_trigger(component, false)) + .await + .map_err(|err| SpCommsError::SpCommunicationFailed { + sp: sp_id, + err, + })?; + + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_update( @@ -398,19 +456,22 @@ impl GatewayApi for GatewayImpl { let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let component = component_from_str(&component)?; - let ComponentUpdateIdSlot { id, firmware_slot } = - query_params.into_inner(); + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; + let ComponentUpdateIdSlot { id, firmware_slot } = + query_params.into_inner(); - // TODO-performance: this makes a full copy of the uploaded data - let image = body.as_bytes().to_vec(); + // TODO-performance: this makes a full copy of the uploaded data + let image = body.as_bytes().to_vec(); - sp.start_update(component, id, firmware_slot, image) - .await - .map_err(|err| SpCommsError::UpdateFailed { sp: sp_id, err })?; + sp.start_update(component, id, firmware_slot, image) + .await + .map_err(|err| SpCommsError::UpdateFailed { sp: sp_id, err })?; - Ok(HttpResponseUpdatedNoContent {}) + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_update_status( @@ -421,14 +482,17 @@ impl GatewayApi for GatewayImpl { let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let component = component_from_str(&component)?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; - let status = sp.update_status(component).await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let status = sp.update_status(component).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - Ok(HttpResponseOk(status.into())) + Ok(HttpResponseOk(status.into())) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_component_update_abort( @@ -440,15 +504,18 @@ impl GatewayApi for GatewayImpl { let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let component = component_from_str(&component)?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let component = component_from_str(&component)?; - let UpdateAbortBody { id } = body.into_inner(); - sp.update_abort(component, id).await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let UpdateAbortBody { id } = body.into_inner(); + sp.update_abort(component, id).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - Ok(HttpResponseUpdatedNoContent {}) + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_rot_cmpa_get( @@ -459,24 +526,26 @@ impl GatewayApi for GatewayImpl { let PathSpComponent { sp, component } = path.into_inner(); let sp_id = sp.into(); + let handler = async { + // Ensure the caller knows they're asking for the RoT + if component_from_str(&component)? != SpComponent::ROT { + return Err(HttpError::for_bad_request( + Some("RequestUnsupportedForComponent".to_string()), + "Only the RoT has a CFPA".into(), + )); + } + + let sp = apictx.mgmt_switch.sp(sp_id)?; + let data = sp.read_rot_cmpa().await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - // Ensure the caller knows they're asking for the RoT - if component_from_str(&component)? != SpComponent::ROT { - return Err(HttpError::for_bad_request( - Some("RequestUnsupportedForComponent".to_string()), - "Only the RoT has a CFPA".into(), - )); - } - - let sp = apictx.mgmt_switch.sp(sp_id)?; - let data = sp.read_rot_cmpa().await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; - - let base64_data = - base64::engine::general_purpose::STANDARD.encode(data); + let base64_data = + base64::engine::general_purpose::STANDARD.encode(data); - Ok(HttpResponseOk(RotCmpa { base64_data })) + Ok(HttpResponseOk(RotCmpa { base64_data })) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_rot_cfpa_get( @@ -490,29 +559,32 @@ impl GatewayApi for GatewayImpl { let GetCfpaParams { slot } = params.into_inner(); let sp_id = sp.into(); - // Ensure the caller knows they're asking for the RoT - if component_from_str(&component)? != SpComponent::ROT { - return Err(HttpError::for_bad_request( - Some("RequestUnsupportedForComponent".to_string()), - "Only the RoT has a CFPA".into(), - )); - } + let handler = async { + // Ensure the caller knows they're asking for the RoT + if component_from_str(&component)? != SpComponent::ROT { + return Err(HttpError::for_bad_request( + Some("RequestUnsupportedForComponent".to_string()), + "Only the RoT has a CFPA".into(), + )); + } + + let sp = apictx.mgmt_switch.sp(sp_id)?; + let data = match slot { + RotCfpaSlot::Active => sp.read_rot_active_cfpa().await, + RotCfpaSlot::Inactive => sp.read_rot_inactive_cfpa().await, + RotCfpaSlot::Scratch => sp.read_rot_scratch_cfpa().await, + } + .map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - let sp = apictx.mgmt_switch.sp(sp_id)?; - let data = match slot { - RotCfpaSlot::Active => sp.read_rot_active_cfpa().await, - RotCfpaSlot::Inactive => sp.read_rot_inactive_cfpa().await, - RotCfpaSlot::Scratch => sp.read_rot_scratch_cfpa().await, - } - .map_err(|err| SpCommsError::SpCommunicationFailed { - sp: sp_id, - err, - })?; + let base64_data = + base64::engine::general_purpose::STANDARD.encode(data); - let base64_data = - base64::engine::general_purpose::STANDARD.encode(data); + Ok(HttpResponseOk(RotCfpa { base64_data, slot })) + }; - Ok(HttpResponseOk(RotCfpa { base64_data, slot })) + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_rot_boot_info( @@ -526,20 +598,24 @@ impl GatewayApi for GatewayImpl { let GetRotBootInfoParams { version } = params.into_inner(); let sp_id = sp.into(); - // Ensure the caller knows they're asking for the RoT - if component_from_str(&component)? != SpComponent::ROT { - return Err(HttpError::for_bad_request( - Some("RequestUnsupportedForComponent".to_string()), - "rot_boot_info only makes sent for a RoT".into(), - )); - } + let handler = async { + // Ensure the caller knows they're asking for the RoT + if component_from_str(&component)? != SpComponent::ROT { + return Err(HttpError::for_bad_request( + Some("RequestUnsupportedForComponent".to_string()), + "rot_boot_info only makes sent for a RoT".into(), + )); + } + + let sp = apictx.mgmt_switch.sp(sp_id)?; + let state = sp.rot_state(version).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - let sp = apictx.mgmt_switch.sp(sp_id)?; - let state = sp.rot_state(version).await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + Ok(HttpResponseOk(state.into())) + }; - Ok(HttpResponseOk(state.into())) + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn ignition_list( @@ -547,17 +623,19 @@ impl GatewayApi for GatewayImpl { ) -> Result>, HttpError> { let apictx = rqctx.context(); let mgmt_switch = &apictx.mgmt_switch; - - let out = mgmt_switch - .bulk_ignition_state() - .await? - .map(|(id, state)| SpIgnitionInfo { - id: id.into(), - details: state.into(), - }) - .collect(); - - Ok(HttpResponseOk(out)) + let handler = async { + let out = mgmt_switch + .bulk_ignition_state() + .await? + .map(|(id, state)| SpIgnitionInfo { + id: id.into(), + details: state.into(), + }) + .collect(); + + Ok(HttpResponseOk(out)) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn ignition_get( @@ -568,19 +646,23 @@ impl GatewayApi for GatewayImpl { let mgmt_switch = &apictx.mgmt_switch; let sp_id = path.into_inner().sp.into(); - let ignition_target = mgmt_switch.ignition_target(sp_id)?; - - let state = mgmt_switch - .ignition_controller() - .ignition_state(ignition_target) - .await - .map_err(|err| SpCommsError::SpCommunicationFailed { - sp: sp_id, - err, - })?; - - let info = SpIgnitionInfo { id: sp_id.into(), details: state.into() }; - Ok(HttpResponseOk(info)) + let handler = async { + let ignition_target = mgmt_switch.ignition_target(sp_id)?; + + let state = mgmt_switch + .ignition_controller() + .ignition_state(ignition_target) + .await + .map_err(|err| SpCommsError::SpCommunicationFailed { + sp: sp_id, + err, + })?; + + let info = + SpIgnitionInfo { id: sp_id.into(), details: state.into() }; + Ok(HttpResponseOk(info)) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn ignition_command( @@ -591,18 +673,22 @@ impl GatewayApi for GatewayImpl { let mgmt_switch = &apictx.mgmt_switch; let PathSpIgnitionCommand { sp, command } = path.into_inner(); let sp_id = sp.into(); - let ignition_target = mgmt_switch.ignition_target(sp_id)?; - mgmt_switch - .ignition_controller() - .ignition_command(ignition_target, command.into()) - .await - .map_err(|err| SpCommsError::SpCommunicationFailed { - sp: sp_id, - err, - })?; + let handler = async { + let ignition_target = mgmt_switch.ignition_target(sp_id)?; - Ok(HttpResponseUpdatedNoContent {}) + mgmt_switch + .ignition_controller() + .ignition_command(ignition_target, command.into()) + .await + .map_err(|err| SpCommsError::SpCommunicationFailed { + sp: sp_id, + err, + })?; + + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_power_state_get( @@ -611,13 +697,16 @@ impl GatewayApi for GatewayImpl { ) -> Result, HttpError> { let apictx = rqctx.context(); let sp_id = path.into_inner().sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; - let power_state = sp.power_state().await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + let power_state = sp.power_state().await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - Ok(HttpResponseOk(power_state.into())) + Ok(HttpResponseOk(power_state.into())) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_power_state_set( @@ -627,14 +716,17 @@ impl GatewayApi for GatewayImpl { ) -> Result { let apictx = rqctx.context(); let sp_id = path.into_inner().sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; - let power_state = body.into_inner(); + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; + let power_state = body.into_inner(); - sp.set_power_state(power_state.into()).await.map_err(|err| { - SpCommsError::SpCommunicationFailed { sp: sp_id, err } - })?; + sp.set_power_state(power_state.into()).await.map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - Ok(HttpResponseUpdatedNoContent {}) + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_installinator_image_id_set( @@ -646,21 +738,23 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let sp_id = path.into_inner().sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; - let image_id = ipcc::InstallinatorImageId::from(body.into_inner()); + let image_id = ipcc::InstallinatorImageId::from(body.into_inner()); - sp.set_ipcc_key_lookup_value( - Key::InstallinatorImageId as u8, - image_id.serialize(), - ) - .await - .map_err(|err| SpCommsError::SpCommunicationFailed { - sp: sp_id, - err, - })?; + sp.set_ipcc_key_lookup_value( + Key::InstallinatorImageId as u8, + image_id.serialize(), + ) + .await + .map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - Ok(HttpResponseUpdatedNoContent {}) + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_installinator_image_id_delete( @@ -671,20 +765,22 @@ impl GatewayApi for GatewayImpl { let apictx = rqctx.context(); let sp_id = path.into_inner().sp.into(); - let sp = apictx.mgmt_switch.sp(sp_id)?; + let handler = async { + let sp = apictx.mgmt_switch.sp(sp_id)?; - // We clear the image ID by setting it to a 0-length vec. - sp.set_ipcc_key_lookup_value( - Key::InstallinatorImageId as u8, - Vec::new(), - ) - .await - .map_err(|err| SpCommsError::SpCommunicationFailed { - sp: sp_id, - err, - })?; + // We clear the image ID by setting it to a 0-length vec. + sp.set_ipcc_key_lookup_value( + Key::InstallinatorImageId as u8, + Vec::new(), + ) + .await + .map_err(|err| { + SpCommsError::SpCommunicationFailed { sp: sp_id, err } + })?; - Ok(HttpResponseUpdatedNoContent {}) + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_host_phase2_progress_get( @@ -692,37 +788,41 @@ impl GatewayApi for GatewayImpl { path: Path, ) -> Result, HttpError> { let apictx = rqctx.context(); - let sp = apictx.mgmt_switch.sp(path.into_inner().sp.into())?; - - let Some(progress) = sp.most_recent_host_phase2_request().await else { - return Ok(HttpResponseOk(HostPhase2Progress::None)); - }; - - // Our `host_phase2_provider` is using an in-memory cache, so the only way - // we can fail to get the total size is if we no longer have the image that - // this SP most recently requested. We'll treat that as "no progress - // information", since it almost certainly means our progress info on this - // SP is very stale. - let Ok(total_size) = - apictx.host_phase2_provider.total_size(progress.hash).await - else { - return Ok(HttpResponseOk(HostPhase2Progress::None)); - }; - - let image_id = HostPhase2RecoveryImageId { - sha256_hash: ArtifactHash(progress.hash), + let handler = async { + let sp = apictx.mgmt_switch.sp(path.into_inner().sp.into())?; + + let Some(progress) = sp.most_recent_host_phase2_request().await + else { + return Ok(HttpResponseOk(HostPhase2Progress::None)); + }; + + // Our `host_phase2_provider` is using an in-memory cache, so the only way + // we can fail to get the total size is if we no longer have the image that + // this SP most recently requested. We'll treat that as "no progress + // information", since it almost certainly means our progress info on this + // SP is very stale. + let Ok(total_size) = + apictx.host_phase2_provider.total_size(progress.hash).await + else { + return Ok(HttpResponseOk(HostPhase2Progress::None)); + }; + + let image_id = HostPhase2RecoveryImageId { + sha256_hash: ArtifactHash(progress.hash), + }; + + // `progress` tells us the offset the SP requested and the amount of data we + // sent starting at that offset; report the end of that chunk to our caller. + let offset = progress.offset.saturating_add(progress.data_sent); + + Ok(HttpResponseOk(HostPhase2Progress::Available { + image_id, + offset, + total_size, + age: progress.received.elapsed(), + })) }; - - // `progress` tells us the offset the SP requested and the amount of data we - // sent starting at that offset; report the end of that chunk to our caller. - let offset = progress.offset.saturating_add(progress.data_sent); - - Ok(HttpResponseOk(HostPhase2Progress::Available { - image_id, - offset, - total_size, - age: progress.received.elapsed(), - })) + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_host_phase2_progress_delete( @@ -730,11 +830,14 @@ impl GatewayApi for GatewayImpl { path: Path, ) -> Result { let apictx = rqctx.context(); - let sp = apictx.mgmt_switch.sp(path.into_inner().sp.into())?; + let handler = async { + let sp = apictx.mgmt_switch.sp(path.into_inner().sp.into())?; - sp.clear_most_recent_host_phase2_request().await; + sp.clear_most_recent_host_phase2_request().await; - Ok(HttpResponseUpdatedNoContent {}) + Ok(HttpResponseUpdatedNoContent {}) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn recovery_host_phase2_upload( @@ -742,44 +845,55 @@ impl GatewayApi for GatewayImpl { body: UntypedBody, ) -> Result, HttpError> { let apictx = rqctx.context(); - - // TODO: this makes a full copy of the host image, potentially unnecessarily - // if it's malformed. - let image = body.as_bytes().to_vec(); - - let sha256_hash = - apictx.host_phase2_provider.insert(image).await.map_err(|err| { - // Any cache-insertion failure indicates a malformed image; map them - // to bad requests. - HttpError::for_bad_request( - Some("BadHostPhase2Image".to_string()), - err.to_string(), - ) - })?; - let sha256_hash = ArtifactHash(sha256_hash); - - Ok(HttpResponseOk(HostPhase2RecoveryImageId { sha256_hash })) + let handler = async { + // TODO: this makes a full copy of the host image, potentially unnecessarily + // if it's malformed. + let image = body.as_bytes().to_vec(); + + let sha256_hash = + apictx.host_phase2_provider.insert(image).await.map_err( + |err| { + // Any cache-insertion failure indicates a malformed image; map them + // to bad requests. + HttpError::for_bad_request( + Some("BadHostPhase2Image".to_string()), + err.to_string(), + ) + }, + )?; + let sha256_hash = ArtifactHash(sha256_hash); + + Ok(HttpResponseOk(HostPhase2RecoveryImageId { sha256_hash })) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_local_switch_id( rqctx: RequestContext, ) -> Result, HttpError> { let apictx = rqctx.context(); + let handler = async { + let id = apictx.mgmt_switch.local_switch()?; - let id = apictx.mgmt_switch.local_switch()?; - - Ok(HttpResponseOk(id.into())) + Ok(HttpResponseOk(id.into())) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } async fn sp_all_ids( rqctx: RequestContext, ) -> Result>, HttpError> { let apictx = rqctx.context(); - - let all_ids = - apictx.mgmt_switch.all_sps()?.map(|(id, _)| id.into()).collect(); - - Ok(HttpResponseOk(all_ids)) + let handler = async { + let all_ids = apictx + .mgmt_switch + .all_sps()? + .map(|(id, _)| id.into()) + .collect(); + + Ok(HttpResponseOk(all_ids)) + }; + apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await } } diff --git a/gateway/src/lib.rs b/gateway/src/lib.rs index 8e764dc63f..e07df0cfb9 100644 --- a/gateway/src/lib.rs +++ b/gateway/src/lib.rs @@ -143,6 +143,7 @@ impl Server { config.host_phase2_recovery_image_cache_max_images, )); let apictx = ServerContext::new( + args.id, host_phase2_provider, config.switch, args.rack_id, @@ -306,15 +307,6 @@ impl Server { warn!(self.apictx.log, "SMF refresh called without a rack id"); } } - - // TODO does MGS register itself with oximeter? - // Register the Nexus server as a metric producer with `oximeter. - // pub async fn register_as_producer(&self) { - // self.apictx - // .nexus - // .register_as_producer(self.http_server_internal.local_addr()) - // .await; - // } } /// Start an instance of the [Server]. @@ -337,6 +329,5 @@ pub async fn start_server( debug!(log, "registered DTrace probes"); } let server = Server::start(config, args, log).await?; - // server.register_as_producer().await; Ok(server) } diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs index d4e0795ae0..7c133f5d97 100644 --- a/gateway/src/metrics.rs +++ b/gateway/src/metrics.rs @@ -242,6 +242,7 @@ impl Metrics { let server = { let log = log.new(slog::o!("component" => "producer-server")); let registry = ProducerRegistry::with_id(id); + // Register the producer for SP sensor metrics. registry .register_producer(Producer { sample_rx, log: log.clone() }) // TODO(ben): when you change `register_producer` to not return @@ -251,6 +252,15 @@ impl Metrics { actually return an `Err`, so this shouldn't ever \ happen...", ); + // Also, register the producer for the HTTP API metrics. + registry + .register_producer(apictx.latencies.clone()) + // TODO(ben): do this one too pls + .expect( + "`ProducerRegistry::register_producer()` will never \ + actually return an `Err`, so this shouldn't ever \ + happen...", + ); tokio::spawn( ServerManager { log, addrs: addrs_rx, registry }.run(cfg),