From 39ea9703b8e6d4292f8af84123919e4b7edf157e Mon Sep 17 00:00:00 2001 From: Davide Miola Date: Tue, 22 Aug 2023 12:15:58 +0000 Subject: [PATCH] Add io_uring io thread accounting --- main/Cargo.toml | 1 + main/src/actors/trace_analyzer.rs | 3 ++- main/src/bpf/prog.bpf.c | 40 +++++++++++++++++++++++-------- main/src/bpf/prog.bpf.h | 9 ++++++- main/src/common.rs | 35 +++++++++++++++++++++++---- main/src/ksyms.rs | 2 +- web-frontend/src/plot.rs | 1 + 7 files changed, 73 insertions(+), 18 deletions(-) diff --git a/main/Cargo.toml b/main/Cargo.toml index 394afeb..1b2f21c 100644 --- a/main/Cargo.toml +++ b/main/Cargo.toml @@ -23,4 +23,5 @@ libbpf-cargo = "0.20" bindgen = "0.64" [features] +#default = ["save-traces"] save-traces = [] diff --git a/main/src/actors/trace_analyzer.rs b/main/src/actors/trace_analyzer.rs index 612555c..7c699fc 100644 --- a/main/src/actors/trace_analyzer.rs +++ b/main/src/actors/trace_analyzer.rs @@ -4,7 +4,7 @@ use anyhow::anyhow; use libbpf_rs::MapFlags; use powercap::{IntelRapl, PowerCap}; use tokio::sync::mpsc::Sender; -use crate::{ksyms::{Counts, KSyms}, common::{event_types_EVENT_MAX, self, event_types_EVENT_SOCK_SENDMSG, event_types_EVENT_NET_TX_SOFTIRQ, event_types_EVENT_NET_RX_SOFTIRQ, event_types_EVENT_SOCK_RECVMSG}, bpf::ProgSkel}; +use crate::{ksyms::{Counts, KSyms}, common::{event_types_EVENT_MAX, self, event_types_EVENT_SOCK_SENDMSG, event_types_EVENT_NET_TX_SOFTIRQ, event_types_EVENT_NET_RX_SOFTIRQ, event_types_EVENT_SOCK_RECVMSG, event_types_EVENT_IO_WORKER}, bpf::ProgSkel}; use libc::{mmap, PROT_READ, MAP_SHARED, sysconf, _SC_CLK_TCK}; use super::{metrics_collector::MetricsCollector, MetricUpdate, SubmitUpdate}; #[cfg(feature = "save-traces")] @@ -206,6 +206,7 @@ impl TraceAnalyzer { event_types_EVENT_SOCK_SENDMSG => "TX syscalls", event_types_EVENT_SOCK_RECVMSG => "RX syscalls", event_types_EVENT_NET_TX_SOFTIRQ => "TX softirq", + event_types_EVENT_IO_WORKER => "IO workers", event_types_EVENT_NET_RX_SOFTIRQ => { // Update sub-events let denominator = counts[cpuid].net_rx_action.max(1) as f64; diff --git a/main/src/bpf/prog.bpf.c b/main/src/bpf/prog.bpf.c index 138cc27..315a558 100644 --- a/main/src/bpf/prog.bpf.c +++ b/main/src/bpf/prog.bpf.c @@ -75,8 +75,14 @@ u32 stack_traces_slot_off = 0; const u64 event_max = EVENT_MAX; inline void stop_event(u64 per_task_events, struct per_cpu_data* per_cpu_data, u64 now) { - if (per_task_events < EVENT_MAX) - per_cpu_data->per_event_total_time[per_task_events] += now - per_cpu_data->entry_ts; + u64 t; + + if (per_task_events < EVENT_MAX) { + t = now - per_cpu_data->entry_ts; + + per_cpu_data->per_event_total_time[per_task_events] += t; + per_cpu_data->sched_switch_accounted_time += t; + } } SEC("fentry/sock_sendmsg") @@ -100,14 +106,17 @@ SEC("fexit/sock_sendmsg") int BPF_PROG(sock_sendmsg_exit) { u32 zero = 0; struct per_cpu_data* per_cpu_data; - u64* per_task_events, now = bpf_ktime_get_ns(); + u64* per_task_events, now = bpf_ktime_get_ns(), t; if ( likely((per_task_events = bpf_task_storage_get(&traced_pids, bpf_get_current_task_btf(), NULL, 0)) != NULL) && likely((per_cpu_data = bpf_map_lookup_elem(&per_cpu, &zero)) != NULL) ) { + t = now - per_cpu_data->entry_ts; + *per_task_events = EVENT_MAX; - per_cpu_data->per_event_total_time[EVENT_SOCK_SENDMSG] += now - per_cpu_data->entry_ts; + per_cpu_data->per_event_total_time[EVENT_SOCK_SENDMSG] += t; + per_cpu_data->sched_switch_accounted_time += t; } return 0; @@ -134,14 +143,17 @@ SEC("fexit/sock_recvmsg") int BPF_PROG(sock_recvmsg_exit) { u32 zero = 0; struct per_cpu_data* per_cpu_data; - u64* per_task_events, now = bpf_ktime_get_ns(); + u64* per_task_events, now = bpf_ktime_get_ns(), t; if ( likely((per_task_events = bpf_task_storage_get(&traced_pids, bpf_get_current_task_btf(), NULL, 0)) != NULL) && likely((per_cpu_data = bpf_map_lookup_elem(&per_cpu, &zero)) != NULL) ) { + t = now - per_cpu_data->entry_ts; + *per_task_events = EVENT_MAX; - per_cpu_data->per_event_total_time[EVENT_SOCK_RECVMSG] += now - per_cpu_data->entry_ts; + per_cpu_data->per_event_total_time[EVENT_SOCK_RECVMSG] += t; + per_cpu_data->sched_switch_accounted_time += t; } return 0; @@ -170,25 +182,28 @@ SEC("tp_btf/softirq_exit") int BPF_PROG(net_rx_softirq_exit, unsigned int vec) { u32 zero = 0; struct per_cpu_data* per_cpu_data; - u64* per_task_events, now = bpf_ktime_get_ns(); + u64* per_task_events, now = bpf_ktime_get_ns(), t; if ( (vec == NET_TX_SOFTIRQ || vec == NET_RX_SOFTIRQ) && likely((per_task_events = bpf_task_storage_get(&traced_pids, bpf_get_current_task_btf(), NULL, 0)) != NULL) && likely((per_cpu_data = bpf_map_lookup_elem(&per_cpu, &zero)) != NULL) ) { + t = now - per_cpu_data->entry_ts; + // Convoluted expression makes the verifier happy switch (vec) { case NET_TX_SOFTIRQ: - per_cpu_data->per_event_total_time[EVENT_NET_TX_SOFTIRQ] += now - per_cpu_data->entry_ts; + per_cpu_data->per_event_total_time[EVENT_NET_TX_SOFTIRQ] += t; break; default: case NET_RX_SOFTIRQ: - per_cpu_data->per_event_total_time[EVENT_NET_RX_SOFTIRQ] += now - per_cpu_data->entry_ts; + per_cpu_data->per_event_total_time[EVENT_NET_RX_SOFTIRQ] += t; per_cpu_data->enable_stack_trace = 0; } - + + per_cpu_data->sched_switch_accounted_time += t; if (*per_task_events != EVENT_MAX) per_cpu_data->entry_ts = now; } @@ -208,6 +223,11 @@ int BPF_PROG(tp_sched_switch, bool preempt, struct task_struct* prev, struct tas if (likely(per_cpu_data != NULL)) { if (prev_task_events != NULL) stop_event(*prev_task_events, per_cpu_data, now); if (next_task_events != NULL && *next_task_events != EVENT_MAX) per_cpu_data->entry_ts = now; + + if (prev->flags & 0x10 /* PF_IO_WORKER */) + per_cpu_data->per_event_total_time[EVENT_IO_WORKER] += now - per_cpu_data->sched_switch_ts - per_cpu_data->sched_switch_accounted_time; + per_cpu_data->sched_switch_ts = now; + per_cpu_data->sched_switch_accounted_time = 0; } return 0; diff --git a/main/src/bpf/prog.bpf.h b/main/src/bpf/prog.bpf.h index d435211..460b97d 100644 --- a/main/src/bpf/prog.bpf.h +++ b/main/src/bpf/prog.bpf.h @@ -8,14 +8,21 @@ enum event_types { EVENT_SOCK_RECVMSG = 1, EVENT_NET_TX_SOFTIRQ = 2, EVENT_NET_RX_SOFTIRQ = 3, + EVENT_IO_WORKER = 4, - EVENT_MAX = 4 + EVENT_MAX = 5 }; struct per_cpu_data { /// @brief Latest entry timestamp to any event in ns u64 entry_ts; + /// @brief Latest scheduler switch timestamp + u64 sched_switch_ts; + + /// @brief Total CPU time accounted to various events since the last scheduler switch + u64 sched_switch_accounted_time; + /// @brief Total time in ns registered for each event u64 per_event_total_time[EVENT_MAX]; diff --git a/main/src/common.rs b/main/src/common.rs index 66631d1..56c9a96 100644 --- a/main/src/common.rs +++ b/main/src/common.rs @@ -8,15 +8,20 @@ pub const event_types_EVENT_SOCK_SENDMSG: event_types = 0; pub const event_types_EVENT_SOCK_RECVMSG: event_types = 1; pub const event_types_EVENT_NET_TX_SOFTIRQ: event_types = 2; pub const event_types_EVENT_NET_RX_SOFTIRQ: event_types = 3; -pub const event_types_EVENT_MAX: event_types = 4; +pub const event_types_EVENT_IO_WORKER: event_types = 4; +pub const event_types_EVENT_MAX: event_types = 5; pub type event_types = ::std::os::raw::c_uint; #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct per_cpu_data { #[doc = " @brief Latest entry timestamp to any event in ns"] pub entry_ts: u64_, + #[doc = " @brief Latest scheduler switch timestamp"] + pub sched_switch_ts: u64_, + #[doc = " @brief Total CPU time accounted to various events since the last scheduler switch"] + pub sched_switch_accounted_time: u64_, #[doc = " @brief Total time in ns registered for each event"] - pub per_event_total_time: [u64_; 4usize], + pub per_event_total_time: [u64_; 5usize], #[doc = " @brief When non-zero, stack traces by the perf event prog are enabled"] pub enable_stack_trace: u8_, } @@ -26,7 +31,7 @@ fn bindgen_test_layout_per_cpu_data() { let ptr = UNINIT.as_ptr(); assert_eq!( ::std::mem::size_of::(), - 48usize, + 72usize, concat!("Size of: ", stringify!(per_cpu_data)) ); assert_eq!( @@ -45,8 +50,28 @@ fn bindgen_test_layout_per_cpu_data() { ) ); assert_eq!( - unsafe { ::std::ptr::addr_of!((*ptr).per_event_total_time) as usize - ptr as usize }, + unsafe { ::std::ptr::addr_of!((*ptr).sched_switch_ts) as usize - ptr as usize }, 8usize, + concat!( + "Offset of field: ", + stringify!(per_cpu_data), + "::", + stringify!(sched_switch_ts) + ) + ); + assert_eq!( + unsafe { ::std::ptr::addr_of!((*ptr).sched_switch_accounted_time) as usize - ptr as usize }, + 16usize, + concat!( + "Offset of field: ", + stringify!(per_cpu_data), + "::", + stringify!(sched_switch_accounted_time) + ) + ); + assert_eq!( + unsafe { ::std::ptr::addr_of!((*ptr).per_event_total_time) as usize - ptr as usize }, + 24usize, concat!( "Offset of field: ", stringify!(per_cpu_data), @@ -56,7 +81,7 @@ fn bindgen_test_layout_per_cpu_data() { ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).enable_stack_trace) as usize - ptr as usize }, - 40usize, + 64usize, concat!( "Offset of field: ", stringify!(per_cpu_data), diff --git a/main/src/ksyms.rs b/main/src/ksyms.rs index ffee601..658d02b 100644 --- a/main/src/ksyms.rs +++ b/main/src/ksyms.rs @@ -75,7 +75,7 @@ impl KSyms { "__napi_poll" => Option::::Some(Box::new( |cnt, _| Some(&mut cnt.__napi_poll) )), - "netif_receive_skb" | "netif_receive_skb_core" | "netif_receive_skb_list_internal" => Option::::Some(Box::new( + "netif_receive_skb" | "netif_receive_skb_core" | "netif_receive_skb_list_internal" | "__netif_receive_skb" => Option::::Some(Box::new( |cnt, PerFrameProps { in_nf_hook, .. }| { cnt.nf_netdev_ingress = cnt.nf_netdev_ingress.max(std::mem::take(in_nf_hook)); Some(&mut cnt.netif_receive_skb) diff --git a/web-frontend/src/plot.rs b/web-frontend/src/plot.rs index 67c3e71..b9b4540 100644 --- a/web-frontend/src/plot.rs +++ b/web-frontend/src/plot.rs @@ -144,6 +144,7 @@ fn draw_bar1( (palette::GREEN_800.into(), WHITE.into()), // RX syscalls (CYAN.into(), BLACK.into()), // TX softirq (palette::PURPLE.into(), WHITE.into()), // RX softirq + (palette::PINK_A200.into(), WHITE.into()), // IO workers (BLACK.mix(0.2), BLACK.into()) // other ]; let stack = metrics.top_level_metrics