Skip to content

Commit

Permalink
Add io_uring io thread accounting
Browse files Browse the repository at this point in the history
  • Loading branch information
miolad committed Aug 22, 2023
1 parent 589feae commit 39ea970
Show file tree
Hide file tree
Showing 7 changed files with 73 additions and 18 deletions.
1 change: 1 addition & 0 deletions main/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,5 @@ libbpf-cargo = "0.20"
bindgen = "0.64"

[features]
#default = ["save-traces"]
save-traces = []
3 changes: 2 additions & 1 deletion main/src/actors/trace_analyzer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use anyhow::anyhow;
use libbpf_rs::MapFlags;
use powercap::{IntelRapl, PowerCap};
use tokio::sync::mpsc::Sender;
use crate::{ksyms::{Counts, KSyms}, common::{event_types_EVENT_MAX, self, event_types_EVENT_SOCK_SENDMSG, event_types_EVENT_NET_TX_SOFTIRQ, event_types_EVENT_NET_RX_SOFTIRQ, event_types_EVENT_SOCK_RECVMSG}, bpf::ProgSkel};
use crate::{ksyms::{Counts, KSyms}, common::{event_types_EVENT_MAX, self, event_types_EVENT_SOCK_SENDMSG, event_types_EVENT_NET_TX_SOFTIRQ, event_types_EVENT_NET_RX_SOFTIRQ, event_types_EVENT_SOCK_RECVMSG, event_types_EVENT_IO_WORKER}, bpf::ProgSkel};
use libc::{mmap, PROT_READ, MAP_SHARED, sysconf, _SC_CLK_TCK};
use super::{metrics_collector::MetricsCollector, MetricUpdate, SubmitUpdate};
#[cfg(feature = "save-traces")]
Expand Down Expand Up @@ -206,6 +206,7 @@ impl TraceAnalyzer {
event_types_EVENT_SOCK_SENDMSG => "TX syscalls",
event_types_EVENT_SOCK_RECVMSG => "RX syscalls",
event_types_EVENT_NET_TX_SOFTIRQ => "TX softirq",
event_types_EVENT_IO_WORKER => "IO workers",
event_types_EVENT_NET_RX_SOFTIRQ => {
// Update sub-events
let denominator = counts[cpuid].net_rx_action.max(1) as f64;
Expand Down
40 changes: 30 additions & 10 deletions main/src/bpf/prog.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,14 @@ u32 stack_traces_slot_off = 0;
const u64 event_max = EVENT_MAX;

inline void stop_event(u64 per_task_events, struct per_cpu_data* per_cpu_data, u64 now) {
if (per_task_events < EVENT_MAX)
per_cpu_data->per_event_total_time[per_task_events] += now - per_cpu_data->entry_ts;
u64 t;

if (per_task_events < EVENT_MAX) {
t = now - per_cpu_data->entry_ts;

per_cpu_data->per_event_total_time[per_task_events] += t;
per_cpu_data->sched_switch_accounted_time += t;
}
}

SEC("fentry/sock_sendmsg")
Expand All @@ -100,14 +106,17 @@ SEC("fexit/sock_sendmsg")
int BPF_PROG(sock_sendmsg_exit) {
u32 zero = 0;
struct per_cpu_data* per_cpu_data;
u64* per_task_events, now = bpf_ktime_get_ns();
u64* per_task_events, now = bpf_ktime_get_ns(), t;

if (
likely((per_task_events = bpf_task_storage_get(&traced_pids, bpf_get_current_task_btf(), NULL, 0)) != NULL) &&
likely((per_cpu_data = bpf_map_lookup_elem(&per_cpu, &zero)) != NULL)
) {
t = now - per_cpu_data->entry_ts;

*per_task_events = EVENT_MAX;
per_cpu_data->per_event_total_time[EVENT_SOCK_SENDMSG] += now - per_cpu_data->entry_ts;
per_cpu_data->per_event_total_time[EVENT_SOCK_SENDMSG] += t;
per_cpu_data->sched_switch_accounted_time += t;
}

return 0;
Expand All @@ -134,14 +143,17 @@ SEC("fexit/sock_recvmsg")
int BPF_PROG(sock_recvmsg_exit) {
u32 zero = 0;
struct per_cpu_data* per_cpu_data;
u64* per_task_events, now = bpf_ktime_get_ns();
u64* per_task_events, now = bpf_ktime_get_ns(), t;

if (
likely((per_task_events = bpf_task_storage_get(&traced_pids, bpf_get_current_task_btf(), NULL, 0)) != NULL) &&
likely((per_cpu_data = bpf_map_lookup_elem(&per_cpu, &zero)) != NULL)
) {
t = now - per_cpu_data->entry_ts;

*per_task_events = EVENT_MAX;
per_cpu_data->per_event_total_time[EVENT_SOCK_RECVMSG] += now - per_cpu_data->entry_ts;
per_cpu_data->per_event_total_time[EVENT_SOCK_RECVMSG] += t;
per_cpu_data->sched_switch_accounted_time += t;
}

return 0;
Expand Down Expand Up @@ -170,25 +182,28 @@ SEC("tp_btf/softirq_exit")
int BPF_PROG(net_rx_softirq_exit, unsigned int vec) {
u32 zero = 0;
struct per_cpu_data* per_cpu_data;
u64* per_task_events, now = bpf_ktime_get_ns();
u64* per_task_events, now = bpf_ktime_get_ns(), t;

if (
(vec == NET_TX_SOFTIRQ || vec == NET_RX_SOFTIRQ) &&
likely((per_task_events = bpf_task_storage_get(&traced_pids, bpf_get_current_task_btf(), NULL, 0)) != NULL) &&
likely((per_cpu_data = bpf_map_lookup_elem(&per_cpu, &zero)) != NULL)
) {
t = now - per_cpu_data->entry_ts;

// Convoluted expression makes the verifier happy
switch (vec) {
case NET_TX_SOFTIRQ:
per_cpu_data->per_event_total_time[EVENT_NET_TX_SOFTIRQ] += now - per_cpu_data->entry_ts;
per_cpu_data->per_event_total_time[EVENT_NET_TX_SOFTIRQ] += t;
break;

default:
case NET_RX_SOFTIRQ:
per_cpu_data->per_event_total_time[EVENT_NET_RX_SOFTIRQ] += now - per_cpu_data->entry_ts;
per_cpu_data->per_event_total_time[EVENT_NET_RX_SOFTIRQ] += t;
per_cpu_data->enable_stack_trace = 0;
}


per_cpu_data->sched_switch_accounted_time += t;
if (*per_task_events != EVENT_MAX) per_cpu_data->entry_ts = now;
}

Expand All @@ -208,6 +223,11 @@ int BPF_PROG(tp_sched_switch, bool preempt, struct task_struct* prev, struct tas
if (likely(per_cpu_data != NULL)) {
if (prev_task_events != NULL) stop_event(*prev_task_events, per_cpu_data, now);
if (next_task_events != NULL && *next_task_events != EVENT_MAX) per_cpu_data->entry_ts = now;

if (prev->flags & 0x10 /* PF_IO_WORKER */)
per_cpu_data->per_event_total_time[EVENT_IO_WORKER] += now - per_cpu_data->sched_switch_ts - per_cpu_data->sched_switch_accounted_time;
per_cpu_data->sched_switch_ts = now;
per_cpu_data->sched_switch_accounted_time = 0;
}

return 0;
Expand Down
9 changes: 8 additions & 1 deletion main/src/bpf/prog.bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,21 @@ enum event_types {
EVENT_SOCK_RECVMSG = 1,
EVENT_NET_TX_SOFTIRQ = 2,
EVENT_NET_RX_SOFTIRQ = 3,
EVENT_IO_WORKER = 4,

EVENT_MAX = 4
EVENT_MAX = 5
};

struct per_cpu_data {
/// @brief Latest entry timestamp to any event in ns
u64 entry_ts;

/// @brief Latest scheduler switch timestamp
u64 sched_switch_ts;

/// @brief Total CPU time accounted to various events since the last scheduler switch
u64 sched_switch_accounted_time;

/// @brief Total time in ns registered for each event
u64 per_event_total_time[EVENT_MAX];

Expand Down
35 changes: 30 additions & 5 deletions main/src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,20 @@ pub const event_types_EVENT_SOCK_SENDMSG: event_types = 0;
pub const event_types_EVENT_SOCK_RECVMSG: event_types = 1;
pub const event_types_EVENT_NET_TX_SOFTIRQ: event_types = 2;
pub const event_types_EVENT_NET_RX_SOFTIRQ: event_types = 3;
pub const event_types_EVENT_MAX: event_types = 4;
pub const event_types_EVENT_IO_WORKER: event_types = 4;
pub const event_types_EVENT_MAX: event_types = 5;
pub type event_types = ::std::os::raw::c_uint;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct per_cpu_data {
#[doc = " @brief Latest entry timestamp to any event in ns"]
pub entry_ts: u64_,
#[doc = " @brief Latest scheduler switch timestamp"]
pub sched_switch_ts: u64_,
#[doc = " @brief Total CPU time accounted to various events since the last scheduler switch"]
pub sched_switch_accounted_time: u64_,
#[doc = " @brief Total time in ns registered for each event"]
pub per_event_total_time: [u64_; 4usize],
pub per_event_total_time: [u64_; 5usize],
#[doc = " @brief When non-zero, stack traces by the perf event prog are enabled"]
pub enable_stack_trace: u8_,
}
Expand All @@ -26,7 +31,7 @@ fn bindgen_test_layout_per_cpu_data() {
let ptr = UNINIT.as_ptr();
assert_eq!(
::std::mem::size_of::<per_cpu_data>(),
48usize,
72usize,
concat!("Size of: ", stringify!(per_cpu_data))
);
assert_eq!(
Expand All @@ -45,8 +50,28 @@ fn bindgen_test_layout_per_cpu_data() {
)
);
assert_eq!(
unsafe { ::std::ptr::addr_of!((*ptr).per_event_total_time) as usize - ptr as usize },
unsafe { ::std::ptr::addr_of!((*ptr).sched_switch_ts) as usize - ptr as usize },
8usize,
concat!(
"Offset of field: ",
stringify!(per_cpu_data),
"::",
stringify!(sched_switch_ts)
)
);
assert_eq!(
unsafe { ::std::ptr::addr_of!((*ptr).sched_switch_accounted_time) as usize - ptr as usize },
16usize,
concat!(
"Offset of field: ",
stringify!(per_cpu_data),
"::",
stringify!(sched_switch_accounted_time)
)
);
assert_eq!(
unsafe { ::std::ptr::addr_of!((*ptr).per_event_total_time) as usize - ptr as usize },
24usize,
concat!(
"Offset of field: ",
stringify!(per_cpu_data),
Expand All @@ -56,7 +81,7 @@ fn bindgen_test_layout_per_cpu_data() {
);
assert_eq!(
unsafe { ::std::ptr::addr_of!((*ptr).enable_stack_trace) as usize - ptr as usize },
40usize,
64usize,
concat!(
"Offset of field: ",
stringify!(per_cpu_data),
Expand Down
2 changes: 1 addition & 1 deletion main/src/ksyms.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ impl KSyms {
"__napi_poll" => Option::<SymbolFun>::Some(Box::new(
|cnt, _| Some(&mut cnt.__napi_poll)
)),
"netif_receive_skb" | "netif_receive_skb_core" | "netif_receive_skb_list_internal" => Option::<SymbolFun>::Some(Box::new(
"netif_receive_skb" | "netif_receive_skb_core" | "netif_receive_skb_list_internal" | "__netif_receive_skb" => Option::<SymbolFun>::Some(Box::new(
|cnt, PerFrameProps { in_nf_hook, .. }| {
cnt.nf_netdev_ingress = cnt.nf_netdev_ingress.max(std::mem::take(in_nf_hook));
Some(&mut cnt.netif_receive_skb)
Expand Down
1 change: 1 addition & 0 deletions web-frontend/src/plot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ fn draw_bar1<DB: DrawingBackend>(
(palette::GREEN_800.into(), WHITE.into()), // RX syscalls
(CYAN.into(), BLACK.into()), // TX softirq
(palette::PURPLE.into(), WHITE.into()), // RX softirq
(palette::PINK_A200.into(), WHITE.into()), // IO workers
(BLACK.mix(0.2), BLACK.into()) // other
];
let stack = metrics.top_level_metrics
Expand Down

0 comments on commit 39ea970

Please sign in to comment.