Skip to content

Commit

Permalink
tracing: add "power-tracer": C/P state tracer to help power optimization
Browse files Browse the repository at this point in the history
Impact: new "power-tracer" ftrace plugin

This patch adds a C/P-state ftrace plugin that will generate
detailed statistics about the C/P-states that are being used,
so that we can look at detailed decisions that the C/P-state
code is making, rather than the too high level "average"
that we have today.

An example way of using this is:

 mount -t debugfs none /sys/kernel/debug
 echo cstate > /sys/kernel/debug/tracing/current_tracer
 echo 1 > /sys/kernel/debug/tracing/tracing_enabled
 sleep 1
 echo 0 > /sys/kernel/debug/tracing/tracing_enabled
 cat /sys/kernel/debug/tracing/trace | perl scripts/trace/cstate.pl > out.svg

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
  • Loading branch information
Arjan van de Ven authored and Ingo Molnar committed Nov 26, 2008
1 parent 509dcee commit f3f47a6
Show file tree
Hide file tree
Showing 8 changed files with 355 additions and 0 deletions.
4 changes: 4 additions & 0 deletions arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include <linux/cpufreq.h>
#include <linux/compiler.h>
#include <linux/dmi.h>
#include <linux/ftrace.h>

#include <linux/acpi.h>
#include <acpi/processor.h>
Expand Down Expand Up @@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
unsigned int next_perf_state = 0; /* Index into perf table */
unsigned int i;
int result = 0;
struct power_trace it;

dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);

Expand Down Expand Up @@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
}
}

trace_power_mark(&it, POWER_PSTATE, next_perf_state);

switch (data->cpu_feature) {
case SYSTEM_INTEL_MSR_CAPABLE:
cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
Expand Down
16 changes: 16 additions & 0 deletions arch/x86/kernel/process.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <linux/module.h>
#include <linux/pm.h>
#include <linux/clockchips.h>
#include <linux/ftrace.h>
#include <asm/system.h>

unsigned long idle_halt;
Expand Down Expand Up @@ -100,6 +101,9 @@ static inline int hlt_use_halt(void)
void default_idle(void)
{
if (hlt_use_halt()) {
struct power_trace it;

trace_power_start(&it, POWER_CSTATE, 1);
current_thread_info()->status &= ~TS_POLLING;
/*
* TS_POLLING-cleared state must be visible before we
Expand All @@ -112,6 +116,7 @@ void default_idle(void)
else
local_irq_enable();
current_thread_info()->status |= TS_POLLING;
trace_power_end(&it);
} else {
local_irq_enable();
/* loop is done by the caller */
Expand Down Expand Up @@ -154,24 +159,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
*/
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
struct power_trace it;

trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
if (!need_resched()) {
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__mwait(ax, cx);
}
trace_power_end(&it);
}

/* Default MONITOR/MWAIT with no hints, used for default C1 state */
static void mwait_idle(void)
{
struct power_trace it;
if (!need_resched()) {
trace_power_start(&it, POWER_CSTATE, 1);
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__sti_mwait(0, 0);
else
local_irq_enable();
trace_power_end(&it);
} else
local_irq_enable();
}
Expand All @@ -183,9 +195,13 @@ static void mwait_idle(void)
*/
static void poll_idle(void)
{
struct power_trace it;

trace_power_start(&it, POWER_CSTATE, 0);
local_irq_enable();
while (!need_resched())
cpu_relax();
trace_power_end(&it);
}

/*
Expand Down
29 changes: 29 additions & 0 deletions include/linux/ftrace.h
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,35 @@ ftrace_init_module(struct module *mod,
unsigned long *start, unsigned long *end) { }
#endif

enum {
POWER_NONE = 0,
POWER_CSTATE = 1,
POWER_PSTATE = 2,
};

struct power_trace {
#ifdef CONFIG_POWER_TRACER
ktime_t stamp;
ktime_t end;
int type;
int state;
#endif
};

#ifdef CONFIG_POWER_TRACER
extern void trace_power_start(struct power_trace *it, unsigned int type,
unsigned int state);
extern void trace_power_mark(struct power_trace *it, unsigned int type,
unsigned int state);
extern void trace_power_end(struct power_trace *it);
#else
static inline void trace_power_start(struct power_trace *it, unsigned int type,
unsigned int state) { }
static inline void trace_power_mark(struct power_trace *it, unsigned int type,
unsigned int state) { }
static inline void trace_power_end(struct power_trace *it) { }
#endif


/*
* Structure that defines a return function trace.
Expand Down
11 changes: 11 additions & 0 deletions kernel/trace/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,17 @@ config BRANCH_TRACER

Say N if unsure.

config POWER_TRACER
bool "Trace power consumption behavior"
depends on DEBUG_KERNEL
depends on X86
select TRACING
help
This tracer helps developers to analyze and optimize the kernels
power management decisions, specifically the C-state and P-state
behavior.


config STACK_TRACER
bool "Trace max stack"
depends on HAVE_FUNCTION_TRACER
Expand Down
1 change: 1 addition & 0 deletions kernel/trace/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
obj-$(CONFIG_BTS_TRACER) += trace_bts.o
obj-$(CONFIG_POWER_TRACER) += trace_power.o

libftrace-y := ftrace.o
7 changes: 7 additions & 0 deletions kernel/trace/trace.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ enum trace_type {
TRACE_FN_RET,
TRACE_USER_STACK,
TRACE_BTS,
TRACE_POWER,

__TRACE_LAST_TYPE
};
Expand Down Expand Up @@ -160,6 +161,11 @@ struct bts_entry {
unsigned long to;
};

struct trace_power {
struct trace_entry ent;
struct power_trace state_data;
};

/*
* trace_flag_type is an enumeration that holds different
* states when a trace occurs. These are:
Expand Down Expand Up @@ -266,6 +272,7 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\
IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\
IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
__ftrace_bad_type(); \
} while (0)

Expand Down
179 changes: 179 additions & 0 deletions kernel/trace/trace_power.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
/*
* ring buffer based C-state tracer
*
* Arjan van de Ven <arjan@linux.intel.com>
* Copyright (C) 2008 Intel Corporation
*
* Much is borrowed from trace_boot.c which is
* Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
*
*/

#include <linux/init.h>
#include <linux/debugfs.h>
#include <linux/ftrace.h>
#include <linux/kallsyms.h>
#include <linux/module.h>

#include "trace.h"

static struct trace_array *power_trace;
static int __read_mostly trace_power_enabled;


static void start_power_trace(struct trace_array *tr)
{
trace_power_enabled = 1;
}

static void stop_power_trace(struct trace_array *tr)
{
trace_power_enabled = 0;
}


static int power_trace_init(struct trace_array *tr)
{
int cpu;
power_trace = tr;

trace_power_enabled = 1;

for_each_cpu_mask(cpu, cpu_possible_map)
tracing_reset(tr, cpu);
return 0;
}

static enum print_line_t power_print_line(struct trace_iterator *iter)
{
int ret = 0;
struct trace_entry *entry = iter->ent;
struct trace_power *field ;
struct power_trace *it;
struct trace_seq *s = &iter->seq;
struct timespec stamp;
struct timespec duration;

trace_assign_type(field, entry);
it = &field->state_data;
stamp = ktime_to_timespec(it->stamp);
duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));

if (entry->type == TRACE_POWER) {
if (it->type == POWER_CSTATE)
ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
stamp.tv_sec,
stamp.tv_nsec,
it->state, iter->cpu,
duration.tv_sec,
duration.tv_nsec);
if (it->type == POWER_PSTATE)
ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
stamp.tv_sec,
stamp.tv_nsec,
it->state, iter->cpu);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_HANDLED;
}
return TRACE_TYPE_UNHANDLED;
}

static struct tracer power_tracer __read_mostly =
{
.name = "power",
.init = power_trace_init,
.start = start_power_trace,
.stop = stop_power_trace,
.reset = stop_power_trace,
.print_line = power_print_line,
};

static int init_power_trace(void)
{
return register_tracer(&power_tracer);
}
device_initcall(init_power_trace);

void trace_power_start(struct power_trace *it, unsigned int type,
unsigned int level)
{
if (!trace_power_enabled)
return;

memset(it, 0, sizeof(struct power_trace));
it->state = level;
it->type = type;
it->stamp = ktime_get();
}
EXPORT_SYMBOL_GPL(trace_power_start);


void trace_power_end(struct power_trace *it)
{
struct ring_buffer_event *event;
struct trace_power *entry;
struct trace_array_cpu *data;
unsigned long irq_flags;
struct trace_array *tr = power_trace;

if (!trace_power_enabled)
return;

preempt_disable();
it->end = ktime_get();
data = tr->data[smp_processor_id()];

event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
&irq_flags);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, 0);
entry->ent.type = TRACE_POWER;
entry->state_data = *it;
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);

trace_wake_up();

out:
preempt_enable();
}
EXPORT_SYMBOL_GPL(trace_power_end);

void trace_power_mark(struct power_trace *it, unsigned int type,
unsigned int level)
{
struct ring_buffer_event *event;
struct trace_power *entry;
struct trace_array_cpu *data;
unsigned long irq_flags;
struct trace_array *tr = power_trace;

if (!trace_power_enabled)
return;

memset(it, 0, sizeof(struct power_trace));
it->state = level;
it->type = type;
it->stamp = ktime_get();
preempt_disable();
it->end = it->stamp;
data = tr->data[smp_processor_id()];

event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
&irq_flags);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, 0);
entry->ent.type = TRACE_POWER;
entry->state_data = *it;
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);

trace_wake_up();

out:
preempt_enable();
}
EXPORT_SYMBOL_GPL(trace_power_mark);
Loading

0 comments on commit f3f47a6

Please sign in to comment.