diff --git a/README.md b/README.md index 3842a46..ee4ec1f 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ Using ftrace: - [iolatency](iolatency): summarize disk I/O latency as a histogram. [Examples](examples/iolatency_example.txt). - [execsnoop](execsnoop): trace process exec() with command line argument details. [Examples](examples/execsnoop_example.txt). - [opensnoop](opensnoop): trace open() syscalls showing filenames. [Examples](examples/opensnoop_example.txt). +- system/[tpoint](system/tpoint): trace a given tracepoint. [Examples](examples/tpoint_example.txt). - kernel/[funccount](kernel/funccount): count kernel functions that match a string. [Examples](examples/funccount_example.txt). - kernel/[functrace](kernel/functrace): trace kernel functions that match a string. [Examples](examples/functrace_example.txt). - kernel/[funcgraph](kernel/funcgraph): trace kernel function graph showing children and times. [Examples](examples/funcgraph_example.txt). diff --git a/bin/tpoint b/bin/tpoint new file mode 120000 index 0000000..6be1316 --- /dev/null +++ b/bin/tpoint @@ -0,0 +1 @@ +../system/tpoint \ No newline at end of file diff --git a/examples/tpoint_example.txt b/examples/tpoint_example.txt new file mode 100644 index 0000000..27be53b --- /dev/null +++ b/examples/tpoint_example.txt @@ -0,0 +1,180 @@ +Demonstrations of tpoint, the Linux ftrace version. + +Let's trace block:block_rq_issue, to see block device (disk) I/O requests: + +# ./tpoint block:block_rq_issue +Tracing block:block_rq_issue. Ctrl-C to end. + supervise-1692 [001] d... 7269912.982162: block_rq_issue: 202,1 W 0 () 17039656 + 8 [supervise] + supervise-1696 [000] d... 7269912.982243: block_rq_issue: 202,1 W 0 () 12862264 + 8 [supervise] + cksum-12994 [000] d... 7269913.317924: block_rq_issue: 202,1 R 0 () 9357056 + 72 [cksum] + cksum-12994 [000] d... 7269913.319013: block_rq_issue: 202,1 R 0 () 2977536 + 144 [cksum] + cksum-12994 [000] d... 7269913.320217: block_rq_issue: 202,1 R 0 () 2986240 + 216 [cksum] + cksum-12994 [000] d... 7269913.321677: block_rq_issue: 202,1 R 0 () 620344 + 56 [cksum] + cksum-12994 [001] d... 7269913.329309: block_rq_issue: 202,1 R 0 () 9107912 + 88 [cksum] + cksum-12994 [001] d... 7269913.340133: block_rq_issue: 202,1 R 0 () 3147008 + 248 [cksum] + cksum-12994 [001] d... 7269913.354551: block_rq_issue: 202,1 R 0 () 11583488 + 256 [cksum] + cksum-12994 [001] d... 7269913.379904: block_rq_issue: 202,1 R 0 () 11583744 + 256 [cksum] +[...] +^C +Ending tracing... + +Great, that was easy! + +perf_events can do this as well, and is better in many ways, including a more +efficient buffering strategy, and multi-user access. It's not that easy to do +this one-liner in perf_events, however. An equivalent for recent kernels is: + +perf record --no-buffer -e block:block_rq_issue -a -o - | PAGER=cat stdbuf -oL perf script -i - + +Older kernels, use -D instead of --no-buffer. Even better is to set the buffer +page size to a sufficient grouping (using -m), to minimize overheads, at the +expense of liveliness of updates. Note that stack traces (-g) don't work on +my systems with this perf one-liner, however, they do work with tpoint -s. + + +Column headings can be printed using -H: + +# ./tpoint -H block:block_rq_issue +Tracing block:block_rq_issue. Ctrl-C to end. +# tracer: nop +# +# entries-in-buffer/entries-written: 0/0 #P:2 +# +# _-----=> irqs-off +# / _----=> need-resched +# | / _---=> hardirq/softirq +# || / _--=> preempt-depth +# ||| / delay +# TASK-PID CPU# |||| TIMESTAMP FUNCTION +# | | | |||| | | + supervise-1697 [000] d... 7270545.340856: block_rq_issue: 202,1 W 0 () 12862464 + 8 [supervise] + supervise-1697 [000] d... 7270545.341256: block_rq_issue: 202,1 W 0 () 12862472 + 8 [supervise] + supervise-1690 [000] d... 7270545.342363: block_rq_issue: 202,1 W 0 () 17040368 + 8 [supervise] +[...] + +They are also documented in the Linux kernel source under: +Documentation/trace/ftrace.txt. + + +How about stacks traces for those block_rq_issue events? Adding -s: + +# ./tpoint -s block:block_rq_issue +Tracing block:block_rq_issue. Ctrl-C to end. + supervise-1691 [000] d... 7269511.079179: block_rq_issue: 202,1 W 0 () 17040232 + 8 [supervise] + supervise-1691 [000] d... 7269511.079188: + => blk_peek_request + => do_blkif_request + => __blk_run_queue + => queue_unplugged + => blk_flush_plug_list + => blk_finish_plug + => ext4_writepages + => do_writepages + => __filemap_fdatawrite_range + => filemap_flush + => ext4_alloc_da_blocks + => ext4_rename + => vfs_rename + => SYSC_renameat2 + => SyS_renameat2 + => SyS_rename + => system_call_fastpath + cksum-7428 [000] d... 7269511.331778: block_rq_issue: 202,1 R 0 () 9006848 + 208 [cksum] + cksum-7428 [000] d... 7269511.331784: + => blk_peek_request + => do_blkif_request + => __blk_run_queue + => queue_unplugged + => blk_flush_plug_list + => blk_finish_plug + => __do_page_cache_readahead + => ondemand_readahead + => page_cache_async_readahead + => generic_file_read_iter + => new_sync_read + => vfs_read + => SyS_read + => system_call_fastpath + cksum-7428 [000] d... 7269511.332631: block_rq_issue: 202,1 R 0 () 620992 + 200 [cksum] + cksum-7428 [000] d... 7269511.332639: + => blk_peek_request + => do_blkif_request + => __blk_run_queue + => queue_unplugged + => blk_flush_plug_list + => blk_finish_plug + => __do_page_cache_readahead + => ondemand_readahead + => page_cache_sync_readahead + => generic_file_read_iter + => new_sync_read + => vfs_read + => SyS_read + => system_call_fastpath +^C +Ending tracing... + +Easy. Now I can read the ancestry to understand what actually lead to issuing +a block device (disk) I/O. + + +Here's insertion onto the block I/O queue (better matches processes): + +# ./tpoint -s block:block_rq_insert +Tracing block:block_rq_insert. Ctrl-C to end. + cksum-11908 [000] d... 7269834.882517: block_rq_insert: 202,1 R 0 () 736304 + 256 [cksum] + cksum-11908 [000] d... 7269834.882528: + => __elv_add_request + => blk_flush_plug_list + => blk_finish_plug + => __do_page_cache_readahead + => ondemand_readahead + => page_cache_sync_readahead + => generic_file_read_iter + => new_sync_read + => vfs_read + => SyS_read + => system_call_fastpath +[...] + + +You can also add tracepoint filters. To see what variables you can use, use -v: + +# ./tpoint -v block:block_rq_issue +name: block_rq_issue +ID: 942 +format: + field:unsigned short common_type; offset:0; size:2; signed:0; + field:unsigned char common_flags; offset:2; size:1; signed:0; + field:unsigned char common_preempt_count; offset:3; size:1; signed:0; + field:int common_pid; offset:4; size:4; signed:1; + + field:dev_t dev; offset:8; size:4; signed:0; + field:sector_t sector; offset:16; size:8; signed:0; + field:unsigned int nr_sector; offset:24; size:4; signed:0; + field:unsigned int bytes; offset:28; size:4; signed:0; + field:char rwbs[8]; offset:32; size:8; signed:1; + field:char comm[16]; offset:40; size:16; signed:1; + field:__data_loc char[] cmd; offset:56; size:4; signed:1; + +print fmt: "%d,%d %s %u (%s) %llu + %u [%s]", ((unsigned int) ((REC->dev) >> 20)), ((unsigned int) ((REC->dev) & ((1U << 20) - 1))), REC->rwbs, REC->bytes, __get_str(cmd), (unsigned long long)REC->sector, REC->nr_sector, REC->comm + + +Now I'll add a filter to check that the rwbs field (I/O type) includes an "R", +making it a read: + +# ./tpoint -s block:block_rq_insert 'rwbs ~ "*R*"' + cksum-11908 [000] d... 7269839.919098: block_rq_insert: 202,1 R 0 () 736560 + 136 [cksum] + cksum-11908 [000] d... 7269839.919107: + => __elv_add_request + => blk_flush_plug_list + => blk_finish_plug + => __do_page_cache_readahead + => ondemand_readahead + => page_cache_async_readahead + => generic_file_read_iter + => new_sync_read + => vfs_read + => SyS_read + => system_call_fastpath +[...] diff --git a/man/man8/tpoint.8 b/man/man8/tpoint.8 new file mode 100644 index 0000000..5c1c161 --- /dev/null +++ b/man/man8/tpoint.8 @@ -0,0 +1,129 @@ +.TH tpoint 8 "2014-07-20" "USER COMMANDS" +.SH NAME +tpoint \- trace a given tracepoint. Static tracing. Uses Linux ftrace. +.SH SYNOPSIS +.B tpoint +[\-hHsv] [\-d secs] [\-p PID] tracepoint [filter] + +.B tpoint +\-l +.SH DESCRIPTION +This will enable a given tracepoint, print events, then disable the tracepoint +when the program ends. This is like a simple version of the "perf" command for +printing live tracepoint events only. Wildcards are currently not supported. +If for any reason tpoint(8) is insufficient, use the more powerful perf +command for tracing tracepoints instead. + +Beware of feedback loops: tracing tcp functions over an ssh session, +or writing ext4 events to an ext4 file system. For the former, tcp +trace data could be redirected to a file (as in the usage message). For +the latter, trace to the screen or a different file system. + +Since this uses ftrace, only the root user can use this tool. +.SH REQUIREMENTS +FTRACE CONFIG and tracepoints, which you may already have enabled and available +on recent kernels. +.SH OPTIONS +.TP +\-d seconds +Set the duration of tracing, in seconds. Trace output will be buffered and +printed at the end. This also reduces overheads by buffering in-kernel, +instead of printing events as they occur. + +The ftrace buffer has a fixed size per-CPU (see +/sys/kernel/debug/tracing/buffer_size_kb). If you think events are missing, +try increasing that size. +.TP +\-h +Print usage message. +.TP +\-H +Print column headers. +.TP +\-l +List tracepoints only. +.TP +\-s +Print kernel stack traces after each event. +.TP +\-v +Show the tpoint format file only (do not trace), identifying possible variables +for use in a custom filter. +.TP +\-p PID +Only trace kernel functions when this process ID is on-CPU. +.TP +tracepoint +A tracepoint name. Eg, block:block_rq_issue. See the EXAMPLES section. +.TP +filter +An ftrace filter definition. +.SH EXAMPLES +.TP +List tracepoints containing "open": +# +.B tpoint -l | grep open +.TP +Trace open() syscall entry: +# +.B tpoint syscalls:sys_enter_open +.TP +Trace open() syscall entry, showing column headers: +# +.B tpoint -H syscalls:sys_enter_open +.TP +Trace block I/O issue: +# +.B tpoint block:block_rq_issue +.TP +Trace block I/O issue with stack traces: +# +.B tpoint \-s block:block_rq_issue +.SH FIELDS +The output format depends on the kernel version, and headings can be printed +using \-H. The format is the same as the ftrace function trace format, described +in the kernel source under Documentation/trace/ftrace.txt. + +Typical fields are: +.TP +TASK-PID +The process name (which could include dashes), a dash, and the process ID. +.TP +CPU# +The CPU ID, in brackets. +.TP +|||| +Kernel state flags. For example, on Linux 3.16 these are for irqs-off, +need-resched, hardirq/softirq, and preempt-depth. +.TP +TIMESTAMP +Time of event, in seconds. +.TP +FUNCTION +Kernel function name. +.SH OVERHEAD +This can generate a lot of trace data quickly, depending on the +frequency of the traced events. Such data will cause performance overheads. +This also works without buffering by default, printing function events +as they happen (uses trace_pipe), context switching and consuming CPU to do +so. If needed, you can try the "\-d secs" option, which buffers events +instead, reducing overhead. If you think the buffer option is losing events, +try increasing the buffer size (buffer_size_kb). + +Consider using perf_events, which manages buffers differently and more +efficiently, for higher frequency applications. +.SH SOURCE +This is from the perf-tools collection: +.IP +https://github.com/brendangregg/perf-tools +.PP +Also look under the examples directory for a text file containing example +usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Brendan Gregg +.SH SEE ALSO +functrace(8), funccount(8), perf(1) diff --git a/system/tpoint b/system/tpoint new file mode 100755 index 0000000..bf25d3b --- /dev/null +++ b/system/tpoint @@ -0,0 +1,214 @@ +#!/bin/bash +# +# tpoint - trace a given tracepoint. Static tracing. +# Written using Linux ftrace. +# +# This will enable a given tracepoint, print events, then disable the tracepoint +# when the program ends. This is like a simple version of the "perf" command for +# printing live tracepoint events only. Wildcards are currently not supported. +# +# USAGE: ./tpoint [-hHsv] [-d secs] [-p pid] tracepoint [filter] +# ./tpoint -l +# +# Run "tpoint -h" for full usage. +# +# I wrote this because I often needed a quick way to dump stack traces for a +# given tracepoint. +# +# REQUIREMENTS: FTRACE and tracepoints, which you may already have on recent +# kernel versions. +# +# From perf-tools: https://github.com/brendangregg/perf-tools +# +# See the tpoint(8) man page (in perf-tools) for more info. +# +# COPYRIGHT: Copyright (c) 2014 Brendan Gregg. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# (http://www.gnu.org/copyleft/gpl.html) +# +# 22-Jul-2014 Brendan Gregg Created this. + +### default variables +tracing=/sys/kernel/debug/tracing +flock=/var/tmp/.ftrace-lock; wroteflock=0 +opt_duration=0; duration=; opt_pid=0; pid=; opt_filter=0; filter= +opt_view=0; opt_headers=0; opt_stack=0; dmesg=2 +trap ':' INT QUIT TERM PIPE HUP # sends execution to end tracing section + +function usage { + cat <<-END >&2 + USAGE: tpoint [-hHsv] [-d secs] [-p PID] tracepoint [filter] + tpoint -l + -d seconds # trace duration, and use buffers + -p PID # PID to match on I/O issue + -v # view format file (don't trace) + -H # include column headers + -l # list all tracepoints + -s # show kernel stack traces + -h # this usage message + + Note that these examples may need modification to match your kernel + version's function names and platform's register usage. + eg, + tpoint -l | grep open + # find tracepoints containing "open" + tpoint syscalls:sys_enter_open + # trace open() syscall entry + tpoint block:block_rq_issue + # trace block I/O issue + tpoint -s block:black_rq_issue + # show kernel stacks + + See the man page and example file for more info. +END + exit +} + +function warn { + if ! eval "$@"; then + echo >&2 "WARNING: command failed \"$@\"" + fi +} + +function end { + # disable tracing + echo 2>/dev/null + echo "Ending tracing..." 2>/dev/null + cd $tracing + warn "echo 0 > $tdir/enable" + if (( opt_filter )); then + warn "echo 0 > $tdir/filter" + fi + (( opt_stack )) && warn "echo 0 > options/stacktrace" + warn "echo > trace" + (( wroteflock )) && warn "rm $flock" +} + +function die { + echo >&2 "$@" + exit 1 +} + +function edie { + # die with a quiet end() + echo >&2 "$@" + exec >/dev/null 2>&1 + end + exit 1 +} + +### process options +while getopts d:hHlp:sv opt +do + case $opt in + d) opt_duration=1; duration=$OPTARG ;; + p) opt_pid=1; pid=$OPTARG ;; + H) opt_headers=1 ;; + l) opt_list=1 ;; + s) opt_stack=1 ;; + v) opt_view=1 ;; + h|?) usage ;; + esac +done +if (( !opt_list )); then + shift $(( $OPTIND - 1 )) + (( $# )) || usage + tpoint=$1 + shift + if (( $# )); then + opt_filter=1 + filter=$1 + fi +fi + +### option logic +(( opt_pid && opt_filter )) && die "ERROR: use either -p or -f." +(( opt_duration && opt_view )) && die "ERROR: use either -d or -v." +if (( opt_pid )); then + # convert to filter + opt_filter=1 + filter="common_pid == $pid" +fi +if (( !opt_view && !opt_list )); then + if (( opt_duration )); then + echo "Tracing $tpoint for $duration seconds (buffered)..." + else + echo "Tracing $tpoint. Ctrl-C to end." + fi +fi + +### check permissions +cd $tracing || die "ERROR: accessing tracing. Root user? Kernel has FTRACE? + debugfs mounted? (mount -t debugfs debugfs /sys/kernel/debug)" + +### list tracepoints +if (( opt_list )); then + cd events + for tp in */*; do + echo ${tp/\//:} + done + exit +fi + +### check tracepoints +tdir=events/${tpoint/:/\/} +[[ -e $tdir ]] || die "ERROR: tracepoint $tpoint not found. Exiting" + +### view +if (( opt_view )); then + cat $tdir/format + exit +fi + +### ftrace lock +[[ -e $flock ]] && die "ERROR: ftrace may be in use by PID $(cat $flock) $flock" +echo $$ > $flock || die "ERROR: unable to write $flock." +wroteflock=1 + +### setup and begin tracing +echo nop > current_tracer +if (( opt_filter )); then + if ! echo "$filter" > $tdir/filter; then + edie "ERROR: setting filter or -p. Exiting." + fi +fi +if (( opt_stack )); then + if ! echo 1 > options/stacktrace; then + edie "ERROR: enabling stack traces (-s). Exiting" + fi +fi +if ! echo 1 >> $tdir/enable; then + edie "ERROR: enabling tracepoint $tprobe. Exiting." +fi + +### print trace buffer +warn "echo > trace" +if (( opt_duration )); then + sleep $duration + if (( opt_headers )); then + cat trace + else + grep -v '^#' trace + fi +else + # trace_pipe lack headers, so fetch them from trace + (( opt_headers )) && cat trace + cat trace_pipe +fi + +### end tracing +end