vmstat
# vmstat 1
procs -----------memory---------- -swap- ---io--- -system-- ------cpu-----
r b swpd free buff cache si so bi bo in cs us sy id wa st
8 0 2000272 2128480 483836 15791896 7 9 582 992 42 44 39 6 55 0 0
8 0 2000272 2021504 483852 15802480 0 0 136 2520 8764 3656 84 16 0 0 0
8 0 2000272 2113768 483876 15804436 0 0 76 2352 8740 3385 83 17 0 0 0
8 0 2000272 1978844 483876 15812104 0 0 164 260 8657 3070 85 15 0 0 0
^C
#
perf stat
# perf stat --interval-print 1000 -e context-switches
# time counts unit events
1.000157779 2,627 context-switches
2.000338693 3,067 context-switches
3.000500966 3,577 context-switches
4.000655707 2,976 context-switches
5.000808482 3,199 context-switches
^C 5.922384403 3,333 context-switches
#
perf targets
- system wide
- CPU
- cgroups
- PID
- TID
perf stat with some threads
# perf stat -I 1000 -e cs -t `pidof -S, bash`
# time counts unit events
1.000217556 12 cs
2.000423764 11 cs
3.000606747 1 cs
4.000817161 1 cs
5.001014081 <not counted> cs
^C 5.976337203 <not counted> cs
#
Counting SMIs with perf stat
press the mute key
# perf stat -I 1000 -e msr/smi/
# time counts unit events
1.000532552 0 msr/smi/
2.001166107 0 msr/smi/
3.001675027 8 msr/smi/
4.002176974 0 msr/smi/
5.002726317 8 msr/smi/
^C 5.637141913 0 msr/smi/
#
Before and after
Before...
# echo 1 > /proc/sys/vm/drop_caches
# perf stat find ~acme/git/linux > /dev/null
Performance counter stats for 'find /home/acme/git/linux':
318.11 msec task-clock # 0.420 CPUs utilized
4,989 context-switches # 0.016 M/sec
63 cpu-migrations # 0.198 K/sec
280 page-faults # 0.880 K/sec
417,896,513 cycles # 1.314 GHz
379,620,453 instructions # 0.91 insn per cycle
75,490,217 branches # 237.312 M/sec
2,036,356 branch-misses # 2.70% of all branches
0.758165914 seconds time elapsed
0.073100000 seconds user
0.241968000 seconds sys
#
Before and after
After...
# perf stat find ~acme/git/linux > /dev/null
Performance counter stats for 'find /home/acme/git/linux':
68.28 msec task-clock # 0.991 CPUs utilized
12 context-switches # 0.176 K/sec
0 cpu-migrations # 0.000 K/sec
274 page-faults # 0.004 M/sec
265,617,729 cycles # 3.890 GHz
310,931,753 instructions # 1.17 insn per cycle
61,838,290 branches # 905.609 M/sec
1,508,769 branch-misses # 2.44% of all branches
0.068907995 seconds time elapsed
0.018829000 seconds user
0.049568000 seconds sys
#
Shell completion
# tools/perf/perf-completion.sh
# perf stat -e power/energy-
power/energy-cores/ power/energy-gpu/ power/energy-pkg/ power/energy-psys/ power/energy-ram/
PMU events
- Try this with -I1000 - interval mode
- And then generate some load to burn power
- http://git.kernel.org/torvalds/c/4788e5b4b2338f
# perf stat -e power/energy-cores/,power/energy-gpu/,
power/energy-pkg/,power/energy-psys/,
power/energy-ram/ sleep 1
Performance counter stats for 'system wide':
0.83 Joules power/energy-cores/
0.01 Joules power/energy-gpu/
1.66 Joules power/energy-pkg/
5.98 Joules power/energy-psys/
0.54 Joules power/energy-ram/
1.001084656 seconds time elapsed
#
top
Tasks: 299 total, 1 running, 297 sleeping, 1 stopped, 0 zombie
%Cpu(s): 1.1 us, 0.5 sy, 0.0 ni, 98.0 id, 0.1 wa, 0.1 hi, 0.2 si, 0.0 st
MiB Mem : 23980.2 total, 6982.2 free, 4027.3 used, 12970.8 buff/cache
MiB Swap: 12056.0 total, 12056.0 free, 0.0 used. 18803.3 avail Mem
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
9137 acme 20 0 2515848 839908 120176 S 3.0 3.4 12:14.64 Web Content
8888 acme 20 0 2261904 562132 147104 S 2.6 2.3 5:24.73 Web Content
2727 acme 20 0 689068 67888 43064 S 1.3 0.3 2:50.71 gnome-terminal-
9093 acme 20 0 2221712 350912 121988 S 1.0 1.4 2:11.17 Web Content
989 avahi 20 0 32776 5116 4496 S 0.7 0.0 0:16.37 avahi-daemon
7922 root 20 0 228896 5148 4268 R 0.7 0.0 0:00.06 top
8786 acme 20 0 2661872 425496 187100 S 0.7 1.7 13:37.34 firefox
8959 acme 20 0 1581656 129824 82692 S 0.7 0.5 0:26.55 WebExtensions
1996 acme 9 -11 2092664 14264 10804 S 0.3 0.1 0:04.55 pulseaudio
2086 acme 20 0 3298140 211612 118636 S 0.3 0.9 6:36.83 gnome-shell
7995 qemu 20 0 5256108 690860 22588 S 0.3 2.8 1:22.68 qemu-system-x86
9050 acme 20 0 2239376 585160 116440 S 0.3 2.4 1:57.69 Web Content
29175 root 20 0 253012 30208 8636 S 0.3 0.1 0:12.01 sssd_kcm
perf top --hierarchy
# perf top --hierarchy
Samples: 4K of event 'cycles:ppp', 4000 Hz, Event count (approx.): 1006575623 lost: 0/0
+ 57.16% cc1
+ 19.39% [kernel]
+ 11.64% libc-2.28.so
+ 2.08% perf
+ 1.22% genksyms
+ 1.16% ccache
+ 1.05% as
+ 0.99% ld-2.28.so
+ 0.86% make
+ 0.84% libpixman-1.so.0.34.0
+ 0.65% bash
+ 0.41% libelf-0.174.so
+ 0.32% objtool
+ 0.24% libglib-2.0.so.0.5800.2
+ 0.17% gcc
+ 0.06% perf-8888.map
+ 0.03% [i915]
+ 0.02% [crc32c_intel]
perf top --hierarchy
# perf top --hierarchy
Samples: 4K of event 'cycles:ppp', 4000 Hz, Event count (approx.): 1006575623 lost: 0/0
+ 61.70% cc1
+ 16.39% [kernel]
- 11.66% libc-2.28.so
2.54% [.] _int_malloc
1.51% [.] malloc
1.39% [.] _int_free
0.72% [.] __strcmp_avx2
0.65% [.] cfree@GLIBC_2.2.5
0.51% [.] __strlen_avx2
0.45% [.] __memcmp_avx2_movbe
0.34% [.] __memset_avx2_erms
0.32% [.] malloc_consolidate
0.32% [.] __memchr_avx2
0.30% [.] __memmove_avx_unaligned_erms
0.29% [.] __strncmp_avx2
0.21% [.] getenv
0.15% [.] __dcigettext
+ 1.24% ccache
+ 0.78% ld-2.28.so
perf top --hierarchy
# perf top --hierarchy
Samples: 4K of event 'cycles:ppp', 4000 Hz, Event count (approx.): 1006575623 lost: 0/0 drop: 0/0
+ 61.70% cc1
- 16.39% [kernel]
1.13% [k] native_flush_tlb_one_user
0.56% [k] unmap_page_range
0.48% [k] native_irq_return_iret
0.46% [k] clear_page_erms
0.42% [k] filemap_map_pages
0.40% [k] copy_user_enhanced_fast_string
0.38% [k] sync_regs
0.37% [k] syscall_return_via_sysret
0.37% [k] entry_SYSCALL_64
0.35% [k] common_interrupt
0.33% [k] error_entry
0.31% [k] __list_del_entry_valid
0.30% [k] __handle_mm_fault
0.24% [k] get_page_from_freelist
0.24% [k] __d_lookup_rcu
0.20% [k] vma_interval_tree_insert
0.19% [k] release_pages
0.19% [k] handle_mm_fault
0.19% [k] update_blocked_averages
0.18% [k] avc_has_perm_noaudit
0.15% [k] page_remove_rmap
0.14% [k] __pagevec_lru_add_fn
0.14% [k] _raw_spin_lock
+ 1.29% genksyms
+ 1.24% ccache
+ 0.78% ld-2.28.so
Looking for help with substrings
# perf top -h percent
Usage: perf top []
--call-graph
setup and enables call-graph (stack chain/backtrace):
record_mode: call graph recording mode (fp|dwarf|lbr)
record_size: if record_mode is 'dwarf', max size of stack recording ()
default: 8192 (bytes)
print_type: call graph printing style (graph|flat|fractal|folded|none)
threshold: minimum call graph inclusion threshold ()
print_limit: maximum number of call graph entry ()
order: call graph order (caller|callee)
sort_key: call graph sort key (function|address)
branch: include last branch info to call graph (branch)
value: call graph value (percent|period|count)
Default: fp,graph,0.5,caller,function
--percent-limit
Don't show entries under that percent
--percentage
How to display percentage of filtered entries
#
gdb
- Stops arbitrary code
- kgdb for kernel, userspace with gdb
- Breakpoints
- Allows reading variable values
- Watchpoints
- Backtraces
- Debugging interactivity
- ptrace
perf probe
- Hooks arbitrary code
- Kernel or userspace
- kprobes, uprobes or ftrace hooks
- No interactivity
- Backtraces
- no ptrace
- Collects variables
perf probe + perf stat
# perf probe -x /lib64/libc-2.28.so malloc
Added new event:
probe_libc:malloc (on malloc in /usr/lib64/libc-2.28.so)
You can now use it in all perf tools, such as:
perf record -e probe_libc:malloc -aR sleep 1
# perf stat -e probe_libc:malloc sleep 1
Performance counter stats for 'sleep 1':
31 probe_libc:malloc
1.001535345 seconds time elapsed
0.001071000 seconds user
0.000000000 seconds sys
#
perf stat + perf probe
- Defines a new event
- Counts it like any other event
- Event remains available
- But inactive when not used
- Until rebooting
Where can we put probes?
# perf probe -L icmp_rcv
...
52 if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
/*
* RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
* silently ignored (we let user decide with a sysctl).
* RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
* discarded if to broadcast/multicast.
*/
59 if ((icmph->type == ICMP_ECHO ||
60 icmph->type == ICMP_TIMESTAMP) &&
net->ipv4.sysctl_icmp_echo_ignore_broadcasts) {
goto error;
}
64 if (icmph->type != ICMP_ECHO &&
icmph->type != ICMP_TIMESTAMP &&
66 icmph->type != ICMP_ADDRESS &&
icmph->type != ICMP_ADDRESSREPLY) {
goto error;
}
}
...
New probe: rx of broadcast packet
- See the source code for the kernel
- Works for userspace as well: uprobes
- With offsets in the lines
- Where probes can be put
Adding the probe
# perf probe icmp_rcv:59
Added new events:
probe:icmp_rcv (on icmp_rcv:59)
probe:icmp_rcv_1 (on icmp_rcv:59)
You can now use it in all perf tools, such as:
perf record -e probe:icmp_rcv_1 -aR sleep 1
#
Tracing those events
# On window 1:
# ping -c 2 -b 127.255.255.255
WARNING: pinging broadcast address
PING 127.255.255.255 (127.255.255.255) 56(84) bytes of data.
--- 127.255.255.255 ping statistics ---
2 packets transmitted, 0 received, 100% packet loss, time 54ms
#
# On window 2:
# perf trace -e probe:icmp_rcv*
0.000 ping/2126 probe:icmp_rcv_1:(ffffffff9186a9d2)
0.002 ping/2126 probe:icmp_rcv:(ffffffff9186a9d2)
1052.532 ping/2126 probe:icmp_rcv_1:(ffffffff9186a9d2)
1052.534 ping/2126 probe:icmp_rcv:(ffffffff9186a9d2)
^C#
# perf trace --call-graph=dwarf -e probe:icmp_rcv*
0.000 ping/28036 probe:icmp_rcv_1:(ffffffff9186a9d2)
icmp_rcv ([kernel.kallsyms])
ip_local_deliver_finish ([kernel.kallsyms])
ip_local_deliver ([kernel.kallsyms])
ip_rcv ([kernel.kallsyms])
__netif_receive_skb_one_core ([kernel.kallsyms])
process_backlog ([kernel.kallsyms])
net_rx_action ([kernel.kallsyms])
__softirqentry_text_start ([kernel.kallsyms])
do_softirq_own_stack ([kernel.kallsyms])
do_softirq.part.15 ([kernel.kallsyms])
__local_bh_enable_ip ([kernel.kallsyms])
ip_finish_output2 ([kernel.kallsyms])
ip_output ([kernel.kallsyms])
ip_send_skb ([kernel.kallsyms])
raw_sendmsg ([kernel.kallsyms])
sock_sendmsg ([kernel.kallsyms])
__sys_sendto ([kernel.kallsyms])
__x64_sys_sendto ([kernel.kallsyms])
do_syscall_64 ([kernel.kallsyms])
entry_SYSCALL_64_after_hwframe ([kernel.kallsyms])
__libc_sendto (/usr/lib64/libc-2.28.so)
ping4_send_probe (/usr/bin/ping)
pinger (/usr/bin/ping)
main_loop (/usr/bin/ping)
ping4_run (/usr/bin/ping)
main (/usr/bin/ping)
__libc_start_main (/usr/lib64/libc-2.28.so)
_start (/usr/bin/ping)
strace overhead: baseline
# dd if=/dev/zero of=/dev/null bs=1 count=5000k
5120000+0 records in
5120000+0 records out
5120000 bytes (5.1 MB, 4.9 MiB) copied, 2.98647 s, 1.7 MB/s
#
strace overhead
# strace -eaccept dd if=/dev/zero of=/dev/null bs=1 count=5000k
5120000+0 records in
5120000+0 records out
5120000 bytes (5.1 MB, 4.9 MiB) copied, 74.5371 s, 68.7 kB/s
+++ exited with 0 +++
#
strace
- 25 times slower
- Worst case
- dd emits syscalls as fast as it can
- Brendan Gregg's article mentions 442x slow down, same example (2014)
- ptrace pauses the application twice
- syscall enter and exit
- www.brendangregg.com/blog/2014-05-11/strace-wow-much-syscall.html
perf trace overhead
# perf trace -e accept dd if=/dev/zero of=/dev/null bs=1 count=5000k
5120000+0 records in
5120000+0 records out
5120000 bytes (5.1 MB, 4.9 MiB) copied, 4.22042 s, 1.2 MB/s
#
perf trace
- 1.41 times slower
- Again worst case
- dd emits syscalls as fast as it can
- No workload stop at syscall enter/exit
- Collects info at syscall enter and exit
- Mix with other tracepoints, kprobes, uprobes, etc
- But only integer args
Augmented Syscalls: eBPF
- Hooks into raw_syscalls:sys_enter
- Appends pointer contents to existing payload
- Existing beautifiers changed to use it
- When available, continue working without it
- Filters
- Extend the kernel without writing "kernel" code
stracing the open* syscalls
# perf trace -e open* --max-events=10
clang-6.0/29741 openat(AT_FDCWD, "/tmp/build/perf/util/.parse-events-flex.o.d",
O_CLOEXEC|O_CREAT|O_TRUNC|O_WRONLY, S_IRUGO|S_IWUGO) = 3
sh/29774 openat(AT_FDCWD, "/tmp/build/perf/util/.parse-events-flex.o.tmp",
O_CREAT|O_TRUNC|O_WRONLY, S_IRUGO|S_IWUGO) = 3
fixdep/29774 openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fixdep/29774 openat(AT_FDCWD, "/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
fixdep/29774 openat(AT_FDCWD, "/tmp/build/perf/util/.parse-events-flex.o.d",
O_RDONLY) = 3
rm/29775 openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
rm/29775 openat(AT_FDCWD, "/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
mv/29776 openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
mv/29776 openat(AT_FDCWD, "/lib64/libselinux.so.1", O_RDONLY|O_CLOEXEC) = 3
mv/29776 openat(AT_FDCWD, "/lib64/libacl.so.1", O_RDONLY|O_CLOEXEC) = 3
#
strace skin
# cat ~/.perfconfig
[trace]
add_events = tools/perf/examples/bpf/augmented_raw_syscalls.o
show_zeros = yes
show_duration = no
no_inherit = yes
show_timestamp = no
show_arg_names = no
args_alignment = 40
show_prefix = yes
#
Showing just failures
# perf trace --failure -e close --max-events=10
sh/20674 close(4) = -1 EBADF (Bad file descriptor)
sh/20674 close(3) = -1 EBADF (Bad file descriptor)
sh/20678 close(4) = -1 EBADF (Bad file descriptor)
sh/20678 close(3) = -1 EBADF (Bad file descriptor)
sh/20681 close(4<pipe:[1521159]>) = -1 EBADF (Bad file descriptor)
sh/20681 close(3) = -1 EBADF (Bad file descriptor)
sh/20681 close(5) = -1 EBADF (Bad file descriptor)
sh/20681 close(4) = -1 EBADF (Bad file descriptor)
sh/20690 close(6) = -1 EBADF (Bad file descriptor)
sh/20690 close(5) = -1 EBADF (Bad file descriptor)
#
Augmented Syscalls
- bpf-output event
- bpf_perf_event_output
- bpf_probe_read
- bpf_probe_read_str
Augmenting syscalls
struct bpf_map SEC("maps") __augmented__ = {
.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
}
int syscall_enter(syscall)(struct syscall_enter_open_args *args)
{
struct augmented_open_args augmented;
probe_read(&augmented.args, sizeof(augmented.args), args);
str_len = probe_read_str(&augmented.filename.value,
sizeof(augmented.filename.value),
args->filename_ptr);
perf_event_output(args, &__augmented__, BPF_F_CURRENT_CPU,
&augmented, sizeof(augmented));
return 0;
}
Augmented Syscalls
# cd tools/perf/examples/bpf/
# perf trace -e augmented_syscalls.c cat hello.c
#include <stdio.h>
int syscall_enter(openat)(void *args)
{
puts("Hello, world\n");
return 0;
}
license(GPL);
0.000 cat/31285 openat(dfd: CWD, filename: /etc/ld.so.cache, flags: CLOEXEC)
0.029 cat/31285 openat(dfd: CWD, filename: /lib64/libc.so.6, flags: CLOEXEC)
0.250 cat/31285 open(filename: /usr/lib/locale/locale-archive, flags: CLOEXEC)
0.300 cat/31285 openat(dfd: CWD, filename: hello.c)
#
Further details: maps
- syscall numbers loaded from userspace
- syscall arg types in another map
- together with how many bytes to copy
- types: kernel BTF section
- syscalls to trace: in a map too (done)
- One kernel .o