perf lab

Tips, tricks and a DB optimization using perf

Arnaldo Carvalho de Melo
Jiří Olša
Joe Mario
{acme,jolsa,jmario}@redhat.com
Red Hat Inc.

The other tools

sar, top, stap, etc
All part of a toolchest
perf adds to the toolchest
Sometimes does the same
With more flexibility

vmstat

# vmstat 1
procs -----------memory----------     -swap- ---io--- -system-- ------cpu-----
 r  b   swpd   free   buff  cache     si so  bi    bo  in    cs us sy id wa st
 8  0 2000272 2128480 483836 15791896 7  9   582  992   42   44 39  6 55  0  0
 8  0 2000272 2021504 483852 15802480 0  0   136 2520 8764 3656 84 16  0  0  0
 8  0 2000272 2113768 483876 15804436 0  0    76 2352 8740 3385 83 17  0  0  0
 8  0 2000272 1978844 483876 15812104 0  0   164  260 8657 3070 85 15  0  0  0
^C
#

perf stat

# perf stat --interval-print 1000 -e context-switches
#        time   counts unit events
  1.000157779    2,627      context-switches
  2.000338693    3,067      context-switches
  3.000500966    3,577      context-switches
  4.000655707    2,976      context-switches
  5.000808482    3,199      context-switches
^C 5.922384403   3,333      context-switches

#

perf targets

system wide
CPU
cgroups
PID
TID

perf stat with some threads

# perf stat -I 1000 -e cs -t `pidof -S, bash`
#        time          counts unit events
  1.000217556              12      cs
  2.000423764              11      cs
  3.000606747               1      cs
  4.000817161               1      cs
  5.001014081   <not counted>      cs
^C  5.976337203 <not counted>      cs
#

Counting SMIs with perf stat

press the mute key

# perf stat -I 1000 -e msr/smi/
#           time             counts unit events
     1.000532552                  0      msr/smi/
     2.001166107                  0      msr/smi/
     3.001675027                  8      msr/smi/
     4.002176974                  0      msr/smi/
     5.002726317                  8      msr/smi/
^C     5.637141913                  0      msr/smi/

#

Before and after

Before...

# echo 1 > /proc/sys/vm/drop_caches 
# perf stat find ~acme/git/linux > /dev/null

 Performance counter stats for 'find /home/acme/git/linux':

            318.11 msec task-clock                #    0.420 CPUs utilized          
             4,989      context-switches          #    0.016 M/sec                  
                63      cpu-migrations            #    0.198 K/sec                  
               280      page-faults               #    0.880 K/sec                  
       417,896,513      cycles                    #    1.314 GHz                    
       379,620,453      instructions              #    0.91  insn per cycle         
        75,490,217      branches                  #  237.312 M/sec                  
         2,036,356      branch-misses             #    2.70% of all branches        

       0.758165914 seconds time elapsed

       0.073100000 seconds user
       0.241968000 seconds sys
#

Before and after

After...

# perf stat find ~acme/git/linux > /dev/null

 Performance counter stats for 'find /home/acme/git/linux':

             68.28 msec task-clock                #    0.991 CPUs utilized          
                12      context-switches          #    0.176 K/sec                  
                 0      cpu-migrations            #    0.000 K/sec                  
               274      page-faults               #    0.004 M/sec                  
       265,617,729      cycles                    #    3.890 GHz                    
       310,931,753      instructions              #    1.17  insn per cycle         
        61,838,290      branches                  #  905.609 M/sec                  
         1,508,769      branch-misses             #    2.44% of all branches        

       0.068907995 seconds time elapsed

       0.018829000 seconds user
       0.049568000 seconds sys
#

Shell completion

# tools/perf/perf-completion.sh
# perf stat -e power/energy-
power/energy-cores/ power/energy-gpu/ power/energy-pkg/ power/energy-psys/ power/energy-ram/

PMU events

Try this with -I1000 - interval mode
And then generate some load to burn power
http://git.kernel.org/torvalds/c/4788e5b4b2338f

# perf stat -e power/energy-cores/,power/energy-gpu/,
               power/energy-pkg/,power/energy-psys/,
	       power/energy-ram/ sleep 1

 Performance counter stats for 'system wide':

              0.83 Joules power/energy-cores/
              0.01 Joules power/energy-gpu/
              1.66 Joules power/energy-pkg/
              5.98 Joules power/energy-psys/
              0.54 Joules power/energy-ram/

       1.001084656 seconds time elapsed

#

top

Tasks: 299 total,   1 running, 297 sleeping,   1 stopped,   0 zombie
%Cpu(s):  1.1 us,  0.5 sy,  0.0 ni, 98.0 id,  0.1 wa,  0.1 hi,  0.2 si,  0.0 st
MiB Mem :  23980.2 total,   6982.2 free,   4027.3 used,  12970.8 buff/cache
MiB Swap:  12056.0 total,  12056.0 free,      0.0 used.  18803.3 avail Mem

  PID USER  PR  NI    VIRT    RES    SHR S %CPU %MEM    TIME+ COMMAND
 9137 acme  20   0 2515848 839908 120176 S  3.0  3.4 12:14.64 Web Content
 8888 acme  20   0 2261904 562132 147104 S  2.6  2.3  5:24.73 Web Content
 2727 acme  20   0  689068  67888  43064 S  1.3  0.3  2:50.71 gnome-terminal-
 9093 acme  20   0 2221712 350912 121988 S  1.0  1.4  2:11.17 Web Content
  989 avahi 20   0   32776   5116   4496 S  0.7  0.0  0:16.37 avahi-daemon
 7922 root  20   0  228896   5148   4268 R  0.7  0.0  0:00.06 top
 8786 acme  20   0 2661872 425496 187100 S  0.7  1.7 13:37.34 firefox
 8959 acme  20   0 1581656 129824  82692 S  0.7  0.5  0:26.55 WebExtensions
 1996 acme   9 -11 2092664  14264  10804 S  0.3  0.1  0:04.55 pulseaudio
 2086 acme  20   0 3298140 211612 118636 S  0.3  0.9  6:36.83 gnome-shell
 7995 qemu  20   0 5256108 690860  22588 S  0.3  2.8  1:22.68 qemu-system-x86
 9050 acme  20   0 2239376 585160 116440 S  0.3  2.4  1:57.69 Web Content
29175 root  20   0  253012  30208   8636 S  0.3  0.1  0:12.01 sssd_kcm

perf top --hierarchy

# perf top --hierarchy
Samples: 4K of event 'cycles:ppp', 4000 Hz, Event count (approx.): 1006575623 lost: 0/0
+  57.16%     cc1
+  19.39%     [kernel]
+  11.64%     libc-2.28.so
+   2.08%     perf
+   1.22%     genksyms
+   1.16%     ccache
+   1.05%     as
+   0.99%     ld-2.28.so
+   0.86%     make
+   0.84%     libpixman-1.so.0.34.0
+   0.65%     bash
+   0.41%     libelf-0.174.so
+   0.32%     objtool
+   0.24%     libglib-2.0.so.0.5800.2
+   0.17%     gcc
+   0.06%     perf-8888.map
+   0.03%     [i915]
+   0.02%     [crc32c_intel]

perf top --hierarchy

# perf top --hierarchy
Samples: 4K of event 'cycles:ppp', 4000 Hz, Event count (approx.): 1006575623 lost: 0/0
+  61.70%     cc1
+  16.39%     [kernel]
-  11.66%     libc-2.28.so
       2.54%     [.] _int_malloc
       1.51%     [.] malloc
       1.39%     [.] _int_free
       0.72%     [.] __strcmp_avx2
       0.65%     [.] cfree@GLIBC_2.2.5
       0.51%     [.] __strlen_avx2
       0.45%     [.] __memcmp_avx2_movbe
       0.34%     [.] __memset_avx2_erms
       0.32%     [.] malloc_consolidate
       0.32%     [.] __memchr_avx2
       0.30%     [.] __memmove_avx_unaligned_erms
       0.29%     [.] __strncmp_avx2
       0.21%     [.] getenv
       0.15%     [.] __dcigettext
+   1.24%     ccache
+   0.78%     ld-2.28.so

perf top --hierarchy

# perf top --hierarchy
Samples: 4K of event 'cycles:ppp', 4000 Hz, Event count (approx.): 1006575623 lost: 0/0 drop: 0/0
+  61.70%     cc1
-  16.39%     [kernel]
       1.13%     [k] native_flush_tlb_one_user
       0.56%     [k] unmap_page_range
       0.48%     [k] native_irq_return_iret
       0.46%     [k] clear_page_erms
       0.42%     [k] filemap_map_pages
       0.40%     [k] copy_user_enhanced_fast_string
       0.38%     [k] sync_regs
       0.37%     [k] syscall_return_via_sysret
       0.37%     [k] entry_SYSCALL_64
       0.35%     [k] common_interrupt
       0.33%     [k] error_entry
       0.31%     [k] __list_del_entry_valid
       0.30%     [k] __handle_mm_fault
       0.24%     [k] get_page_from_freelist
       0.24%     [k] __d_lookup_rcu
       0.20%     [k] vma_interval_tree_insert
       0.19%     [k] release_pages
       0.19%     [k] handle_mm_fault
       0.19%     [k] update_blocked_averages
       0.18%     [k] avc_has_perm_noaudit
       0.15%     [k] page_remove_rmap
       0.14%     [k] __pagevec_lru_add_fn
       0.14%     [k] _raw_spin_lock
+   1.29%     genksyms
+   1.24%     ccache
+   0.78%     ld-2.28.so

Looking for help with substrings

# perf top -h percent
 Usage: perf top []
    --call-graph 
        setup and enables call-graph (stack chain/backtrace):

          record_mode: call graph recording mode (fp|dwarf|lbr)
          record_size: if record_mode is 'dwarf', max size of stack recording ()
                               default: 8192 (bytes)
          print_type:  call graph printing style (graph|flat|fractal|folded|none)
          threshold:   minimum call graph inclusion threshold ()
          print_limit: maximum number of call graph entry ()
          order:       call graph order (caller|callee)
          sort_key:    call graph sort key (function|address)
          branch:      include last branch info to call graph (branch)
          value:       call graph value (percent|period|count)

          Default: fp,graph,0.5,caller,function
    --percent-limit 
          Don't show entries under that percent
    --percentage 
          How to display percentage of filtered entries

#

gdb

Stops arbitrary code
kgdb for kernel, userspace with gdb
Breakpoints
Allows reading variable values
Watchpoints
Backtraces
Debugging interactivity
ptrace

perf probe

Hooks arbitrary code
Kernel or userspace
kprobes, uprobes or ftrace hooks
No interactivity
Backtraces
no ptrace
Collects variables

perf probe + perf stat

# perf probe -x /lib64/libc-2.28.so malloc
Added new event:
  probe_libc:malloc    (on malloc in /usr/lib64/libc-2.28.so)

You can now use it in all perf tools, such as:

	perf record -e probe_libc:malloc -aR sleep 1

# perf stat -e probe_libc:malloc sleep 1

 Performance counter stats for 'sleep 1':

                31      probe_libc:malloc

       1.001535345 seconds time elapsed

       0.001071000 seconds user
       0.000000000 seconds sys
#

perf stat + perf probe

Defines a new event
Counts it like any other event
Event remains available
But inactive when not used
Until rebooting

Where can we put probes?

# perf probe -L icmp_rcv
...
     52         if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
                        /*
                         *      RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
                         *        silently ignored (we let user decide with a sysctl).
                         *      RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
                         *        discarded if to broadcast/multicast.
                         */
     59                 if ((icmph->type == ICMP_ECHO ||
     60                      icmph->type == ICMP_TIMESTAMP) &&
                            net->ipv4.sysctl_icmp_echo_ignore_broadcasts) {
                                goto error;
                        }
     64                 if (icmph->type != ICMP_ECHO &&
                            icmph->type != ICMP_TIMESTAMP &&
     66                     icmph->type != ICMP_ADDRESS &&
                            icmph->type != ICMP_ADDRESSREPLY) {
                                goto error;
                        }
                }

...

New probe: rx of broadcast packet

See the source code for the kernel
Works for userspace as well: uprobes
With offsets in the lines
Where probes can be put

Adding the probe

# perf probe icmp_rcv:59
Added new events:
  probe:icmp_rcv       (on icmp_rcv:59)
  probe:icmp_rcv_1     (on icmp_rcv:59)

You can now use it in all perf tools, such as:

	perf record -e probe:icmp_rcv_1 -aR sleep 1

#

Tracing those events

# On window 1:
# ping -c 2 -b 127.255.255.255
WARNING: pinging broadcast address
PING 127.255.255.255 (127.255.255.255) 56(84) bytes of data.
--- 127.255.255.255 ping statistics ---
2 packets transmitted, 0 received, 100% packet loss, time 54ms
#

# On window 2:
# perf trace -e probe:icmp_rcv*
     0.000 ping/2126 probe:icmp_rcv_1:(ffffffff9186a9d2)
     0.002 ping/2126 probe:icmp_rcv:(ffffffff9186a9d2)
  1052.532 ping/2126 probe:icmp_rcv_1:(ffffffff9186a9d2)
  1052.534 ping/2126 probe:icmp_rcv:(ffffffff9186a9d2)
^C#

# perf trace --call-graph=dwarf -e probe:icmp_rcv*
     0.000 ping/28036 probe:icmp_rcv_1:(ffffffff9186a9d2)
                         icmp_rcv ([kernel.kallsyms])
                         ip_local_deliver_finish ([kernel.kallsyms])
                         ip_local_deliver ([kernel.kallsyms])
                         ip_rcv ([kernel.kallsyms])
                         __netif_receive_skb_one_core ([kernel.kallsyms])
                         process_backlog ([kernel.kallsyms])
                         net_rx_action ([kernel.kallsyms])
                         __softirqentry_text_start ([kernel.kallsyms])
                         do_softirq_own_stack ([kernel.kallsyms])
                         do_softirq.part.15 ([kernel.kallsyms])
                         __local_bh_enable_ip ([kernel.kallsyms])
                         ip_finish_output2 ([kernel.kallsyms])
                         ip_output ([kernel.kallsyms])
                         ip_send_skb ([kernel.kallsyms])
                         raw_sendmsg ([kernel.kallsyms])
                         sock_sendmsg ([kernel.kallsyms])
                         __sys_sendto ([kernel.kallsyms])
                         __x64_sys_sendto ([kernel.kallsyms])
                         do_syscall_64 ([kernel.kallsyms])
                         entry_SYSCALL_64_after_hwframe ([kernel.kallsyms])
                         __libc_sendto (/usr/lib64/libc-2.28.so)
                         ping4_send_probe (/usr/bin/ping)
                         pinger (/usr/bin/ping)
                         main_loop (/usr/bin/ping)
                         ping4_run (/usr/bin/ping)
                         main (/usr/bin/ping)
                         __libc_start_main (/usr/lib64/libc-2.28.so)
                         _start (/usr/bin/ping)

strace overhead: baseline

# dd if=/dev/zero of=/dev/null bs=1 count=5000k
5120000+0 records in
5120000+0 records out
5120000 bytes (5.1 MB, 4.9 MiB) copied, 2.98647 s, 1.7 MB/s
#

strace overhead

# strace -eaccept dd if=/dev/zero of=/dev/null bs=1 count=5000k
5120000+0 records in
5120000+0 records out
5120000 bytes (5.1 MB, 4.9 MiB) copied, 74.5371 s, 68.7 kB/s
+++ exited with 0 +++
#

strace

25 times slower
Worst case
dd emits syscalls as fast as it can
Brendan Gregg's article mentions 442x slow down, same example (2014)
ptrace pauses the application twice
syscall enter and exit
www.brendangregg.com/blog/2014-05-11/strace-wow-much-syscall.html

perf trace overhead

# perf trace -e accept dd if=/dev/zero of=/dev/null bs=1 count=5000k
5120000+0 records in
5120000+0 records out
5120000 bytes (5.1 MB, 4.9 MiB) copied, 4.22042 s, 1.2 MB/s
#

perf trace

1.41 times slower
Again worst case
dd emits syscalls as fast as it can
No workload stop at syscall enter/exit
Collects info at syscall enter and exit
Mix with other tracepoints, kprobes, uprobes, etc
But only integer args

Augmented Syscalls: eBPF

Hooks into raw_syscalls:sys_enter
Appends pointer contents to existing payload
Existing beautifiers changed to use it
When available, continue working without it
Filters
Extend the kernel without writing "kernel" code

stracing the open* syscalls

# perf trace -e open* --max-events=10
clang-6.0/29741 openat(AT_FDCWD, "/tmp/build/perf/util/.parse-events-flex.o.d",
                       O_CLOEXEC|O_CREAT|O_TRUNC|O_WRONLY, S_IRUGO|S_IWUGO) = 3
sh/29774 openat(AT_FDCWD, "/tmp/build/perf/util/.parse-events-flex.o.tmp",
                O_CREAT|O_TRUNC|O_WRONLY, S_IRUGO|S_IWUGO) = 3
fixdep/29774 openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fixdep/29774 openat(AT_FDCWD, "/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
fixdep/29774 openat(AT_FDCWD, "/tmp/build/perf/util/.parse-events-flex.o.d",
                    O_RDONLY) = 3
rm/29775 openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
rm/29775 openat(AT_FDCWD, "/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
mv/29776 openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
mv/29776 openat(AT_FDCWD, "/lib64/libselinux.so.1", O_RDONLY|O_CLOEXEC) = 3
mv/29776 openat(AT_FDCWD, "/lib64/libacl.so.1", O_RDONLY|O_CLOEXEC) = 3
#

strace skin

# cat ~/.perfconfig
[trace]
	add_events = tools/perf/examples/bpf/augmented_raw_syscalls.o
	show_zeros = yes
	show_duration = no
	no_inherit = yes
	show_timestamp = no
	show_arg_names = no
	args_alignment = 40
	show_prefix = yes
#

Showing just failures

# perf trace --failure -e close --max-events=10
sh/20674 close(4)                   = -1 EBADF (Bad file descriptor)
sh/20674 close(3)                   = -1 EBADF (Bad file descriptor)
sh/20678 close(4)                   = -1 EBADF (Bad file descriptor)
sh/20678 close(3)                   = -1 EBADF (Bad file descriptor)
sh/20681 close(4<pipe:[1521159]>)   = -1 EBADF (Bad file descriptor)
sh/20681 close(3)                   = -1 EBADF (Bad file descriptor)
sh/20681 close(5)                   = -1 EBADF (Bad file descriptor)
sh/20681 close(4)                   = -1 EBADF (Bad file descriptor)
sh/20690 close(6)                   = -1 EBADF (Bad file descriptor)
sh/20690 close(5)                   = -1 EBADF (Bad file descriptor)
#

Augmented Syscalls

bpf-output event
bpf_perf_event_output
bpf_probe_read
bpf_probe_read_str

Augmenting syscalls

struct bpf_map SEC("maps") __augmented__ = {
       .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
}
int syscall_enter(syscall)(struct syscall_enter_open_args *args)
{
	struct augmented_open_args augmented;
	probe_read(&augmented.args, sizeof(augmented.args), args);
	str_len = probe_read_str(&augmented.filename.value,
				 sizeof(augmented.filename.value),
				 args->filename_ptr);
	perf_event_output(args, &__augmented__, BPF_F_CURRENT_CPU,
			  &augmented, sizeof(augmented));
	return 0;
}

Augmented Syscalls

# cd tools/perf/examples/bpf/
# perf trace -e augmented_syscalls.c cat hello.c 
#include <stdio.h>

int syscall_enter(openat)(void *args)
{
	puts("Hello, world\n");
	return 0;
}

license(GPL);
 0.000 cat/31285 openat(dfd: CWD, filename: /etc/ld.so.cache, flags: CLOEXEC)
 0.029 cat/31285 openat(dfd: CWD, filename: /lib64/libc.so.6, flags: CLOEXEC)
 0.250 cat/31285 open(filename: /usr/lib/locale/locale-archive, flags: CLOEXEC)
 0.300 cat/31285 openat(dfd: CWD, filename: hello.c)
#

Further details: maps

syscall numbers loaded from userspace
syscall arg types in another map
together with how many bytes to copy
types: kernel BTF section
syscalls to trace: in a map too (done)
One kernel .o

THE END

http://www.brendangregg.com/perf.html