perf lab


Tips, tricks and a DB optimization using perf


Arnaldo Carvalho de Melo
Jiří Olša
Joe Mario
{acme,jolsa,jmario}@redhat.com
Red Hat Inc.

The other tools


  • sar, top, stap, etc
  • All part of a toolchest
  • perf adds to the toolchest
  • Sometimes does the same
  • With more flexibility

vmstat


# vmstat 1
procs -----------memory----------     -swap- ---io--- -system-- ------cpu-----
 r  b   swpd   free   buff  cache     si so  bi    bo  in    cs us sy id wa st
 8  0 2000272 2128480 483836 15791896 7  9   582  992   42   44 39  6 55  0  0
 8  0 2000272 2021504 483852 15802480 0  0   136 2520 8764 3656 84 16  0  0  0
 8  0 2000272 2113768 483876 15804436 0  0    76 2352 8740 3385 83 17  0  0  0
 8  0 2000272 1978844 483876 15812104 0  0   164  260 8657 3070 85 15  0  0  0
^C
#
					

perf stat


# perf stat --interval-print 1000 -e context-switches
#        time   counts unit events
  1.000157779    2,627      context-switches
  2.000338693    3,067      context-switches
  3.000500966    3,577      context-switches
  4.000655707    2,976      context-switches
  5.000808482    3,199      context-switches
^C 5.922384403   3,333      context-switches

#
					

perf targets


  • system wide
  • CPU
  • cgroups
  • PID
  • TID

perf stat with some threads


# perf stat -I 1000 -e cs -t `pidof -S, bash`
#        time          counts unit events
  1.000217556              12      cs
  2.000423764              11      cs
  3.000606747               1      cs
  4.000817161               1      cs
  5.001014081   <not counted>      cs
^C  5.976337203 <not counted>      cs
#
					

Counting SMIs with perf stat

press the mute key

# perf stat -I 1000 -e msr/smi/
#           time             counts unit events
     1.000532552                  0      msr/smi/
     2.001166107                  0      msr/smi/
     3.001675027                  8      msr/smi/
     4.002176974                  0      msr/smi/
     5.002726317                  8      msr/smi/
^C     5.637141913                  0      msr/smi/

#
					

Before and after

Before...
# echo 1 > /proc/sys/vm/drop_caches 
# perf stat find ~acme/git/linux > /dev/null

 Performance counter stats for 'find /home/acme/git/linux':

            318.11 msec task-clock                #    0.420 CPUs utilized          
             4,989      context-switches          #    0.016 M/sec                  
                63      cpu-migrations            #    0.198 K/sec                  
               280      page-faults               #    0.880 K/sec                  
       417,896,513      cycles                    #    1.314 GHz                    
       379,620,453      instructions              #    0.91  insn per cycle         
        75,490,217      branches                  #  237.312 M/sec                  
         2,036,356      branch-misses             #    2.70% of all branches        

       0.758165914 seconds time elapsed

       0.073100000 seconds user
       0.241968000 seconds sys
#
					

Before and after

After...
# perf stat find ~acme/git/linux > /dev/null

 Performance counter stats for 'find /home/acme/git/linux':

             68.28 msec task-clock                #    0.991 CPUs utilized          
                12      context-switches          #    0.176 K/sec                  
                 0      cpu-migrations            #    0.000 K/sec                  
               274      page-faults               #    0.004 M/sec                  
       265,617,729      cycles                    #    3.890 GHz                    
       310,931,753      instructions              #    1.17  insn per cycle         
        61,838,290      branches                  #  905.609 M/sec                  
         1,508,769      branch-misses             #    2.44% of all branches        

       0.068907995 seconds time elapsed

       0.018829000 seconds user
       0.049568000 seconds sys
#
					
Shell completion

# tools/perf/perf-completion.sh
# perf stat -e power/energy-
power/energy-cores/ power/energy-gpu/ power/energy-pkg/ power/energy-psys/ power/energy-ram/
					
PMU events
  • Try this with -I1000 - interval mode
  • And then generate some load to burn power
  • http://git.kernel.org/torvalds/c/4788e5b4b2338f


# perf stat -e power/energy-cores/,power/energy-gpu/,
               power/energy-pkg/,power/energy-psys/,
	       power/energy-ram/ sleep 1

 Performance counter stats for 'system wide':

              0.83 Joules power/energy-cores/
              0.01 Joules power/energy-gpu/
              1.66 Joules power/energy-pkg/
              5.98 Joules power/energy-psys/
              0.54 Joules power/energy-ram/

       1.001084656 seconds time elapsed

#
					

top


Tasks: 299 total,   1 running, 297 sleeping,   1 stopped,   0 zombie
%Cpu(s):  1.1 us,  0.5 sy,  0.0 ni, 98.0 id,  0.1 wa,  0.1 hi,  0.2 si,  0.0 st
MiB Mem :  23980.2 total,   6982.2 free,   4027.3 used,  12970.8 buff/cache
MiB Swap:  12056.0 total,  12056.0 free,      0.0 used.  18803.3 avail Mem

  PID USER  PR  NI    VIRT    RES    SHR S %CPU %MEM    TIME+ COMMAND
 9137 acme  20   0 2515848 839908 120176 S  3.0  3.4 12:14.64 Web Content
 8888 acme  20   0 2261904 562132 147104 S  2.6  2.3  5:24.73 Web Content
 2727 acme  20   0  689068  67888  43064 S  1.3  0.3  2:50.71 gnome-terminal-
 9093 acme  20   0 2221712 350912 121988 S  1.0  1.4  2:11.17 Web Content
  989 avahi 20   0   32776   5116   4496 S  0.7  0.0  0:16.37 avahi-daemon
 7922 root  20   0  228896   5148   4268 R  0.7  0.0  0:00.06 top
 8786 acme  20   0 2661872 425496 187100 S  0.7  1.7 13:37.34 firefox
 8959 acme  20   0 1581656 129824  82692 S  0.7  0.5  0:26.55 WebExtensions
 1996 acme   9 -11 2092664  14264  10804 S  0.3  0.1  0:04.55 pulseaudio
 2086 acme  20   0 3298140 211612 118636 S  0.3  0.9  6:36.83 gnome-shell
 7995 qemu  20   0 5256108 690860  22588 S  0.3  2.8  1:22.68 qemu-system-x86
 9050 acme  20   0 2239376 585160 116440 S  0.3  2.4  1:57.69 Web Content
29175 root  20   0  253012  30208   8636 S  0.3  0.1  0:12.01 sssd_kcm
					

perf top --hierarchy


# perf top --hierarchy
Samples: 4K of event 'cycles:ppp', 4000 Hz, Event count (approx.): 1006575623 lost: 0/0
+  57.16%     cc1
+  19.39%     [kernel]
+  11.64%     libc-2.28.so
+   2.08%     perf
+   1.22%     genksyms
+   1.16%     ccache
+   1.05%     as
+   0.99%     ld-2.28.so
+   0.86%     make
+   0.84%     libpixman-1.so.0.34.0
+   0.65%     bash
+   0.41%     libelf-0.174.so
+   0.32%     objtool
+   0.24%     libglib-2.0.so.0.5800.2
+   0.17%     gcc
+   0.06%     perf-8888.map
+   0.03%     [i915]
+   0.02%     [crc32c_intel]
					

perf top --hierarchy


# perf top --hierarchy
Samples: 4K of event 'cycles:ppp', 4000 Hz, Event count (approx.): 1006575623 lost: 0/0
+  61.70%     cc1
+  16.39%     [kernel]
-  11.66%     libc-2.28.so
       2.54%     [.] _int_malloc
       1.51%     [.] malloc
       1.39%     [.] _int_free
       0.72%     [.] __strcmp_avx2
       0.65%     [.] cfree@GLIBC_2.2.5
       0.51%     [.] __strlen_avx2
       0.45%     [.] __memcmp_avx2_movbe
       0.34%     [.] __memset_avx2_erms
       0.32%     [.] malloc_consolidate
       0.32%     [.] __memchr_avx2
       0.30%     [.] __memmove_avx_unaligned_erms
       0.29%     [.] __strncmp_avx2
       0.21%     [.] getenv
       0.15%     [.] __dcigettext
+   1.24%     ccache
+   0.78%     ld-2.28.so
					

perf top --hierarchy


# perf top --hierarchy
Samples: 4K of event 'cycles:ppp', 4000 Hz, Event count (approx.): 1006575623 lost: 0/0 drop: 0/0
+  61.70%     cc1
-  16.39%     [kernel]
       1.13%     [k] native_flush_tlb_one_user
       0.56%     [k] unmap_page_range
       0.48%     [k] native_irq_return_iret
       0.46%     [k] clear_page_erms
       0.42%     [k] filemap_map_pages
       0.40%     [k] copy_user_enhanced_fast_string
       0.38%     [k] sync_regs
       0.37%     [k] syscall_return_via_sysret
       0.37%     [k] entry_SYSCALL_64
       0.35%     [k] common_interrupt
       0.33%     [k] error_entry
       0.31%     [k] __list_del_entry_valid
       0.30%     [k] __handle_mm_fault
       0.24%     [k] get_page_from_freelist
       0.24%     [k] __d_lookup_rcu
       0.20%     [k] vma_interval_tree_insert
       0.19%     [k] release_pages
       0.19%     [k] handle_mm_fault
       0.19%     [k] update_blocked_averages
       0.18%     [k] avc_has_perm_noaudit
       0.15%     [k] page_remove_rmap
       0.14%     [k] __pagevec_lru_add_fn
       0.14%     [k] _raw_spin_lock
+   1.29%     genksyms
+   1.24%     ccache
+   0.78%     ld-2.28.so
					
Looking for help with substrings
# perf top -h percent
 Usage: perf top []
    --call-graph 
        setup and enables call-graph (stack chain/backtrace):

          record_mode: call graph recording mode (fp|dwarf|lbr)
          record_size: if record_mode is 'dwarf', max size of stack recording ()
                               default: 8192 (bytes)
          print_type:  call graph printing style (graph|flat|fractal|folded|none)
          threshold:   minimum call graph inclusion threshold ()
          print_limit: maximum number of call graph entry ()
          order:       call graph order (caller|callee)
          sort_key:    call graph sort key (function|address)
          branch:      include last branch info to call graph (branch)
          value:       call graph value (percent|period|count)

          Default: fp,graph,0.5,caller,function
    --percent-limit 
          Don't show entries under that percent
    --percentage 
          How to display percentage of filtered entries

#
					

gdb



  • Stops arbitrary code
  • kgdb for kernel, userspace with gdb
  • Breakpoints
  • Allows reading variable values
  • Watchpoints
  • Backtraces
  • Debugging interactivity
  • ptrace

perf probe



  • Hooks arbitrary code
  • Kernel or userspace
  • kprobes, uprobes or ftrace hooks
  • No interactivity
  • Backtraces
  • no ptrace
  • Collects variables

perf probe + perf stat


# perf probe -x /lib64/libc-2.28.so malloc
Added new event:
  probe_libc:malloc    (on malloc in /usr/lib64/libc-2.28.so)

You can now use it in all perf tools, such as:

	perf record -e probe_libc:malloc -aR sleep 1

# perf stat -e probe_libc:malloc sleep 1

 Performance counter stats for 'sleep 1':

                31      probe_libc:malloc

       1.001535345 seconds time elapsed

       0.001071000 seconds user
       0.000000000 seconds sys
#
					

perf stat + perf probe



  • Defines a new event
  • Counts it like any other event
  • Event remains available
  • But inactive when not used
  • Until rebooting

Where can we put probes?


# perf probe -L icmp_rcv
...
     52         if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
                        /*
                         *      RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
                         *        silently ignored (we let user decide with a sysctl).
                         *      RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
                         *        discarded if to broadcast/multicast.
                         */
     59                 if ((icmph->type == ICMP_ECHO ||
     60                      icmph->type == ICMP_TIMESTAMP) &&
                            net->ipv4.sysctl_icmp_echo_ignore_broadcasts) {
                                goto error;
                        }
     64                 if (icmph->type != ICMP_ECHO &&
                            icmph->type != ICMP_TIMESTAMP &&
     66                     icmph->type != ICMP_ADDRESS &&
                            icmph->type != ICMP_ADDRESSREPLY) {
                                goto error;
                        }
                }

...
					

New probe: rx of broadcast packet



  • See the source code for the kernel
  • Works for userspace as well: uprobes
  • With offsets in the lines
  • Where probes can be put

Adding the probe


# perf probe icmp_rcv:59
Added new events:
  probe:icmp_rcv       (on icmp_rcv:59)
  probe:icmp_rcv_1     (on icmp_rcv:59)

You can now use it in all perf tools, such as:

	perf record -e probe:icmp_rcv_1 -aR sleep 1

#
					

Tracing those events


# On window 1:
# ping -c 2 -b 127.255.255.255
WARNING: pinging broadcast address
PING 127.255.255.255 (127.255.255.255) 56(84) bytes of data.
--- 127.255.255.255 ping statistics ---
2 packets transmitted, 0 received, 100% packet loss, time 54ms
#

# On window 2:
# perf trace -e probe:icmp_rcv*
     0.000 ping/2126 probe:icmp_rcv_1:(ffffffff9186a9d2)
     0.002 ping/2126 probe:icmp_rcv:(ffffffff9186a9d2)
  1052.532 ping/2126 probe:icmp_rcv_1:(ffffffff9186a9d2)
  1052.534 ping/2126 probe:icmp_rcv:(ffffffff9186a9d2)
^C#
					
# perf trace --call-graph=dwarf -e probe:icmp_rcv*
     0.000 ping/28036 probe:icmp_rcv_1:(ffffffff9186a9d2)
                         icmp_rcv ([kernel.kallsyms])
                         ip_local_deliver_finish ([kernel.kallsyms])
                         ip_local_deliver ([kernel.kallsyms])
                         ip_rcv ([kernel.kallsyms])
                         __netif_receive_skb_one_core ([kernel.kallsyms])
                         process_backlog ([kernel.kallsyms])
                         net_rx_action ([kernel.kallsyms])
                         __softirqentry_text_start ([kernel.kallsyms])
                         do_softirq_own_stack ([kernel.kallsyms])
                         do_softirq.part.15 ([kernel.kallsyms])
                         __local_bh_enable_ip ([kernel.kallsyms])
                         ip_finish_output2 ([kernel.kallsyms])
                         ip_output ([kernel.kallsyms])
                         ip_send_skb ([kernel.kallsyms])
                         raw_sendmsg ([kernel.kallsyms])
                         sock_sendmsg ([kernel.kallsyms])
                         __sys_sendto ([kernel.kallsyms])
                         __x64_sys_sendto ([kernel.kallsyms])
                         do_syscall_64 ([kernel.kallsyms])
                         entry_SYSCALL_64_after_hwframe ([kernel.kallsyms])
                         __libc_sendto (/usr/lib64/libc-2.28.so)
                         ping4_send_probe (/usr/bin/ping)
                         pinger (/usr/bin/ping)
                         main_loop (/usr/bin/ping)
                         ping4_run (/usr/bin/ping)
                         main (/usr/bin/ping)
                         __libc_start_main (/usr/lib64/libc-2.28.so)
                         _start (/usr/bin/ping)
					

strace overhead: baseline


# dd if=/dev/zero of=/dev/null bs=1 count=5000k
5120000+0 records in
5120000+0 records out
5120000 bytes (5.1 MB, 4.9 MiB) copied, 2.98647 s, 1.7 MB/s
#
					

strace overhead


# strace -eaccept dd if=/dev/zero of=/dev/null bs=1 count=5000k
5120000+0 records in
5120000+0 records out
5120000 bytes (5.1 MB, 4.9 MiB) copied, 74.5371 s, 68.7 kB/s
+++ exited with 0 +++
#
					

strace


  • 25 times slower
  • Worst case
  • dd emits syscalls as fast as it can
  • Brendan Gregg's article mentions 442x slow down, same example (2014)
  • ptrace pauses the application twice
  • syscall enter and exit
  • www.brendangregg.com/blog/2014-05-11/strace-wow-much-syscall.html

perf trace overhead

# perf trace -e accept dd if=/dev/zero of=/dev/null bs=1 count=5000k
5120000+0 records in
5120000+0 records out
5120000 bytes (5.1 MB, 4.9 MiB) copied, 4.22042 s, 1.2 MB/s
#
					

perf trace


  • 1.41 times slower
  • Again worst case
  • dd emits syscalls as fast as it can
  • No workload stop at syscall enter/exit
  • Collects info at syscall enter and exit
  • Mix with other tracepoints, kprobes, uprobes, etc
  • But only integer args

Augmented Syscalls: eBPF


  • Hooks into raw_syscalls:sys_enter
  • Appends pointer contents to existing payload
  • Existing beautifiers changed to use it
  • When available, continue working without it
  • Filters
  • Extend the kernel without writing "kernel" code

stracing the open* syscalls


# perf trace -e open* --max-events=10
clang-6.0/29741 openat(AT_FDCWD, "/tmp/build/perf/util/.parse-events-flex.o.d",
                       O_CLOEXEC|O_CREAT|O_TRUNC|O_WRONLY, S_IRUGO|S_IWUGO) = 3
sh/29774 openat(AT_FDCWD, "/tmp/build/perf/util/.parse-events-flex.o.tmp",
                O_CREAT|O_TRUNC|O_WRONLY, S_IRUGO|S_IWUGO) = 3
fixdep/29774 openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fixdep/29774 openat(AT_FDCWD, "/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
fixdep/29774 openat(AT_FDCWD, "/tmp/build/perf/util/.parse-events-flex.o.d",
                    O_RDONLY) = 3
rm/29775 openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
rm/29775 openat(AT_FDCWD, "/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
mv/29776 openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
mv/29776 openat(AT_FDCWD, "/lib64/libselinux.so.1", O_RDONLY|O_CLOEXEC) = 3
mv/29776 openat(AT_FDCWD, "/lib64/libacl.so.1", O_RDONLY|O_CLOEXEC) = 3
#
					

strace skin


# cat ~/.perfconfig
[trace]
	add_events = tools/perf/examples/bpf/augmented_raw_syscalls.o
	show_zeros = yes
	show_duration = no
	no_inherit = yes
	show_timestamp = no
	show_arg_names = no
	args_alignment = 40
	show_prefix = yes
#
					

Showing just failures


# perf trace --failure -e close --max-events=10
sh/20674 close(4)                   = -1 EBADF (Bad file descriptor)
sh/20674 close(3)                   = -1 EBADF (Bad file descriptor)
sh/20678 close(4)                   = -1 EBADF (Bad file descriptor)
sh/20678 close(3)                   = -1 EBADF (Bad file descriptor)
sh/20681 close(4<pipe:[1521159]>)   = -1 EBADF (Bad file descriptor)
sh/20681 close(3)                   = -1 EBADF (Bad file descriptor)
sh/20681 close(5)                   = -1 EBADF (Bad file descriptor)
sh/20681 close(4)                   = -1 EBADF (Bad file descriptor)
sh/20690 close(6)                   = -1 EBADF (Bad file descriptor)
sh/20690 close(5)                   = -1 EBADF (Bad file descriptor)
#
					

Augmented Syscalls


  • bpf-output event
  • bpf_perf_event_output
  • bpf_probe_read
  • bpf_probe_read_str

Augmenting syscalls


struct bpf_map SEC("maps") __augmented__ = {
       .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
}
int syscall_enter(syscall)(struct syscall_enter_open_args *args)
{
	struct augmented_open_args augmented;
	probe_read(&augmented.args, sizeof(augmented.args), args);
	str_len = probe_read_str(&augmented.filename.value,
				 sizeof(augmented.filename.value),
				 args->filename_ptr);
	perf_event_output(args, &__augmented__, BPF_F_CURRENT_CPU,
			  &augmented, sizeof(augmented));
	return 0;
}
					

Augmented Syscalls


# cd tools/perf/examples/bpf/
# perf trace -e augmented_syscalls.c cat hello.c 
#include <stdio.h>

int syscall_enter(openat)(void *args)
{
	puts("Hello, world\n");
	return 0;
}

license(GPL);
 0.000 cat/31285 openat(dfd: CWD, filename: /etc/ld.so.cache, flags: CLOEXEC)
 0.029 cat/31285 openat(dfd: CWD, filename: /lib64/libc.so.6, flags: CLOEXEC)
 0.250 cat/31285 open(filename: /usr/lib/locale/locale-archive, flags: CLOEXEC)
 0.300 cat/31285 openat(dfd: CWD, filename: hello.c)
#
					

Further details: maps


  • syscall numbers loaded from userspace
  • syscall arg types in another map
  • together with how many bytes to copy
  • types: kernel BTF section
  • syscalls to trace: in a map too (done)
  • One kernel .o

THE END


  • http://www.brendangregg.com/perf.html