acme@redhat.com |
---|
Date: Mon Nov 15 11:02:37 2021 -0800 From: Eric DumazetMove sk_bind_phc next to sk_peer_lock to fill a hole. @@ -489,5 +489,6 @@ struct sock { u16 sk_busy_poll_budget; #endif spinlock_t sk_peer_lock; + int sk_bind_phc; struct pid *sk_peer_pid; const struct cred *sk_peer_cred; @@ -498,5 +499,4 @@ struct sock { seqlock_t sk_stamp_seq; #endif u16 sk_tsflags; - int sk_bind_phc; u8 sk_shutdown;
$ git log -2 --oneline 1ace2b4d2b4e1db8f 1ace2b4d2b4e1db8 net: shrink struct sock by 8 bytes 1b31debca8328448 ipv6: shrink struct ipcm6_cookie $
$ pahole -C list_head ~/git/build/v5.18-rc6+/vmlinux struct list_head { struct list_head * next; /* 0 8 */ struct list_head * prev; /* 8 8 */ /* size: 16, cachelines: 1, members: 2 */ /* last cacheline: 16 bytes */ }; $
# pahole -C _IO_FILE ~/bin/perf | head struct _IO_FILE { int _flags; /* 0 4 */ /* XXX 4 bytes hole, try to pack */ char * _IO_read_ptr; /* 8 8 */ char * _IO_read_end; /* 16 8 */ char * _IO_read_base; /* 24 8 */ char * _IO_write_base; /* 32 8 */ char * _IO_write_ptr; /* 40 8 */ #
$ pahole --reorganize task_struct | tail /* --- cacheline 142 boundary (9088 bytes) was 56 bytes ago --- */ struct thread_struct thread __attribute__((__aligned__(64))); /* 9144 4416 */ /* size: 13560, cachelines: 212, members: 252 */ /* sum members: 13487, holes: 2, sum holes: 57 */ /* sum bitfield members: 79 bits, bit holes: 2, sum bit holes: 49 bits */ /* paddings: 6, sum paddings: 49 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 56 */ /* last cacheline: 56 bytes */ }; /* saved 136 bytes and 2 cachelines! */ $
$ pahole spinlock_t typedef struct spinlock spinlock_t; $
$ pahole spinlock struct spinlock { union { struct raw_spinlock rlock; /* 0 4 */ }; /* 0 4 */ /* size: 4, cachelines: 1, members: 1 */ /* last cacheline: 4 bytes */ }; $
$ pahole -E spinlock struct spinlock { union { struct raw_spinlock { /* typedef arch_spinlock_t */ struct qspinlock { union { /* typedef atomic_t */ struct { int counter; /* 0 4 */ } val; /* 0 4 */ struct { /* typedef u8 -> __u8 */ unsigned char locked; /* 0 1 */ /* ypedef u8 -> __u8 */ unsigned char pending; /* 1 1 */ }; /* 0 2 */ struct { /* typedef u16 -> __u16 */ short unsigned int locked_pending; /* 0 2 */ /* typedef u16 -> __u16 */ short unsigned int tail; /* 2 2 */ }; /* 0 4 */ }; /* 0 4 */ } raw_lock; /* 0 4 */ } rlock; /* 0 4 */ }; /* 0 4 */ /* size: 4, cachelines: 1, members: 1 */ /* last cacheline: 4 bytes */ }; $
bpftool btf dump file /sys/kernel/btf/vmlinux format c
Author: Namhyung KimDate: Wed May 18 15:47:23 2022 -0700 perf record: Handle argument change in sched_switch Recently sched_switch tracepoint added a new argument for prev_state, but it's hard to handle the change in a BPF program. Instead, we can check the function prototype in BTF before loading the program.
static void check_sched_switch_args(void) { struct btf *btf = bpf_object__btf(skel->obj); struct btf_type *t1, *t2, *t3; u32 type_id = btf__find_by_name_kind(btf, "bpf_trace_sched_switch", BTF_KIND_TYPEDEF); t1 = btf__type_by_id(btf, type_id); t2 = btf__type_by_id(btf, t1->type); if (t3 && btf_is_func_proto(t3) && btf_vlen(t3) == 4) { // new format: pass prev_state as 4th arg skel->rodata->has_prev_state = true; } }
+++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c +const volatile bool has_prev_state = false; +SEC("tp_btf/sched_switch") +int on_switch(u64 *ctx) { + struct task_struct *prev, *next; + int prev_state; + + if (!enabled) return 0; + prev = (struct task_struct *)ctx[1]; + next = (struct task_struct *)ctx[2]; + + if (has_prev_state) + prev_state = (int)ctx[3]; + else + prev_state = get_task_state(prev); + return off_cpu_stat(ctx, prev, next, prev_state); +}
# perf record --off-cpu ^C[ perf record: Woken up 1924 times to write data ] [ perf record: Captured and wrote 483.936 MB perf.data (8857075 samples) ] # ls -la perf.data -rw-------. 1 root root 507845510 May 31 00:45 perf.data #
# bpftool prog | grep on_switch -A4 634: tracing name on_switch tag 3d6d5a513a933c28 gpl loaded_at 2022-05-30T22:37:17+0200 uid 0 xlated 1392B jited 913B memlock 4096B map_ids 497,498,493,494,495,490,491,492 btf_id 602 pids perf(393176) #
# bpftool prog dump jited id 634 int on_switch(u64 * ctx): bpf_prog_3d6d5a513a933c28_on_switch: ; int on_switch(u64 *ctx) 0: nopl 0x0(%rax,%rax,1) 5: xchg %ax,%ax 7: push %rbp 8: mov %rsp,%rbp b: sub $0x38,%rsp 12: push %rbx 13: push %r13 15: push %r14 17: push %r15 19: mov %rdi,%r15 ; if (!enabled) 1c: movabs $0xffffb53a400d2000,%rdi 26: mov 0x0(%rdi),%edi ; if (!enabled) 29: test %rdi,%rdi 2c: je 0x0000000000000386
; next = (struct task_struct *)ctx[2]; 32: mov 0x10(%r15),%r14 ; prev = (struct task_struct *)ctx[1]; 36: mov 0x8(%r15),%rbx ; if (has_prev_state) 3a: movabs $0xffffb53a400f6000,%rdi 44: movzbq 0x0(%rdi),%rdi ; prev_state = (int)ctx[3]; 49: mov $0x1,%edi ; if (bpf_core_field_exists(t->__state)) 4e: mov $0x18,%edi 53: mov %rbx,%rdx 56: add %rdi,%rdx 59: mov %rbp,%rdi ; 5c: add $0xffffffffffffffd8,%rdi ; return BPF_CORE_READ(t, __state); 60: mov $0x4,%esi 65: callq 0xffffffffd5f13f50 ; return BPF_CORE_READ(t, __state); 6a: mov -0x28(%rbp),%r13d
# bpftool map | grep off_cpu -A3 490: array name off_cpu_.rodata flags 0x480 key 4B value 3B max_entries 1 memlock 4096B btf_id 628 frozen pids perf(393176) #
# bpftool map dump id 490 [{ "value": { ".rodata": [{ "has_prev_state": false },{ "needs_cgroup": false },{ "uses_cgroup_v1": false } ] } } #
# perf report --stdio --call-graph=no # Childr Self Command Shared Object Symbol # ...... ...... ............... .................. ......................... 81.66% 0.00% sched-messaging libc-2.33.so [.] __libc_start_main 81.66% 0.00% sched-messaging perf [.] cmd_bench 81.66% 0.00% sched-messaging perf [.] main 81.66% 0.00% sched-messaging perf [.] run_builtin 81.43% 0.00% sched-messaging perf [.] bench_sched_messaging 40.86% 40.86% sched-messaging libpthread-2.33.so [.] __read 37.66% 37.66% sched-messaging libpthread-2.33.so [.] __write 2.91% 2.91% sched-messaging libc-2.33.so [.] __poll ... As you can see it spent most of off-cpu time in read and write in bench_sched_messaging(). The --call-graph=no was added just to make the output concise here.
LD [M] drivers/media/usb/gspca/gspca_zc3xx.o AR drivers/media/built-in.a AR drivers/built-in.a GEN .version CHK include/generated/compile.h LD vmlinux.o MODPOST vmlinux.symvers MODINFO modules.builtin.modinfo GEN modules.builtin CC .vmlinux.export.o LD .tmp_vmlinux.btf BTF .btf.vmlinux.bin.o LD .tmp_vmlinux.kallsyms1 KSYMS .tmp_vmlinux.kallsyms1.S AS .tmp_vmlinux.kallsyms1.S LD .tmp_vmlinux.kallsyms2 KSYMS .tmp_vmlinux.kallsyms2.S AS .tmp_vmlinux.kallsyms2.S LD vmlinux
LD [M] arch/x86/kvm/kvm-amd.ko BTF [M] arch/x86/kernel/cpu/mce/mce-inject.ko BTF [M] arch/x86/events/rapl.ko LD [M] arch/x86/kvm/kvm-intel.ko LD [M] arch/x86/kvm/kvm.ko BTF [M] arch/x86/kvm/kvm-amd.ko LD [M] crypto/adiantum.ko BTF [M] crypto/adiantum.ko BTF [M] arch/x86/kvm/kvm-intel.ko LD [M] crypto/aegis128.ko BTF [M] crypto/aegis128.ko LD [M] crypto/aes_ti.ko BTF [M] arch/x86/kvm/kvm.ko
$ cat scripts/pahole-flags.sh #!/bin/sh if [ "${pahole_ver}" -ge "118" ] && [ "${pahole_ver}" -le "121" ]; then # pahole 1.18 through 1.21 can't handle zero-sized per-CPU vars extra_paholeopt="${extra_paholeopt} --skip_encoding_btf_vars" fi if [ "${pahole_ver}" -ge "121" ]; then extra_paholeopt="${extra_paholeopt} --btf_gen_floats" fi if [ "${pahole_ver}" -ge "122" ]; then extra_paholeopt="${extra_paholeopt} -j" fi echo ${extra_paholeopt} $ scripts/pahole-flags.sh --btf_gen_floats -j $ pahole --version v1.23 $
static const char *languages[] = { [DW_LANG_Ada83] = "ada83", SNIP [DW_LANG_C11] = "c11", [DW_LANG_C89] = "c89", [DW_LANG_C99] = "c99", [DW_LANG_C] = "c", [DW_LANG_Cobol74] = "cobol74", SNIP [DW_LANG_C_plus_plus_14] = "c++14", [DW_LANG_C_plus_plus] = "c++", [DW_LANG_D] = "d", [DW_LANG_Dylan] = "dylan", [DW_LANG_Fortran03] = "fortran03", SNIP [DW_LANG_PLI] = "pli", [DW_LANG_Python] = "python", [DW_LANG_RenderScript] = "renderscript", [DW_LANG_Rust] = "rust", };
static struct btf_ptr b = { }; b.ptr = skb; b.type_id = __builtin_btf_type_id(struct sk_buff, 1); bpf_snprintf_btf(str, sizeof(str), &b, sizeof(b), 0, 0);
Default output looks like this: (struct sk_buff){ .transport_header = (__u16)65535, .mac_header = (__u16)65535, .end = (sk_buff_data_t)192, .head = (unsigned char *)0x000000007524fd8b, .data = (unsigned char *)0x000000007524fd8b, .truesize = (unsigned int)768, .users = (refcount_t){ .refs = (atomic_t){ .counter = (int)1, }, }, }
Flags modifying display are as follows: - BTF_F_COMPACT: no formatting around type information - BTF_F_NONAME: no struct/union member names/types - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values; equivalent to %px. - BTF_F_ZERO: show zero-valued struct/union members; they are not displayed by default
$ pahole --prettify=- --header elf64_hdr < /bin/bash { .e_ident = { 127, 69, 76, 70, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, .e_type = 3, .e_machine = 62, .e_version = 1, .e_entry = 204224, .e_phoff = 64, .e_shoff = 1388016, .e_flags = 0, .e_ehsize = 64, .e_phentsize = 56, .e_phnum = 13, .e_shentsize = 64, .e_shnum = 32, .e_shstrndx = 31, }, $
$ pahole elf64_hdr struct elf64_hdr { unsigned char e_ident[16]; /* 0 16 */ Elf64_Half e_type; /* 16 2 */ Elf64_Half e_machine; /* 18 2 */ Elf64_Word e_version; /* 20 4 */ Elf64_Addr e_entry; /* 24 8 */ Elf64_Off e_phoff; /* 32 8 */ Elf64_Off e_shoff; /* 40 8 */ Elf64_Word e_flags; /* 48 4 */ Elf64_Half e_ehsize; /* 52 2 */ Elf64_Half e_phentsize; /* 54 2 */ Elf64_Half e_phnum; /* 56 2 */ Elf64_Half e_shentsize; /* 58 2 */ Elf64_Half e_shnum; /* 60 2 */ Elf64_Half e_shstrndx; /* 62 2 */ /* size: 64, cachelines: 1, members: 14 */ }; $