Always Present Type Information


Thanks to BPF: BTF


Arnaldo Carvalho de Melo
acme@redhat.com
Red Hat Inc.

What is this about?



  • BPF Type Format
  • Always present
  • BPF uses it
  • A fraction of DWARF size
  • A fraction of DWARF contents
  • Other tools use it too
  • In kernel snprintf augmented by BTF
  • Pretty printing raw data using BTF

Where?



  • /sys/kernel/btf/vmlinux
  • Modules too: split BTF

What is in there?



  • All kernel types
  • Kernel ABI: set in stone
  • Kernel Internals: always in flux

Let's see


$ pahole rwlock_t
typedef struct {
	arch_rwlock_t              raw_lock;             /*     0     8 */

	/* size: 8, cachelines: 1, members: 1 */
	/* last cacheline: 8 bytes */
} rwlock_t;
$
					

Pahole?



  • It started for looking at struct holes
  • Let's call it another name then

Aliases

$ alias typedef=pahole
$ alias struct=pahole 
$ alias union=pahole
$ alias enum=pahole
					
$ typedef rwlock_t
typedef struct {
	arch_rwlock_t              raw_lock;             /*     0     8 */

	/* size: 8, cachelines: 1, members: 1 */
	/* last cacheline: 8 bytes */
} rwlock_t;
					
$ struct list_head
struct list_head {
	struct list_head *         next;                 /*     0     8 */
	struct list_head *         prev;                 /*     8     8 */

	/* size: 16, cachelines: 1, members: 2 */
	/* last cacheline: 16 bytes */
};
$
					
Enumerations
$ enum perf_event_type
enum perf_event_type {
	PERF_RECORD_MMAP            = 1,
	PERF_RECORD_LOST            = 2,
	PERF_RECORD_COMM            = 3,
	PERF_RECORD_EXIT            = 4,
	PERF_RECORD_THROTTLE        = 5,
	PERF_RECORD_UNTHROTTLE      = 6,
	PERF_RECORD_FORK            = 7,
	PERF_RECORD_READ            = 8,
	PERF_RECORD_SAMPLE          = 9,
	PERF_RECORD_MMAP2           = 10,
	PERF_RECORD_AUX             = 11,
	PERF_RECORD_ITRACE_START    = 12,
	PERF_RECORD_LOST_SAMPLES    = 13,
	PERF_RECORD_SWITCH          = 14,
	PERF_RECORD_SWITCH_CPU_WIDE = 15,
	PERF_RECORD_NAMESPACES      = 16,
	PERF_RECORD_KSYMBOL         = 17,
	PERF_RECORD_BPF_EVENT       = 18,
	PERF_RECORD_CGROUP          = 19,
	PERF_RECORD_TEXT_POKE       = 20,
	PERF_RECORD_MAX             = 21,
};
$
					

All kernel types

$ typedef --hex --expand_types rwlock_t
typedef struct {
    /* typedef arch_rwlock_t */ struct qrwlock {
        union {
            /* typedef atomic_t */ struct {
                int counter;                                        /*   0  0x4 */
            } cnts;                                                 /*   0  0x4 */
            struct {
                /* typedef u8 -> __u8 */ unsigned char wlocked;     /*   0  0x1 */
                /* typedef u8 -> __u8 */ unsigned char __lstate[3]; /* 0x1  0x3 */
            };                                                      /*   0  0x4 */
        };                                                          /*   0  0x4 */
        /* typedef arch_spinlock_t */ struct qspinlock {
            union {
                /* typedef atomic_t */ struct {
                    int    counter;                                 /* 0x4  0x4 */
                } val;                                              /* 0x4  0x4 */
                struct {
                    /* typedef u8 -> __u8 */ unsigned char locked;  /* 0x4  0x1 */
                    /* typedef u8 -> __u8 */ unsigned char pending; /* 0x5  0x1 */
                };                                                  /* 0x4  0x2 */
                struct {
                    /* typedef u16 -> __u16 */ short unsigned int locked_pending; /* 0x4  0x2 */
                    /* typedef u16 -> __u16 */ short unsigned int tail; /* 0x6  0x2 */
                };                                                  /* 0x4  0x4 */
            };                                                      /* 0x4  0x4 */
        } wait_lock;                                                /* 0x4  0x4 */
    } raw_lock; /*     0   0x8 */
} rwlock_t;
$
					

Split BTF



  • For kernel modules
  • Do not duplicate types
  • module BTF refers to kernel's
  • Since pahole v1.19
  • Since kernel v5.11

Kconfig variables


$ grep BTF ~/git/build/v5.11.0-rc6+.clang/.config
CONFIG_VIDEO_SONY_BTF_MPX=m
CONFIG_DEBUG_INFO_BTF=y
CONFIG_PAHOLE_HAS_SPLIT_BTF=y
CONFIG_DEBUG_INFO_BTF_MODULES=y
$
					

Lots more files


$ uname -r
5.11.0-rc6.clang+
$ cd /sys/kernel/btf
$ ls -1 | wc -l
136
$ ls -1 | head
ac97_bus
acpi_pad
asus_wmi
bridge
cec
coretemp
crc32c_intel
crc32_pclmul
crct10dif_pclmul
dca
$
					

What is in there?


$ pahole acpi_pad
libbpf: Invalid BTF string section
pahole: file 'acpi_pad' has no supported type information.
					
$ pahole --btf_base=vmlinux acpi_pad | head -11
struct gate_struct {
	u16                        offset_low;           /*     0     2 */
	u16                        segment;              /*     2     2 */
	struct idt_bits            bits;                 /*     4     2 */
	u16                        offset_middle;        /*     6     2 */
	u32                        offset_high;          /*     8     4 */
	u32                        reserved;             /*    12     4 */

	/* size: 16, cachelines: 1, members: 6 */
	/* last cacheline: 16 bytes */
};
$
					

A shortcut


$ pahole /sys/kernel/btf/acpi_pad | head -11
struct gate_struct {
	u16                        offset_low;           /*     0     2 */
	u16                        segment;              /*     2     2 */
	struct idt_bits            bits;                 /*     4     2 */
	u16                        offset_middle;        /*     6     2 */
	u32                        offset_high;          /*     8     4 */
	u32                        reserved;             /*    12     4 */

	/* size: 16, cachelines: 1, members: 6 */
	/* last cacheline: 16 bytes */
};
$
					

Can I do more?



  • Using plain 'struct foo' is powerful
  • For developers
  • Reconstruct types
  • No need for kernel headers
  • Matches the running kernel
  • Some more?

A request from a coworker



  • Joe Lawrence
  • Hey, pahole knows about types
  • We need to extract module versioning info
  • In shell scripts
  • Related to kernel live patching
  • Can you help?

Pretty printing raw data



  • Use type information
  • Format stdin
  • Arrays
  • pahole v1.18

modversion_info


$ pahole -C modversion_info drivers/scsi/sg.ko
struct modversion_info {
	long unsigned int          crc;                  /*     0     8 */
	char                       name[56];             /*     8    56 */

	/* size: 64, cachelines: 1, members: 2 */
};
$
					

pretty print it


$ objcopy -O binary --only-section=__versions drivers/scsi/sg.ko versions
					
$ ls -la versions
-rw-rw-r--. 1 acme acme 7616 Feb 18 09:39 versions
					
$ pahole --count 3 -C modversion_info drivers/scsi/sg.ko < versions
					
{
	.crc = 148553092,
	.name = "module_layout",
},
{
	.crc = 1172595067,
	.name = "no_llseek",
},
{
	.crc = 2722082444,
	.name = "param_ops_int",
},
$
					

Another example: ELF header


$ pahole elf64_hdr
struct elf64_hdr {
	unsigned char              e_ident[16];          /*     0    16 */
	Elf64_Half                 e_type;               /*    16     2 */
	Elf64_Half                 e_machine;            /*    18     2 */
	Elf64_Word                 e_version;            /*    20     4 */
	Elf64_Addr                 e_entry;              /*    24     8 */
	Elf64_Off                  e_phoff;              /*    32     8 */
	Elf64_Off                  e_shoff;              /*    40     8 */
	Elf64_Word                 e_flags;              /*    48     4 */
	Elf64_Half                 e_ehsize;             /*    52     2 */
	Elf64_Half                 e_phentsize;          /*    54     2 */
	Elf64_Half                 e_phnum;              /*    56     2 */
	Elf64_Half                 e_shentsize;          /*    58     2 */
	Elf64_Half                 e_shnum;              /*    60     2 */
	Elf64_Half                 e_shstrndx;           /*    62     2 */

	/* size: 64, cachelines: 1, members: 14 */
};
$
					

An ELF header


$ pahole --count 1 elf64_hdr < /bin/bash
{
	.e_ident = { 127, 69, 76, 70, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
	.e_type = 3,
	.e_machine = 62,
	.e_version = 1,
	.e_entry = 199248,
	.e_phoff = 64,
	.e_shoff = 1342344,
	.e_flags = 0,
	.e_ehsize = 64,
	.e_phentsize = 56,
	.e_phnum = 13,
	.e_shentsize = 64,
	.e_shnum = 31,
	.e_shstrndx = 30,
},
					

Another ELF header


$ pahole --count 1 elf64_hdr < /bin/cp
{
	.e_ident = { 127, 69, 76, 70, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
	.e_type = 3,
	.e_machine = 62,
	.e_version = 1,
	.e_entry = 23792,
	.e_phoff = 64,
	.e_shoff = 147760,
	.e_flags = 0,
	.e_ehsize = 64,
	.e_phentsize = 56,
	.e_phnum = 13,
	.e_shentsize = 64,
	.e_shnum = 31,
	.e_shstrndx = 30,
},
$
					

Case closed, huh?



  • No
  • Let's add some more features...
--header
$ pahole --header elf64_hdr < /lib64/libc-2.32.so
{
	.e_ident = { 127, 69, 76, 70, 2, 1, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0 },
	.e_type = 3,
	.e_machine = 62,
	.e_version = 1,
	.e_entry = 164640,
	.e_phoff = 64,
	.e_shoff = 3217696,
	.e_flags = 0,
	.e_ehsize = 64,
	.e_phentsize = 56,
	.e_phnum = 14,
	.e_shentsize = 64,
	.e_shnum = 68,
	.e_shstrndx = 67,
},
					

Header variables



  • Fields in the header type
  • Can be later referenced
  • To decode ranges in a file

perf.data header


$ pahole --hex ~/bin/perf --header=perf_file_header < perf.data
{
	.magic = 0x32454c4946524550,
	.size = 0x68,
	.attr_size = 0x88,
	.attrs = {
		.offset = 0x128,
		.size = 0x88,
	},
	.data = {
		.offset = 0x1b0,
		.size = 0x3f0,
	},
	.adds_features = { 0x16717ffc, 0, 0, 0 },
},
$
					

perf event header: kernel ABI


$ pahole perf_event_header
struct perf_event_header {
	__u32                      type;                 /*     0     4 */
	__u16                      misc;                 /*     4     2 */
	__u16                      size;                 /*     6     2 */

	/* size: 8, cachelines: 1, members: 3 */
	/* last cacheline: 8 bytes */
};
$
					

perf event header: in perf tool


$ pahole -C perf_event_header ~/bin/perf
struct perf_event_header {
	__u32                      type;                 /*     0     4 */
	__u16                      misc;                 /*     4     2 */
	__u16                      size;                 /*     6     2 */

	/* size: 8, cachelines: 1, members: 3 */
	/* last cacheline: 8 bytes */
};
$
					

Variable sized record



  • Well known member names
  • type, size
Unpolished records
$ pahole --hex ~/bin/perf --seek_bytes=0x1b0 \
			  --size_bytes=0x3f0 \
			  --count 4 \
			  -C 'perf_event_header(sizeof=size)' < perf.data
{
	.type = 0x4f,
	.misc = 0,
	.size = 0x38,
},
{
	.type = 0x49,
	.misc = 0,
	.size = 0x28,
},
{
	.type = 0x4a,
	.misc = 0,
	.size = 0x20,
},
{
	.type = 0x3,
	.misc = 0,
	.size = 0x28,
},
$
					
Using header variables
$ pahole --hex ~/bin/perf --header=perf_file_header \
			  --seek_bytes='$header.data.offset' \
			  --size_bytes='$header.data.size' \
			  --count 4 \
			  -C 'perf_event_header(sizeof=size)' < perf.data
{
	.type = 0x4f,
	.misc = 0,
	.size = 0x38,
},
{
	.type = 0x49,
	.misc = 0,
	.size = 0x28,
},
{
	.type = 0x4a,
	.misc = 0,
	.size = 0x20,
},
{
	.type = 0x3,
	.misc = 0,
	.size = 0x28,
},
$
					
Enumerations
$ enum --hex perf_event_type
enum perf_event_type {
	PERF_RECORD_MMAP            = 0x1,
	PERF_RECORD_LOST            = 0x2,
	PERF_RECORD_COMM            = 0x3,
	PERF_RECORD_EXIT            = 0x4,
	PERF_RECORD_THROTTLE        = 0x5,
	PERF_RECORD_UNTHROTTLE      = 0x6,
	PERF_RECORD_FORK            = 0x7,
	PERF_RECORD_READ            = 0x8,
	PERF_RECORD_SAMPLE          = 0x9,
	PERF_RECORD_MMAP2           = 0xa,
	PERF_RECORD_AUX             = 0xb,
	PERF_RECORD_ITRACE_START    = 0xc,
	PERF_RECORD_LOST_SAMPLES    = 0xd,
	PERF_RECORD_SWITCH          = 0xe,
	PERF_RECORD_SWITCH_CPU_WIDE = 0xf,
	PERF_RECORD_NAMESPACES      = 0x10,
	PERF_RECORD_KSYMBOL         = 0x11,
	PERF_RECORD_BPF_EVENT       = 0x12,
	PERF_RECORD_CGROUP          = 0x13,
	PERF_RECORD_TEXT_POKE       = 0x14,
	PERF_RECORD_MAX             = 0x15,
};
$
					
Some type enumeration mapped
$  pahole --seek_bytes=0x1b0 --hex ~/bin/perf \
          --size_bytes=0x3f0 --skip 1 --count 3 \
          -C 'perf_event_header(sizeof,type,type_enum=perf_event_type)' < perf.data
{
	.type = 0x49,
	.misc = 0,
	.size = 0x28,
},
{
	.type = 0x4a,
	.misc = 0,
	.size = 0x20,
},
{
	.header = {
		.type = PERF_RECORD_COMM,
		.misc = 0,
		.size = 0x28,
	},
     	.pid = 0x4edf,
	.tid = 0x4edf,
	.comm = "perf",
},
$ 

					
The record types
$ pahole ~/bin/perf -C perf_event
union perf_event {
	struct perf_event_header   header;             /*     0     8 */
	struct perf_record_mmap    mmap;               /*     0  4136 */
	struct perf_record_mmap2   mmap2;              /*     0  4168 */
	struct perf_record_comm    comm;               /*     0    32 */
	struct perf_record_namespaces namespaces;      /*     0    24 */
	struct perf_record_cgroup  cgroup;             /*     0  4112 */
	struct perf_record_fork    fork;               /*     0    32 */
	struct perf_record_lost    lost;               /*     0    24 */
	struct perf_record_lost_samples lost_samples;  /*     0    16 */
	struct perf_record_read    read;               /*     0    48 */
	struct perf_record_throttle throttle;          /*     0    32 */
	struct perf_record_sample  sample;             /*     0     8 */
	struct perf_record_bpf_event bpf;              /*     0    24 */
	struct perf_record_ksymbol ksymbol;            /*     0   280 */
	struct perf_record_text_poke_event text_poke;  /*     0    24 */
	struct perf_record_header_attr attr;           /*     0   128 */
<SNIP>
	struct perf_record_time_conv time_conv;        /*     0    56 */
	struct perf_record_header_feature feat;        /*     0    16 */
	struct perf_record_compressed pack;            /*     0     8 */
};
$
					
What to 'cast' for
$ pahole ~/bin/perf -C perf_record_comm
struct perf_record_comm {
	struct perf_event_header   header;               /*     0     8 */
	__u32                      pid;                  /*     8     4 */
	__u32                      tid;                  /*    12     4 */
	char                       comm[16];             /*    16    16 */

	/* size: 32, cachelines: 1, members: 4 */
	/* last cacheline: 32 bytes */
};
$ 
					
Lots more, but we end with:
$ pahole ~/bin/perf --header=perf_file_header \
	-C 'perf_file_attr(range=attrs),
	    perf_event_header(range=data,sizeof,type,
			      type_enum=perf_event_type+perf_user_event_type)' < perf.data
					

Goals



  • Use it to document a file format
  • While providing a full pretty printer
  • perf report -D
  • New records gets automagically supported

Future


  • Experiment more
  • Finish perf.data dissector
  • Features not specific to it
  • Integrate with perf's libbeauty
  • Maps integer in kernel ABIs to strings
  • Add some of these features to other tools
  • gdb?
  • crash?
  • Fix bugs found making this presentation :-)
  • Presentations at: http://vger.kernel.org/~acme/bpf/