BPF: The Status of BTF


Producers, Consumers


Arnaldo Carvalho de Melo
acme@redhat.com
Red Hat Inc.

What is this about?



  • BPF Type Format
  • Initially just data types
  • Compact
  • File/Number
  • Global variables

Spreading



  • Most new BPF features use it
  • CO-RE
  • BPF trampolines
  • struct ops
  • Dynamic re-linking
  • KRSI

Always present



  • (Kinda) Required by now
  • Compact: 100s MBs (DWARF) to few MBs
  • /sys/kernel/btf/vmlinux

Compact


$ uname -a
Linux quaco 5.5.0-rc6+ #2 SMP Tue Jan 14 13:13:43 -03 2020 x86_64 GNU/Linux
$ ls -la /sys/kernel/btf/vmlinux 
-r--r--r--. 1 3234516 /sys/kernel/btf/vmlinux
$ 
					

pahole



  • First producer
  • Loads DWARF
  • Encodes BTF
  • CTF reading/encoding was there already

btfdiff



  • pahole -F btf file.o
  • pahole -F dwarf --flat_arrays file.o
  • diff them
  • Should produce the same results
  • Regression tests

fullcircle



  • pfunct --compile file.o
  • Using DWARF
  • BTF output needs some work, but feasible
  • Build resulting file
  • codiff debug info in both
  • Should match
  • Again: regression tests

pahole + kernel



  • CONFIG_DEBUG_INFO_BTF=y
  • Part of the kernel build process
  • Fast enough so far
  • Parallelization possible
  • Recent work on making elfutils thread safe
  • For the DWARF reading part
  • One thread per CU (Compile Unit)

pahole + BTF



  • Default to using /sys/kernel/btf/vmlinux
  • Lots of other options
  • To query various aspects of the kernel structs
pahole --size
$ pahole -s | sort -k2 -nr | head -10
cmp_data		290904	1
dec_data		274520	1
cpu_entry_area		217088	0
pglist_data		172928	4
saved_cmdlines_buffer	131104	1
debug_store_buffers	131072	0
hid_parser		110848	1
hid_local		110608	0
zonelist		 81936	0
e820_table		 64004	0
$
					
pahole --contains=CLASS_NAME
$ pahole -i list_head | head
task_struct
vm_area_struct
rw_semaphore
mutex
device
wait_queue_head
work_struct
address_space
file
deferred_split
$
					
pahole --contains=CLASS_NAME
$ pahole -C task_struct --hex | grep 'struct list_head'
    struct list_head  rcu_tasks_holdout_list; /* 0x388 0x10 */
    struct list_head  tasks;                  /* 0x3b8 0x10 */
    struct list_head  children;               /* 0x4d8 0x10 */
    struct list_head  sibling;                /* 0x4e8 0x10 */
    struct list_head  ptraced;                /* 0x500 0x10 */
    struct list_head  ptrace_entry;           /* 0x510 0x10 */
    struct list_head  thread_group;           /* 0x568 0x10 */
    struct list_head  thread_node;            /* 0x578 0x10 */
    struct list_head  cg_list;                /* 0x8d8 0x10 */
    struct list_head  pi_state_list;          /* 0x900 0x10 */
    struct list_head  perf_event_list;        /* 0x970 0x10 */
$
					
pahole --find_pointers_to=CLASS_NAME
$ pahole -f bpf_prog
perf_event: prog
bpf_prog_aux: linked_prog
bpf_prog_aux: prog
bpf_prog_offload: prog
bpf_prog_array_item: prog
net: flow_dissector_prog
net_device: xdp_prog
sk_filter: prog
seccomp_filter: prog
bpf_raw_tracepoint: prog
bpf_verifier_env: prog
sock_reuseport: prog
ns_get_path_bpf_prog_args: prog
bpf_prog_list: prog
xdp_attachment_info: prog
sk_psock_progs: msg_parser
sk_psock_progs: skb_parser
sk_psock_progs: skb_verdict
bpf_lwt_prog: prog
$
					
Pointers to struct bpf_prog
$ pahole -C sk_filter
struct sk_filter {
	refcount_t                 refcnt;       /*     0     4 */
	/* XXX 4 bytes hole, try to pack */
	struct callback_head       rcu;          /*     8    16 */
	struct bpf_prog *          prog;         /*    24     8 */

	/* size: 32, cachelines: 1, members: 3 */
	/* sum members: 28, holes: 1, sum holes: 4 */
	/* last cacheline: 32 bytes */
};
					
$ pahole -C xdp_attachment_info
struct xdp_attachment_info {
	struct bpf_prog *          prog;         /*     0     8 */
	u32                        flags;        /*     8     4 */

	/* size: 16, cachelines: 1, members: 2 */
	/* padding: 4 */
	/* last cacheline: 16 bytes */
};
					
More concise pahole + BTF
$ pahole bpf_insn
struct bpf_insn {
	__u8                       code;            /*     0     1 */
	__u8                       dst_reg:4;       /*     1: 0  1 */
	__u8                       src_reg:4;       /*     1: 4  1 */
	__s16                      off;             /*     2     2 */
	__s32                      imm;             /*     4     4 */

	/* size: 8, cachelines: 1, members: 5 */
	/* last cacheline: 8 bytes */
};
$
					

bpftool



  • BPF_BTF_GET_FD_BY_ID
  • Pretty prints map keys/values
  • Intermixes source code with bytecode/JITed code
bpftool btf
# bpftool version
bpftool v5.5.0-rc4
					
# bpftool btf help
Usage: bpftool btf { show | list } [id BTF_ID]
       bpftool btf dump BTF_SRC [format FORMAT]
       bpftool btf help

       BTF_SRC := { id BTF_ID | prog PROG |
                    map MAP [{key | value | kv | all}] |
		    file FILE }
       FORMAT  := { raw | c }
       MAP := { id MAP_ID | pinned FILE | name MAP_NAME }
       PROG := { id PROG_ID | pinned FILE | tag PROG_TAG |
                 name PROG_NAME }
       OPTIONS := { {-j|--json} [{-p|--pretty}] | {-f|--bpffs} |
	            {-m|--mapcompat} | {-n|--nomount} }
#
					
'perf trace' BPF maps
# bpftool prog | grep syscall_unaugme -A 100
314: tracepoint  name syscall_unaugme  tag 57cd311f2e27366b  gpl
	loaded_at 2020-01-17T14:38:26-0300  uid 0
	xlated 16B  jited 40B  memlock 4096B btf_id 81
315: tracepoint  name sys_enter_conne  tag f2131823a4275143  gpl
	loaded_at 2020-01-17T14:38:26-0300  uid 0
	xlated 272B  jited 168B  memlock 4096B  map_ids 228,223 btf_id 81
316: tracepoint  name sys_enter_sendt  tag 1962ce5cb9c415b3  gpl
	loaded_at 2020-01-17T14:38:26-0300  uid 0
	xlated 272B  jited 168B  memlock 4096B  map_ids 228,223 btf_id 81
317: tracepoint  name sys_enter_open  tag 0e59c3ac2bea5280  gpl
	loaded_at 2020-01-17T14:38:26-0300  uid 0
	xlated 304B  jited 185B  memlock 4096B  map_ids 228,223 btf_id 81
318: tracepoint  name sys_enter_opena  tag 0baf443610f59837  gpl
	loaded_at 2020-01-17T14:38:26-0300  uid 0
	xlated 304B  jited 185B  memlock 4096B  map_ids 228,223 btf_id 81
319: tracepoint  name sys_enter_renam  tag e6f565ab94fdb0b8  gpl
	loaded_at 2020-01-17T14:38:26-0300  uid 0
	xlated 520B  jited 300B  memlock 4096B  map_ids 228,223 btf_id 81
320: tracepoint  name sys_enter_renam  tag 01ad4488b2d5f42f  gpl
	loaded_at 2020-01-17T14:38:26-0300  uid 0
	xlated 520B  jited 300B  memlock 4096B  map_ids 228,223 btf_id 81
321: tracepoint  name sys_enter  tag 0bc3fc9d11754ba1  gpl
	loaded_at 2020-01-17T14:38:26-0300  uid 0
	xlated 272B  jited 231B  memlock 4096B  map_ids 227,228,225 btf_id 81
322: tracepoint  name sys_exit  tag 29c7ae234d79bd5c  gpl
	loaded_at 2020-01-17T14:38:26-0300  uid 0
	xlated 184B  jited 186B  memlock 4096B  map_ids 227,226 btf_id 81
#
					
bpftool + BTF
# bpftool btf dump map id 168 key
[2] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
					
# bpftool btf dump map id 168 value format c
typedef _Bool bool;

typedef unsigned short __u16;

typedef __u16 u16;

struct syscall {
	bool enabled;
	u16 string_args_len[6];
};

					
bpftool dumping kernel's BTF
# bpftool btf dump file /sys/kernel/btf/vmlinux format c | \
	grep 'struct fpu {' -A10
struct fpu {
	unsigned int last_cpu;
	long unsigned int avx512_timestamp;
	long: 64;
	long: 64;
	long: 64;
	long: 64;
	long: 64;
	long: 64;
	union fpregs_state state;
};
					
pahole dumping kernel's BTF
# pahole fpu
struct fpu {
	unsigned int         last_cpu;             /*  0    4 */

	/* XXX 4 bytes hole, try to pack */

	long unsigned int    avx512_timestamp;     /*  8    8 */

	/* XXX 48 bytes hole, try to pack */

	/* --- cacheline 1 boundary (64 bytes) --- */
	union fpregs_state   state;                /* 64 4096 */

	/* size: 4160, cachelines: 65, members: 3 */
	/* sum members: 4108, holes: 2, sum holes: 52 */
};
# 
					
pahole dumping kernel's DWARF
# pahole -F dwarf /lib/modules/5.5.0-rc6+/build/vmlinux -C fpu
struct fpu {
	unsigned int       last_cpu;             /*  0    4 */

	/* XXX 4 bytes hole, try to pack */

	long unsigned int  avx512_timestamp;     /*  8    8 */

	/* XXX 48 bytes hole, try to pack */

	/* --- cacheline 1 boundary (64 bytes) --- */
	union fpregs_state state __attribute__((__aligned__(64))); /* 64 4096 */

	/* size: 4160, cachelines: 65, members: 3 */
	/* sum members: 4108, holes: 2, sum holes: 52 */
	/* forced alignments: 1, forced holes: 1, sum forced holes: 48 */
} __attribute__((__aligned__(64)));
					

kernel



  • First consumer
  • BPF_BTF_LOAD
  • Validates BTF
  • BPF_BTF_GET_FD_BY_ID
  • Does not allow writes to BTF types

kernel validates BTF



  • Validates header
  • BTF_MAGIC
  • BTF_VERSION
  • flags

Some validations performed


$ grep btf_verifier_log kernel/bpf/btf.c
    btf_verifier_log(env, "Exceeded max num of types");
    btf_verifier_log_type(env, t, "nr_bits exceeds %zu",
    btf_verifier_log_type(env, t, "nr_bits exceeds type_size");
    btf_verifier_log_type(env, t, "Unsupported encoding");
    btf_verifier_log_type(env, t, "Invalid type_id");
    btf_verifier_log_type(env, t, "Invalid name");
    btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
    btf_verifier_log_type(env, t, "Expected size:%zu",
    btf_verifier_log_type(env, t, "Loop detected");
    btf_verifier_log(env, "Unaligned type_off");
    btf_verifier_log(env, "No type found");
    btf_verifier_log(env, "String section is not at the end");
    btf_verifier_log(env, "Invalid string section");
    btf_verifier_log(env, "Section overlap found");
    btf_verifier_log(env, "Total section length too long");
    btf_verifier_log(env, "Unsupported section found");
    btf_verifier_log(env, "hdr_len not found");
    btf_verifier_log(env, "btf_header not found");
    btf_verifier_log(env, "Unsupported btf_header");
    btf_verifier_log(env, "Invalid magic");
    btf_verifier_log(env, "Unsupported version");
    btf_verifier_log(env, "Unsupported flags");
$
					

kernel validating BTF (excerpts)


# perf ftrace -G '*btf*' perf trace -e *sleep sleep 1
 7)               |  bpf_btf_load() {
 7)               |    capable() {
 7)   1.527 us    |    }
 7)               |    btf_new_fd() {
 7)   0.101 us    |      btf_sec_info_cmp();
 7)               |      btf_struct_check_meta() {
 7)   0.135 us    |        btf_name_valid_identifier.isra.12();
 7)   0.109 us    |        __btf_verifier_log_type();
 7)   0.107 us    |        btf_name_valid_identifier.isra.12();
 7)   0.108 us    |        btf_verifier_log_member();
 7)   3.642 us    |      }
 7)               |      btf_int_check_meta() {
 7)   0.100 us    |        __btf_verifier_log_type();
 7)   0.315 us    |      }
 7)               |      btf_ref_type_check_meta()
 7) + 49.743 us   |  }

					
/sys/kernel/btf/vmlinux

  • Raw BTF
  • Always available
  • All types used in vmlinux
  • All functions exported via kallsyms
  • CONFIG_DEBUG_INFO_BTF=y
  • Uses pahole to convert from DWARF
Compact
$ uname -a
Linux quaco 5.5.0-rc6+ #2 SMP Tue Jan 14 13:13:43 -03 2020 x86_64 GNU/Linux
$ ls -la /sys/kernel/btf/vmlinux 
-r--r--r--. 1 3234516 /sys/kernel/btf/vmlinux
$ 
					
$ size -Ad ~/git/build/v5.5-rc6+/vmlinux | grep -E 'BTF|debug'
.BTF                      3234488   
.debug_aranges             184000   
.debug_info             219842532   
.debug_abbrev             6409858   
.debug_line              23369539   
.debug_frame              2693968   
.debug_str                3800630   
.debug_loc               14871978   
.debug_ranges            15582016   
$ 
					

BPF CO-RE



  • Compile Once, Run Everywhere
  • Relocation field offset records
  • __builtin_preserve_access_index(x)
  • Compare kernel types
  • With the ones in the BPF prog
  • Fixup offsets
  • Check for field existence in a kernel
  • Records bitfield accesses
libbpf extern variables

  • LINUX_KERNEL_VERSION
  • CONFIG_ kconfig entries
  • Constant propagation
  • Relocation records where it is used
  • From /boot/config-$(uname -r)
  • Fallback to /proc/config.gz
  • Or via bpf_object_open_opts.kconfig
  • tools/testing/selftests/bpf/prog_tests/core_extern.c
  • tools/testing/selftests/bpf/progs/test_core_extern.c
vmlinux.h
$ bpftool btf dump file /sys/kernel/btf/vmlinux format c | head
#ifndef BPF_NO_PRESERVE_ACCESS_INDEX
#pragma clang attribute push (__attribute__((preserve_access_index)),
			      apply_to = record)
#endif

typedef signed char __s8;

typedef unsigned char __u8;

typedef short int __s16;

$
					

First struct


$ bpftool btf dump file /sys/kernel/btf/vmlinux format c | \
	grep '^struct ' -m 1 -A3 -B10
typedef phys_addr_t resource_size_t;

typedef struct {
	int counter;
} atomic_t;

typedef struct {
	s64 counter;
} atomic64_t;

struct list_head {
	struct list_head *next;
	struct list_head *prev;
};
$
					

runqslower



  • Reimplementation of BCC's runqslower
  • Traces high scheduling delays
  • BPF skeleton
  • auto-generated from BPF object file
  • memory-mapped interface to global data
  • Auto-generation of "relocatable" vmlinux.h
  • Necessary for BTF-typed raw tracepoints with direct memory access
Living on the bleeding edge
$ make -C tools/bpf/runqslower/

In file included from runqslower.bpf.c:3:
.output/vmlinux.h:2:15: error: attribute 'preserve_access_index' is
not supported by '#pragma clang attribute'
#pragma clang attribute push (__attribute__((preserve_access_index)),
                              apply_to = record)
              ^
.output/vmlinux.h:111541:15: error: '#pragma clang attribute pop' with
no matching '#pragma clang attribute push'
#pragma clang attribute pop
              ^
2 errors generated.
make: *** [Makefile:58: .output/runqslower.bpf.o] Error 1
make: Leaving directory '/home/acme/git/bpf/tools/bpf/runqslower'
$ clang -v
clang version 10.0.0 (https://git.llvm.org/git/clang.git/
	65acf43270ea2894dffa0d0b292b92402f80c8cb)
	(https://git.llvm.org/git/llvm.git/ a461b7a03cde32cd6560dcbcea23ec51dfd1e522)

					
Update your system!

  • SVN mirror abandoned after move to MONOREPO/github
  • https://llvm.org/docs/Proposals/GitHubMove.html
  • Use single repo llvm
  • https://github.com/llvm/llvm-project
  • build it to get close to llvm 11
  • To get pragma attribute preserve_access_index
  • & llvm to generate datasec and variables type info
  • Needed for ruqslower args
With all that is needed in place:
$ make -C tools/bpf/runqslower/  
make: Entering directory '/home/acme/git/bpf/tools/bpf/runqslower'
  LINK     /home/acme/git/bpf/tools/bpf/runqslower/.output//libbpf/libbpf.a
  LINK     /home/acme/git/bpf/tools/bpf/runqslower/.output/bpftool
  INSTALL  bpftool
  GEN      vmlinux.h
  BPF      runqslower.bpf.o
  GEN-SKEL runqslower.skel.h
  CC       runqslower.o
  BINARY   runqslower
make: Leaving directory '/home/acme/git/bpf/tools/bpf/runqslower'
					
With all that is needed in place:
$ file tools/bpf/runqslower/.output/runqslower
tools/bpf/runqslower/.output/runqslower: ELF 64-bit LSB executable, x86-64,
version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2,
for GNU/Linux 3.2.0,
BuildID[sha1]=e7986148fa9629ef9ca8ea1fd76c3a0bbdf70b06, with debug_info,
not stripped
					
$ ls -la tools/bpf/runqslower/.output/runqslower
-rwxrwxr-x. 1 507016 tools/bpf/runqslower/.output/runqslower
$ 
					
$ strip tools/bpf/runqslower/.output/runqslower
$ ls -la tools/bpf/runqslower/.output/runqslower
-rwxrwxr-x. 1 249208 tools/bpf/runqslower/.output/runqslower
$ 
					
Running it
$ sudo tools/bpf/runqslower/.output/runqslower 500
Tracing run queue latency higher than 500 us
TIME     COMM             PID           LAT(us)
19:00:11 swapper/3        22111            1904
19:00:30 weechat          41               1244
19:00:30 weechat          41               1316
^C
$ 
					
Benefits

  • Boilerplate being reduced
  • Everything needed in one binary
  • Small
  • Runs in any kernel
  • With subset of fields used
Unroll the magic

  • Similar to BCC
  • Userspace part
  • BPF/kernel part
  • Natural struct field deref
  • No bpf_probe_read
Common userspace/BPF header
$ cat tools/bpf/runqslower/runqslower.h
/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
#ifndef __RUNQSLOWER_H
#define __RUNQSLOWER_H

#define TASK_COMM_LEN 16

struct event {
	char task[TASK_COMM_LEN];
	__u64 delta_us;
	pid_t pid;
};

#endif /* __RUNQSLOWER_H */
$
					
Userspace part

  • tools/bpf/runqslower/runqslower.c
  • Uses argp.h for command line options
  • Initializes global data with those options
  • Sends the BPF bytecode to the kernel
  • Sets up perf ring buffer
  • Reads events
  • All using tools/lib/bpf (libbpf)
Userspace part: event handler
#include <bpf.h>
#include "runqslower.h"
#include "runqslower.skel.h"

void handle_event(void *ctx, int cpu, void *data, __u32 data_sz)
{
        const struct event *e = data;

	strftime(ts, sizeof(ts), "%H:%M:%S", tm);
        printf("%-8s %-16s %-6d %14llu\n", ts, e->task, e->pid, e->delta_us);
}
					
Userspace part: main loop
int main(int argc, char **argv)
{
        err = argp_parse(&argp, argc, argv, 0, NULL, NULL);
        obj = runqslower_bpf__open();
        obj->rodata->min_us = env.min_us; // arg, BPF global var
					
        err = runqslower_bpf__load(obj);
        err = runqslower_bpf__attach(obj);
					
        printf("%-8s %-16s %-6s %14s\n", "TIME", "COMM", "PID", "LAT(us)");

        pb_opts.sample_cb = handle_event;
        pb_opts.lost_cb = handle_lost_events;
        pb = perf_buffer__new(bpf_map__fd(obj->maps.events), 64, &pb_opts);
					
        while ((err = perf_buffer__poll(pb, 100)) >= 0);
}
					
Kernel/BPF part

  • tools/bpf/runqslower/runqslower.bpf.c
  • Uses per pid has map for timestamps
  • To calculate the deltas/latencies
  • Connects to BTF tracepoints
  • To use normal pointer dereference
  • Sets up events
  • Pushes to userspace via perf ring buffer
BPF/kernel program
$ cat tools/bpf/runqslower/runqslower.bpf.c 
#include "vmlinux.h"
#include <bpf_helpers.h>
#include "runqslower.h"
					
const volatile __u64 min_us = 0;
const volatile pid_t targ_pid = 0;
					
struct {
	__uint(type, BPF_MAP_TYPE_HASH);
	__uint(max_entries, 10240);
	__type(key, u32);
	__type(value, u64);
} start SEC(".maps");
					
struct {
	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
	__uint(key_size, sizeof(u32));
	__uint(value_size, sizeof(u32));
} events SEC(".maps");
				
Symbols
$ readelf -sW tools/bpf/runqslower/.output/runqslower.bpf.o | \
	egrep min_us\|targ_pid
   Num:  Size  Type    Bind    Vis     Ndx  Name
    18:   8    OBJECT  GLOBAL  DEFAULT  5   min_us
    20:   4    OBJECT  GLOBAL  DEFAULT  5   targ_pid
$
					
BPF/kernel program
__always_inline static int trace_enqueue(u32 tgid, u32 pid)
{
	if (!pid || (targ_pid && targ_pid != pid))
		return 0;
	u64 ts = bpf_ktime_get_ns();
	bpf_map_update_elem(&start, &pid, &ts, 0);
	return 0;
}
					
SEC("tp_btf/sched_wakeup") int handle__sched_wakeup(u64 *ctx)
{
	// TP_PROTO(struct task_struct *p) 

	struct task_struct *p = (void *)ctx[0];
	return trace_enqueue(p->tgid, p->pid);
}
					
BPF/kernel program: validation
SEC("tp_btf/sched_switch") int handle__sched_switch(u64 *ctx)
{
	// TP_PROTO(bool preempt, struct task_struct *prev,
	//			  struct task_struct *next)

	struct task_struct *prev = (struct task_struct *)ctx[1];
	struct task_struct *next = (struct task_struct *)ctx[2];
	struct event event = {};
	u32 pid = next->pid;
					
	u64 *tstamp = bpf_map_lookup_elem(&start, &pid);
	if (!tstamp)
		return 0;   /* missed enqueue */
					
	u64 delta_us = (bpf_ktime_get_ns() - *tstamp) / 1000;
	if (min_us && delta_us <= min_us)
		return 0;
					
BPF/kernel program: push to userspace
	event.pid = pid;
	event.delta_us = delta_us;

	bpf_get_current_comm(&event.task, sizeof(event.task));
					
	bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU,
			      &event, sizeof(event));
					
	bpf_map_delete_elem(&start, &pid);
	return 0;
}
					
BPF object details
$ ls -la tools/bpf/runqslower/.output/runqslower.bpf.o
-rw-rw-r--. 1 31688 tools/bpf/runqslower/.output/runqslower.bpf.o
					
$ file tools/bpf/runqslower/.output/runqslower.bpf.o
tools/bpf/runqslower/.output/runqslower.bpf.o: ELF 64-bit LSB relocatable,
        eBPF, version 1 (SYSV), not stripped
$ 
					
$ eu-readelf -SW tools/bpf/runqslower/.output/runqslower.bpf.o | grep \.rel
Section Headers:
[Nr] Name                   Type Size    
[11] .reltp_btf/sched_wakeup REL   20
[13] .reltp_btf/sched_switch REL   60
[14] .rel.BTF                REL   50
[15] .rel.BTF.ext            REL  460
$
					

BPF_PROG_TYPE_STRUCT_OPS



  • TCP Congestion control algorithms
  • Pointer table
  • Faster turnaround for testing new algos
  • Builds on top of CO-RE infra
  • General mechanism for any kernel struct ops
TCP congestion control modules
$ cd net/ipv4/
$ grep -l 'tcp_register_congestion_control(&' tcp_*.c
tcp_bbr.c
tcp_bic.c
tcp_cdg.c
tcp_cubic.c
tcp_dctcp.c
tcp_highspeed.c
tcp_htcp.c
tcp_hybla.c
tcp_illinois.c
tcp_lp.c
tcp_nv.c
tcp_scalable.c
tcp_vegas.c
tcp_veno.c
tcp_westwood.c
tcp_yeah.c
$
					
Example: DCTCP in BPF

  • Data Center TCP
  • tools/testing/selftests/bpf/progs/bpf_dctcp.c
  • Just for testing
  • Not the same as net/ipv4/tcp_dctcp.c
  • Helpers for tcp_sock/inet_connection_sock structs/logic
  • structs with same name as in kernel
  • But with just the fields needed for DCTCP
Passing arguments
static unsigned int dctcp_shift_g = 4; /* g = 1/2^4 */
static unsigned int dctcp_alpha_on_init = DCTCP_MAX_ALPHA;
					
Subset of struct tcp_sock
// tools/testing/selftests/bpf/bpf_tcp_helpers.h
struct tcp_sock {
       struct inet_connection_sock     inet_conn;
       __u32   rcv_nxt, snd_nxt, snd_una;
       __u8    ecn_flags;
       __u32   delivered;
       __u32   delivered_ce;
       __u32   snd_cwnd, snd_cwnd_cnt, snd_cwnd_clamp;
       __u32   snd_ssthresh;
       __u8    syn_data:1,     /* SYN includes data */
               syn_fastopen:1, /* SYN includes Fast Open option */
               syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
               syn_fastopen_ch:1, /* Active TFO re-enabling probe */
               syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
               save_syn:1,     /* Save headers of SYN packet */
               is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
               syn_smc:1;      /* SYN includes SMC */
       __u32   max_packets_out;
       __u32   lsndtime;
       __u32   prior_cwnd;
} __attribute__((preserve_access_index));
					
The full 'struct tcp_sock'
$ pahole tcp_sock | grep size: -A4
	/* size: 2192, cachelines: 35, members: 135 */
	/* sum members: 2181, holes: 3, sum holes: 8 */
	/* sum bitfield members: 24 bits (3 bytes) */
	/* paddings: 3, sum paddings: 10 */
	/* last cacheline: 16 bytes */
$ 
					
BPF_MAP_TYPE_STRUCT_OPS

  • To register/unregister/introspect struct ops
  • Receives tcp_congestion_ops pointer
  • Populates map
  • bpf map dump
  • Shows how many users (refcnt)
struct ops init()
SEC("struct_ops/dctcp_init")
void BPF_PROG(dctcp_init, struct sock *sk)
{
	const struct tcp_sock *tp = tcp_sk(sk);
	struct dctcp *ca = inet_csk_ca(sk);

	ca->prior_rcv_nxt = tp->rcv_nxt;
	ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
	ca->loss_cwnd = 0;
	ca->ce_state = 0;

	dctcp_reset(tp, ca);
}
					
struct ops ssthresh
SEC("struct_ops/dctcp_ssthresh")
__u32 BPF_PROG(dctcp_ssthresh, struct sock *sk)
{
	struct dctcp *ca = inet_csk_ca(sk);
	struct tcp_sock *tp = tcp_sk(sk);

	ca->loss_cwnd = tp->snd_cwnd;
	return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) > 11U), 2U);
}
					

BPF trampolines

  • Kernel code call into BPF, nearly zero overhead
  • Arch dependent trampoline generation
  • Converts native calling convention
  • Into BPF calling convention
  • kernel_to_BPF converts kernel args
  • btf_distill_func_proto()
  • Uses BTF to cast args into u64 array for BPF's ctx arg
  • BPF_TRACE_ENTRY/EXIT replaces kprobe/kretprobe
  • More than tracing: XDP minus retpoline
  • http://git.kernel.org/torvalds/c/fec56f5890d93fc2e

Dynamic re-linking



  • Original motivation: XDP chaining
  • BPF_PROG_TYPE_EXT
  • Replace placeholder BPF functions
  • In rootlet XDP program
  • While they are running
  • BTF used to verify function signatures
  • New function must match replaced sig

perf



  • BPF_BTF_GET_FD_BY_ID
  • PERF_RECORD_BPF_EVENT
  • Annotates source code with JITed code
perf top
perf annotate
nospectre_v1 + nospectre_v2

gcc



  • Jose Marchesi
  • BPF target in trunk since Sep/9/2019
  • BTF not yet being produced
  • Patch in the works
  • Will use pahole for testing

gcc helper builtins


$ grep __builtin_bpf testsuite/gcc.target/bpf/*.c | \
  sed -r 's/.*(__builtin_bpf[a-z0-9_]+).*/\1/g' | sort -u | head
__builtin_bpf_helper_bind
__builtin_bpf_helper_clone_redirect
__builtin_bpf_helper_csum_diff
__builtin_bpf_helper_csum_update
__builtin_bpf_helper_current_task_under_cgroup
__builtin_bpf_helper_fib_lookup
__builtin_bpf_helper_get_cgroup_classid
__builtin_bpf_helper_get_current_cgroup_id
__builtin_bpf_helper_get_current_comm
__builtin_bpf_helper_get_current_pid_tgid
$