acme@redhat.com |
---|
commit 91b6d325635617540b6a1646ddb138bb17cbd569 Author: Eric DumazetDate: Mon Nov 15 11:02:39 2021 -0800 net: cache align tcp_memory_allocated, tcp_sockets_allocated tcp_memory_allocated and tcp_sockets_allocated often share a common cache line, source of false sharing.
+++ b/net/ipv4/tcp.c -atomic_long_t tcp_memory_allocated; // Current allocated memory +atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; // Current allocated memory -struct percpu_counter tcp_sockets_allocated; +struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
+++ b/drivers/md/bcache/writeback.c sectors_dirty = atomic_add_return(s, d->stripe_sectors_dirty + stripe); - if (sectors_dirty == d->stripe_size) - set_bit(stripe, d->full_dirty_stripes); - else - clear_bit(stripe, d->full_dirty_stripes); + if (sectors_dirty == d->stripe_size) { + if (!test_bit(stripe, d->full_dirty_stripes)) + set_bit(stripe, d->full_dirty_stripes); + } else { + if (test_bit(stripe, d->full_dirty_stripes)) + clear_bit(stripe, d->full_dirty_stripes); + }
+++ b/include/linux/page_counter.h @@ -12,7 +12,6 @@ struct page_counter { unsigned long low; unsigned long high; unsigned long max; - struct page_counter *parent; /* effective memory.min and memory.min usage tracking */ unsigned long emin; @@ -26,6 +25,12 @@ struct page_counter { unsigned long watermark; unsigned long failcnt; + /* + * 'parent' is placed here to be far from 'usage' to reduce cache + * false sharing, as 'usage' is written mostly while parent is + * frequently read for cgroup's hierarchical counting nature. + */ + struct page_counter *parent; };
$ pahole page_counter struct page_counter { atomic_long_t usage; /* 0 8 */ long unsigned int min; /* 8 8 */ long unsigned int low; /* 16 8 */ long unsigned int high; /* 24 8 */ long unsigned int max; /* 32 8 */ long unsigned int emin; /* 40 8 */ atomic_long_t min_usage; /* 48 8 */ atomic_long_t children_min_usage; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ long unsigned int elow; /* 64 8 */ atomic_long_t low_usage; /* 72 8 */ atomic_long_t children_low_usage; /* 80 8 */ long unsigned int watermark; /* 88 8 */ long unsigned int failcnt; /* 96 8 */ struct page_counter *parent; /* 104 8 */ /* size: 112, cachelines: 2, members: 14 */ /* last cacheline: 48 bytes */ }; $
$ cd /sys/kernel/btf $ ls -lh vmlinux -r--r--r--. 1 root root 5.1M Sep 8 20:38 vmlinux $ $ ls -lh i915 -r--r--r--. 1 root root 556K Sep 12 09:29 i915 $ $ ls -1 | wc -l 204 $ lsmod | wc -l 204 $ lsmod | head -2 Module Size Used by sctp 434176 28 $
$ pahole page_pool struct page_pool { struct page_pool_params p; /* 0 56 */ struct delayed_work release_dw; /* 56 88 */ /* XXX last struct has 4 bytes of padding */ /* --- cacheline 2 boundary (128 bytes) was 16 bytes ago --- */ void (*disconnect)(void *); /* 144 8 */ long unsigned int defer_start; /* 152 8 */ long unsigned int defer_warn; /* 160 8 */ u32 pages_state_hold_cnt; /* 168 4 */ unsigned int frag_offset; /* 172 4 */ struct page * frag_page; /* 176 8 */ long int frag_users; /* 184 8 */ /* --- cacheline 3 boundary (192 bytes) --- */ u32 xdp_mem_id; /* 192 4 */ /* XXX 60 bytes hole, try to pack */ /* --- cacheline 4 boundary (256 bytes) --- */ struct pp_alloc_cache alloc __attribute__((__aligned__(64))); /* 256 1032 */ /* XXX 56 bytes hole, try to pack */ /* --- cacheline 21 boundary (1344 bytes) --- */ struct ptr_ring ring __attribute__((__aligned__(64))); /* 1344 192 */ /* XXX last struct has 48 bytes of padding */ /* --- cacheline 24 boundary (1536 bytes) --- */ atomic_t pages_state_release_cnt; /* 1536 4 */ refcount_t user_cnt; /* 1540 4 */ u64 destroy_cnt; /* 1544 8 */ /* size: 1600, cachelines: 25, members: 15 */ /* sum members: 1436, holes: 2, sum holes: 116 */ /* padding: 48 paddings: 2, sum paddings: 52 */ /* forced alignments: 2, forced holes: 2, sum forced holes: 116 */ } __attribute__((__aligned__(64))); $
18.8.1.2 Load Latency Performance Monitoring Facility The load latency facility provides software a means to charaterize the average load latency to different levels of cache/memory hierarchy. This facility requires processor supporting enhanced PEBS record format in the PEBS buffer, see Table 18-23. This field measures the load latency from load's first dispatch of till final data writeback from the memory subsystem. The latency is reported for retired demand load operations and in core cycles (it accounts for re-dispatches).
# perf mem record -a sleep 1 # # perf mem -t load report --sort=mem --stdio # Total Lost Samples: 0 # # Samples: 51K of event 'cpu/mem-loads,ldlat=30/P' # Total weight : 4819902 # Sort order : mem # # Overhead Samples Memory access # ........ ............ ........................ 44.87% 20217 LFB or LFB hit 27.30% 18618 L3 or L3 hit 22.53% 11712 L1 or L1 hit 4.85% 637 Local RAM or RAM hit 0.25% 1 Uncached or N/A hit 0.20% 188 L2 or L2 hit 0.00% 35 L3 miss
# perf mem record sleep 1 # # perf mem -t load report --sort=mem --stdio # Total Lost Samples: 0 # # Samples: 16 of event 'cpu/mem-loads,ldlat=30/P' # Total weight : 1556 # Sort order : mem # Overhead Samples Memory access # ........ ............ ........................ # 64.52% 8 LFB or LFB hit 14.07% 4 L1 or L1 hit 11.05% 3 L3 or L3 hit 10.35% 1 Local RAM or RAM hit
$ make -j8 O=../build/allmodconfig/ make[1]: Entering directory '/home/acme/git/build/allmodconfig'# perf mem record sleep 1m [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.037 MB perf.data (20 samples) ] #
# perf mem report --stdio # Total Lost Samples: 0 # # Samples: 11 of event 'cpu/mem-loads,ldlat=30/P' # Total weight : 2155 # Sort order : local_weight,mem,sym,dso,symbol_daddr,dso_daddr # # Local Mem #Overhead Weig Access Symbol Sh Object Data Symbol Data Obj #........ .... ........ ................... ......... ................ ........... 23.94% 516 LocalRAM copy_page [kernel] 0xffff8d42228ea900 [unknown] 15.31% 330 LFB flush_signal_handle [kernel] 0xffff8d3f976020a0 [unknown] 14.66% 316 LFB strlen [kernel] 0xffffffff9b5f4cd3 [kernel].ro 13.36% 288 LFB _dl_relocate_object ld-linux.so 0x00007f6ccdc23068 libc.so.6 11.46% 247 LFB next_uptodate_page [kernel] 0xffffe401957e4df4 [unknown] 7.33% 158 LFB copy_page [kernel] 0xffff8d41f2dae920 [unknown] 4.04% 87 LFB unlock_page_memcg [kernel] 0xffffe4019333d8b8 [unknown] 3.06% 66 L1 check_preemption_di [kernel] 0xffffa8e8622ffc80 [unknown] 2.69% 58 LFB perf_output_begin [kernel] 0xffff8d3f52a1b01c [unknown] 2.13% 46 L3 task_work_run [kernel] 0xffff8d3f4a9c802c [unknown] 2.00% 43 L1 kmem_cache_alloc_tr [kernel] 0xffffa8e8622ffbc8 [unknown]
# perf c2c record -a sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 7.787 MB perf.data (2450 samples) ] # perf evlist cpu/mem-loads,ldlat=30/P cpu/mem-stores/P dummy:HG #
# perf script --cpu 4 --pid 0 | head
swapper 0 [4] 319242.043904: 58 cpu/mem-loads,ldlat=30/P: ffff8d3e49c0e688 11868100242 |OP LOAD|LVL LFB or LFB hit|SNP None|TLB L1 or L2 hit|LCK No|BLK N/A 290 0 ffffffff9a13eb2c __update_load_avg_cfs_rq+0x9c (vmlinux) 9c0e688
swapper 0 [4] 319242.142295: 39 cpu/mem-loads,ldlat=30/P: ffff8d44865f2408 10268100142 |OP LOAD|LVL L1 or L1 hit|SNP None|TLB L1 or L2 hit|LCK No|BLK N/A 335 0 ffffffff9a13eecd update_rt_rq_load_avg+0x17d (vmlinux) 6465f2408
swapper 0 [4] 319242.143587: 99614 cpu/mem-stores/P: ffff8d4486500028 5080184 |OP STORE|LVL L1 miss|SNP N/A|TLB N/A|LCK N/A|BLK N/A 0 0 ffffffff9a001c2f __switch_to_asm+0x1f (vmlinux) 646500028
swapper 0 [4] 319242.174494: 33 cpu/mem-loads,ldlat=30/P: ffff8d3f595ddc38 11a68201042 |OP LOAD|LVL Local RAM or RAM hit|SNP Hit|TLB L1 or L2 hit|LCK No|BLK N/A 176 0 ffffffff9a13e78d __update_load_avg_se+0x1d (vmlinux) 1195ddc38
swapper 0 [4] 319242.178002: 27 cpu/mem-loads,ldlat=30/P: ffff8d44865312c0 10668100842 |OP LOAD|LVL L3 or L3 hit|SNP None|TLB L1 or L2 hit|LCK No|BLK N/A 56 0 ffffffff9a07d74f switch_mm_irqs_off+0x16f (vmlinux) 6465312c0
swapper 0 [4] 319242.212148: 23 cpu/mem-loads,ldlat=30/P: ffff8d44865322e8 10668100842 |OP LOAD|LVL L3 or L3 hit|SNP None|TLB L1 or L2 hit|LCK No|BLK N/A 55 0 ffffffff9a140c22 irqtime_account_process_tick+0xa2 (vmlinux) 6465322e8
swapper 0 [4] 319242.217357: 18 cpu/mem-loads,ldlat=30/P: ffff8d4486532490 10268100142 |OP LOAD|LVL L1 or L1 hit|SNP None|TLB L1 or L2 hit|LCK No|BLK N/A 125 0 ffffffff9a140076 update_irq_load_avg+0xf6 (vmlinux) 646532490
swapper 0 [4] 319242.220573: 15 cpu/mem-loads,ldlat=30/P: ffff8d3f4f35f218 11868100242 |OP LOAD|LVL LFB or LFB hit|SNP None|TLB L1 or L2 hit|LCK No|BLK N/A 383 0 ffffffff9a73b407 rb_erase+0x7 (vmlinux) 10f35f218
swapper 0 [4] 319242.240176: 15 cpu/mem-loads,ldlat=30/P: ffff8d3f6b617be0 10650100842 |OP LOAD|LVL L3 or L3 hit|SNP None|TLB L2 miss|LCK No|BLK N/A 184 0 ffffffff9a129fbb update_blocked_averages+0x1fb (vmlinux) 12b617be0
swapper 0 [4] 319242.243441: 8849 cpu/mem-stores/P: ffff8d3f40c2b1a4 5080144 |OP STORE|LVL L1 hit|SNP N/A|TLB N/A|LCK N/A|BLK N/A 0 0 ffffffff9ad68aed rcu_eqs_exit.constprop.0+0x3d (vmlinux) 100c2b1a4
#
# perf evlist -v | head -1 cpu/mem-loads,ldlat=30/P: type: 4, size: 128, config: 0x1cd, \ { sample_period, sample_freq }: 4000, \ sample_type: IP|TID|TIME|ADDR|ID|CPU|PERIOD|DATA_SRC|PHYS_ADDR| \ WEIGHT_STRUCT, \ read_format: ID, disabled: 1, inherit: 1, freq: 1, precise_ip: 3, \ sample_id_all: 1, { bp_addr, config1 }: 0x1f #
# perf report --header-only # captured on : Fri Sep 9 11:10:04 2022 # hostname : quaco # os release : 5.18.17-200.fc36.x86_64 # perf version : 6.0.rc3.gfaf59ec8c3c3 # arch : x86_64 # nrcpus online : 8 # nrcpus avail : 8 # cpudesc : Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz # total memory : 24487388 kB # cmdline : /home/acme/bin/perf c2c record -a sleep 1m # event : name = cpu/mem-loads,ldlat=30/P, freq = 4000, sample_type = IP|TID|TIME|ADDR|ID|CPU|PERIOD|DATA_SRC|\ PHYS_ADDR|WEIGHT_STRUCT # event : name = cpu/mem-stores/P, freq = 4000, sample_type = IP|TID|TIME|ADDR|ID|CPU|PERIOD|DATA_SRC|\ PHYS_ADDR|WEIGHT_STRUCT
# perf c2c report --stats Total records : 3223429 Locked Load/Store Operations : 112673 Load Operations : 1387118 Loads - uncacheable : 1 Loads - IO : 4 Loads - Miss : 142 Loads - no mapping : 2350 Load Fill Buffer Hit : 455747 Load L1D hit : 264355 Load L2D hit : 29304 Load LLC hit : 534642 Load Local HITM : 629 Load Remote HITM : 0 Load Remote HIT : 0
# perf c2c report --stdio ================================================= Shared Data Cache Line Table ================================================= # # -- Cacheline -- -- Load Hitm ---- Tot Total Total -- Stores -- # Idx Address Hitm Tot LclHitm rec Loads Stores L1Hit L1Miss # ... ................ ..... ... ....... ... ..... ...... ............ # 0 ffff8d449e7d6380 8.43% 53 53 510 499 11 11 0 1 ffff8d4058209340 6.20% 39 39 371 135 236 223 13 2 ffff8d449e7ff400 5.88% 37 37 501 479 22 22 0 3 ffffffff9bf53980 4.93% 31 31 233 208 25 24 1 4 ffff8d3f49ebd280 3.18% 20 20 162 153 9 9 0 5 ffff8d3f420d4880 2.86% 18 18 126 121 5 5 0
Cacheline 0xffff8d449e7ff400 -HITM- CL --- cycles --- Tot cpu LclHitm Off Code address lcl hitm load rec cnt Symbol Object Source:Line 97.30% 0x0 0xffffffff9a2d293b 113 44 454 8 __mod_node_page_state [kernel] vmstat.c:379 0.00% 0x8 0xffffffff9a2d29bb 0 112 40 8 __mod_node_page_state [kernel] atomic64_64.h:46 2.70% 0x18 0xffffffff9a2d2be5 959 103 2 2 refresh_cpu_vm_stats [kernel] atomic64_64.h:46
$ perf probe -L vmstat.c:379 | head 379 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; 380 s8 __percpu *p = pcp->vm_node_stat_diff + item; 381 long x; long t; if (vmstat_item_in_bytes(item)) { /* * Only cgroups use subpage accounting right now; at * the global level, these items still change in $
$ pfunct __mod_node_page_state void __mod_node_page_state(struct pglist_data * pgdat, enum node_stat_item item, long int delta); $ $ pahole pglist_data | grep -B2 -A6 per_cpu_nodestats /* --- cacheline 2704 boundary (173056 bytes) --- */ struct zone_padding _pad2_; /* 173056 0 */ struct per_cpu_nodestat *per_cpu_nodestats __attribute__((__aligned__(64))); /* 173056 8 */ atomic_long_t vm_stat[41]; /* 173064 328 */ /* size: 173440, cachelines: 2710, members: 32 */ /* sum members: 173309, holes: 6, sum holes: 83 */ /* padding: 48 */ /* forced alignments: 2 */ $
# perf annotate --stdio2 refresh_cpu_vm_stats <SNIP> refresh_cpu_vm_stats() /usr/lib/debug/lib/modules/5.18.17-200.fc36.x86_64/vmlinux ffffffff812d29e0: static int refresh_cpu_vm_stats(bool do_pagesets) struct pglist_data *pgdat; struct zone *zone; <SNIP> for_each_online_pgdat(pgdat) { <SNIP> 1f3: cmpxchg %r8b,%gs:(%rdx) ↑ jne 1f3 movsbl %al,%edi if (v) { test %edi,%edi ↓ je 20b atomic_long_add(v, &pgdat->vm_stat[i])
* Update the zone counters for the current cpu. * Note that refresh_cpu_vm_stats strives to only access node local * memory. The per cpu pagesets on remote zones are placed in the * memory local to the processor using that pageset. So the loop over * all zones will access a series of cachelines local to the processor. * The call to zone_page_state_add updates the cachelines with the stats * in the remote zone struct as well as the global cachelines with the * global counters. These could cause remote node cache line bouncing * and will have to be only done when necessary. * The function returns the number of global counters updated.
$ wc -l tools/perf/util/bpf_skel/*.bpf.c 191 tools/perf/util/bpf_skel/bperf_cgroup.bpf.c 78 tools/perf/util/bpf_skel/bperf_follower.bpf.c 55 tools/perf/util/bpf_skel/bperf_leader.bpf.c 92 tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c 116 tools/perf/util/bpf_skel/func_latency.bpf.c 383 tools/perf/util/bpf_skel/kwork_trace.bpf.c 175 tools/perf/util/bpf_skel/lock_contention.bpf.c 273 tools/perf/util/bpf_skel/off_cpu.bpf.c 1363 total $
$ sudo perf lock contention -b ^C contended total wait max wait avg wait type caller 42 192.67 us 13.64 us 4.59 us spinlock queue_work_on+0x20 23 85.54 us 10.28 us 3.72 us spinlock worker_thread+0x14a 6 13.92 us 6.51 us 2.32 us mutex kernfs_iop_permission+0x30 3 11.59 us 10.04 us 3.86 us mutex kernfs_dop_revalidate+0x3c 1 7.52 us 7.52 us 7.52 us spinlock kthread+0x115 1 7.24 us 7.24 us 7.24 us rwlock:W sys_epoll_wait+0x148 2 7.08 us 3.99 us 3.54 us spinlock delayed_work_timer_fn+0x1b 1 6.41 us 6.41 us 6.41 us spinlock idle_balance+0xa06 2 2.50 us 1.83 us 1.25 us mutex kernfs_iop_lookup+0x2f 1 1.71 us 1.71 us 1.71 us mutex kernfs_iop_getattr+0x2c ...