linux-uconsole/kernel
Alexei Starovoitov e3bc64c9aa bpf: fix lockdep false positive in percpu_freelist
[ Upstream commit a89fac57b5 ]

Lockdep warns about false positive:
[   12.492084] 00000000e6b28347 (&head->lock){+...}, at: pcpu_freelist_push+0x2a/0x40
[   12.492696] but this lock was taken by another, HARDIRQ-safe lock in the past:
[   12.493275]  (&rq->lock){-.-.}
[   12.493276]
[   12.493276]
[   12.493276] and interrupts could create inverse lock ordering between them.
[   12.493276]
[   12.494435]
[   12.494435] other info that might help us debug this:
[   12.494979]  Possible interrupt unsafe locking scenario:
[   12.494979]
[   12.495518]        CPU0                    CPU1
[   12.495879]        ----                    ----
[   12.496243]   lock(&head->lock);
[   12.496502]                                local_irq_disable();
[   12.496969]                                lock(&rq->lock);
[   12.497431]                                lock(&head->lock);
[   12.497890]   <Interrupt>
[   12.498104]     lock(&rq->lock);
[   12.498368]
[   12.498368]  *** DEADLOCK ***
[   12.498368]
[   12.498837] 1 lock held by dd/276:
[   12.499110]  #0: 00000000c58cb2ee (rcu_read_lock){....}, at: trace_call_bpf+0x5e/0x240
[   12.499747]
[   12.499747] the shortest dependencies between 2nd lock and 1st lock:
[   12.500389]  -> (&rq->lock){-.-.} {
[   12.500669]     IN-HARDIRQ-W at:
[   12.500934]                       _raw_spin_lock+0x2f/0x40
[   12.501373]                       scheduler_tick+0x4c/0xf0
[   12.501812]                       update_process_times+0x40/0x50
[   12.502294]                       tick_periodic+0x27/0xb0
[   12.502723]                       tick_handle_periodic+0x1f/0x60
[   12.503203]                       timer_interrupt+0x11/0x20
[   12.503651]                       __handle_irq_event_percpu+0x43/0x2c0
[   12.504167]                       handle_irq_event_percpu+0x20/0x50
[   12.504674]                       handle_irq_event+0x37/0x60
[   12.505139]                       handle_level_irq+0xa7/0x120
[   12.505601]                       handle_irq+0xa1/0x150
[   12.506018]                       do_IRQ+0x77/0x140
[   12.506411]                       ret_from_intr+0x0/0x1d
[   12.506834]                       _raw_spin_unlock_irqrestore+0x53/0x60
[   12.507362]                       __setup_irq+0x481/0x730
[   12.507789]                       setup_irq+0x49/0x80
[   12.508195]                       hpet_time_init+0x21/0x32
[   12.508644]                       x86_late_time_init+0xb/0x16
[   12.509106]                       start_kernel+0x390/0x42a
[   12.509554]                       secondary_startup_64+0xa4/0xb0
[   12.510034]     IN-SOFTIRQ-W at:
[   12.510305]                       _raw_spin_lock+0x2f/0x40
[   12.510772]                       try_to_wake_up+0x1c7/0x4e0
[   12.511220]                       swake_up_locked+0x20/0x40
[   12.511657]                       swake_up_one+0x1a/0x30
[   12.512070]                       rcu_process_callbacks+0xc5/0x650
[   12.512553]                       __do_softirq+0xe6/0x47b
[   12.512978]                       irq_exit+0xc3/0xd0
[   12.513372]                       smp_apic_timer_interrupt+0xa9/0x250
[   12.513876]                       apic_timer_interrupt+0xf/0x20
[   12.514343]                       default_idle+0x1c/0x170
[   12.514765]                       do_idle+0x199/0x240
[   12.515159]                       cpu_startup_entry+0x19/0x20
[   12.515614]                       start_kernel+0x422/0x42a
[   12.516045]                       secondary_startup_64+0xa4/0xb0
[   12.516521]     INITIAL USE at:
[   12.516774]                      _raw_spin_lock_irqsave+0x38/0x50
[   12.517258]                      rq_attach_root+0x16/0xd0
[   12.517685]                      sched_init+0x2f2/0x3eb
[   12.518096]                      start_kernel+0x1fb/0x42a
[   12.518525]                      secondary_startup_64+0xa4/0xb0
[   12.518986]   }
[   12.519132]   ... key      at: [<ffffffff82b7bc28>] __key.71384+0x0/0x8
[   12.519649]   ... acquired at:
[   12.519892]    pcpu_freelist_pop+0x7b/0xd0
[   12.520221]    bpf_get_stackid+0x1d2/0x4d0
[   12.520563]    ___bpf_prog_run+0x8b4/0x11a0
[   12.520887]
[   12.521008] -> (&head->lock){+...} {
[   12.521292]    HARDIRQ-ON-W at:
[   12.521539]                     _raw_spin_lock+0x2f/0x40
[   12.521950]                     pcpu_freelist_push+0x2a/0x40
[   12.522396]                     bpf_get_stackid+0x494/0x4d0
[   12.522828]                     ___bpf_prog_run+0x8b4/0x11a0
[   12.523296]    INITIAL USE at:
[   12.523537]                    _raw_spin_lock+0x2f/0x40
[   12.523944]                    pcpu_freelist_populate+0xc0/0x120
[   12.524417]                    htab_map_alloc+0x405/0x500
[   12.524835]                    __do_sys_bpf+0x1a3/0x1a90
[   12.525253]                    do_syscall_64+0x4a/0x180
[   12.525659]                    entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   12.526167]  }
[   12.526311]  ... key      at: [<ffffffff838f7668>] __key.13130+0x0/0x8
[   12.526812]  ... acquired at:
[   12.527047]    __lock_acquire+0x521/0x1350
[   12.527371]    lock_acquire+0x98/0x190
[   12.527680]    _raw_spin_lock+0x2f/0x40
[   12.527994]    pcpu_freelist_push+0x2a/0x40
[   12.528325]    bpf_get_stackid+0x494/0x4d0
[   12.528645]    ___bpf_prog_run+0x8b4/0x11a0
[   12.528970]
[   12.529092]
[   12.529092] stack backtrace:
[   12.529444] CPU: 0 PID: 276 Comm: dd Not tainted 5.0.0-rc3-00018-g2fa53f892422 #475
[   12.530043] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014
[   12.530750] Call Trace:
[   12.530948]  dump_stack+0x5f/0x8b
[   12.531248]  check_usage_backwards+0x10c/0x120
[   12.531598]  ? ___bpf_prog_run+0x8b4/0x11a0
[   12.531935]  ? mark_lock+0x382/0x560
[   12.532229]  mark_lock+0x382/0x560
[   12.532496]  ? print_shortest_lock_dependencies+0x180/0x180
[   12.532928]  __lock_acquire+0x521/0x1350
[   12.533271]  ? find_get_entry+0x17f/0x2e0
[   12.533586]  ? find_get_entry+0x19c/0x2e0
[   12.533902]  ? lock_acquire+0x98/0x190
[   12.534196]  lock_acquire+0x98/0x190
[   12.534482]  ? pcpu_freelist_push+0x2a/0x40
[   12.534810]  _raw_spin_lock+0x2f/0x40
[   12.535099]  ? pcpu_freelist_push+0x2a/0x40
[   12.535432]  pcpu_freelist_push+0x2a/0x40
[   12.535750]  bpf_get_stackid+0x494/0x4d0
[   12.536062]  ___bpf_prog_run+0x8b4/0x11a0

It has been explained that is a false positive here:
https://lkml.org/lkml/2018/7/25/756
Recap:
- stackmap uses pcpu_freelist
- The lock in pcpu_freelist is a percpu lock
- stackmap is only used by tracing bpf_prog
- A tracing bpf_prog cannot be run if another bpf_prog
  has already been running (ensured by the percpu bpf_prog_active counter).

Eric pointed out that this lockdep splats stops other
legit lockdep splats in selftests/bpf/test_progs.c.

Fix this by calling local_irq_save/restore for stackmap.

Another false positive had also been worked around by calling
local_irq_save in commit 89ad2fa3f0 ("bpf: fix lockdep splat").
That commit added unnecessary irq_save/restore to fast path of
bpf hash map. irqs are already disabled at that point, since htab
is holding per bucket spin_lock with irqsave.

Let's reduce overhead for htab by introducing __pcpu_freelist_push/pop
function w/o irqsave and convert pcpu_freelist_push/pop to irqsave
to be used elsewhere (right now only in stackmap).
It stops lockdep false positive in stackmap with a bit of acceptable overhead.

Fixes: 557c0c6e7d ("bpf: convert stackmap to pre-allocation")
Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2019-03-13 14:02:36 -07:00
..
bpf bpf: fix lockdep false positive in percpu_freelist 2019-03-13 14:02:36 -07:00
cgroup cgroup: fix parsing empty mount option string 2019-02-12 19:47:17 +01:00
configs kconfig: tinyconfig: remove stale stack protector fixups 2018-06-15 07:15:28 +09:00
debug kdb: Don't back trace on a cpu that didn't round up 2019-02-12 19:47:19 +01:00
dma dma-direct: do not include SME mask in the DMA supported check 2019-01-13 09:51:05 +01:00
events perf core: Fix perf_proc_update_handler() bug 2019-03-13 14:02:26 -07:00
gcov gcov: remove CONFIG_GCOV_FORMAT_AUTODETECT 2018-06-08 18:56:02 +09:00
irq genirq: Make sure the initial affinity is not empty 2019-03-05 17:58:47 +01:00
livepatch Merge branch 'for-4.19/upstream' into for-linus 2018-08-20 18:33:50 +02:00
locking locking/rwsem: Fix (possible) missed wakeup 2019-03-05 17:58:49 +01:00
power Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input 2018-10-12 12:35:02 +02:00
printk printk: Fix panic caused by passing log_buf_len to command line 2018-11-13 11:08:48 -08:00
rcu srcu: Lock srcu_data structure in srcu_gp_start() 2019-01-13 09:51:06 +01:00
sched sched/wake_q: Fix wakeup ordering for wake_q 2019-03-05 17:58:49 +01:00
time timekeeping: Use proper seqcount initializer 2019-02-12 19:47:05 +01:00
trace tracing: Fix event filters and triggers to handle negative numbers 2019-03-10 07:17:20 +01:00
.gitignore
acct.c
async.c
audit.c audit: use ktime_get_coarse_real_ts64() for timestamps 2018-07-17 14:45:08 -04:00
audit.h
audit_fsnotify.c fsnotify: add fsnotify_add_inode_mark() wrappers 2018-05-18 14:58:22 +02:00
audit_tree.c \n 2018-08-17 09:41:28 -07:00
audit_watch.c audit: fix use-after-free in audit_add_watch 2018-07-18 11:43:36 -04:00
auditfilter.c audit: rename FILTER_TYPE to FILTER_EXCLUDE 2018-06-19 10:39:54 -04:00
auditsc.c audit/stable-4.18 PR 20180814 2018-08-15 10:46:54 -07:00
backtracetest.c
bounds.c kbuild: fix kernel/bounds.c 'W=1' warning 2018-11-13 11:08:47 -08:00
capability.c
compat.c time: Enable get/put_compat_itimerspec64 always 2018-06-24 14:39:47 +02:00
configs.c
context_tracking.c
cpu.c cpu/hotplug: Fix "SMT disabled by BIOS" detection for KVM 2019-02-12 19:47:25 +01:00
cpu_pm.c
crash_core.c kernel/crash_core.c: print timestamp using time64_t 2018-08-22 10:52:47 -07:00
crash_dump.c
cred.c
delayacct.c delayacct: Use raw_spinlocks 2018-04-27 14:34:51 +02:00
dma.c proc: introduce proc_create_single{,_data} 2018-05-16 07:23:35 +02:00
elfcore.c
exec_domain.c proc: introduce proc_create_single{,_data} 2018-05-16 07:23:35 +02:00
exit.c sched/wait: Fix rcuwait_wake_up() ordering 2019-03-05 17:58:49 +01:00
extable.c
fail_function.c bpf/error-inject/kprobes: Clear current_kprobe and enable preempt in kprobe 2018-06-21 12:33:19 +02:00
fork.c fork: record start_time late 2019-01-13 09:51:04 +01:00
freezer.c PM / reboot: Eliminate race between reboot and suspend 2018-08-06 12:35:20 +02:00
futex.c futex: Fix (possible) missed wakeup 2019-03-05 17:58:49 +01:00
futex_compat.c
groups.c
hung_task.c kernel/hung_task.c: force console verbose before panic 2019-02-12 19:47:19 +01:00
iomem.c memremap: split devm_memremap_pages() and memremap() infrastructure 2018-05-15 23:08:33 -07:00
irq_work.c
jump_label.c jump_label: Fix typo in warning message 2018-09-10 10:15:48 +02:00
kallsyms.c kallsyms, x86: Export addresses of PTI entry trampolines 2018-08-14 19:12:29 -03:00
kcmp.c
Kconfig.freezer
Kconfig.hz
Kconfig.locks
Kconfig.preempt kconfig: include kernel/Kconfig.preempt from init/Kconfig 2018-08-02 08:06:54 +09:00
kcov.c kernel/kcov.c: mark write_comp_data() as notrace 2019-02-12 19:47:20 +01:00
kexec.c kexec: add call to LSM hook in original kexec_load syscall 2018-07-16 12:31:57 -07:00
kexec_core.c kexec: yield to scheduler when loading kimage segments 2018-06-15 07:55:24 +09:00
kexec_file.c treewide: Use array_size() in vzalloc() 2018-06-12 16:19:22 -07:00
kexec_internal.h
kmod.c
kprobes.c kprobes: Return error if we fail to reuse kprobe instead of BUG_ON() 2018-11-13 11:08:28 -08:00
ksysfs.c
kthread.c Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip 2018-08-13 11:25:07 -07:00
latencytop.c
Makefile kbuild: move bin2c back to scripts/ from scripts/basic/ 2018-07-18 01:18:05 +09:00
memremap.c mm, devm_memremap_pages: add MEMORY_DEVICE_PRIVATE support 2019-01-13 09:51:04 +01:00
module-internal.h modsign: log module name in the event of an error 2018-07-02 11:36:17 +02:00
module.c kobject: return error code if writing /sys/.../uevent fails 2019-02-12 19:47:06 +01:00
module_signing.c modsign: log module name in the event of an error 2018-07-02 11:36:17 +02:00
notifier.c
nsproxy.c
padata.c
panic.c panic: avoid deadlocks in re-entrant console drivers 2018-12-29 13:37:57 +01:00
params.c
pid.c Fix failure path in alloc_pid() 2019-01-13 09:51:06 +01:00
pid_namespace.c
profile.c
ptrace.c ptrace: Remove unused ptrace_may_access_sched() and MODE_IBRS 2018-12-05 19:32:03 +01:00
range.c
reboot.c PM / reboot: Eliminate race between reboot and suspend 2018-08-06 12:35:20 +02:00
relay.c relay: check return of create_buf_file() properly 2019-03-13 14:02:35 -07:00
resource.c libnvdimm for 4.18 2018-06-08 17:21:52 -07:00
rseq.c rseq: uapi: Declare rseq_cs field as union, update includes 2018-07-10 22:18:52 +02:00
seccomp.c audit/stable-4.18 PR 20180605 2018-06-06 16:34:00 -07:00
signal.c signal: Restore the stop PTRACE_EVENT_EXIT 2019-02-20 10:25:48 +01:00
smp.c cpu/hotplug: Fix "SMT disabled by BIOS" detection for KVM 2019-02-12 19:47:25 +01:00
smpboot.c smpboot: Remove cpumask from the API 2018-07-03 09:20:44 +02:00
smpboot.h
softirq.c nohz: Fix missing tick reprogram when interrupting an inline softirq 2018-08-03 15:52:10 +02:00
stacktrace.c
stop_machine.c Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip 2018-08-13 11:25:07 -07:00
sys.c kernel/sys.c: remove duplicated include 2018-09-20 22:01:11 +02:00
sys_ni.c Merge branch 'core-rseq-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip 2018-06-10 10:17:09 -07:00
sysctl.c proc/sysctl: fix return error for proc_doulongvec_minmax() 2019-02-12 19:47:19 +01:00
sysctl_binary.c
task_work.c
taskstats.c
test_kprobes.c kprobes: Remove jprobe API implementation 2018-06-21 12:33:05 +02:00
torture.c torture: Keep old-school dmesg format 2018-06-25 11:30:10 -07:00
tracepoint.c tracepoint: Fix tracepoint array element size mismatch 2018-10-17 15:35:29 -04:00
tsacct.c
ucount.c
uid16.c
uid16.h
umh.c umh: fix race condition 2018-06-07 16:56:28 -04:00
up.c
user-return-notifier.c
user.c userns: use irqsave variant of refcount_dec_and_lock() 2018-08-22 10:52:47 -07:00
user_namespace.c userns: also map extents in the reverse map to kernel IDs 2018-11-13 11:09:00 -08:00
utsname.c
utsname_sysctl.c sys: don't hold uts_sem while accessing userspace memory 2018-08-11 02:05:53 -05:00
watchdog.c watchdog: Mark watchdog touch functions as notrace 2018-08-30 12:56:40 +02:00
watchdog_hld.c watchdog: Mark watchdog touch functions as notrace 2018-08-30 12:56:40 +02:00
workqueue.c watchdog: Mark watchdog touch functions as notrace 2018-08-30 12:56:40 +02:00
workqueue_internal.h workqueue: Set worker->desc to workqueue name by default 2018-05-18 08:47:13 -07:00