mirror of
https://github.com/rd-stuffs/msm-4.14.git
synced 2025-02-20 11:45:48 +08:00
-----BEGIN PGP SIGNATURE----- iQIzBAABCAAdFiEEZH8oZUiU471FcZm+ONu9yGCSaT4FAl3jdnkACgkQONu9yGCS aT63nhAAnjTfLWAkeluVyLdyTRSoAedY21PJUtMYJVsULQcU886kRluCuz9Md/Um GVuZDTqlsXroH88xeiwKLsjt8bYOnjFwWQKoexi4MjtePkTkhmEAca3zl5CG7GH6 /e4VdkYvGMc9/+Lkv/+lBgZx7w+hoSEpEyAQ9NFFDDySEnzKpdg66e4fuSU54xhH 9sHAc/dpQX3a9+tgCs5V+tMczGqJH04MiX3zjS/SYvbI45QTOse4KVsZdxuHE0ip Ls4vEporl0C08bnZwyjFaJ7qT/j5vcXAGQ2ikqpegn/jSThG3rgPE1NB4+rUvLS6 3CRcu6CLuoYIpo/7BAzjZTPbYbgDwXqk3P9SSxSGHtt/Iy3nQ0Qt7J129IloHqm8 6mpqtM+D1xbxM/bi7C/16HAYmENos3HW5mv835yc8Xa7hi47FuQCNLY1cRYssDnE RxsCOni/im5Zp+rxbWmXGr0m/BZ7B2P5KdwXuUIeMVit2ROcDKy6DxZNH05RpjDp tTCqjSB27ubl6IfmvSsOD6JjHHNRqgvzsW8PVaSI/dx0jfiAOvn/tRoeHf/gNLQ4 SXVsYCpXyRgGCysABnYOT84ZioGpJABCQDDC6Tpoc8ikbGU7YA3Ju7vpGBl0qBAU 8S/Z0LHoGZFJtIVbJei3176QE/uASDqLctIR6FZMJuw+6pfXEZE= =cWv6 -----END PGP SIGNATURE----- Merge 4.14.157 into android-4.14 Changes in 4.14.157 net/mlx4_en: fix mlx4 ethtool -N insertion net: rtnetlink: prevent underflows in do_setvfinfo() sfc: Only cancel the PPS workqueue if it exists net/mlx5e: Fix set vf link state error flow net/mlxfw: Verify FSM error code translation doesn't exceed array size net/sched: act_pedit: fix WARN() in the traffic path vhost/vsock: split packets to send using multiple buffers gpio: max77620: Fixup debounce delays tools: gpio: Correctly add make dependencies for gpio_utils nbd:fix memory leak in nbd_get_socket() virtio_console: allocate inbufs in add_port() only if it is needed Revert "fs: ocfs2: fix possible null-pointer dereferences in ocfs2_xa_prepare_entry()" mm/ksm.c: don't WARN if page is still mapped in remove_stable_node() drm/i915/userptr: Try to acquire the page lock around set_page_dirty() platform/x86: asus-nb-wmi: Support ALS on the Zenbook UX430UQ platform/x86: asus-wmi: Only Tell EC the OS will handle display hotkeys from asus_nb_wmi mwifiex: Fix NL80211_TX_POWER_LIMITED ALSA: isight: fix leak of reference to firewire unit in error path of .probe callback printk: fix integer overflow in setup_log_buf() gfs2: Fix marking bitmaps non-full pty: fix compat ioctls synclink_gt(): fix compat_ioctl() powerpc: Fix signedness bug in update_flash_db() powerpc/boot: Disable vector instructions powerpc/eeh: Fix use of EEH_PE_KEEP on wrong field EDAC, thunderx: Fix memory leak in thunderx_l2c_threaded_isr() brcmsmac: AP mode: update beacon when TIM changes ath10k: allocate small size dma memory in ath10k_pci_diag_write_mem skd: fixup usage of legacy IO API cdrom: don't attempt to fiddle with cdo->capability spi: sh-msiof: fix deferred probing mmc: mediatek: fix cannot receive new request when msdc_cmd_is_ready fail btrfs: handle error of get_old_root gsmi: Fix bug in append_to_eventlog sysfs handler misc: mic: fix a DMA pool free failure w1: IAD Register is yet readable trough iad sys file. Fix snprintf (%u for unsigned, count for max size). m68k: fix command-line parsing when passed from u-boot RDMA/bnxt_re: Fix qp async event reporting pinctrl: sunxi: Fix a memory leak in 'sunxi_pinctrl_build_state()' pwm: lpss: Only set update bit if we are actually changing the settings amiflop: clean up on errors during setup qed: Align local and global PTT to propagate through the APIs. scsi: ips: fix missing break in switch KVM: nVMX: reset cache/shadows when switching loaded VMCS KVM/x86: Fix invvpid and invept register operand size in 64-bit mode scsi: isci: Use proper enumerated type in atapi_d2h_reg_frame_handler scsi: isci: Change sci_controller_start_task's return type to sci_status scsi: iscsi_tcp: Explicitly cast param in iscsi_sw_tcp_host_get_param crypto: ccree - avoid implicit enum conversion nvmet-fcloop: suppress a compiler warning clk: mmp2: fix the clock id for sdh2_clk and sdh3_clk clk: at91: audio-pll: fix audio pmc type ASoC: tegra_sgtl5000: fix device_node refcounting scsi: dc395x: fix dma API usage in srb_done scsi: dc395x: fix DMA API usage in sg_update_list net: dsa: mv88e6xxx: Fix 88E6141/6341 2500mbps SERDES speed net: fix warning in af_unix net: ena: Fix Kconfig dependency on X86 xfs: fix use-after-free race in xfs_buf_rele kprobes, x86/ptrace.h: Make regs_get_kernel_stack_nth() not fault on bad stack PM / Domains: Deal with multiple states but no governor in genpd ALSA: i2c/cs8427: Fix int to char conversion macintosh/windfarm_smu_sat: Fix debug output PCI: vmd: Detach resources after stopping root bus USB: misc: appledisplay: fix backlight update_status return code usbip: tools: fix atoi() on non-null terminated string dm raid: avoid bitmap with raid4/5/6 journal device SUNRPC: Fix a compile warning for cmpxchg64() sunrpc: safely reallow resvport min/max inversion atm: zatm: Fix empty body Clang warnings s390/perf: Return error when debug_register fails spi: omap2-mcspi: Set FIFO DMA trigger level to word length sparc: Fix parport build warnings. powerpc/pseries: Export raw per-CPU VPA data via debugfs ceph: fix dentry leak in ceph_readdir_prepopulate rtc: s35390a: Change buf's type to u8 in s35390a_init f2fs: fix to spread clear_cold_data() mISDN: Fix type of switch control variable in ctrl_teimanager qlcnic: fix a return in qlcnic_dcb_get_capability() net: ethernet: ti: cpsw: unsync mcast entries while switch promisc mode mfd: arizona: Correct calling of runtime_put_sync mfd: mc13xxx-core: Fix PMIC shutdown when reading ADC values mfd: intel_soc_pmic_bxtwc: Chain power button IRQs as well mfd: max8997: Enale irq-wakeup unconditionally selftests/ftrace: Fix to test kprobe $comm arg only if available selftests: watchdog: fix message when /dev/watchdog open fails selftests: watchdog: Fix error message. thermal: rcar_thermal: Prevent hardware access during system suspend bpf: devmap: fix wrong interface selection in notifier_call powerpc/process: Fix flush_all_to_thread for SPE sparc64: Rework xchg() definition to avoid warnings. arm64: lib: use C string functions with KASAN enabled fs/ocfs2/dlm/dlmdebug.c: fix a sleep-in-atomic-context bug in dlm_print_one_mle() mm/page-writeback.c: fix range_cyclic writeback vs writepages deadlock macsec: update operstate when lower device changes macsec: let the administrator set UP state even if lowerdev is down block: fix the DISCARD request merge i2c: uniphier-f: make driver robust against concurrency i2c: uniphier-f: fix occasional timeout error i2c: uniphier-f: fix race condition when IRQ is cleared um: Make line/tty semantics use true write IRQ vfs: avoid problematic remapping requests into partial EOF block powerpc/xmon: Relax frame size for clang selftests/powerpc/signal: Fix out-of-tree build selftests/powerpc/switch_endian: Fix out-of-tree build selftests/powerpc/cache_shape: Fix out-of-tree build linux/bitmap.h: handle constant zero-size bitmaps correctly linux/bitmap.h: fix type of nbits in bitmap_shift_right() hfsplus: fix BUG on bnode parent update hfs: fix BUG on bnode parent update hfsplus: prevent btree data loss on ENOSPC hfs: prevent btree data loss on ENOSPC hfsplus: fix return value of hfsplus_get_block() hfs: fix return value of hfs_get_block() hfsplus: update timestamps on truncate() hfs: update timestamp on truncate() fs/hfs/extent.c: fix array out of bounds read of array extent mm/memory_hotplug: make add_memory() take the device_hotplug_lock igb: shorten maximum PHC timecounter update interval net: hns3: bugfix for buffer not free problem during resetting ntb_netdev: fix sleep time mismatch ntb: intel: fix return value for ndev_vec_mask() arm64: makefile fix build of .i file in external module case ocfs2: don't put and assigning null to bh allocated outside ocfs2: fix clusters leak in ocfs2_defrag_extent() net: do not abort bulk send on BQL status sched/topology: Fix off by one bug sched/fair: Don't increase sd->balance_interval on newidle balance openvswitch: fix linking without CONFIG_NF_CONNTRACK_LABELS clk: sunxi-ng: enable so-said LDOs for A64 SoC's pll-mipi clock audit: print empty EXECVE args btrfs: avoid link error with CONFIG_NO_AUTO_INLINE wil6210: fix locking in wmi_call wlcore: Fix the return value in case of error in 'wlcore_vendor_cmd_smart_config_start()' rtl8xxxu: Fix missing break in switch brcmsmac: never log "tid x is not agg'able" by default wireless: airo: potential buffer overflow in sprintf() rtlwifi: rtl8192de: Fix misleading REG_MCUFWDL information net: dsa: bcm_sf2: Turn on PHY to allow successful registration scsi: mpt3sas: Fix Sync cache command failure during driver unload scsi: mpt3sas: Don't modify EEDPTagMode field setting on SAS3.5 HBA devices scsi: mpt3sas: Fix driver modifying persistent data in Manufacturing page11 scsi: megaraid_sas: Fix msleep granularity scsi: megaraid_sas: Fix goto labels in error handling scsi: lpfc: fcoe: Fix link down issue after 1000+ link bounces scsi: lpfc: Correct loss of fc4 type on remote port address change dlm: fix invalid free dlm: don't leak kernel pointer to userspace vrf: mark skb for multicast or link-local as enslaved to VRF ACPICA: Use %d for signed int print formatting instead of %u net: bcmgenet: return correct value 'ret' from bcmgenet_power_down of: unittest: allow base devicetree to have symbol metadata cfg80211: Prevent regulatory restore during STA disconnect in concurrent interfaces pinctrl: qcom: spmi-gpio: fix gpio-hog related boot issues pinctrl: lpc18xx: Use define directive for PIN_CONFIG_GPIO_PIN_INT pinctrl: zynq: Use define directive for PIN_CONFIG_IO_STANDARD PCI: keystone: Use quirk to limit MRRS for K2G spi: omap2-mcspi: Fix DMA and FIFO event trigger size mismatch i2c: uniphier-f: fix timeout error after reading 8 bytes mm/memory_hotplug: Do not unlock when fails to take the device_hotplug_lock ipv6: Fix handling of LLA with VRF and sockets bound to VRF cfg80211: call disconnect_wk when AP stops Bluetooth: Fix invalid-free in bcsp_close() KVM: MMU: Do not treat ZONE_DEVICE pages as being reserved ath10k: Fix a NULL-ptr-deref bug in ath10k_usb_alloc_urb_from_pipe ath9k_hw: fix uninitialized variable data md/raid10: prevent access of uninitialized resync_pages offset mm/memory_hotplug: don't access uninitialized memmaps in shrink_zone_span() net: phy: dp83867: fix speed 10 in sgmii mode net: phy: dp83867: increase SGMII autoneg timer duration arm64: fix for bad_mode() handler to always result in panic cpufreq: Skip cpufreq resume if it's not suspended ocfs2: remove ocfs2_is_o2cb_active() ARM: 8904/1: skip nomap memblocks while finding the lowmem/highmem boundary ARC: perf: Accommodate big-endian CPU x86/insn: Fix awk regexp warnings x86/speculation: Fix incorrect MDS/TAA mitigation status x86/speculation: Fix redundant MDS mitigation message nbd: prevent memory leak nfc: port100: handle command failure cleanly media: vivid: Set vid_cap_streaming and vid_out_streaming to true media: vivid: Fix wrong locking that causes race conditions on streaming stop media: usbvision: Fix races among open, close, and disconnect cpufreq: Add NULL checks to show() and store() methods of cpufreq media: uvcvideo: Fix error path in control parsing failure media: b2c2-flexcop-usb: add sanity checking media: cxusb: detect cxusb_ctrl_msg error in query media: imon: invalid dereference in imon_touch_event virtio_ring: fix return code on DMA mapping fails usbip: tools: fix fd leakage in the function of read_attr_usbip_status usbip: Fix uninitialized symbol 'nents' in stub_recv_cmd_submit() usb-serial: cp201x: support Mark-10 digital force gauge USB: chaoskey: fix error case of a timeout appledisplay: fix error handling in the scheduled work USB: serial: mos7840: add USB ID to support Moxa UPort 2210 USB: serial: mos7720: fix remote wakeup USB: serial: mos7840: fix remote wakeup USB: serial: option: add support for DW5821e with eSIM support USB: serial: option: add support for Foxconn T77W968 LTE modules staging: comedi: usbduxfast: usbduxfast_ai_cmdtest rounding error powerpc/64s: support nospectre_v2 cmdline option powerpc/book3s64: Fix link stack flush on context switch KVM: PPC: Book3S HV: Flush link stack on guest exit to host kernel x86/hyperv: mark hyperv_init as __init function Linux 4.14.157 Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
421 lines
13 KiB
C
421 lines
13 KiB
C
/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of version 2 of the GNU General Public
|
|
* License as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful, but
|
|
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*/
|
|
|
|
/* Devmaps primary use is as a backend map for XDP BPF helper call
|
|
* bpf_redirect_map(). Because XDP is mostly concerned with performance we
|
|
* spent some effort to ensure the datapath with redirect maps does not use
|
|
* any locking. This is a quick note on the details.
|
|
*
|
|
* We have three possible paths to get into the devmap control plane bpf
|
|
* syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
|
|
* will invoke an update, delete, or lookup operation. To ensure updates and
|
|
* deletes appear atomic from the datapath side xchg() is used to modify the
|
|
* netdev_map array. Then because the datapath does a lookup into the netdev_map
|
|
* array (read-only) from an RCU critical section we use call_rcu() to wait for
|
|
* an rcu grace period before free'ing the old data structures. This ensures the
|
|
* datapath always has a valid copy. However, the datapath does a "flush"
|
|
* operation that pushes any pending packets in the driver outside the RCU
|
|
* critical section. Each bpf_dtab_netdev tracks these pending operations using
|
|
* an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed
|
|
* until all bits are cleared indicating outstanding flush operations have
|
|
* completed.
|
|
*
|
|
* BPF syscalls may race with BPF program calls on any of the update, delete
|
|
* or lookup operations. As noted above the xchg() operation also keep the
|
|
* netdev_map consistent in this case. From the devmap side BPF programs
|
|
* calling into these operations are the same as multiple user space threads
|
|
* making system calls.
|
|
*
|
|
* Finally, any of the above may race with a netdev_unregister notifier. The
|
|
* unregister notifier must search for net devices in the map structure that
|
|
* contain a reference to the net device and remove them. This is a two step
|
|
* process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
|
|
* check to see if the ifindex is the same as the net_device being removed.
|
|
* When removing the dev a cmpxchg() is used to ensure the correct dev is
|
|
* removed, in the case of a concurrent update or delete operation it is
|
|
* possible that the initially referenced dev is no longer in the map. As the
|
|
* notifier hook walks the map we know that new dev references can not be
|
|
* added by the user because core infrastructure ensures dev_get_by_index()
|
|
* calls will fail at this point.
|
|
*/
|
|
#include <linux/bpf.h>
|
|
#include <linux/filter.h>
|
|
|
|
#define DEV_CREATE_FLAG_MASK \
|
|
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
|
|
|
|
struct bpf_dtab_netdev {
|
|
struct net_device *dev;
|
|
struct bpf_dtab *dtab;
|
|
unsigned int bit;
|
|
struct rcu_head rcu;
|
|
};
|
|
|
|
struct bpf_dtab {
|
|
struct bpf_map map;
|
|
struct bpf_dtab_netdev **netdev_map;
|
|
unsigned long __percpu *flush_needed;
|
|
struct list_head list;
|
|
};
|
|
|
|
static DEFINE_SPINLOCK(dev_map_lock);
|
|
static LIST_HEAD(dev_map_list);
|
|
|
|
static u64 dev_map_bitmap_size(const union bpf_attr *attr)
|
|
{
|
|
return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
|
|
}
|
|
|
|
static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
|
|
{
|
|
struct bpf_dtab *dtab;
|
|
int err = -EINVAL;
|
|
u64 cost;
|
|
|
|
if (!capable(CAP_NET_ADMIN))
|
|
return ERR_PTR(-EPERM);
|
|
|
|
/* check sanity of attributes */
|
|
if (attr->max_entries == 0 || attr->key_size != 4 ||
|
|
attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
dtab = kzalloc(sizeof(*dtab), GFP_USER);
|
|
if (!dtab)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
/* mandatory map attributes */
|
|
dtab->map.map_type = attr->map_type;
|
|
dtab->map.key_size = attr->key_size;
|
|
dtab->map.value_size = attr->value_size;
|
|
dtab->map.max_entries = attr->max_entries;
|
|
dtab->map.map_flags = attr->map_flags;
|
|
dtab->map.numa_node = bpf_map_attr_numa_node(attr);
|
|
|
|
/* make sure page count doesn't overflow */
|
|
cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
|
|
cost += dev_map_bitmap_size(attr) * num_possible_cpus();
|
|
if (cost >= U32_MAX - PAGE_SIZE)
|
|
goto free_dtab;
|
|
|
|
dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
|
|
|
|
/* if map size is larger than memlock limit, reject it early */
|
|
err = bpf_map_precharge_memlock(dtab->map.pages);
|
|
if (err)
|
|
goto free_dtab;
|
|
|
|
err = -ENOMEM;
|
|
|
|
/* A per cpu bitfield with a bit per possible net device */
|
|
dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
|
|
__alignof__(unsigned long),
|
|
GFP_KERNEL | __GFP_NOWARN);
|
|
if (!dtab->flush_needed)
|
|
goto free_dtab;
|
|
|
|
dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
|
|
sizeof(struct bpf_dtab_netdev *),
|
|
dtab->map.numa_node);
|
|
if (!dtab->netdev_map)
|
|
goto free_dtab;
|
|
|
|
spin_lock(&dev_map_lock);
|
|
list_add_tail_rcu(&dtab->list, &dev_map_list);
|
|
spin_unlock(&dev_map_lock);
|
|
|
|
return &dtab->map;
|
|
free_dtab:
|
|
free_percpu(dtab->flush_needed);
|
|
kfree(dtab);
|
|
return ERR_PTR(err);
|
|
}
|
|
|
|
static void dev_map_free(struct bpf_map *map)
|
|
{
|
|
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
|
|
int i, cpu;
|
|
|
|
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
|
|
* so the programs (can be more than one that used this map) were
|
|
* disconnected from events. Wait for outstanding critical sections in
|
|
* these programs to complete. The rcu critical section only guarantees
|
|
* no further reads against netdev_map. It does __not__ ensure pending
|
|
* flush operations (if any) are complete.
|
|
*/
|
|
|
|
spin_lock(&dev_map_lock);
|
|
list_del_rcu(&dtab->list);
|
|
spin_unlock(&dev_map_lock);
|
|
|
|
synchronize_rcu();
|
|
|
|
/* Make sure prior __dev_map_entry_free() have completed. */
|
|
rcu_barrier();
|
|
|
|
/* To ensure all pending flush operations have completed wait for flush
|
|
* bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
|
|
* Because the above synchronize_rcu() ensures the map is disconnected
|
|
* from the program we can assume no new bits will be set.
|
|
*/
|
|
for_each_online_cpu(cpu) {
|
|
unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
|
|
|
|
while (!bitmap_empty(bitmap, dtab->map.max_entries))
|
|
cond_resched();
|
|
}
|
|
|
|
for (i = 0; i < dtab->map.max_entries; i++) {
|
|
struct bpf_dtab_netdev *dev;
|
|
|
|
dev = dtab->netdev_map[i];
|
|
if (!dev)
|
|
continue;
|
|
|
|
dev_put(dev->dev);
|
|
kfree(dev);
|
|
}
|
|
|
|
free_percpu(dtab->flush_needed);
|
|
bpf_map_area_free(dtab->netdev_map);
|
|
kfree(dtab);
|
|
}
|
|
|
|
static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
|
|
{
|
|
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
|
|
u32 index = key ? *(u32 *)key : U32_MAX;
|
|
u32 *next = next_key;
|
|
|
|
if (index >= dtab->map.max_entries) {
|
|
*next = 0;
|
|
return 0;
|
|
}
|
|
|
|
if (index == dtab->map.max_entries - 1)
|
|
return -ENOENT;
|
|
*next = index + 1;
|
|
return 0;
|
|
}
|
|
|
|
void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
|
|
{
|
|
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
|
|
unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
|
|
|
|
__set_bit(bit, bitmap);
|
|
}
|
|
|
|
/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
|
|
* from the driver before returning from its napi->poll() routine. The poll()
|
|
* routine is called either from busy_poll context or net_rx_action signaled
|
|
* from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
|
|
* net device can be torn down. On devmap tear down we ensure the ctx bitmap
|
|
* is zeroed before completing to ensure all flush operations have completed.
|
|
*/
|
|
void __dev_map_flush(struct bpf_map *map)
|
|
{
|
|
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
|
|
unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
|
|
u32 bit;
|
|
|
|
for_each_set_bit(bit, bitmap, map->max_entries) {
|
|
struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
|
|
struct net_device *netdev;
|
|
|
|
/* This is possible if the dev entry is removed by user space
|
|
* between xdp redirect and flush op.
|
|
*/
|
|
if (unlikely(!dev))
|
|
continue;
|
|
|
|
__clear_bit(bit, bitmap);
|
|
netdev = dev->dev;
|
|
if (likely(netdev->netdev_ops->ndo_xdp_flush))
|
|
netdev->netdev_ops->ndo_xdp_flush(netdev);
|
|
}
|
|
}
|
|
|
|
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
|
|
* update happens in parallel here a dev_put wont happen until after reading the
|
|
* ifindex.
|
|
*/
|
|
struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
|
|
{
|
|
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
|
|
struct bpf_dtab_netdev *dev;
|
|
|
|
if (key >= map->max_entries)
|
|
return NULL;
|
|
|
|
dev = READ_ONCE(dtab->netdev_map[key]);
|
|
return dev ? dev->dev : NULL;
|
|
}
|
|
|
|
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
|
|
{
|
|
struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
|
|
|
|
return dev ? &dev->ifindex : NULL;
|
|
}
|
|
|
|
static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
|
|
{
|
|
if (dev->dev->netdev_ops->ndo_xdp_flush) {
|
|
struct net_device *fl = dev->dev;
|
|
unsigned long *bitmap;
|
|
int cpu;
|
|
|
|
for_each_online_cpu(cpu) {
|
|
bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
|
|
__clear_bit(dev->bit, bitmap);
|
|
|
|
fl->netdev_ops->ndo_xdp_flush(dev->dev);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void __dev_map_entry_free(struct rcu_head *rcu)
|
|
{
|
|
struct bpf_dtab_netdev *dev;
|
|
|
|
dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
|
|
dev_map_flush_old(dev);
|
|
dev_put(dev->dev);
|
|
kfree(dev);
|
|
}
|
|
|
|
static int dev_map_delete_elem(struct bpf_map *map, void *key)
|
|
{
|
|
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
|
|
struct bpf_dtab_netdev *old_dev;
|
|
int k = *(u32 *)key;
|
|
|
|
if (k >= map->max_entries)
|
|
return -EINVAL;
|
|
|
|
/* Use call_rcu() here to ensure any rcu critical sections have
|
|
* completed, but this does not guarantee a flush has happened
|
|
* yet. Because driver side rcu_read_lock/unlock only protects the
|
|
* running XDP program. However, for pending flush operations the
|
|
* dev and ctx are stored in another per cpu map. And additionally,
|
|
* the driver tear down ensures all soft irqs are complete before
|
|
* removing the net device in the case of dev_put equals zero.
|
|
*/
|
|
old_dev = xchg(&dtab->netdev_map[k], NULL);
|
|
if (old_dev)
|
|
call_rcu(&old_dev->rcu, __dev_map_entry_free);
|
|
return 0;
|
|
}
|
|
|
|
static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
|
|
u64 map_flags)
|
|
{
|
|
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
|
|
struct net *net = current->nsproxy->net_ns;
|
|
struct bpf_dtab_netdev *dev, *old_dev;
|
|
u32 i = *(u32 *)key;
|
|
u32 ifindex = *(u32 *)value;
|
|
|
|
if (unlikely(map_flags > BPF_EXIST))
|
|
return -EINVAL;
|
|
if (unlikely(i >= dtab->map.max_entries))
|
|
return -E2BIG;
|
|
if (unlikely(map_flags == BPF_NOEXIST))
|
|
return -EEXIST;
|
|
|
|
if (!ifindex) {
|
|
dev = NULL;
|
|
} else {
|
|
dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
|
|
map->numa_node);
|
|
if (!dev)
|
|
return -ENOMEM;
|
|
|
|
dev->dev = dev_get_by_index(net, ifindex);
|
|
if (!dev->dev) {
|
|
kfree(dev);
|
|
return -EINVAL;
|
|
}
|
|
|
|
dev->bit = i;
|
|
dev->dtab = dtab;
|
|
}
|
|
|
|
/* Use call_rcu() here to ensure rcu critical sections have completed
|
|
* Remembering the driver side flush operation will happen before the
|
|
* net device is removed.
|
|
*/
|
|
old_dev = xchg(&dtab->netdev_map[i], dev);
|
|
if (old_dev)
|
|
call_rcu(&old_dev->rcu, __dev_map_entry_free);
|
|
|
|
return 0;
|
|
}
|
|
|
|
const struct bpf_map_ops dev_map_ops = {
|
|
.map_alloc = dev_map_alloc,
|
|
.map_free = dev_map_free,
|
|
.map_get_next_key = dev_map_get_next_key,
|
|
.map_lookup_elem = dev_map_lookup_elem,
|
|
.map_update_elem = dev_map_update_elem,
|
|
.map_delete_elem = dev_map_delete_elem,
|
|
};
|
|
|
|
static int dev_map_notification(struct notifier_block *notifier,
|
|
ulong event, void *ptr)
|
|
{
|
|
struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
|
|
struct bpf_dtab *dtab;
|
|
int i;
|
|
|
|
switch (event) {
|
|
case NETDEV_UNREGISTER:
|
|
/* This rcu_read_lock/unlock pair is needed because
|
|
* dev_map_list is an RCU list AND to ensure a delete
|
|
* operation does not free a netdev_map entry while we
|
|
* are comparing it against the netdev being unregistered.
|
|
*/
|
|
rcu_read_lock();
|
|
list_for_each_entry_rcu(dtab, &dev_map_list, list) {
|
|
for (i = 0; i < dtab->map.max_entries; i++) {
|
|
struct bpf_dtab_netdev *dev, *odev;
|
|
|
|
dev = READ_ONCE(dtab->netdev_map[i]);
|
|
if (!dev || netdev != dev->dev)
|
|
continue;
|
|
odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
|
|
if (dev == odev)
|
|
call_rcu(&dev->rcu,
|
|
__dev_map_entry_free);
|
|
}
|
|
}
|
|
rcu_read_unlock();
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
return NOTIFY_OK;
|
|
}
|
|
|
|
static struct notifier_block dev_map_notifier = {
|
|
.notifier_call = dev_map_notification,
|
|
};
|
|
|
|
static int __init dev_map_init(void)
|
|
{
|
|
register_netdevice_notifier(&dev_map_notifier);
|
|
return 0;
|
|
}
|
|
|
|
subsys_initcall(dev_map_init);
|