msm-4.14/net/ipv4/inet_fragment.c
Blagovest Kolenichev dc1d03db8d Merge android-4.14.114 (c680586) into msm-4.14
* refs/heads/tmp-c680586:
  dm: Restore reverted changes
  Linux 4.14.114
  kernel/sysctl.c: fix out-of-bounds access when setting file-max
  Revert "locking/lockdep: Add debug_locks check in __lock_downgrade()"
  i2c-hid: properly terminate i2c_hid_dmi_desc_override_table[] array
  xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute
  xfs: add the ability to join a held buffer to a defer_ops
  iomap: report collisions between directio and buffered writes to userspace
  tools include: Adopt linux/bits.h
  percpu: stop printing kernel addresses
  ALSA: info: Fix racy addition/deletion of nodes
  mm/vmstat.c: fix /proc/vmstat format for CONFIG_DEBUG_TLBFLUSH=y CONFIG_SMP=n
  device_cgroup: fix RCU imbalance in error case
  sched/fair: Limit sched_cfs_period_timer() loop to avoid hard lockup
  Revert "kbuild: use -Oz instead of -Os when using clang"
  net: IP6 defrag: use rbtrees in nf_conntrack_reasm.c
  net: IP6 defrag: use rbtrees for IPv6 defrag
  ipv6: remove dependency of nf_defrag_ipv6 on ipv6 module
  net: IP defrag: encapsulate rbtree defrag code into callable functions
  ipv6: frags: fix a lockdep false positive
  tpm/tpm_i2c_atmel: Return -E2BIG when the transfer is incomplete
  modpost: file2alias: check prototype of handler
  modpost: file2alias: go back to simple devtable lookup
  mmc: sdhci: Handle auto-command errors
  mmc: sdhci: Rename SDHCI_ACMD12_ERR and SDHCI_INT_ACMD12ERR
  mmc: sdhci: Fix data command CRC error handling
  crypto: crypto4xx - properly set IV after de- and encrypt
  x86/speculation: Prevent deadlock on ssb_state::lock
  perf/x86: Fix incorrect PEBS_REGS
  x86/cpu/bugs: Use __initconst for 'const' init data
  perf/x86/amd: Add event map for AMD Family 17h
  mac80211: do not call driver wake_tx_queue op during reconfig
  rt2x00: do not increment sequence number while re-transmitting
  kprobes: Fix error check when reusing optimized probes
  kprobes: Mark ftrace mcount handler functions nokprobe
  x86/kprobes: Verify stack frame on kretprobe
  arm64: futex: Restore oldval initialization to work around buggy compilers
  crypto: x86/poly1305 - fix overflow during partial reduction
  coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping
  Revert "svm: Fix AVIC incomplete IPI emulation"
  Revert "scsi: fcoe: clear FC_RP_STARTED flags when receiving a LOGO"
  scsi: core: set result when the command cannot be dispatched
  ALSA: core: Fix card races between register and disconnect
  ALSA: hda/realtek - add two more pin configuration sets to quirk table
  staging: comedi: ni_usb6501: Fix possible double-free of ->usb_rx_buf
  staging: comedi: ni_usb6501: Fix use of uninitialized mutex
  staging: comedi: vmk80xx: Fix possible double-free of ->usb_rx_buf
  staging: comedi: vmk80xx: Fix use of uninitialized semaphore
  io: accel: kxcjk1013: restore the range after resume.
  iio: core: fix a possible circular locking dependency
  iio: adc: at91: disable adc channel interrupt in timeout case
  iio: Fix scan mask selection
  iio: dac: mcp4725: add missing powerdown bits in store eeprom
  iio: ad_sigma_delta: select channel when reading register
  iio: cros_ec: Fix the maths for gyro scale calculation
  iio/gyro/bmg160: Use millidegrees for temperature scale
  iio: gyro: mpu3050: fix chip ID reading
  staging: iio: ad7192: Fix ad7193 channel address
  Staging: iio: meter: fixed typo
  KVM: x86: svm: make sure NMI is injected after nmi_singlestep
  KVM: x86: Don't clear EFER during SMM transitions for 32-bit vCPU
  CIFS: keep FileInfo handle live during oplock break
  net: thunderx: don't allow jumbo frames with XDP
  net: thunderx: raise XDP MTU to 1508
  ipv4: ensure rcu_read_lock() in ipv4_link_failure()
  ipv4: recompile ip options in ipv4_link_failure
  vhost: reject zero size iova range
  team: set slave to promisc if team is already in promisc mode
  tcp: tcp_grow_window() needs to respect tcp_space()
  net: fou: do not use guehdr after iptunnel_pull_offloads in gue_udp_recv
  net: bridge: multicast: use rcu to access port list from br_multicast_start_querier
  net: bridge: fix per-port af_packet sockets
  net: atm: Fix potential Spectre v1 vulnerabilities
  bonding: fix event handling for stacked bonds
  ANDROID: cuttlefish_defconfig: Enable CONFIG_XFRM_STATISTICS
  Linux 4.14.113
  appletalk: Fix compile regression
  mm: hide incomplete nr_indirectly_reclaimable in sysfs
  net: stmmac: Set dma ring length before enabling the DMA
  bpf: Fix selftests are changes for CVE 2019-7308
  bpf: fix sanitation rewrite in case of non-pointers
  bpf: do not restore dst_reg when cur_state is freed
  bpf: fix inner map masking to prevent oob under speculation
  bpf: fix sanitation of alu op with pointer / scalar type from different paths
  bpf: prevent out of bounds speculation on pointer arithmetic
  bpf: fix check_map_access smin_value test when pointer contains offset
  bpf: restrict unknown scalars of mixed signed bounds for unprivileged
  bpf: restrict stack pointer arithmetic for unprivileged
  bpf: restrict map value pointer arithmetic for unprivileged
  bpf: enable access to ax register also from verifier rewrite
  bpf: move tmp variable into ax register in interpreter
  bpf: move {prev_,}insn_idx into verifier env
  bpf: fix stack state printing in verifier log
  bpf: fix verifier NULL pointer dereference
  bpf: fix verifier memory leaks
  bpf: reduce verifier memory consumption
  dm: disable CRYPTO_TFM_REQ_MAY_SLEEP to fix a GFP_KERNEL recursion deadlock
  bpf: fix use after free in bpf_evict_inode
  include/linux/swap.h: use offsetof() instead of custom __swapoffset macro
  lib/div64.c: off by one in shift
  appletalk: Fix use-after-free in atalk_proc_exit
  drm/amdkfd: use init_mqd function to allocate object for hid_mqd (CI)
  ARM: 8839/1: kprobe: make patch_lock a raw_spinlock_t
  drm/nouveau/volt/gf117: fix speedo readout register
  coresight: cpu-debug: Support for CA73 CPUs
  Revert "ACPI / EC: Remove old CLEAR_ON_RESUME quirk"
  crypto: axis - fix for recursive locking from bottom half
  drm/panel: panel-innolux: set display off in innolux_panel_unprepare
  lkdtm: Add tests for NULL pointer dereference
  lkdtm: Print real addresses
  soc/tegra: pmc: Drop locking from tegra_powergate_is_powered()
  iommu/dmar: Fix buffer overflow during PCI bus notification
  crypto: sha512/arm - fix crash bug in Thumb2 build
  crypto: sha256/arm - fix crash bug in Thumb2 build
  kernel: hung_task.c: disable on suspend
  cifs: fallback to older infolevels on findfirst queryinfo retry
  compiler.h: update definition of unreachable()
  KVM: nVMX: restore host state in nested_vmx_vmexit for VMFail
  ACPI / SBS: Fix GPE storm on recent MacBookPro's
  usbip: fix vhci_hcd controller counting
  ARM: samsung: Limit SAMSUNG_PM_CHECK config option to non-Exynos platforms
  HID: i2c-hid: override HID descriptors for certain devices
  media: au0828: cannot kfree dev before usb disconnect
  powerpc/pseries: Remove prrn_work workqueue
  serial: uartps: console_setup() can't be placed to init section
  netfilter: xt_cgroup: shrink size of v2 path
  f2fs: fix to do sanity check with current segment number
  9p locks: add mount option for lock retry interval
  9p: do not trust pdu content for stat item size
  rsi: improve kernel thread handling to fix kernel panic
  gpio: pxa: handle corner case of unprobed device
  ext4: prohibit fstrim in norecovery mode
  fix incorrect error code mapping for OBJECTID_NOT_FOUND
  x86/hw_breakpoints: Make default case in hw_breakpoint_arch_parse() return an error
  iommu/vt-d: Check capability before disabling protected memory
  drm/nouveau/debugfs: Fix check of pm_runtime_get_sync failure
  x86/cpu/cyrix: Use correct macros for Cyrix calls on Geode processors
  x86/hpet: Prevent potential NULL pointer dereference
  irqchip/mbigen: Don't clear eventid when freeing an MSI
  perf tests: Fix a memory leak in test__perf_evsel__tp_sched_test()
  perf tests: Fix memory leak by expr__find_other() in test__expr()
  perf tests: Fix a memory leak of cpu_map object in the openat_syscall_event_on_all_cpus test
  perf evsel: Free evsel->counts in perf_evsel__exit()
  perf hist: Add missing map__put() in error case
  perf top: Fix error handling in cmd_top()
  perf build-id: Fix memory leak in print_sdt_events()
  perf config: Fix a memory leak in collect_config()
  perf config: Fix an error in the config template documentation
  perf list: Don't forget to drop the reference to the allocated thread_map
  tools/power turbostat: return the exit status of a command
  x86/mm: Don't leak kernel addresses
  scsi: iscsi: flush running unbind operations when removing a session
  thermal/intel_powerclamp: fix truncated kthread name
  thermal/int340x_thermal: fix mode setting
  thermal/int340x_thermal: Add additional UUIDs
  thermal: bcm2835: Fix crash in bcm2835_thermal_debugfs
  thermal/intel_powerclamp: fix __percpu declaration of worker_data
  ALSA: opl3: fix mismatch between snd_opl3_drum_switch definition and declaration
  mmc: davinci: remove extraneous __init annotation
  IB/mlx4: Fix race condition between catas error reset and aliasguid flows
  auxdisplay: hd44780: Fix memory leak on ->remove()
  ALSA: sb8: add a check for request_region
  ALSA: echoaudio: add a check for ioremap_nocache
  ext4: report real fs size after failed resize
  ext4: add missing brelse() in add_new_gdb_meta_bg()
  perf/core: Restore mmap record type correctly
  arc: hsdk_defconfig: Enable CONFIG_BLK_DEV_RAM
  ARC: u-boot args: check that magic number is correct
  ANDROID: cuttlefish_defconfig: Enable L2TP/PPTP
  ANDROID: Makefile: Properly resolve 4.14.112 merge
  Make arm64 serial port config compatible with crosvm
  Linux 4.14.112
  arm64: dts: rockchip: Fix vcc_host1_5v GPIO polarity on rk3328-rock64
  arm64: dts: rockchip: fix vcc_host1_5v pin assign on rk3328-rock64
  dm table: propagate BDI_CAP_STABLE_WRITES to fix sporadic checksum errors
  PCI: Add function 1 DMA alias quirk for Marvell 9170 SATA controller
  x86/perf/amd: Remove need to check "running" bit in NMI handler
  x86/perf/amd: Resolve NMI latency issues for active PMCs
  x86/perf/amd: Resolve race condition when disabling PMC
  xtensa: fix return_address
  sched/fair: Do not re-read ->h_load_next during hierarchical load calculation
  xen: Prevent buffer overflow in privcmd ioctl
  arm64: backtrace: Don't bother trying to unwind the userspace stack
  arm64: dts: rockchip: fix rk3328 rgmii high tx error rate
  arm64: futex: Fix FUTEX_WAKE_OP atomic ops with non-zero result value
  ARM: dts: at91: Fix typo in ISC_D0 on PC9
  ARM: dts: am335x-evm: Correct the regulators for the audio codec
  ARM: dts: am335x-evmsk: Correct the regulators for the audio codec
  virtio: Honour 'may_reduce_num' in vring_create_virtqueue
  genirq: Initialize request_mutex if CONFIG_SPARSE_IRQ=n
  genirq: Respect IRQCHIP_SKIP_SET_WAKE in irq_chip_set_wake_parent()
  block: fix the return errno for direct IO
  block: do not leak memory in bio_copy_user_iov()
  btrfs: prop: fix vanished compression property after failed set
  btrfs: prop: fix zstd compression parameter validation
  Btrfs: do not allow trimming when a fs is mounted with the nologreplay option
  ASoC: fsl_esai: fix channel swap issue when stream starts
  include/linux/bitrev.h: fix constant bitrev
  drm/udl: add a release method and delay modeset teardown
  alarmtimer: Return correct remaining time
  parisc: regs_return_value() should return gpr28
  parisc: Detect QEMU earlier in boot process
  arm64: dts: rockchip: fix rk3328 sdmmc0 write errors
  hv_netvsc: Fix unwanted wakeup after tx_disable
  ip6_tunnel: Match to ARPHRD_TUNNEL6 for dev type
  ALSA: seq: Fix OOB-reads from strlcpy
  net: ethtool: not call vzalloc for zero sized memory request
  netns: provide pure entropy for net_hash_mix()
  net/sched: act_sample: fix divide by zero in the traffic path
  bnxt_en: Reset device on RX buffer errors.
  bnxt_en: Improve RX consumer index validity check.
  nfp: validate the return code from dev_queue_xmit()
  net/mlx5e: Add a lock on tir list
  net/mlx5e: Fix error handling when refreshing TIRs
  vrf: check accept_source_route on the original netdevice
  tcp: Ensure DCTCP reacts to losses
  sctp: initialize _pad of sockaddr_in before copying to user memory
  qmi_wwan: add Olicard 600
  openvswitch: fix flow actions reallocation
  net/sched: fix ->get helper of the matchall cls
  net: rds: force to destroy connection if t_sock is NULL in rds_tcp_kill_sock().
  net/mlx5: Decrease default mr cache size
  net-gro: Fix GRO flush when receiving a GSO packet.
  kcm: switch order of device registration to fix a crash
  ipv6: sit: reset ip header pointer in ipip6_rcv
  ipv6: Fix dangling pointer when ipv6 fragment
  tty: ldisc: add sysctl to prevent autoloading of ldiscs
  tty: mark Siemens R3964 line discipline as BROKEN
  arm64: kaslr: Reserve size of ARM64_MEMSTART_ALIGN in linear region
  stating: ccree: revert "staging: ccree: fix leak of import() after init()"
  lib/string.c: implement a basic bcmp
  x86/vdso: Drop implicit common-page-size linker flag
  x86: vdso: Use $LD instead of $CC to link
  kbuild: clang: choose GCC_TOOLCHAIN_DIR not on LD
  powerpc/tm: Limit TM code inside PPC_TRANSACTIONAL_MEM
  drm/i915/gvt: do not let pin count of shadow mm go negative
  x86/power: Make restore_processor_context() sane
  x86/power/32: Move SYSENTER MSR restoration to fix_processor_context()
  x86/power/64: Use struct desc_ptr for the IDT in struct saved_context
  x86/power: Fix some ordering bugs in __restore_processor_context()
  net: sfp: move sfp_register_socket call from sfp_remove to sfp_probe
  Revert "CHROMIUM: dm: boot time specification of dm="
  Revert "ANDROID: dm: do_mounts_dm: Rebase on top of 4.9"
  Revert "ANDROID: dm: do_mounts_dm: fix dm_substitute_devices()"
  Revert "ANDROID: dm: do_mounts_dm: Update init/do_mounts_dm.c to the latest ChromiumOS version."
  sched/fair: remove printk while schedule is in progress
  ANDROID: Makefile: Add '-fsplit-lto-unit' to cfi-clang-flags
  ANDROID: cfi: Remove unused variable in ptr_to_check_fn
  ANDROID: cuttlefish_defconfig: Enable CONFIG_FUSE_FS

Conflicts:
	arch/arm64/kernel/traps.c
	drivers/mmc/host/sdhci.c
	drivers/mmc/host/sdhci.h
	drivers/tty/Kconfig
	kernel/sched/fair.c

Change-Id: Ic4c01204f58cdb536e2cab04e4f1a2451977f6a3
Signed-off-by: Blagovest Kolenichev <bkolenichev@codeaurora.org>
2019-06-25 03:05:18 -07:00

519 lines
13 KiB
C

/*
* inet fragments management
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Pavel Emelyanov <xemul@openvz.org>
* Started as consolidation of ipv4/ip_fragment.c,
* ipv6/reassembly. and ipv6 nf conntrack reassembly
*/
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/slab.h>
#include <net/sock.h>
#include <net/inet_frag.h>
#include <net/inet_ecn.h>
#include <net/ip.h>
#include <net/ipv6.h>
/* Use skb->cb to track consecutive/adjacent fragments coming at
* the end of the queue. Nodes in the rb-tree queue will
* contain "runs" of one or more adjacent fragments.
*
* Invariants:
* - next_frag is NULL at the tail of a "run";
* - the head of a "run" has the sum of all fragment lengths in frag_run_len.
*/
struct ipfrag_skb_cb {
union {
struct inet_skb_parm h4;
struct inet6_skb_parm h6;
};
struct sk_buff *next_frag;
int frag_run_len;
};
#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
static void fragcb_clear(struct sk_buff *skb)
{
RB_CLEAR_NODE(&skb->rbnode);
FRAG_CB(skb)->next_frag = NULL;
FRAG_CB(skb)->frag_run_len = skb->len;
}
/* Append skb to the last "run". */
static void fragrun_append_to_last(struct inet_frag_queue *q,
struct sk_buff *skb)
{
fragcb_clear(skb);
FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
FRAG_CB(q->fragments_tail)->next_frag = skb;
q->fragments_tail = skb;
}
/* Create a new "run" with the skb. */
static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb)
{
BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
fragcb_clear(skb);
if (q->last_run_head)
rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
&q->last_run_head->rbnode.rb_right);
else
rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
rb_insert_color(&skb->rbnode, &q->rb_fragments);
q->fragments_tail = skb;
q->last_run_head = skb;
}
/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
* Value : 0xff if frame should be dropped.
* 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
*/
const u8 ip_frag_ecn_table[16] = {
/* at least one fragment had CE, and others ECT_0 or ECT_1 */
[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE,
[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
/* invalid combinations : drop frame */
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
};
EXPORT_SYMBOL(ip_frag_ecn_table);
int inet_frags_init(struct inet_frags *f)
{
f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
NULL);
if (!f->frags_cachep)
return -ENOMEM;
return 0;
}
EXPORT_SYMBOL(inet_frags_init);
void inet_frags_fini(struct inet_frags *f)
{
/* We must wait that all inet_frag_destroy_rcu() have completed. */
rcu_barrier();
kmem_cache_destroy(f->frags_cachep);
f->frags_cachep = NULL;
}
EXPORT_SYMBOL(inet_frags_fini);
static void inet_frags_free_cb(void *ptr, void *arg)
{
struct inet_frag_queue *fq = ptr;
/* If we can not cancel the timer, it means this frag_queue
* is already disappearing, we have nothing to do.
* Otherwise, we own a refcount until the end of this function.
*/
if (!del_timer(&fq->timer))
return;
spin_lock_bh(&fq->lock);
if (!(fq->flags & INET_FRAG_COMPLETE)) {
fq->flags |= INET_FRAG_COMPLETE;
refcount_dec(&fq->refcnt);
}
spin_unlock_bh(&fq->lock);
inet_frag_put(fq);
}
void inet_frags_exit_net(struct netns_frags *nf)
{
nf->high_thresh = 0; /* prevent creation of new frags */
rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
}
EXPORT_SYMBOL(inet_frags_exit_net);
void inet_frag_kill(struct inet_frag_queue *fq)
{
if (del_timer(&fq->timer))
refcount_dec(&fq->refcnt);
if (!(fq->flags & INET_FRAG_COMPLETE)) {
struct netns_frags *nf = fq->net;
fq->flags |= INET_FRAG_COMPLETE;
rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params);
refcount_dec(&fq->refcnt);
}
}
EXPORT_SYMBOL(inet_frag_kill);
static void inet_frag_destroy_rcu(struct rcu_head *head)
{
struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
rcu);
struct inet_frags *f = q->net->f;
if (f->destructor)
f->destructor(q);
kmem_cache_free(f->frags_cachep, q);
}
unsigned int inet_frag_rbtree_purge(struct rb_root *root)
{
struct rb_node *p = rb_first(root);
unsigned int sum = 0;
while (p) {
struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
p = rb_next(p);
rb_erase(&skb->rbnode, root);
while (skb) {
struct sk_buff *next = FRAG_CB(skb)->next_frag;
sum += skb->truesize;
kfree_skb(skb);
skb = next;
}
}
return sum;
}
EXPORT_SYMBOL(inet_frag_rbtree_purge);
void inet_frag_destroy(struct inet_frag_queue *q)
{
struct sk_buff *fp;
struct netns_frags *nf;
unsigned int sum, sum_truesize = 0;
struct inet_frags *f;
WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
WARN_ON(del_timer(&q->timer) != 0);
/* Release all fragment data. */
fp = q->fragments;
nf = q->net;
f = nf->f;
if (fp) {
do {
struct sk_buff *xp = fp->next;
sum_truesize += fp->truesize;
kfree_skb(fp);
fp = xp;
} while (fp);
} else {
sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
}
sum = sum_truesize + f->qsize;
call_rcu(&q->rcu, inet_frag_destroy_rcu);
sub_frag_mem_limit(nf, sum);
}
EXPORT_SYMBOL(inet_frag_destroy);
static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
struct inet_frags *f,
void *arg)
{
struct inet_frag_queue *q;
if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
return NULL;
q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
if (!q)
return NULL;
q->net = nf;
f->constructor(q, arg);
add_frag_mem_limit(nf, f->qsize);
timer_setup(&q->timer, f->frag_expire, 0);
spin_lock_init(&q->lock);
refcount_set(&q->refcnt, 3);
return q;
}
static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
void *arg,
struct inet_frag_queue **prev)
{
struct inet_frags *f = nf->f;
struct inet_frag_queue *q;
q = inet_frag_alloc(nf, f, arg);
if (!q) {
*prev = ERR_PTR(-ENOMEM);
return NULL;
}
mod_timer(&q->timer, jiffies + nf->timeout);
*prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key,
&q->node, f->rhash_params);
if (*prev) {
q->flags |= INET_FRAG_COMPLETE;
inet_frag_kill(q);
inet_frag_destroy(q);
return NULL;
}
return q;
}
/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
{
struct inet_frag_queue *fq = NULL, *prev;
rcu_read_lock();
prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
if (!prev)
fq = inet_frag_create(nf, key, &prev);
if (prev && !IS_ERR(prev)) {
fq = prev;
if (!refcount_inc_not_zero(&fq->refcnt))
fq = NULL;
}
rcu_read_unlock();
return fq;
}
EXPORT_SYMBOL(inet_frag_find);
int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
int offset, int end)
{
struct sk_buff *last = q->fragments_tail;
/* RFC5722, Section 4, amended by Errata ID : 3089
* When reassembling an IPv6 datagram, if
* one or more its constituent fragments is determined to be an
* overlapping fragment, the entire datagram (and any constituent
* fragments) MUST be silently discarded.
*
* Duplicates, however, should be ignored (i.e. skb dropped, but the
* queue/fragments kept for later reassembly).
*/
if (!last)
fragrun_create(q, skb); /* First fragment. */
else if (last->ip_defrag_offset + last->len < end) {
/* This is the common case: skb goes to the end. */
/* Detect and discard overlaps. */
if (offset < last->ip_defrag_offset + last->len)
return IPFRAG_OVERLAP;
if (offset == last->ip_defrag_offset + last->len)
fragrun_append_to_last(q, skb);
else
fragrun_create(q, skb);
} else {
/* Binary search. Note that skb can become the first fragment,
* but not the last (covered above).
*/
struct rb_node **rbn, *parent;
rbn = &q->rb_fragments.rb_node;
do {
struct sk_buff *curr;
int curr_run_end;
parent = *rbn;
curr = rb_to_skb(parent);
curr_run_end = curr->ip_defrag_offset +
FRAG_CB(curr)->frag_run_len;
if (end <= curr->ip_defrag_offset)
rbn = &parent->rb_left;
else if (offset >= curr_run_end)
rbn = &parent->rb_right;
else if (offset >= curr->ip_defrag_offset &&
end <= curr_run_end)
return IPFRAG_DUP;
else
return IPFRAG_OVERLAP;
} while (*rbn);
/* Here we have parent properly set, and rbn pointing to
* one of its NULL left/right children. Insert skb.
*/
fragcb_clear(skb);
rb_link_node(&skb->rbnode, parent, rbn);
rb_insert_color(&skb->rbnode, &q->rb_fragments);
}
skb->ip_defrag_offset = offset;
return IPFRAG_OK;
}
EXPORT_SYMBOL(inet_frag_queue_insert);
void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
struct sk_buff *parent)
{
struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments);
struct sk_buff **nextp;
int delta;
if (head != skb) {
fp = skb_clone(skb, GFP_ATOMIC);
if (!fp)
return NULL;
FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
if (RB_EMPTY_NODE(&skb->rbnode))
FRAG_CB(parent)->next_frag = fp;
else
rb_replace_node(&skb->rbnode, &fp->rbnode,
&q->rb_fragments);
if (q->fragments_tail == skb)
q->fragments_tail = fp;
skb_morph(skb, head);
FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
rb_replace_node(&head->rbnode, &skb->rbnode,
&q->rb_fragments);
consume_skb(head);
head = skb;
}
WARN_ON(head->ip_defrag_offset != 0);
delta = -head->truesize;
/* Head of list must not be cloned. */
if (skb_unclone(head, GFP_ATOMIC))
return NULL;
delta += head->truesize;
if (delta)
add_frag_mem_limit(q->net, delta);
/* If the first fragment is fragmented itself, we split
* it to two chunks: the first with data and paged part
* and the second, holding only fragments.
*/
if (skb_has_frag_list(head)) {
struct sk_buff *clone;
int i, plen = 0;
clone = alloc_skb(0, GFP_ATOMIC);
if (!clone)
return NULL;
skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
skb_frag_list_init(head);
for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
clone->data_len = head->data_len - plen;
clone->len = clone->data_len;
head->truesize += clone->truesize;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
add_frag_mem_limit(q->net, clone->truesize);
skb_shinfo(head)->frag_list = clone;
nextp = &clone->next;
} else {
nextp = &skb_shinfo(head)->frag_list;
}
return nextp;
}
EXPORT_SYMBOL(inet_frag_reasm_prepare);
void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
void *reasm_data)
{
struct sk_buff **nextp = (struct sk_buff **)reasm_data;
struct rb_node *rbn;
struct sk_buff *fp;
skb_push(head, head->data - skb_network_header(head));
/* Traverse the tree in order, to build frag_list. */
fp = FRAG_CB(head)->next_frag;
rbn = rb_next(&head->rbnode);
rb_erase(&head->rbnode, &q->rb_fragments);
while (rbn || fp) {
/* fp points to the next sk_buff in the current run;
* rbn points to the next run.
*/
/* Go through the current run. */
while (fp) {
*nextp = fp;
nextp = &fp->next;
fp->prev = NULL;
memset(&fp->rbnode, 0, sizeof(fp->rbnode));
fp->sk = NULL;
head->data_len += fp->len;
head->len += fp->len;
if (head->ip_summed != fp->ip_summed)
head->ip_summed = CHECKSUM_NONE;
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
head->truesize += fp->truesize;
fp = FRAG_CB(fp)->next_frag;
}
/* Move to the next run. */
if (rbn) {
struct rb_node *rbnext = rb_next(rbn);
fp = rb_to_skb(rbn);
rb_erase(rbn, &q->rb_fragments);
rbn = rbnext;
}
}
sub_frag_mem_limit(q->net, head->truesize);
*nextp = NULL;
head->next = NULL;
head->prev = NULL;
head->tstamp = q->stamp;
}
EXPORT_SYMBOL(inet_frag_reasm_finish);
struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q)
{
struct sk_buff *head;
if (q->fragments) {
head = q->fragments;
q->fragments = head->next;
} else {
struct sk_buff *skb;
head = skb_rb_first(&q->rb_fragments);
if (!head)
return NULL;
skb = FRAG_CB(head)->next_frag;
if (skb)
rb_replace_node(&head->rbnode, &skb->rbnode,
&q->rb_fragments);
else
rb_erase(&head->rbnode, &q->rb_fragments);
memset(&head->rbnode, 0, sizeof(head->rbnode));
barrier();
}
if (head == q->fragments_tail)
q->fragments_tail = NULL;
sub_frag_mem_limit(q->net, head->truesize);
return head;
}
EXPORT_SYMBOL(inet_frag_pull_head);