From 24a2d43d8886f5a29c3cf108927f630c545a9a38 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 27 Sep 2014 09:50:55 -0700 Subject: [PATCH 1/3] ipv4: rename ip_options_echo to __ip_options_echo() ip_options_echo() assumes struct ip_options is provided in &IPCB(skb)->opt Lets break this assumption, but provide a helper to not change all call points. ip_send_unicast_reply() gets a new struct ip_options pointer. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 15 ++++++++++++--- net/ipv4/ip_options.c | 6 ++---- net/ipv4/ip_output.c | 8 +++++--- net/ipv4/tcp_ipv4.c | 10 ++++++---- 4 files changed, 25 insertions(+), 14 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index fcd9068fb8c3..0bb620702929 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -180,8 +180,10 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; } -void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, - __be32 saddr, const struct ip_reply_arg *arg, +void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, + const struct ip_options *sopt, + __be32 daddr, __be32 saddr, + const struct ip_reply_arg *arg, unsigned int len); #define IP_INC_STATS(net, field) SNMP_INC_STATS64((net)->mib.ip_statistics, field) @@ -511,7 +513,14 @@ int ip_forward(struct sk_buff *skb); void ip_options_build(struct sk_buff *skb, struct ip_options *opt, __be32 daddr, struct rtable *rt, int is_frag); -int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb); + +int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb, + const struct ip_options *sopt); +static inline int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) +{ + return __ip_options_echo(dopt, skb, &IPCB(skb)->opt); +} + void ip_options_fragment(struct sk_buff *skb); int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ad382499bace..5b3d91be2db0 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -87,17 +87,15 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, * NOTE: dopt cannot point to skb. */ -int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) +int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb, + const struct ip_options *sopt) { - const struct ip_options *sopt; unsigned char *sptr, *dptr; int soffset, doffset; int optlen; memset(dopt, 0, sizeof(struct ip_options)); - sopt = &(IPCB(skb)->opt); - if (sopt->optlen == 0) return 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 215af2b155cb..c8fa62476461 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1522,8 +1522,10 @@ static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { .uc_ttl = -1, }; -void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, - __be32 saddr, const struct ip_reply_arg *arg, +void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, + const struct ip_options *sopt, + __be32 daddr, __be32 saddr, + const struct ip_reply_arg *arg, unsigned int len) { struct ip_options_data replyopts; @@ -1534,7 +1536,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, struct sock *sk; struct inet_sock *inet; - if (ip_options_echo(&replyopts.opt.opt, skb)) + if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) return; ipc.addr = daddr; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3b2e49cb2b61..28ab90382c01 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -681,8 +681,9 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; - ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); + ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, + ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, + &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); @@ -764,8 +765,9 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, if (oif) arg.bound_dev_if = oif; arg.tos = tos; - ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); + ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, + ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, + &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); } From a224772db8420ecb7ce91a9ba5d535ee3a50d982 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 27 Sep 2014 09:50:56 -0700 Subject: [PATCH 2/3] ipv6: add a struct inet6_skb_parm param to ipv6_opt_accepted() ipv6_opt_accepted() assumes IP6CB(skb) holds the struct inet6_skb_parm that it needs. Lets not assume this, as TCP stack might use a different place. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ipv6.h | 3 ++- net/dccp/ipv6.c | 2 +- net/ipv6/af_inet6.c | 4 ++-- net/ipv6/syncookies.c | 2 +- net/ipv6/tcp_ipv6.c | 5 +++-- 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 7e247e9b8765..97f472012438 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -288,7 +288,8 @@ struct ipv6_txoptions *ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt); -bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb); +bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, + const struct inet6_skb_parm *opt); static inline bool ipv6_accept_ra(struct inet6_dev *idev) { diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 04cb17d4b0ce..ad2acfe1ca61 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -404,7 +404,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - if (ipv6_opt_accepted(sk, skb) || + if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { atomic_inc(&skb->users); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e4865a3ebe1d..34f726f59814 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -672,10 +672,10 @@ int inet6_sk_rebuild_header(struct sock *sk) } EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); -bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb) +bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, + const struct inet6_skb_parm *opt) { const struct ipv6_pinfo *np = inet6_sk(sk); - const struct inet6_skb_parm *opt = IP6CB(skb); if (np->rxopt.all) { if ((opt->hop && (np->rxopt.bits.hopopts || diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index c643dc907ce7..9a2838e93cc5 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -203,7 +203,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->ir_num = ntohs(th->dest); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - if (ipv6_opt_accepted(sk, skb) || + if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { atomic_inc(&skb->users); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index de51a88bec6f..9400b4326f22 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -742,7 +742,8 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, ireq->ir_iif = inet6_iif(skb); if (!TCP_SKB_CB(skb)->tcp_tw_isn && - (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || + (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || np->repflow)) { atomic_inc(&skb->users); @@ -1367,7 +1368,7 @@ ipv6_pktoptions: np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); if (np->repflow) np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); - if (ipv6_opt_accepted(sk, opt_skb)) { + if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { skb_set_owner_r(opt_skb, sk); opt_skb = xchg(&np->pktoptions, opt_skb); } else { From 971f10eca186cab238c49daa91f703c5a001b0b1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 27 Sep 2014 09:50:57 -0700 Subject: [PATCH 3/3] tcp: better TCP_SKB_CB layout to reduce cache line misses TCP maintains lists of skb in write queue, and in receive queues (in order and out of order queues) Scanning these lists both in input and output path usually requires access to skb->next, TCP_SKB_CB(skb)->seq, and TCP_SKB_CB(skb)->end_seq These fields are currently in two different cache lines, meaning we waste lot of memory bandwidth when these queues are big and flows have either packet drops or packet reorders. We can move TCP_SKB_CB(skb)->header at the end of TCP_SKB_CB, because this header is not used in fast path. This allows TCP to search much faster in the skb lists. Even with regular flows, we save one cache line miss in fast path. Thanks to Christoph Paasch for noticing we need to cleanup skb->cb[] (IPCB/IP6CB) before entering IP stack in tx path, and that I forgot IPCB use in tcp_v4_hnd_req() and tcp_v4_save_options(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 12 ++++++------ net/ipv4/tcp_ipv4.c | 19 ++++++++++++------- net/ipv4/tcp_output.c | 5 +++++ net/ipv6/tcp_ipv6.c | 7 +++++++ 4 files changed, 30 insertions(+), 13 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index a4201ef216e8..4dc6641ee990 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -696,12 +696,6 @@ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately. */ struct tcp_skb_cb { - union { - struct inet_skb_parm h4; -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_skb_parm h6; -#endif - } header; /* For incoming frames */ __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ __u32 tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */ @@ -720,6 +714,12 @@ struct tcp_skb_cb { __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ /* 1 byte hole */ __u32 ack_seq; /* Sequence number ACK'd */ + union { + struct inet_skb_parm h4; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_skb_parm h6; +#endif + } header; /* For incoming frames */ }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 28ab90382c01..9ce3eac02957 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -886,18 +886,16 @@ EXPORT_SYMBOL(tcp_syn_flood_action); */ static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) { - const struct ip_options *opt = &(IPCB(skb)->opt); + const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; struct ip_options_rcu *dopt = NULL; if (opt && opt->optlen) { int opt_size = sizeof(*dopt) + opt->optlen; dopt = kmalloc(opt_size, GFP_ATOMIC); - if (dopt) { - if (ip_options_echo(&dopt->opt, skb)) { - kfree(dopt); - dopt = NULL; - } + if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) { + kfree(dopt); + dopt = NULL; } } return dopt; @@ -1431,7 +1429,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_SYN_COOKIES if (!th->syn) - sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); + sk = cookie_v4_check(sk, skb, &TCP_SKB_CB(skb)->header.h4.opt); #endif return sk; } @@ -1636,6 +1634,13 @@ int tcp_v4_rcv(struct sk_buff *skb) th = tcp_hdr(skb); iph = ip_hdr(skb); + /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() + * barrier() makes sure compiler wont play fool^Waliasing games. + */ + memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), + sizeof(struct inet_skb_parm)); + barrier(); + TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f173b1c4f815..a462fb1db896 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -974,6 +974,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, /* Our usage of tstamp should remain private */ skb->tstamp.tv64 = 0; + + /* Cleanup our debris for IP stacks */ + memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), + sizeof(struct inet6_skb_parm))); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); if (likely(err <= 0)) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9400b4326f22..132bac137aed 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1412,6 +1412,13 @@ static int tcp_v6_rcv(struct sk_buff *skb) th = tcp_hdr(skb); hdr = ipv6_hdr(skb); + /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() + * barrier() makes sure compiler wont play fool^Waliasing games. + */ + memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb), + sizeof(struct inet6_skb_parm)); + barrier(); + TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4);