linux-pinenote/net/ipv6/ip6_offload.c

336 lines
7.9 KiB
C
Raw Normal View History

/*
* IPV6 GSO/GRO offload support
* Linux INET6 implementation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/socket.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/printk.h>
#include <net/protocol.h>
#include <net/ipv6.h>
#include "ip6_offload.h"
static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto)
{
const struct net_offload *ops = NULL;
for (;;) {
struct ipv6_opt_hdr *opth;
int len;
if (proto != NEXTHDR_HOP) {
ops = rcu_dereference(inet6_offloads[proto]);
if (unlikely(!ops))
break;
if (!(ops->flags & INET6_PROTO_GSO_EXTHDR))
break;
}
if (unlikely(!pskb_may_pull(skb, 8)))
break;
opth = (void *)skb->data;
len = ipv6_optlen(opth);
if (unlikely(!pskb_may_pull(skb, len)))
break;
proto = opth->nexthdr;
__skb_pull(skb, len);
}
return proto;
}
static int ipv6_gso_send_check(struct sk_buff *skb)
{
const struct ipv6hdr *ipv6h;
const struct net_offload *ops;
int err = -EINVAL;
if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
goto out;
ipv6h = ipv6_hdr(skb);
__skb_pull(skb, sizeof(*ipv6h));
err = -EPROTONOSUPPORT;
ops = rcu_dereference(inet6_offloads[
ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr)]);
if (likely(ops && ops->callbacks.gso_send_check)) {
skb_reset_transport_header(skb);
err = ops->callbacks.gso_send_check(skb);
}
out:
return err;
}
static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct ipv6hdr *ipv6h;
const struct net_offload *ops;
int proto;
struct frag_hdr *fptr;
unsigned int unfrag_ip6hlen;
u8 *prevhdr;
int offset = 0;
bool tunnel;
int nhoff;
if (unlikely(skb_shinfo(skb)->gso_type &
~(SKB_GSO_UDP |
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
SKB_GSO_GRE |
SKB_GSO_IPIP |
SKB_GSO_SIT |
SKB_GSO_UDP_TUNNEL |
MPLS: Add limited GSO support In the case where a non-MPLS packet is received and an MPLS stack is added it may well be the case that the original skb is GSO but the NIC used for transmit does not support GSO of MPLS packets. The aim of this code is to provide GSO in software for MPLS packets whose skbs are GSO. SKB Usage: When an implementation adds an MPLS stack to a non-MPLS packet it should do the following to skb metadata: * Set skb->inner_protocol to the old non-MPLS ethertype of the packet. skb->inner_protocol is added by this patch. * Set skb->protocol to the new MPLS ethertype of the packet. * Set skb->network_header to correspond to the end of the L3 header, including the MPLS label stack. I have posted a patch, "[PATCH v3.29] datapath: Add basic MPLS support to kernel" which adds MPLS support to the kernel datapath of Open vSwtich. That patch sets the above requirements in datapath/actions.c:push_mpls() and was used to exercise this code. The datapath patch is against the Open vSwtich tree but it is intended that it be added to the Open vSwtich code present in the mainline Linux kernel at some point. Features: I believe that the approach that I have taken is at least partially consistent with the handling of other protocols. Jesse, I understand that you have some ideas here. I am more than happy to change my implementation. This patch adds dev->mpls_features which may be used by devices to advertise features supported for MPLS packets. A new NETIF_F_MPLS_GSO feature is added for devices which support hardware MPLS GSO offload. Currently no devices support this and MPLS GSO always falls back to software. Alternate Implementation: One possible alternate implementation is to teach netif_skb_features() and skb_network_protocol() about MPLS, in a similar way to their understanding of VLANs. I believe this would avoid the need for net/mpls/mpls_gso.c and in particular the calls to __skb_push() and __skb_push() in mpls_gso_segment(). I have decided on the implementation in this patch as it should not introduce any overhead in the case where mpls_gso is not compiled into the kernel or inserted as a module. MPLS GSO suggested by Jesse Gross. Based in part on "v4 GRE: Add TCP segmentation offload for GRE" by Pravin B Shelar. Cc: Jesse Gross <jesse@nicira.com> Cc: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Simon Horman <horms@verge.net.au> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-05-23 21:02:52 +00:00
SKB_GSO_MPLS |
SKB_GSO_TCPV6 |
0)))
goto out;
skb_reset_network_header(skb);
nhoff = skb_network_header(skb) - skb_mac_header(skb);
if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
goto out;
tunnel = SKB_GSO_CB(skb)->encap_level > 0;
if (tunnel)
features = skb->dev->hw_enc_features & netif_skb_features(skb);
SKB_GSO_CB(skb)->encap_level += sizeof(*ipv6h);
ipv6h = ipv6_hdr(skb);
__skb_pull(skb, sizeof(*ipv6h));
segs = ERR_PTR(-EPROTONOSUPPORT);
proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr);
ops = rcu_dereference(inet6_offloads[proto]);
if (likely(ops && ops->callbacks.gso_segment)) {
skb_reset_transport_header(skb);
segs = ops->callbacks.gso_segment(skb, features);
}
if (IS_ERR(segs))
goto out;
for (skb = segs; skb; skb = skb->next) {
ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
ipv6h->payload_len = htons(skb->len - nhoff - sizeof(*ipv6h));
if (tunnel) {
skb_reset_inner_headers(skb);
skb->encapsulation = 1;
}
skb->network_header = (u8 *)ipv6h - skb->head;
if (!tunnel && proto == IPPROTO_UDP) {
unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
fptr = (struct frag_hdr *)((u8 *)ipv6h + unfrag_ip6hlen);
fptr->frag_off = htons(offset);
if (skb->next != NULL)
fptr->frag_off |= htons(IP6_MF);
offset += (ntohs(ipv6h->payload_len) -
sizeof(struct frag_hdr));
}
}
out:
return segs;
}
/* Return the total length of all the extension hdrs, following the same
* logic in ipv6_gso_pull_exthdrs() when parsing ext-hdrs.
*/
static int ipv6_exthdrs_len(struct ipv6hdr *iph,
const struct net_offload **opps)
{
struct ipv6_opt_hdr *opth = (void *)iph;
int len = 0, proto, optlen = sizeof(*iph);
proto = iph->nexthdr;
for (;;) {
if (proto != NEXTHDR_HOP) {
*opps = rcu_dereference(inet6_offloads[proto]);
if (unlikely(!(*opps)))
break;
if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR))
break;
}
opth = (void *)opth + optlen;
optlen = ipv6_optlen(opth);
len += optlen;
proto = opth->nexthdr;
}
return len;
}
static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
{
const struct net_offload *ops;
struct sk_buff **pp = NULL;
struct sk_buff *p;
struct ipv6hdr *iph;
unsigned int nlen;
unsigned int hlen;
unsigned int off;
net-gre-gro: Add GRE support to the GRO stack This patch built on top of Commit 299603e8370a93dd5d8e8d800f0dff1ce2c53d36 ("net-gro: Prepare GRO stack for the upcoming tunneling support") to add the support of the standard GRE (RFC1701/RFC2784/RFC2890) to the GRO stack. It also serves as an example for supporting other encapsulation protocols in the GRO stack in the future. The patch supports version 0 and all the flags (key, csum, seq#) but will flush any pkt with the S (seq#) flag. This is because the S flag is not support by GSO, and a GRO pkt may end up in the forwarding path, thus requiring GSO support to break it up correctly. Currently the "packet_offload" structure only contains L3 (ETH_P_IP/ ETH_P_IPV6) GRO offload support so the encapped pkts are limited to IP pkts (i.e., w/o L2 hdr). But support for other protocol type can be easily added, so is the support for GRE variations like NVGRE. The patch also support csum offload. Specifically if the csum flag is on and the h/w is capable of checksumming the payload (CHECKSUM_COMPLETE), the code will take advantage of the csum computed by the h/w when validating the GRE csum. Note that commit 60769a5dcd8755715c7143b4571d5c44f01796f1 "ipv4: gre: add GRO capability" already introduces GRO capability to IPv4 GRE tunnels, using the gro_cells infrastructure. But GRO is done after GRE hdr has been removed (i.e., decapped). The following patch applies GRO when pkts first come in (before hitting the GRE tunnel code). There is some performance advantage for applying GRO as early as possible. Also this approach is transparent to other subsystem like Open vSwitch where GRE decap is handled outside of the IP stack hence making it harder for the gro_cells stuff to apply. On the other hand, some NICs are still not capable of hashing on the inner hdr of a GRE pkt (RSS). In that case the GRO processing of pkts from the same remote host will all happen on the same CPU and the performance may be suboptimal. I'm including some rough preliminary performance numbers below. Note that the performance will be highly dependent on traffic load, mix as usual. Moreover it also depends on NIC offload features hence the following is by no means a comprehesive study. Local testing and tuning will be needed to decide the best setting. All tests spawned 50 copies of netperf TCP_STREAM and ran for 30 secs. (super_netperf 50 -H 192.168.1.18 -l 30) An IP GRE tunnel with only the key flag on (e.g., ip tunnel add gre1 mode gre local 10.246.17.18 remote 10.246.17.17 ttl 255 key 123) is configured. The GRO support for pkts AFTER decap are controlled through the device feature of the GRE device (e.g., ethtool -K gre1 gro on/off). 1.1 ethtool -K gre1 gro off; ethtool -K eth0 gro off thruput: 9.16Gbps CPU utilization: 19% 1.2 ethtool -K gre1 gro on; ethtool -K eth0 gro off thruput: 5.9Gbps CPU utilization: 15% 1.3 ethtool -K gre1 gro off; ethtool -K eth0 gro on thruput: 9.26Gbps CPU utilization: 12-13% 1.4 ethtool -K gre1 gro on; ethtool -K eth0 gro on thruput: 9.26Gbps CPU utilization: 10% The following tests were performed on a different NIC that is capable of csum offload. I.e., the h/w is capable of computing IP payload csum (CHECKSUM_COMPLETE). 2.1 ethtool -K gre1 gro on (hence will use gro_cells) 2.1.1 ethtool -K eth0 gro off; csum offload disabled thruput: 8.53Gbps CPU utilization: 9% 2.1.2 ethtool -K eth0 gro off; csum offload enabled thruput: 8.97Gbps CPU utilization: 7-8% 2.1.3 ethtool -K eth0 gro on; csum offload disabled thruput: 8.83Gbps CPU utilization: 5-6% 2.1.4 ethtool -K eth0 gro on; csum offload enabled thruput: 8.98Gbps CPU utilization: 5% 2.2 ethtool -K gre1 gro off 2.2.1 ethtool -K eth0 gro off; csum offload disabled thruput: 5.93Gbps CPU utilization: 9% 2.2.2 ethtool -K eth0 gro off; csum offload enabled thruput: 5.62Gbps CPU utilization: 8% 2.2.3 ethtool -K eth0 gro on; csum offload disabled thruput: 7.69Gbps CPU utilization: 8% 2.2.4 ethtool -K eth0 gro on; csum offload enabled thruput: 8.96Gbps CPU utilization: 5-6% Signed-off-by: H.K. Jerry Chu <hkchu@google.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-07 10:23:19 -08:00
u16 flush = 1;
int proto;
__wsum csum;
off = skb_gro_offset(skb);
hlen = off + sizeof(*iph);
iph = skb_gro_header_fast(skb, off);
if (skb_gro_header_hard(skb, hlen)) {
iph = skb_gro_header_slow(skb, hlen, off);
if (unlikely(!iph))
goto out;
}
skb_set_network_header(skb, off);
skb_gro_pull(skb, sizeof(*iph));
skb_set_transport_header(skb, skb_gro_offset(skb));
flush += ntohs(iph->payload_len) != skb_gro_len(skb);
rcu_read_lock();
proto = iph->nexthdr;
ops = rcu_dereference(inet6_offloads[proto]);
if (!ops || !ops->callbacks.gro_receive) {
__pskb_pull(skb, skb_gro_offset(skb));
proto = ipv6_gso_pull_exthdrs(skb, proto);
skb_gro_pull(skb, -skb_transport_offset(skb));
skb_reset_transport_header(skb);
__skb_push(skb, skb_gro_offset(skb));
ops = rcu_dereference(inet6_offloads[proto]);
if (!ops || !ops->callbacks.gro_receive)
goto out_unlock;
iph = ipv6_hdr(skb);
}
NAPI_GRO_CB(skb)->proto = proto;
flush--;
nlen = skb_network_header_len(skb);
for (p = *head; p; p = p->next) {
const struct ipv6hdr *iph2;
__be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */
if (!NAPI_GRO_CB(p)->same_flow)
continue;
iph2 = (struct ipv6hdr *)(p->data + off);
first_word = *(__be32 *)iph ^ *(__be32 *)iph2 ;
/* All fields must match except length and Traffic Class.
* XXX skbs on the gro_list have all been parsed and pulled
* already so we don't need to compare nlen
* (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops)))
* memcmp() alone below is suffcient, right?
*/
if ((first_word & htonl(0xF00FFFFF)) ||
memcmp(&iph->nexthdr, &iph2->nexthdr,
nlen - offsetof(struct ipv6hdr, nexthdr))) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
/* flush if Traffic Class fields are different */
NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000));
NAPI_GRO_CB(p)->flush |= flush;
}
NAPI_GRO_CB(skb)->flush |= flush;
csum = skb->csum;
skb_postpull_rcsum(skb, iph, skb_network_header_len(skb));
pp = ops->callbacks.gro_receive(head, skb);
skb->csum = csum;
out_unlock:
rcu_read_unlock();
out:
NAPI_GRO_CB(skb)->flush |= flush;
return pp;
}
static int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
{
const struct net_offload *ops;
struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff);
int err = -ENOSYS;
iph->payload_len = htons(skb->len - nhoff - sizeof(*iph));
rcu_read_lock();
nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops);
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
goto out_unlock;
err = ops->callbacks.gro_complete(skb, nhoff);
out_unlock:
rcu_read_unlock();
return err;
}
static struct packet_offload ipv6_packet_offload __read_mostly = {
.type = cpu_to_be16(ETH_P_IPV6),
.callbacks = {
.gso_send_check = ipv6_gso_send_check,
.gso_segment = ipv6_gso_segment,
.gro_receive = ipv6_gro_receive,
.gro_complete = ipv6_gro_complete,
},
};
static const struct net_offload sit_offload = {
.callbacks = {
.gso_send_check = ipv6_gso_send_check,
.gso_segment = ipv6_gso_segment,
},
};
static int __init ipv6_offload_init(void)
{
if (tcpv6_offload_init() < 0)
pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
if (udp_offload_init() < 0)
pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
if (ipv6_exthdrs_offload_init() < 0)
pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__);
dev_add_offload(&ipv6_packet_offload);
inet_add_offload(&sit_offload, IPPROTO_IPV6);
return 0;
}
fs_initcall(ipv6_offload_init);