Home Home > GIT Browse
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Oskolkov <posk@google.com>2018-10-10 12:30:13 -0700
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2019-02-08 11:25:32 +0100
commit3f78a3f45e79ca378cb850a598e4c76633710e92 (patch)
tree88ac41341cfb584a738e20ae320b79f5bd6a2224
parent7fab8b2f0c994decf580027286b97533c8b7f6fd (diff)
ip: use rb trees for IP frag queue.
commit fa0f527358bd900ef92f925878ed6bfbd51305cc upstream. Similar to TCP OOO RX queue, it makes sense to use rb trees to store IP fragments, so that OOO fragments are inserted faster. Tested: - a follow-up patch contains a rather comprehensive ip defrag self-test (functional) - ran neper `udp_stream -c -H <host> -F 100 -l 300 -T 20`: netstat --statistics Ip: 282078937 total packets received 0 forwarded 0 incoming packets discarded 946760 incoming packets delivered 18743456 requests sent out 101 fragments dropped after timeout 282077129 reassemblies required 944952 packets reassembled ok 262734239 packet reassembles failed (The numbers/stats above are somewhat better re: reassemblies vs a kernel without this patchset. More comprehensive performance testing TBD). Reported-by: Jann Horn <jannh@google.com> Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi> Suggested-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Peter Oskolkov <posk@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Florian Westphal <fw@strlen.de> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Mao Wenan <maowenan@huawei.com> [bwh: Backported to 4.4: - Keep using frag_kfree_skb() in inet_frag_destroy() - Adjust context] Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--include/linux/skbuff.h4
-rw-r--r--include/net/inet_frag.h3
-rw-r--r--net/ipv4/inet_fragment.c16
-rw-r--r--net/ipv4/ip_fragment.c182
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c1
-rw-r--r--net/ipv6/reassembly.c1
6 files changed, 117 insertions, 90 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1e734e221ea3..502787c29ce9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -556,14 +556,14 @@ struct sk_buff {
struct skb_mstamp skb_mstamp;
};
};
- struct rb_node rbnode; /* used in netem & tcp stack */
+ struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
};
union {
+ struct sock *sk;
int ip_defrag_offset;
};
- struct sock *sk;
struct net_device *dev;
/*
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 41a830ba11fc..5fd8e76dcd26 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -73,7 +73,8 @@ struct inet_frag_queue {
struct timer_list timer;
spinlock_t lock;
atomic_t refcnt;
- struct sk_buff *fragments;
+ struct sk_buff *fragments; /* Used in IPv6. */
+ struct rb_root rb_fragments; /* Used in IPv4. */
struct sk_buff *fragments_tail;
ktime_t stamp;
int len;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index bb784c3a40fb..02d399618b8f 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -144,12 +144,16 @@ void inet_frag_destroy(struct inet_frag_queue *q)
fp = q->fragments;
nf = q->net;
f = nf->f;
- while (fp) {
- struct sk_buff *xp = fp->next;
-
- sum_truesize += fp->truesize;
- frag_kfree_skb(nf, f, fp);
- fp = xp;
+ if (fp) {
+ do {
+ struct sk_buff *xp = fp->next;
+
+ sum_truesize += fp->truesize;
+ frag_kfree_skb(nf, f, fp);
+ fp = xp;
+ } while (fp);
+ } else {
+ sum_truesize = skb_rbtree_purge(&q->rb_fragments);
}
sum = sum_truesize + f->qsize;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 6e9d0c7c5159..270792776403 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -136,7 +136,7 @@ static bool frag_expire_skip_icmp(u32 user)
static void ip_expire(unsigned long arg)
{
const struct iphdr *iph;
- struct sk_buff *head;
+ struct sk_buff *head = NULL;
struct net *net;
struct ipq *qp;
int err;
@@ -152,14 +152,31 @@ static void ip_expire(unsigned long arg)
ipq_kill(qp);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
-
- head = qp->q.fragments;
-
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
- if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
+ if (!qp->q.flags & INET_FRAG_FIRST_IN)
goto out;
+ /* sk_buff::dev and sk_buff::rbnode are unionized. So we
+ * pull the head out of the tree in order to be able to
+ * deal with head->dev.
+ */
+ if (qp->q.fragments) {
+ head = qp->q.fragments;
+ qp->q.fragments = head->next;
+ } else {
+ head = skb_rb_first(&qp->q.rb_fragments);
+ if (!head)
+ goto out;
+ rb_erase(&head->rbnode, &qp->q.rb_fragments);
+ memset(&head->rbnode, 0, sizeof(head->rbnode));
+ barrier();
+ }
+ if (head == qp->q.fragments_tail)
+ qp->q.fragments_tail = NULL;
+
+ sub_frag_mem_limit(qp->q.net, head->truesize);
+
head->dev = dev_get_by_index_rcu(net, qp->iif);
if (!head->dev)
goto out;
@@ -179,16 +196,16 @@ static void ip_expire(unsigned long arg)
(skb_rtable(head)->rt_type != RTN_LOCAL))
goto out;
- skb_get(head);
spin_unlock(&qp->q.lock);
icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
- kfree_skb(head);
goto out_rcu_unlock;
out:
spin_unlock(&qp->q.lock);
out_rcu_unlock:
rcu_read_unlock();
+ if (head)
+ kfree_skb(head);
ipq_put(qp);
}
@@ -231,7 +248,7 @@ static int ip_frag_too_far(struct ipq *qp)
end = atomic_inc_return(&peer->rid);
qp->rid = end;
- rc = qp->q.fragments && (end - start) > max;
+ rc = qp->q.fragments_tail && (end - start) > max;
if (rc) {
struct net *net;
@@ -245,7 +262,6 @@ static int ip_frag_too_far(struct ipq *qp)
static int ip_frag_reinit(struct ipq *qp)
{
- struct sk_buff *fp;
unsigned int sum_truesize = 0;
if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
@@ -253,20 +269,14 @@ static int ip_frag_reinit(struct ipq *qp)
return -ETIMEDOUT;
}
- fp = qp->q.fragments;
- do {
- struct sk_buff *xp = fp->next;
-
- sum_truesize += fp->truesize;
- kfree_skb(fp);
- fp = xp;
- } while (fp);
+ sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
sub_frag_mem_limit(qp->q.net, sum_truesize);
qp->q.flags = 0;
qp->q.len = 0;
qp->q.meat = 0;
qp->q.fragments = NULL;
+ qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
qp->iif = 0;
qp->ecn = 0;
@@ -278,7 +288,8 @@ static int ip_frag_reinit(struct ipq *qp)
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
- struct sk_buff *prev, *next;
+ struct rb_node **rbn, *parent;
+ struct sk_buff *skb1;
struct net_device *dev;
unsigned int fragsize;
int flags, offset;
@@ -341,58 +352,58 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (err)
goto err;
- /* Find out which fragments are in front and at the back of us
- * in the chain of fragments so far. We must know where to put
- * this fragment, right?
- */
- prev = qp->q.fragments_tail;
- if (!prev || prev->ip_defrag_offset < offset) {
- next = NULL;
- goto found;
- }
- prev = NULL;
- for (next = qp->q.fragments; next != NULL; next = next->next) {
- if (next->ip_defrag_offset >= offset)
- break; /* bingo! */
- prev = next;
- }
+ /* Note : skb->rbnode and skb->dev share the same location. */
+ dev = skb->dev;
+ /* Makes sure compiler wont do silly aliasing games */
+ barrier();
-found:
/* RFC5722, Section 4, amended by Errata ID : 3089
* When reassembling an IPv6 datagram, if
* one or more its constituent fragments is determined to be an
* overlapping fragment, the entire datagram (and any constituent
* fragments) MUST be silently discarded.
*
- * We do the same here for IPv4.
+ * We do the same here for IPv4 (and increment an snmp counter).
*/
- /* Is there an overlap with the previous fragment? */
- if (prev &&
- (prev->ip_defrag_offset + prev->len) > offset)
- goto discard_qp;
-
- /* Is there an overlap with the next fragment? */
- if (next && next->ip_defrag_offset < end)
- goto discard_qp;
+ /* Find out where to put this fragment. */
+ skb1 = qp->q.fragments_tail;
+ if (!skb1) {
+ /* This is the first fragment we've received. */
+ rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
+ qp->q.fragments_tail = skb;
+ } else if ((skb1->ip_defrag_offset + skb1->len) < end) {
+ /* This is the common/special case: skb goes to the end. */
+ /* Detect and discard overlaps. */
+ if (offset < (skb1->ip_defrag_offset + skb1->len))
+ goto discard_qp;
+ /* Insert after skb1. */
+ rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
+ qp->q.fragments_tail = skb;
+ } else {
+ /* Binary search. Note that skb can become the first fragment, but
+ * not the last (covered above). */
+ rbn = &qp->q.rb_fragments.rb_node;
+ do {
+ parent = *rbn;
+ skb1 = rb_to_skb(parent);
+ if (end <= skb1->ip_defrag_offset)
+ rbn = &parent->rb_left;
+ else if (offset >= skb1->ip_defrag_offset + skb1->len)
+ rbn = &parent->rb_right;
+ else /* Found an overlap with skb1. */
+ goto discard_qp;
+ } while (*rbn);
+ /* Here we have parent properly set, and rbn pointing to
+ * one of its NULL left/right children. Insert skb. */
+ rb_link_node(&skb->rbnode, parent, rbn);
+ }
+ rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
- /* Note : skb->ip_defrag_offset and skb->dev share the same location */
- dev = skb->dev;
if (dev)
qp->iif = dev->ifindex;
- /* Makes sure compiler wont do silly aliasing games */
- barrier();
skb->ip_defrag_offset = offset;
- /* Insert this fragment in the chain of fragments. */
- skb->next = next;
- if (!next)
- qp->q.fragments_tail = skb;
- if (prev)
- prev->next = skb;
- else
- qp->q.fragments = skb;
-
qp->q.stamp = skb->tstamp;
qp->q.meat += skb->len;
qp->ecn |= ecn;
@@ -414,7 +425,7 @@ found:
unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL;
- err = ip_frag_reasm(qp, prev, dev);
+ err = ip_frag_reasm(qp, skb, dev);
skb->_skb_refdst = orefdst;
return err;
}
@@ -431,15 +442,15 @@ err:
return err;
}
-
/* Build a new IP datagram from all its fragments. */
-
-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
struct net_device *dev)
{
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct iphdr *iph;
- struct sk_buff *fp, *head = qp->q.fragments;
+ struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
+ struct sk_buff **nextp; /* To build frag_list. */
+ struct rb_node *rbn;
int len;
int ihlen;
int err;
@@ -453,25 +464,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
goto out_fail;
}
/* Make the one we just received the head. */
- if (prev) {
- head = prev->next;
- fp = skb_clone(head, GFP_ATOMIC);
+ if (head != skb) {
+ fp = skb_clone(skb, GFP_ATOMIC);
if (!fp)
goto out_nomem;
-
- fp->next = head->next;
- if (!fp->next)
+ rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
+ if (qp->q.fragments_tail == skb)
qp->q.fragments_tail = fp;
- prev->next = fp;
-
- skb_morph(head, qp->q.fragments);
- head->next = qp->q.fragments->next;
-
- consume_skb(qp->q.fragments);
- qp->q.fragments = head;
+ skb_morph(skb, head);
+ rb_replace_node(&head->rbnode, &skb->rbnode,
+ &qp->q.rb_fragments);
+ consume_skb(head);
+ head = skb;
}
- WARN_ON(!head);
WARN_ON(head->ip_defrag_offset != 0);
/* Allocate a new buffer for the datagram. */
@@ -496,24 +502,35 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
clone = alloc_skb(0, GFP_ATOMIC);
if (!clone)
goto out_nomem;
- clone->next = head->next;
- head->next = clone;
skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
skb_frag_list_init(head);
for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
clone->len = clone->data_len = head->data_len - plen;
- head->data_len -= clone->len;
- head->len -= clone->len;
+ skb->truesize += clone->truesize;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
add_frag_mem_limit(qp->q.net, clone->truesize);
+ skb_shinfo(head)->frag_list = clone;
+ nextp = &clone->next;
+ } else {
+ nextp = &skb_shinfo(head)->frag_list;
}
- skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head));
- for (fp=head->next; fp; fp = fp->next) {
+ /* Traverse the tree in order, to build frag_list. */
+ rbn = rb_next(&head->rbnode);
+ rb_erase(&head->rbnode, &qp->q.rb_fragments);
+ while (rbn) {
+ struct rb_node *rbnext = rb_next(rbn);
+ fp = rb_to_skb(rbn);
+ rb_erase(rbn, &qp->q.rb_fragments);
+ rbn = rbnext;
+ *nextp = fp;
+ nextp = &fp->next;
+ fp->prev = NULL;
+ memset(&fp->rbnode, 0, sizeof(fp->rbnode));
head->data_len += fp->len;
head->len += fp->len;
if (head->ip_summed != fp->ip_summed)
@@ -524,7 +541,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
}
sub_frag_mem_limit(qp->q.net, head->truesize);
+ *nextp = NULL;
head->next = NULL;
+ head->prev = NULL;
head->dev = dev;
head->tstamp = qp->q.stamp;
IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
@@ -552,6 +571,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
qp->q.fragments = NULL;
+ qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
return 0;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index cb72239dcff7..5ce591aef46e 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -445,6 +445,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
head->csum);
fq->q.fragments = NULL;
+ fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL;
/* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 3cbcf099f6b2..ec917f58d105 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -465,6 +465,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
rcu_read_unlock();
fq->q.fragments = NULL;
+ fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL;
return 1;