19952cc4f8
This patch implements per hash bucket locking for the frag queue hash. This removes two write locks, and the only remaining write lock is for protecting hash rebuild. This essentially reduce the readers-writer lock to a rebuild lock. This patch is part of "net: frag performance followup" http://thread.gmane.org/gmane.linux.network/263644 of which two patches have already been accepted: Same test setup as previous: (http://thread.gmane.org/gmane.linux.network/257155) Two 10G interfaces, on seperate NUMA nodes, are under-test, and uses Ethernet flow-control. A third interface is used for generating the DoS attack (with trafgen). Notice, I have changed the frag DoS generator script to be more efficient/deadly. Before it would only hit one RX queue, now its sending packets causing multi-queue RX, due to "better" RX hashing. Test types summary (netperf UDP_STREAM): Test-20G64K == 2x10G with 65K fragments Test-20G3F == 2x10G with 3x fragments (3*1472 bytes) Test-20G64K+DoS == Same as 20G64K with frag DoS Test-20G3F+DoS == Same as 20G3F with frag DoS Test-20G64K+MQ == Same as 20G64K with Multi-Queue frag DoS Test-20G3F+MQ == Same as 20G3F with Multi-Queue frag DoS When I rebased this-patch(03) (on top of net-next commita210576c
) and removed the _bh spinlock, I saw a performance regression. BUT this was caused by some unrelated change in-between. See tests below. Test (A) is what I reported before for patch-02, accepted in commit1b5ab0de
. Test (B) verifying-retest of commit1b5ab0de
corrospond to patch-02. Test (C) is what I reported before for this-patch Test (D) is net-next master HEAD (commita210576c
), which reveals some (unknown) performance regression (compared against test (B)). Test (D) function as a new base-test. Performance table summary (in Mbit/s): (#) Test-type: 20G64K 20G3F 20G64K+DoS 20G3F+DoS 20G64K+MQ 20G3F+MQ ---------- ------- ------- ---------- --------- -------- ------- (A) Patch-02 : 18848.7 13230.1 4103.04 5310.36 130.0 440.2 (B)1b5ab0de
: 18841.5 13156.8 4101.08 5314.57 129.0 424.2 (C) Patch-03v1: 18838.0 13490.5 4405.11 6814.72 196.6 461.6 (D)a210576c
: 18321.5 11250.4 3635.34 5160.13 119.1 405.2 (E) with _bh : 17247.3 11492.6 3994.74 6405.29 166.7 413.6 (F) without bh: 17471.3 11298.7 3818.05 6102.11 165.7 406.3 Test (E) and (F) is this-patch(03), with(V1) and without(V2) the _bh spinlocks. I cannot explain the slow down for 20G64K (but its an artificial "lab-test" so I'm not worried). But the other results does show improvements. And test (E) "with _bh" version is slightly better. Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Acked-by: Eric Dumazet <edumazet@google.com> ---- V2: - By analysis from Hannes Frederic Sowa and Eric Dumazet, we don't need the spinlock _bh versions, as Netfilter currently does a local_bh_disable() before entering inet_fragment. - Fold-in desc from cover-mail V3: - Drop the chain_len counter per hash bucket. Signed-off-by: David S. Miller <davem@davemloft.net>
176 lines
4.8 KiB
C
176 lines
4.8 KiB
C
#ifndef __NET_FRAG_H__
|
|
#define __NET_FRAG_H__
|
|
|
|
#include <linux/percpu_counter.h>
|
|
|
|
struct netns_frags {
|
|
int nqueues;
|
|
struct list_head lru_list;
|
|
spinlock_t lru_lock;
|
|
|
|
/* The percpu_counter "mem" need to be cacheline aligned.
|
|
* mem.count must not share cacheline with other writers
|
|
*/
|
|
struct percpu_counter mem ____cacheline_aligned_in_smp;
|
|
|
|
/* sysctls */
|
|
int timeout;
|
|
int high_thresh;
|
|
int low_thresh;
|
|
};
|
|
|
|
struct inet_frag_queue {
|
|
spinlock_t lock;
|
|
struct timer_list timer; /* when will this queue expire? */
|
|
struct list_head lru_list; /* lru list member */
|
|
struct hlist_node list;
|
|
atomic_t refcnt;
|
|
struct sk_buff *fragments; /* list of received fragments */
|
|
struct sk_buff *fragments_tail;
|
|
ktime_t stamp;
|
|
int len; /* total length of orig datagram */
|
|
int meat;
|
|
__u8 last_in; /* first/last segment arrived? */
|
|
|
|
#define INET_FRAG_COMPLETE 4
|
|
#define INET_FRAG_FIRST_IN 2
|
|
#define INET_FRAG_LAST_IN 1
|
|
|
|
u16 max_size;
|
|
|
|
struct netns_frags *net;
|
|
};
|
|
|
|
#define INETFRAGS_HASHSZ 64
|
|
|
|
/* averaged:
|
|
* max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ /
|
|
* rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or
|
|
* struct frag_queue))
|
|
*/
|
|
#define INETFRAGS_MAXDEPTH 128
|
|
|
|
struct inet_frag_bucket {
|
|
struct hlist_head chain;
|
|
spinlock_t chain_lock;
|
|
};
|
|
|
|
struct inet_frags {
|
|
struct inet_frag_bucket hash[INETFRAGS_HASHSZ];
|
|
/* This rwlock is a global lock (seperate per IPv4, IPv6 and
|
|
* netfilter). Important to keep this on a seperate cacheline.
|
|
* Its primarily a rebuild protection rwlock.
|
|
*/
|
|
rwlock_t lock ____cacheline_aligned_in_smp;
|
|
int secret_interval;
|
|
struct timer_list secret_timer;
|
|
u32 rnd;
|
|
int qsize;
|
|
|
|
unsigned int (*hashfn)(struct inet_frag_queue *);
|
|
bool (*match)(struct inet_frag_queue *q, void *arg);
|
|
void (*constructor)(struct inet_frag_queue *q,
|
|
void *arg);
|
|
void (*destructor)(struct inet_frag_queue *);
|
|
void (*skb_free)(struct sk_buff *);
|
|
void (*frag_expire)(unsigned long data);
|
|
};
|
|
|
|
void inet_frags_init(struct inet_frags *);
|
|
void inet_frags_fini(struct inet_frags *);
|
|
|
|
void inet_frags_init_net(struct netns_frags *nf);
|
|
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
|
|
|
|
void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
|
|
void inet_frag_destroy(struct inet_frag_queue *q,
|
|
struct inet_frags *f, int *work);
|
|
int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force);
|
|
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
|
|
struct inet_frags *f, void *key, unsigned int hash)
|
|
__releases(&f->lock);
|
|
void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
|
|
const char *prefix);
|
|
|
|
static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f)
|
|
{
|
|
if (atomic_dec_and_test(&q->refcnt))
|
|
inet_frag_destroy(q, f, NULL);
|
|
}
|
|
|
|
/* Memory Tracking Functions. */
|
|
|
|
/* The default percpu_counter batch size is not big enough to scale to
|
|
* fragmentation mem acct sizes.
|
|
* The mem size of a 64K fragment is approx:
|
|
* (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes
|
|
*/
|
|
static unsigned int frag_percpu_counter_batch = 130000;
|
|
|
|
static inline int frag_mem_limit(struct netns_frags *nf)
|
|
{
|
|
return percpu_counter_read(&nf->mem);
|
|
}
|
|
|
|
static inline void sub_frag_mem_limit(struct inet_frag_queue *q, int i)
|
|
{
|
|
__percpu_counter_add(&q->net->mem, -i, frag_percpu_counter_batch);
|
|
}
|
|
|
|
static inline void add_frag_mem_limit(struct inet_frag_queue *q, int i)
|
|
{
|
|
__percpu_counter_add(&q->net->mem, i, frag_percpu_counter_batch);
|
|
}
|
|
|
|
static inline void init_frag_mem_limit(struct netns_frags *nf)
|
|
{
|
|
percpu_counter_init(&nf->mem, 0);
|
|
}
|
|
|
|
static inline int sum_frag_mem_limit(struct netns_frags *nf)
|
|
{
|
|
int res;
|
|
|
|
local_bh_disable();
|
|
res = percpu_counter_sum_positive(&nf->mem);
|
|
local_bh_enable();
|
|
|
|
return res;
|
|
}
|
|
|
|
static inline void inet_frag_lru_move(struct inet_frag_queue *q)
|
|
{
|
|
spin_lock(&q->net->lru_lock);
|
|
list_move_tail(&q->lru_list, &q->net->lru_list);
|
|
spin_unlock(&q->net->lru_lock);
|
|
}
|
|
|
|
static inline void inet_frag_lru_del(struct inet_frag_queue *q)
|
|
{
|
|
spin_lock(&q->net->lru_lock);
|
|
list_del(&q->lru_list);
|
|
q->net->nqueues--;
|
|
spin_unlock(&q->net->lru_lock);
|
|
}
|
|
|
|
static inline void inet_frag_lru_add(struct netns_frags *nf,
|
|
struct inet_frag_queue *q)
|
|
{
|
|
spin_lock(&nf->lru_lock);
|
|
list_add_tail(&q->lru_list, &nf->lru_list);
|
|
q->net->nqueues++;
|
|
spin_unlock(&nf->lru_lock);
|
|
}
|
|
|
|
/* RFC 3168 support :
|
|
* We want to check ECN values of all fragments, do detect invalid combinations.
|
|
* In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
|
|
*/
|
|
#define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */
|
|
#define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */
|
|
#define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */
|
|
#define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */
|
|
|
|
extern const u8 ip_frag_ecn_table[16];
|
|
|
|
#endif
|