Skip to content

Commit 09736a3

Browse files
author
Herton R. Krzesinski
committed
Merge: udp: some performance optimizations
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/1541 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2133057 Tested: LNST, Tier1, tput test This series improves UDP protocol RX tput, to keep it on equal footing with rhel-8 one. Patches 1,3,4 are there just to reduces the conflicts, and patch 4 is a very partial backport, to avoid pulling unrelated features. Signed-off-by: Paolo Abeni <pabeni@redhat.com> Approved-by: Antoine Tenart <atenart@redhat.com> Approved-by: Florian Westphal <fwestpha@redhat.com> Signed-off-by: Herton R. Krzesinski <herton@redhat.com>
2 parents 8919de9 + 2657483 commit 09736a3

File tree

12 files changed

+152
-20
lines changed

12 files changed

+152
-20
lines changed

include/linux/net.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ struct net;
4141
#define SOCK_NOSPACE 2
4242
#define SOCK_PASSCRED 3
4343
#define SOCK_PASSSEC 4
44+
#define SOCK_CUSTOM_SOCKOPT 5
4445

4546
#ifndef ARCH_HAS_SOCKET_TYPES
4647
/**

include/linux/netdevice.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3738,6 +3738,7 @@ void netif_receive_skb_list(struct list_head *head);
37383738
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
37393739
void napi_gro_flush(struct napi_struct *napi, bool flush_old);
37403740
struct sk_buff *napi_get_frags(struct napi_struct *napi);
3741+
void napi_get_frags_check(struct napi_struct *napi);
37413742
gro_result_t napi_gro_frags(struct napi_struct *napi);
37423743
struct packet_offload *gro_find_receive_by_type(__be16 type);
37433744
struct packet_offload *gro_find_complete_by_type(__be16 type);

include/linux/udp.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,9 @@ struct udp_sock {
8686

8787
/* This field is dirtied by udp_recvmsg() */
8888
int forward_deficit;
89+
90+
/* This fields follows rcvbuf value, and is touched by udp_recvmsg */
91+
int forward_threshold;
8992
};
9093

9194
#define UDP_MAX_SEGMENTS (1 << 6UL)

include/net/sock.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1805,6 +1805,8 @@ void sock_pfree(struct sk_buff *skb);
18051805
#define sock_edemux sock_efree
18061806
#endif
18071807

1808+
int sk_setsockopt(struct sock *sk, int level, int optname,
1809+
sockptr_t optval, unsigned int optlen);
18081810
int sock_setsockopt(struct socket *sock, int level, int op,
18091811
sockptr_t optval, unsigned int optlen);
18101812

include/net/udp.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,15 @@ INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *));
174174
struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
175175
netdev_features_t features, bool is_ipv6);
176176

177+
static inline void udp_lib_init_sock(struct sock *sk)
178+
{
179+
struct udp_sock *up = udp_sk(sk);
180+
181+
skb_queue_head_init(&up->reader_queue);
182+
up->forward_threshold = sk->sk_rcvbuf >> 2;
183+
set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);
184+
}
185+
177186
/* hash routines shared between UDPv4/6 and UDP-Litev4/6 */
178187
static inline int udp_lib_hash(struct sock *sk)
179188
{

net/core/dev.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6356,6 +6356,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
63566356
set_bit(NAPI_STATE_NPSVC, &napi->state);
63576357
list_add_rcu(&napi->dev_list, &dev->napi_list);
63586358
napi_hash_add(napi);
6359+
napi_get_frags_check(napi);
63596360
/* Create kthread for this napi if dev->threaded is set.
63606361
* Clear dev->threaded if kthread creation failed so that
63616362
* threaded mode will not be enabled in napi_enable().

net/core/skbuff.c

Lines changed: 103 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,66 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
133133
#define NAPI_SKB_CACHE_BULK 16
134134
#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
135135

136+
#if PAGE_SIZE == SZ_4K
137+
138+
#define NAPI_HAS_SMALL_PAGE_FRAG 1
139+
#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc)
140+
141+
/* specialized page frag allocator using a single order 0 page
142+
* and slicing it into 1K sized fragment. Constrained to systems
143+
* with a very limited amount of 1K fragments fitting a single
144+
* page - to avoid excessive truesize underestimation
145+
*/
146+
147+
struct page_frag_1k {
148+
void *va;
149+
u16 offset;
150+
bool pfmemalloc;
151+
};
152+
153+
static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
154+
{
155+
struct page *page;
156+
int offset;
157+
158+
offset = nc->offset - SZ_1K;
159+
if (likely(offset >= 0))
160+
goto use_frag;
161+
162+
page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
163+
if (!page)
164+
return NULL;
165+
166+
nc->va = page_address(page);
167+
nc->pfmemalloc = page_is_pfmemalloc(page);
168+
offset = PAGE_SIZE - SZ_1K;
169+
page_ref_add(page, offset / SZ_1K);
170+
171+
use_frag:
172+
nc->offset = offset;
173+
return nc->va + offset;
174+
}
175+
#else
176+
177+
/* the small page is actually unused in this build; add dummy helpers
178+
* to please the compiler and avoid later preprocessor's conditionals
179+
*/
180+
#define NAPI_HAS_SMALL_PAGE_FRAG 0
181+
#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false
182+
183+
struct page_frag_1k {
184+
};
185+
186+
static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
187+
{
188+
return NULL;
189+
}
190+
191+
#endif
192+
136193
struct napi_alloc_cache {
137194
struct page_frag_cache page;
195+
struct page_frag_1k page_small;
138196
unsigned int skb_count;
139197
void *skb_cache[NAPI_SKB_CACHE_SIZE];
140198
};
@@ -150,6 +208,23 @@ static void *__alloc_frag_align(unsigned int fragsz, gfp_t gfp_mask,
150208
return page_frag_alloc_align(&nc->page, fragsz, gfp_mask, align_mask);
151209
}
152210

211+
/* Double check that napi_get_frags() allocates skbs with
212+
* skb->head being backed by slab, not a page fragment.
213+
* This is to make sure bug fixed in 3226b158e67c
214+
* ("net: avoid 32 x truesize under-estimation for tiny skbs")
215+
* does not accidentally come back.
216+
*/
217+
void napi_get_frags_check(struct napi_struct *napi)
218+
{
219+
struct sk_buff *skb;
220+
221+
local_bh_disable();
222+
skb = napi_get_frags(napi);
223+
WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
224+
napi_free_frags(napi);
225+
local_bh_enable();
226+
}
227+
153228
void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
154229
{
155230
fragsz = SKB_DATA_ALIGN(fragsz);
@@ -562,14 +637,17 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
562637
{
563638
struct napi_alloc_cache *nc;
564639
struct sk_buff *skb;
640+
bool pfmemalloc;
565641
void *data;
566642

567643
len += NET_SKB_PAD + NET_IP_ALIGN;
568644

569645
/* If requested length is either too small or too big,
570646
* we use kmalloc() for skb->head allocation.
647+
* When the small frag allocator is available, prefer it over kmalloc
648+
* for small fragments
571649
*/
572-
if (len <= SKB_WITH_OVERHEAD(1024) ||
650+
if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
573651
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
574652
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
575653
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
@@ -580,13 +658,33 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
580658
}
581659

582660
nc = this_cpu_ptr(&napi_alloc_cache);
583-
len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
584-
len = SKB_DATA_ALIGN(len);
585661

586662
if (sk_memalloc_socks())
587663
gfp_mask |= __GFP_MEMALLOC;
588664

589-
data = page_frag_alloc(&nc->page, len, gfp_mask);
665+
if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
666+
/* we are artificially inflating the allocation size, but
667+
* that is not as bad as it may look like, as:
668+
* - 'len' less than GRO_MAX_HEAD makes little sense
669+
* - On most systems, larger 'len' values lead to fragment
670+
* size above 512 bytes
671+
* - kmalloc would use the kmalloc-1k slab for such values
672+
* - Builds with smaller GRO_MAX_HEAD will very likely do
673+
* little networking, as that implies no WiFi and no
674+
* tunnels support, and 32 bits arches.
675+
*/
676+
len = SZ_1K;
677+
678+
data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
679+
pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
680+
} else {
681+
len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
682+
len = SKB_DATA_ALIGN(len);
683+
684+
data = page_frag_alloc(&nc->page, len, gfp_mask);
685+
pfmemalloc = nc->page.pfmemalloc;
686+
}
687+
590688
if (unlikely(!data))
591689
return NULL;
592690

@@ -596,7 +694,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
596694
return NULL;
597695
}
598696

599-
if (nc->page.pfmemalloc)
697+
if (pfmemalloc)
600698
skb->pfmemalloc = 1;
601699
skb->head_frag = 1;
602700

net/core/sock.c

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,12 +1030,12 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
10301030
* at the socket level. Everything here is generic.
10311031
*/
10321032

1033-
int sock_setsockopt(struct socket *sock, int level, int optname,
1034-
sockptr_t optval, unsigned int optlen)
1033+
int sk_setsockopt(struct sock *sk, int level, int optname,
1034+
sockptr_t optval, unsigned int optlen)
10351035
{
10361036
struct so_timestamping timestamping;
1037+
struct socket *sock = sk->sk_socket;
10371038
struct sock_txtime sk_txtime;
1038-
struct sock *sk = sock->sk;
10391039
int val;
10401040
int valbool;
10411041
struct linger ling;
@@ -1468,6 +1468,13 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
14681468
release_sock(sk);
14691469
return ret;
14701470
}
1471+
1472+
int sock_setsockopt(struct socket *sock, int level, int optname,
1473+
sockptr_t optval, unsigned int optlen)
1474+
{
1475+
return sk_setsockopt(sock->sk, level, optname,
1476+
optval, optlen);
1477+
}
14711478
EXPORT_SYMBOL(sock_setsockopt);
14721479

14731480
static const struct cred *sk_get_peer_cred(struct sock *sk)

net/ipv4/udp.c

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1446,7 +1446,7 @@ static void udp_rmem_release(struct sock *sk, int size, int partial,
14461446
if (likely(partial)) {
14471447
up->forward_deficit += size;
14481448
size = up->forward_deficit;
1449-
if (size < (sk->sk_rcvbuf >> 2) &&
1449+
if (size < READ_ONCE(up->forward_threshold) &&
14501450
!skb_queue_empty(&up->reader_queue))
14511451
return;
14521452
} else {
@@ -1620,7 +1620,7 @@ static void udp_destruct_sock(struct sock *sk)
16201620

16211621
int udp_init_sock(struct sock *sk)
16221622
{
1623-
skb_queue_head_init(&udp_sk(sk)->reader_queue);
1623+
udp_lib_init_sock(sk);
16241624
sk->sk_destruct = udp_destruct_sock;
16251625
return 0;
16261626
}
@@ -2684,6 +2684,18 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
26842684
int err = 0;
26852685
int is_udplite = IS_UDPLITE(sk);
26862686

2687+
if (level == SOL_SOCKET) {
2688+
err = sk_setsockopt(sk, level, optname, optval, optlen);
2689+
2690+
if (optname == SO_RCVBUF || optname == SO_RCVBUFFORCE) {
2691+
lock_sock(sk);
2692+
/* paired with READ_ONCE in udp_rmem_release() */
2693+
WRITE_ONCE(up->forward_threshold, sk->sk_rcvbuf >> 2);
2694+
release_sock(sk);
2695+
}
2696+
return err;
2697+
}
2698+
26872699
if (optlen < sizeof(int))
26882700
return -EINVAL;
26892701

@@ -2797,7 +2809,7 @@ EXPORT_SYMBOL(udp_lib_setsockopt);
27972809
int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
27982810
unsigned int optlen)
27992811
{
2800-
if (level == SOL_UDP || level == SOL_UDPLITE)
2812+
if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET)
28012813
return udp_lib_setsockopt(sk, level, optname,
28022814
optval, optlen,
28032815
udp_push_pending_frames);

net/ipv6/udp.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ static void udpv6_destruct_sock(struct sock *sk)
6363

6464
int udpv6_init_sock(struct sock *sk)
6565
{
66-
skb_queue_head_init(&udp_sk(sk)->reader_queue);
66+
udp_lib_init_sock(sk);
6767
sk->sk_destruct = udpv6_destruct_sock;
6868
return 0;
6969
}
@@ -1671,7 +1671,7 @@ void udpv6_destroy_sock(struct sock *sk)
16711671
int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
16721672
unsigned int optlen)
16731673
{
1674-
if (level == SOL_UDP || level == SOL_UDPLITE)
1674+
if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET)
16751675
return udp_lib_setsockopt(sk, level, optname,
16761676
optval, optlen,
16771677
udp_v6_push_pending_frames);

0 commit comments

Comments
 (0)