Development discussion of WireGuard
 help / color / mirror / Atom feed
* [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
@ 2024-06-09  8:27 Julia Lawall
  2024-06-09  8:27 ` [PATCH 01/14] wireguard: allowedips: " Julia Lawall
  2024-06-12 21:33 ` [PATCH 00/14] " Jakub Kicinski
  0 siblings, 2 replies; 50+ messages in thread
From: Julia Lawall @ 2024-06-09  8:27 UTC (permalink / raw)
  To: linux-block
  Cc: kernel-janitors, bridge, linux-trace-kernel, Mathieu Desnoyers,
	kvm, linuxppc-dev, Naveen N. Rao, Christophe Leroy,
	Nicholas Piggin, netdev, wireguard, linux-kernel, ecryptfs,
	Neil Brown, Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-nfs,
	linux-can, Lai Jiangshan, netfilter-devel, coreteam,
	Paul E . McKenney, Vlastimil Babka

Since SLOB was removed, it is not necessary to use call_rcu
when the callback only performs kmem_cache_free. Use
kfree_rcu() directly.

The changes were done using the following Coccinelle semantic patch.
This semantic patch is designed to ignore cases where the callback
function is used in another way.

// <smpl>
@r@
expression e;
local idexpression e2;
identifier cb,f;
position p;
@@

(
call_rcu(...,e2)
|
call_rcu(&e->f,cb@p)
)

@r1@
type T;
identifier x,r.cb;
@@

 cb(...) {
(
   kmem_cache_free(...);
|
   T x = ...;
   kmem_cache_free(...,x);
|
   T x;
   x = ...;
   kmem_cache_free(...,x);
)
 }

@s depends on r1@
position p != r.p;
identifier r.cb;
@@

 cb@p

@script:ocaml@
cb << r.cb;
p << s.p;
@@

Printf.eprintf "Other use of %s at %s:%d\n"
   cb (List.hd p).file (List.hd p).line

@depends on r1 && !s@
expression e;
identifier r.cb,f;
position r.p;
@@

- call_rcu(&e->f,cb@p)
+ kfree_rcu(e,f)

@r1a depends on !s@
type T;
identifier x,r.cb;
@@

- cb(...) {
(
-  kmem_cache_free(...);
|
-  T x = ...;
-  kmem_cache_free(...,x);
|
-  T x;
-  x = ...;
-  kmem_cache_free(...,x);
)
- }
// </smpl>

Signed-off-by: Julia Lawall <Julia.Lawall@inria.fr>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>

---

 arch/powerpc/kvm/book3s_mmu_hpte.c  |    8 +-------
 block/blk-ioc.c                     |    9 +--------
 drivers/net/wireguard/allowedips.c  |    9 ++-------
 fs/ecryptfs/dentry.c                |    8 +-------
 fs/nfsd/nfs4state.c                 |    9 +--------
 fs/tracefs/inode.c                  |   10 +---------
 kernel/time/posix-timers.c          |    9 +--------
 kernel/workqueue.c                  |    8 +-------
 net/bridge/br_fdb.c                 |    9 +--------
 net/can/gw.c                        |   13 +++----------
 net/ipv4/fib_trie.c                 |    8 +-------
 net/ipv4/inetpeer.c                 |    9 ++-------
 net/ipv6/ip6_fib.c                  |    9 +--------
 net/ipv6/xfrm6_tunnel.c             |    8 +-------
 net/kcm/kcmsock.c                   |   10 +---------
 net/netfilter/nf_conncount.c        |   10 +---------
 net/netfilter/nf_conntrack_expect.c |   10 +---------
 net/netfilter/xt_hashlimit.c        |    9 +--------
 18 files changed, 22 insertions(+), 143 deletions(-)

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-09  8:27 [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback Julia Lawall
@ 2024-06-09  8:27 ` Julia Lawall
  2024-06-09 14:32   ` Jason A. Donenfeld
  2024-06-12 21:33 ` [PATCH 00/14] " Jakub Kicinski
  1 sibling, 1 reply; 50+ messages in thread
From: Julia Lawall @ 2024-06-09  8:27 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: kernel-janitors, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, wireguard, netdev, linux-kernel, Paul E . McKenney,
	Vlastimil Babka

Since SLOB was removed, it is not necessary to use call_rcu
when the callback only performs kmem_cache_free. Use
kfree_rcu() directly.

The changes were done using the following Coccinelle semantic patch.
This semantic patch is designed to ignore cases where the callback
function is used in another way.

// <smpl>
@r@
expression e;
local idexpression e2;
identifier cb,f;
position p;
@@

(
call_rcu(...,e2)
|
call_rcu(&e->f,cb@p)
)

@r1@
type T;
identifier x,r.cb;
@@

 cb(...) {
(
   kmem_cache_free(...);
|
   T x = ...;
   kmem_cache_free(...,x);
|
   T x;
   x = ...;
   kmem_cache_free(...,x);
)
 }

@s depends on r1@
position p != r.p;
identifier r.cb;
@@

 cb@p

@script:ocaml@
cb << r.cb;
p << s.p;
@@

Printf.eprintf "Other use of %s at %s:%d\n"
   cb (List.hd p).file (List.hd p).line

@depends on r1 && !s@
expression e;
identifier r.cb,f;
position r.p;
@@

- call_rcu(&e->f,cb@p)
+ kfree_rcu(e,f)

@r1a depends on !s@
type T;
identifier x,r.cb;
@@

- cb(...) {
(
-  kmem_cache_free(...);
|
-  T x = ...;
-  kmem_cache_free(...,x);
|
-  T x;
-  x = ...;
-  kmem_cache_free(...,x);
)
- }
// </smpl>

Signed-off-by: Julia Lawall <Julia.Lawall@inria.fr>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>

---
 drivers/net/wireguard/allowedips.c |    9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wireguard/allowedips.c b/drivers/net/wireguard/allowedips.c
index 0ba714ca5185..e4e1638fce1b 100644
--- a/drivers/net/wireguard/allowedips.c
+++ b/drivers/net/wireguard/allowedips.c
@@ -48,11 +48,6 @@ static void push_rcu(struct allowedips_node **stack,
 	}
 }
 
-static void node_free_rcu(struct rcu_head *rcu)
-{
-	kmem_cache_free(node_cache, container_of(rcu, struct allowedips_node, rcu));
-}
-
 static void root_free_rcu(struct rcu_head *rcu)
 {
 	struct allowedips_node *node, *stack[MAX_ALLOWEDIPS_DEPTH] = {
@@ -330,13 +325,13 @@ void wg_allowedips_remove_by_peer(struct allowedips *table,
 			child = rcu_dereference_protected(
 					parent->bit[!(node->parent_bit_packed & 1)],
 					lockdep_is_held(lock));
-		call_rcu(&node->rcu, node_free_rcu);
+		kfree_rcu(node, rcu);
 		if (!free_parent)
 			continue;
 		if (child)
 			child->parent_bit_packed = parent->parent_bit_packed;
 		*(struct allowedips_node **)(parent->parent_bit_packed & ~3UL) = child;
-		call_rcu(&parent->rcu, node_free_rcu);
+		kfree_rcu(parent, rcu);
 	}
 }
 


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-09  8:27 ` [PATCH 01/14] wireguard: allowedips: " Julia Lawall
@ 2024-06-09 14:32   ` Jason A. Donenfeld
  2024-06-09 14:36     ` Julia Lawall
  2024-06-10 20:38     ` Vlastimil Babka
  0 siblings, 2 replies; 50+ messages in thread
From: Jason A. Donenfeld @ 2024-06-09 14:32 UTC (permalink / raw)
  To: Julia Lawall
  Cc: kernel-janitors, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, wireguard, netdev, linux-kernel, Paul E . McKenney,
	Vlastimil Babka

Hi Julia & Vlastimil,

On Sun, Jun 09, 2024 at 10:27:13AM +0200, Julia Lawall wrote:
> Since SLOB was removed, it is not necessary to use call_rcu
> when the callback only performs kmem_cache_free. Use
> kfree_rcu() directly.

Thanks, I applied this to the wireguard tree, and I'll send this out as
a fix for 6.10. Let me know if this is unfavorable to you and if you'd
like to take this somewhere yourself, in which case I'll give you my
ack.

Just a question, though, for Vlastimil -- I know that with the SLOB
removal, kfree() is now allowed on kmemcache'd objects. Do you plan to
do a blanket s/kmem_cache_free/kfree/g at some point, and then remove
kmem_cache_free all together?

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-09 14:32   ` Jason A. Donenfeld
@ 2024-06-09 14:36     ` Julia Lawall
  2024-06-10 20:38     ` Vlastimil Babka
  1 sibling, 0 replies; 50+ messages in thread
From: Julia Lawall @ 2024-06-09 14:36 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Julia Lawall, kernel-janitors, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, wireguard, netdev, linux-kernel,
	Paul E . McKenney, Vlastimil Babka



On Sun, 9 Jun 2024, Jason A. Donenfeld wrote:

> Hi Julia & Vlastimil,
>
> On Sun, Jun 09, 2024 at 10:27:13AM +0200, Julia Lawall wrote:
> > Since SLOB was removed, it is not necessary to use call_rcu
> > when the callback only performs kmem_cache_free. Use
> > kfree_rcu() directly.
>
> Thanks, I applied this to the wireguard tree, and I'll send this out as
> a fix for 6.10. Let me know if this is unfavorable to you and if you'd
> like to take this somewhere yourself, in which case I'll give you my
> ack.

Please push it onward.

julia

>
> Just a question, though, for Vlastimil -- I know that with the SLOB
> removal, kfree() is now allowed on kmemcache'd objects. Do you plan to
> do a blanket s/kmem_cache_free/kfree/g at some point, and then remove
> kmem_cache_free all together?
>
> Jason
>

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-09 14:32   ` Jason A. Donenfeld
  2024-06-09 14:36     ` Julia Lawall
@ 2024-06-10 20:38     ` Vlastimil Babka
  2024-06-10 20:59       ` Jason A. Donenfeld
  1 sibling, 1 reply; 50+ messages in thread
From: Vlastimil Babka @ 2024-06-10 20:38 UTC (permalink / raw)
  To: Jason A. Donenfeld, Julia Lawall
  Cc: kernel-janitors, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, wireguard, netdev, linux-kernel, Paul E . McKenney,
	Matthew Wilcox

On 6/9/24 4:32 PM, Jason A. Donenfeld wrote:
> Hi Julia & Vlastimil,
> 
> On Sun, Jun 09, 2024 at 10:27:13AM +0200, Julia Lawall wrote:
>> Since SLOB was removed, it is not necessary to use call_rcu
>> when the callback only performs kmem_cache_free. Use
>> kfree_rcu() directly.
> 
> Thanks, I applied this to the wireguard tree, and I'll send this out as
> a fix for 6.10. Let me know if this is unfavorable to you and if you'd
> like to take this somewhere yourself, in which case I'll give you my
> ack.
> 
> Just a question, though, for Vlastimil -- I know that with the SLOB
> removal, kfree() is now allowed on kmemcache'd objects. Do you plan to
> do a blanket s/kmem_cache_free/kfree/g at some point, and then remove
> kmem_cache_free all together?

Hmm, not really, but obligatory Cc for willy who'd love to have "one free()
to rule them all" IIRC.

My current thinking is that kmem_cache_free() can save the kmem_cache
lookup, or serve as a double check if debugging is enabled, and doesn't have
much downside. If someone wants to not care about the kmem_cache pointer,
they can use kfree(). Even convert their subsystem at will. But a mass
conversion of everything would be rather lot of churn for not much of a
benefit, IMHO.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-10 20:38     ` Vlastimil Babka
@ 2024-06-10 20:59       ` Jason A. Donenfeld
  0 siblings, 0 replies; 50+ messages in thread
From: Jason A. Donenfeld @ 2024-06-10 20:59 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Julia Lawall, kernel-janitors, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, wireguard, netdev, linux-kernel,
	Paul E . McKenney, Matthew Wilcox, linux-mm

Hi Vlastimil,

On Mon, Jun 10, 2024 at 10:38:08PM +0200, Vlastimil Babka wrote:
> On 6/9/24 4:32 PM, Jason A. Donenfeld wrote:
> > Hi Julia & Vlastimil,
> > 
> > On Sun, Jun 09, 2024 at 10:27:13AM +0200, Julia Lawall wrote:
> >> Since SLOB was removed, it is not necessary to use call_rcu
> >> when the callback only performs kmem_cache_free. Use
> >> kfree_rcu() directly.
> > 
> > Thanks, I applied this to the wireguard tree, and I'll send this out as
> > a fix for 6.10. Let me know if this is unfavorable to you and if you'd
> > like to take this somewhere yourself, in which case I'll give you my
> > ack.
> > 
> > Just a question, though, for Vlastimil -- I know that with the SLOB
> > removal, kfree() is now allowed on kmemcache'd objects. Do you plan to
> > do a blanket s/kmem_cache_free/kfree/g at some point, and then remove
> > kmem_cache_free all together?
> 
> Hmm, not really, but obligatory Cc for willy who'd love to have "one free()
> to rule them all" IIRC.
> 
> My current thinking is that kmem_cache_free() can save the kmem_cache
> lookup, or serve as a double check if debugging is enabled, and doesn't have
> much downside. If someone wants to not care about the kmem_cache pointer,
> they can use kfree(). Even convert their subsystem at will. But a mass
> conversion of everything would be rather lot of churn for not much of a
> benefit, IMHO.

Huh, interesting. I can see the practical sense in that, not causing
unnecessary churn and such.

At the same time, this doesn't appeal much to some sort of orderly part
of my mind. Either all kmalloc/kmem_cache memory is kfree()d as the rule
for what is best, or a kmalloc pairs with a kfree and a kmem_cache_alloc
pairs with a kmem_cache_free and that's the rule. And those can be
checked and enforced and so forth. But saying, "oh, well, they might
work a bit different, but whatever you want is basically fine; there's
no rhyme or reason" is somehow dissatisfying. Maybe the rule is
actually, "use kmem_cache_free if you can because it saves a pointer
lookup, but don't go out of your way to do that and certainly don't
bloat .text to make it happen," then maybe that makes sense? But I
dunno, I find myself wanting a rule and consistency. (Did you find it
annoying that in this paragraph, I used () on only one function mention
but not on the others? If so, maybe you're like me.) Maybe I should just
chill though. Anyway, only my 2¢, and my opinion here isn't worth much,
so please regard this as only a gut statement from a bystander.

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-09  8:27 [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback Julia Lawall
  2024-06-09  8:27 ` [PATCH 01/14] wireguard: allowedips: " Julia Lawall
@ 2024-06-12 21:33 ` Jakub Kicinski
  2024-06-12 22:37   ` Paul E. McKenney
  1 sibling, 1 reply; 50+ messages in thread
From: Jakub Kicinski @ 2024-06-12 21:33 UTC (permalink / raw)
  To: Julia Lawall
  Cc: linux-block, kernel-janitors, bridge, linux-trace-kernel,
	Mathieu Desnoyers, kvm, linuxppc-dev, Naveen N. Rao,
	Christophe Leroy, Nicholas Piggin, netdev, wireguard,
	linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia, Dai Ngo,
	Tom Talpey, linux-nfs, linux-can, Lai Jiangshan, netfilter-devel,
	coreteam, Paul E . McKenney, Vlastimil Babka

On Sun,  9 Jun 2024 10:27:12 +0200 Julia Lawall wrote:
> Since SLOB was removed, it is not necessary to use call_rcu
> when the callback only performs kmem_cache_free. Use
> kfree_rcu() directly.
> 
> The changes were done using the following Coccinelle semantic patch.
> This semantic patch is designed to ignore cases where the callback
> function is used in another way.

How does the discussion on:
  [PATCH] Revert "batman-adv: prefer kfree_rcu() over call_rcu() with free-only callbacks"
  https://lore.kernel.org/all/20240612133357.2596-1-linus.luessing@c0d3.blue/
reflect on this series? IIUC we should hold off..

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-12 21:33 ` [PATCH 00/14] " Jakub Kicinski
@ 2024-06-12 22:37   ` Paul E. McKenney
  2024-06-12 22:46     ` Jakub Kicinski
                       ` (3 more replies)
  0 siblings, 4 replies; 50+ messages in thread
From: Paul E. McKenney @ 2024-06-12 22:37 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Julia Lawall, linux-block, kernel-janitors, bridge,
	linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam, Vlastimil Babka

On Wed, Jun 12, 2024 at 02:33:05PM -0700, Jakub Kicinski wrote:
> On Sun,  9 Jun 2024 10:27:12 +0200 Julia Lawall wrote:
> > Since SLOB was removed, it is not necessary to use call_rcu
> > when the callback only performs kmem_cache_free. Use
> > kfree_rcu() directly.
> > 
> > The changes were done using the following Coccinelle semantic patch.
> > This semantic patch is designed to ignore cases where the callback
> > function is used in another way.
> 
> How does the discussion on:
>   [PATCH] Revert "batman-adv: prefer kfree_rcu() over call_rcu() with free-only callbacks"
>   https://lore.kernel.org/all/20240612133357.2596-1-linus.luessing@c0d3.blue/
> reflect on this series? IIUC we should hold off..

We do need to hold off for the ones in kernel modules (such as 07/14)
where the kmem_cache is destroyed during module unload.

OK, I might as well go through them...

[PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
	Needs to wait, see wg_allowedips_slab_uninit().

[PATCH 02/14] net: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
	I don't immediately see the rcu_barrier(), but if there isn't
	one in there somewhere there probably should be.  Caution
	suggests a need to wait.

[PATCH 03/14] KVM: PPC: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
	I don't immediately see the rcu_barrier(), but if there isn't
	one in there somewhere there probably should be.  Caution
	suggests a need to wait.

[PATCH 04/14] xfrm6_tunnel: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
	Needs to wait, see xfrm6_tunnel_fini().

[PATCH 05/14] tracefs: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
	This one is fine because the tracefs_inode_cachep kmem_cache
	is created at boot and never destroyed.

[PATCH 06/14] eCryptfs: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
	I don't see a kmem_cache_destroy(), but then again, I also don't
	see the kmem_cache_create().  Unless someone can see what I am
	not seeing, let's wait.

[PATCH 07/14] net: bridge: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
	Needs to wait, see br_fdb_fini() and br_deinit().

[PATCH 08/14] nfsd: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
	I don't immediately see the rcu_barrier(), but if there isn't
	one in there somewhere there probably should be.  Caution
	suggests a need to wait.

[PATCH 09/14] block: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
	I don't see a kmem_cache_destroy(), but then again, I also don't
	see the kmem_cache_create().  Unless someone can see what I am
	not seeing, let's wait.

[PATCH 10/14] can: gw: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
	Needs to wait, see cgw_module_exit().

[PATCH 11/14] posix-timers: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
	This one is fine because the posix_timers_cache kmem_cache is
	created at boot and never destroyed.

[PATCH 12/14] workqueue: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
	This one is fine because the pwq_cache kmem_cache is created at
	boot and never destroyed.

[PATCH 13/14] kcm: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
	I don't immediately see the rcu_barrier(), but if there isn't
	one in there somewhere there probably should be.  Caution
	suggests a need to wait.

[PATCH 14/14] netfilter: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
	Needs to wait, see hashlimit_mt_exit().

So 05/14, 11/14 and 12/14 are OK and can go ahead.  The rest need some
help.

Apologies for my having gotten overly enthusiastic about this change!

							Thanx, Paul

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-12 22:37   ` Paul E. McKenney
@ 2024-06-12 22:46     ` Jakub Kicinski
       [not found]     ` <7e58e73d-4173-49fe-8f05-38a3699bc2c1@kernel.dk>
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 50+ messages in thread
From: Jakub Kicinski @ 2024-06-12 22:46 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Julia Lawall, linux-block, kernel-janitors, bridge,
	linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam, Vlastimil Babka

On Wed, 12 Jun 2024 15:37:55 -0700 Paul E. McKenney wrote:
> So 05/14, 11/14 and 12/14 are OK and can go ahead.  The rest need some
> help.

Thank you for the breakdown!

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
       [not found]     ` <7e58e73d-4173-49fe-8f05-38a3699bc2c1@kernel.dk>
@ 2024-06-12 23:04       ` Paul E. McKenney
  0 siblings, 0 replies; 50+ messages in thread
From: Paul E. McKenney @ 2024-06-12 23:04 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam, Vlastimil Babka

On Wed, Jun 12, 2024 at 04:52:57PM -0600, Jens Axboe wrote:
> On 6/12/24 4:37 PM, Paul E. McKenney wrote:
> > [PATCH 09/14] block: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
> > 	I don't see a kmem_cache_destroy(), but then again, I also don't
> > 	see the kmem_cache_create().  Unless someone can see what I am
> > 	not seeing, let's wait.
> 
> It's in that same file:
> 
> blk_ioc_init()
> 
> the cache itself never goes away, as the ioc code is not unloadable. So
> I think the change there should be fine.

Thank you, Jens!  (And to Jakub for motivating me to go look.)

So to update the scorecared, 05/14, 09/14, 11/14 and 12/14 are OK and
can go ahead.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-12 22:37   ` Paul E. McKenney
  2024-06-12 22:46     ` Jakub Kicinski
       [not found]     ` <7e58e73d-4173-49fe-8f05-38a3699bc2c1@kernel.dk>
@ 2024-06-12 23:31     ` Jason A. Donenfeld
  2024-06-13  0:31       ` Jason A. Donenfeld
  2024-06-13 11:58     ` Jason A. Donenfeld
  3 siblings, 1 reply; 50+ messages in thread
From: Jason A. Donenfeld @ 2024-06-12 23:31 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam, Vlastimil Babka

On Wed, Jun 12, 2024 at 03:37:55PM -0700, Paul E. McKenney wrote:
> On Wed, Jun 12, 2024 at 02:33:05PM -0700, Jakub Kicinski wrote:
> > On Sun,  9 Jun 2024 10:27:12 +0200 Julia Lawall wrote:
> > > Since SLOB was removed, it is not necessary to use call_rcu
> > > when the callback only performs kmem_cache_free. Use
> > > kfree_rcu() directly.
> > > 
> > > The changes were done using the following Coccinelle semantic patch.
> > > This semantic patch is designed to ignore cases where the callback
> > > function is used in another way.
> > 
> > How does the discussion on:
> >   [PATCH] Revert "batman-adv: prefer kfree_rcu() over call_rcu() with free-only callbacks"
> >   https://lore.kernel.org/all/20240612133357.2596-1-linus.luessing@c0d3.blue/
> > reflect on this series? IIUC we should hold off..
> 
> We do need to hold off for the ones in kernel modules (such as 07/14)
> where the kmem_cache is destroyed during module unload.
> 
> OK, I might as well go through them...
> 
> [PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
> 	Needs to wait, see wg_allowedips_slab_uninit().

Right, this has exactly the same pattern as the batman-adv issue:

    void wg_allowedips_slab_uninit(void)
    {
            rcu_barrier();
            kmem_cache_destroy(node_cache);
    }

I'll hold off on sending that up until this matter is resolved.

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-12 23:31     ` Jason A. Donenfeld
@ 2024-06-13  0:31       ` Jason A. Donenfeld
  2024-06-13  3:38         ` Paul E. McKenney
  0 siblings, 1 reply; 50+ messages in thread
From: Jason A. Donenfeld @ 2024-06-13  0:31 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam, Vlastimil Babka

On Thu, Jun 13, 2024 at 01:31:57AM +0200, Jason A. Donenfeld wrote:
> On Wed, Jun 12, 2024 at 03:37:55PM -0700, Paul E. McKenney wrote:
> > On Wed, Jun 12, 2024 at 02:33:05PM -0700, Jakub Kicinski wrote:
> > > On Sun,  9 Jun 2024 10:27:12 +0200 Julia Lawall wrote:
> > > > Since SLOB was removed, it is not necessary to use call_rcu
> > > > when the callback only performs kmem_cache_free. Use
> > > > kfree_rcu() directly.
> > > > 
> > > > The changes were done using the following Coccinelle semantic patch.
> > > > This semantic patch is designed to ignore cases where the callback
> > > > function is used in another way.
> > > 
> > > How does the discussion on:
> > >   [PATCH] Revert "batman-adv: prefer kfree_rcu() over call_rcu() with free-only callbacks"
> > >   https://lore.kernel.org/all/20240612133357.2596-1-linus.luessing@c0d3.blue/
> > > reflect on this series? IIUC we should hold off..
> > 
> > We do need to hold off for the ones in kernel modules (such as 07/14)
> > where the kmem_cache is destroyed during module unload.
> > 
> > OK, I might as well go through them...
> > 
> > [PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
> > 	Needs to wait, see wg_allowedips_slab_uninit().
> 
> Right, this has exactly the same pattern as the batman-adv issue:
> 
>     void wg_allowedips_slab_uninit(void)
>     {
>             rcu_barrier();
>             kmem_cache_destroy(node_cache);
>     }
> 
> I'll hold off on sending that up until this matter is resolved.

BTW, I think this whole thing might be caused by:

    a35d16905efc ("rcu: Add basic support for kfree_rcu() batching")

The commit message there mentions:

    There is an implication with rcu_barrier() with this patch. Since the
    kfree_rcu() calls can be batched, and may not be handed yet to the RCU
    machinery in fact, the monitor may not have even run yet to do the
    queue_rcu_work(), there seems no easy way of implementing rcu_barrier()
    to wait for those kfree_rcu()s that are already made. So this means a
    kfree_rcu() followed by an rcu_barrier() does not imply that memory will
    be freed once rcu_barrier() returns.

Before that, a kfree_rcu() used to just add a normal call_rcu() into the
list, but with the function offset < 4096 as a special marker. So the
kfree_rcu() calls would be treated alongside the other call_rcu() ones
and thus affected by rcu_barrier(). Looks like that behavior is no more
since this commit.

Rather than getting rid of the batching, which seems good for
efficiency, I wonder if the right fix to this would be adding a
`should_destroy` boolean to kmem_cache, which kmem_cache_destroy() sets
to true. And then right after it checks `if (number_of_allocations == 0)
actually_destroy()`, and likewise on each kmem_cache_free(), it could
check `if (should_destroy && number_of_allocations == 0)
actually_destroy()`. This way, the work is delayed until it's safe to do
so. This might also mitigate other lurking bugs of bad code that calls
kmem_cache_destroy() before kmem_cache_free().

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-13  0:31       ` Jason A. Donenfeld
@ 2024-06-13  3:38         ` Paul E. McKenney
  2024-06-13 12:22           ` Jason A. Donenfeld
  2024-06-13 14:17           ` Jakub Kicinski
  0 siblings, 2 replies; 50+ messages in thread
From: Paul E. McKenney @ 2024-06-13  3:38 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam, Vlastimil Babka

On Thu, Jun 13, 2024 at 02:31:53AM +0200, Jason A. Donenfeld wrote:
> On Thu, Jun 13, 2024 at 01:31:57AM +0200, Jason A. Donenfeld wrote:
> > On Wed, Jun 12, 2024 at 03:37:55PM -0700, Paul E. McKenney wrote:
> > > On Wed, Jun 12, 2024 at 02:33:05PM -0700, Jakub Kicinski wrote:
> > > > On Sun,  9 Jun 2024 10:27:12 +0200 Julia Lawall wrote:
> > > > > Since SLOB was removed, it is not necessary to use call_rcu
> > > > > when the callback only performs kmem_cache_free. Use
> > > > > kfree_rcu() directly.
> > > > > 
> > > > > The changes were done using the following Coccinelle semantic patch.
> > > > > This semantic patch is designed to ignore cases where the callback
> > > > > function is used in another way.
> > > > 
> > > > How does the discussion on:
> > > >   [PATCH] Revert "batman-adv: prefer kfree_rcu() over call_rcu() with free-only callbacks"
> > > >   https://lore.kernel.org/all/20240612133357.2596-1-linus.luessing@c0d3.blue/
> > > > reflect on this series? IIUC we should hold off..
> > > 
> > > We do need to hold off for the ones in kernel modules (such as 07/14)
> > > where the kmem_cache is destroyed during module unload.
> > > 
> > > OK, I might as well go through them...
> > > 
> > > [PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
> > > 	Needs to wait, see wg_allowedips_slab_uninit().
> > 
> > Right, this has exactly the same pattern as the batman-adv issue:
> > 
> >     void wg_allowedips_slab_uninit(void)
> >     {
> >             rcu_barrier();
> >             kmem_cache_destroy(node_cache);
> >     }
> > 
> > I'll hold off on sending that up until this matter is resolved.
> 
> BTW, I think this whole thing might be caused by:
> 
>     a35d16905efc ("rcu: Add basic support for kfree_rcu() batching")
> 
> The commit message there mentions:
> 
>     There is an implication with rcu_barrier() with this patch. Since the
>     kfree_rcu() calls can be batched, and may not be handed yet to the RCU
>     machinery in fact, the monitor may not have even run yet to do the
>     queue_rcu_work(), there seems no easy way of implementing rcu_barrier()
>     to wait for those kfree_rcu()s that are already made. So this means a
>     kfree_rcu() followed by an rcu_barrier() does not imply that memory will
>     be freed once rcu_barrier() returns.
> 
> Before that, a kfree_rcu() used to just add a normal call_rcu() into the
> list, but with the function offset < 4096 as a special marker. So the
> kfree_rcu() calls would be treated alongside the other call_rcu() ones
> and thus affected by rcu_barrier(). Looks like that behavior is no more
> since this commit.

You might well be right, and thank you for digging into this!

> Rather than getting rid of the batching, which seems good for
> efficiency, I wonder if the right fix to this would be adding a
> `should_destroy` boolean to kmem_cache, which kmem_cache_destroy() sets
> to true. And then right after it checks `if (number_of_allocations == 0)
> actually_destroy()`, and likewise on each kmem_cache_free(), it could
> check `if (should_destroy && number_of_allocations == 0)
> actually_destroy()`. This way, the work is delayed until it's safe to do
> so. This might also mitigate other lurking bugs of bad code that calls
> kmem_cache_destroy() before kmem_cache_free().

Here are the current options being considered, including those that
are completely brain-dead:

o	Document current state.  (Must use call_rcu() if module
	destroys slab of RCU-protected objects.)

	Need to review Julia's and Uladzislau's series of patches
	that change call_rcu() of slab objects to kfree_rcu().

o	Make rcu_barrier() wait for kfree_rcu() objects.  (This is
	surprisingly complex and will wait unnecessarily in some cases.
	However, it does preserve current code.)

o	Make a kfree_rcu_barrier() that waits for kfree_rcu() objects.
	(This avoids the unnecessary waits, but adds complexity to
	kfree_rcu().  This is harder than it looks, but could be done,
	for example by maintaining pairs of per-CPU counters and handling
	them in an SRCU-like fashion.  Need some way of communicating the
	index, though.)

	(There might be use cases where both rcu_barrier() and
	kfree_rcu_barrier() would need to be invoked.)

	A simpler way to implement this is to scan all of the in-flight
	objects, and queue each (either separately or in bulk) using
	call_rcu().  This still has problems with kfree_rcu_mightsleep()
	under low-memory conditions, in which case there are a bunch
	of synchronize_rcu() instances waiting.  These instances could
	use SRCU-like per-CPU arrays of counters.  Or just protect the
	calls to synchronize_rcu() and the later frees with an SRCU
	reader, then have the other end call synchronize_srcu().

o	Make the current kmem_cache_destroy() asynchronously wait for
	all memory to be returned, then complete the destruction.
	(This gets rid of a valuable debugging technique because
	in normal use, it is a bug to attempt to destroy a kmem_cache
	that has objects still allocated.)

o	Make a kmem_cache_destroy_rcu() that asynchronously waits for
	all memory to be returned, then completes the destruction.
	(This raises the question of what to is it takes a "long time"
	for the objects to be freed.)

o	Make a kmem_cache_free_barrier() that blocks until all
	objects in the specified kmem_cache have been freed.

o	Make a kmem_cache_destroy_wait() that waits for all memory to
	be returned, then does the destruction.  This is equivalent to:

		kmem_cache_free_barrier(&mycache);
		kmem_cache_destroy(&mycache);

Uladzislau has started discussions on the last few of these:
https://lore.kernel.org/all/ZmnL4jkhJLIW924W@pc636/

I have also added this information to a Google Document for
easier tracking:
https://docs.google.com/document/d/1v0rcZLvvjVGejT3523W0rDy_sLFu2LWc_NR3fQItZaA/edit?usp=sharing

Other thoughts?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-12 22:37   ` Paul E. McKenney
                       ` (2 preceding siblings ...)
  2024-06-12 23:31     ` Jason A. Donenfeld
@ 2024-06-13 11:58     ` Jason A. Donenfeld
  2024-06-13 12:47       ` Paul E. McKenney
  3 siblings, 1 reply; 50+ messages in thread
From: Jason A. Donenfeld @ 2024-06-13 11:58 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam, Vlastimil Babka

On Wed, Jun 12, 2024 at 03:37:55PM -0700, Paul E. McKenney wrote:
> On Wed, Jun 12, 2024 at 02:33:05PM -0700, Jakub Kicinski wrote:
> > On Sun,  9 Jun 2024 10:27:12 +0200 Julia Lawall wrote:
> > > Since SLOB was removed, it is not necessary to use call_rcu
> > > when the callback only performs kmem_cache_free. Use
> > > kfree_rcu() directly.
> > > 
> > > The changes were done using the following Coccinelle semantic patch.
> > > This semantic patch is designed to ignore cases where the callback
> > > function is used in another way.
> > 
> > How does the discussion on:
> >   [PATCH] Revert "batman-adv: prefer kfree_rcu() over call_rcu() with free-only callbacks"
> >   https://lore.kernel.org/all/20240612133357.2596-1-linus.luessing@c0d3.blue/
> > reflect on this series? IIUC we should hold off..
> 
> We do need to hold off for the ones in kernel modules (such as 07/14)
> where the kmem_cache is destroyed during module unload.
> 
> OK, I might as well go through them...
> 
> [PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
> 	Needs to wait, see wg_allowedips_slab_uninit().

Also, notably, this patch needs additionally:

diff --git a/drivers/net/wireguard/allowedips.c b/drivers/net/wireguard/allowedips.c
index e4e1638fce1b..c95f6937c3f1 100644
--- a/drivers/net/wireguard/allowedips.c
+++ b/drivers/net/wireguard/allowedips.c
@@ -377,7 +377,6 @@ int __init wg_allowedips_slab_init(void)

 void wg_allowedips_slab_uninit(void)
 {
-	rcu_barrier();
 	kmem_cache_destroy(node_cache);
 }

Once kmem_cache_destroy has been fixed to be deferrable.

I assume the other patches are similar -- an rcu_barrier() can be
removed. So some manual meddling of these might be in order.

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-13  3:38         ` Paul E. McKenney
@ 2024-06-13 12:22           ` Jason A. Donenfeld
  2024-06-13 12:46             ` Paul E. McKenney
  2024-06-17 15:10             ` Vlastimil Babka
  2024-06-13 14:17           ` Jakub Kicinski
  1 sibling, 2 replies; 50+ messages in thread
From: Jason A. Donenfeld @ 2024-06-13 12:22 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam, Vlastimil Babka

On Wed, Jun 12, 2024 at 08:38:02PM -0700, Paul E. McKenney wrote:
> o	Make the current kmem_cache_destroy() asynchronously wait for
> 	all memory to be returned, then complete the destruction.
> 	(This gets rid of a valuable debugging technique because
> 	in normal use, it is a bug to attempt to destroy a kmem_cache
> 	that has objects still allocated.)
> 
> o	Make a kmem_cache_destroy_rcu() that asynchronously waits for
> 	all memory to be returned, then completes the destruction.
> 	(This raises the question of what to is it takes a "long time"
> 	for the objects to be freed.)

These seem like the best two options.

> o	Make a kmem_cache_free_barrier() that blocks until all
> 	objects in the specified kmem_cache have been freed.
> 
> o	Make a kmem_cache_destroy_wait() that waits for all memory to
> 	be returned, then does the destruction.  This is equivalent to:
> 
> 		kmem_cache_free_barrier(&mycache);
> 		kmem_cache_destroy(&mycache);

These also seem fine, but I'm less keen about blocking behavior.

Though, along the ideas of kmem_cache_destroy_rcu(), you might also
consider renaming this last one to kmem_cache_destroy_rcu_wait/barrier().
This way, it's RCU focused, and you can deal directly with the question
of, "how long is too long to block/to memleak?"

Specifically what I mean is that we can still claim a memory leak has
occurred if one batched kfree_rcu freeing grace period has elapsed since
the last call to kmem_cache_destroy_rcu_wait/barrier() or
kmem_cache_destroy_rcu(). In that case, you quit blocking, or you quit
asynchronously waiting, and then you splat about a memleak like we have
now.

But then, if that mechanism generally works, we don't really need a new
function and we can just go with the first option of making
kmem_cache_destroy() asynchronously wait. It'll wait, as you described,
but then we adjust the tail of every kfree_rcu batch freeing cycle to
check if there are _still_ any old outstanding kmem_cache_destroy()
requests. If so, then we can splat and keep the old debugging info we
currently have for finding memleaks.

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-13 12:22           ` Jason A. Donenfeld
@ 2024-06-13 12:46             ` Paul E. McKenney
  2024-06-13 14:11               ` Jason A. Donenfeld
  2024-06-17 15:10             ` Vlastimil Babka
  1 sibling, 1 reply; 50+ messages in thread
From: Paul E. McKenney @ 2024-06-13 12:46 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam, Vlastimil Babka

On Thu, Jun 13, 2024 at 02:22:41PM +0200, Jason A. Donenfeld wrote:
> On Wed, Jun 12, 2024 at 08:38:02PM -0700, Paul E. McKenney wrote:
> > o	Make the current kmem_cache_destroy() asynchronously wait for
> > 	all memory to be returned, then complete the destruction.
> > 	(This gets rid of a valuable debugging technique because
> > 	in normal use, it is a bug to attempt to destroy a kmem_cache
> > 	that has objects still allocated.)
> > 
> > o	Make a kmem_cache_destroy_rcu() that asynchronously waits for
> > 	all memory to be returned, then completes the destruction.
> > 	(This raises the question of what to is it takes a "long time"
> > 	for the objects to be freed.)
> 
> These seem like the best two options.

I like them myself, but much depends on how much violence they do to
the slab subsystem and to debuggability.

> > o	Make a kmem_cache_free_barrier() that blocks until all
> > 	objects in the specified kmem_cache have been freed.
> > 
> > o	Make a kmem_cache_destroy_wait() that waits for all memory to
> > 	be returned, then does the destruction.  This is equivalent to:
> > 
> > 		kmem_cache_free_barrier(&mycache);
> > 		kmem_cache_destroy(&mycache);
> 
> These also seem fine, but I'm less keen about blocking behavior.

One advantage of the blocking behavior is that it pinpoints memory
leaks from that slab.  On the other hand, one can argue that you want
this to block during testing but to be asynchronous in production.
Borrowing someone else's hand, there are probably lots of other arguments
one can make.

> Though, along the ideas of kmem_cache_destroy_rcu(), you might also
> consider renaming this last one to kmem_cache_destroy_rcu_wait/barrier().
> This way, it's RCU focused, and you can deal directly with the question
> of, "how long is too long to block/to memleak?"

Good point!

> Specifically what I mean is that we can still claim a memory leak has
> occurred if one batched kfree_rcu freeing grace period has elapsed since
> the last call to kmem_cache_destroy_rcu_wait/barrier() or
> kmem_cache_destroy_rcu(). In that case, you quit blocking, or you quit
> asynchronously waiting, and then you splat about a memleak like we have
> now.

How about a kmem_cache_destroy_rcu() that marks that specified cache
for destruction, and then a kmem_cache_destroy_barrier() that waits?

I took the liberty of adding your name to the Google document [1] and
adding this section:

	kmem_cache_destroy_rcu/_barrier()

	The idea here is to provide a asynchronous 
	kmem_cache_destroy_rcu() as described above along with a
	kmem_cache_destroy_barrier() that waits for the destruction
	of all prior kmem_cache instances previously passed
	to kmem_cache_destroy_rcu().  Alternatively,  could
	return a cookie that could be passed into a later call to
	kmem_cache_destroy_barrier().  This alternative has the
	advantage of isolating which kmem_cache instance is suffering
	the memory leak.

Please let me know if either liberty is in any way problematic.

> But then, if that mechanism generally works, we don't really need a new
> function and we can just go with the first option of making
> kmem_cache_destroy() asynchronously wait. It'll wait, as you described,
> but then we adjust the tail of every kfree_rcu batch freeing cycle to
> check if there are _still_ any old outstanding kmem_cache_destroy()
> requests. If so, then we can splat and keep the old debugging info we
> currently have for finding memleaks.

The mechanism can always be sabotaged by memory-leak bugs on the part
of the user of the kmem_cache structure in play, right?

OK, but I see your point.  I added this to the existing
"kmem_cache_destroy() Lingers for kfree_rcu()" section:

	One way of preserving this debugging information is to splat if
	all of the slab’s memory has not been freed within a reasonable
	timeframe, perhaps the same 21 seconds that causes an RCU CPU
	stall warning.

Does that capture it?

							Thanx, Paul

[1] https://docs.google.com/document/d/1v0rcZLvvjVGejT3523W0rDy_sLFu2LWc_NR3fQItZaA/edit?usp=sharing

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-13 11:58     ` Jason A. Donenfeld
@ 2024-06-13 12:47       ` Paul E. McKenney
  2024-06-13 13:06         ` Uladzislau Rezki
  0 siblings, 1 reply; 50+ messages in thread
From: Paul E. McKenney @ 2024-06-13 12:47 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam, Vlastimil Babka

On Thu, Jun 13, 2024 at 01:58:59PM +0200, Jason A. Donenfeld wrote:
> On Wed, Jun 12, 2024 at 03:37:55PM -0700, Paul E. McKenney wrote:
> > On Wed, Jun 12, 2024 at 02:33:05PM -0700, Jakub Kicinski wrote:
> > > On Sun,  9 Jun 2024 10:27:12 +0200 Julia Lawall wrote:
> > > > Since SLOB was removed, it is not necessary to use call_rcu
> > > > when the callback only performs kmem_cache_free. Use
> > > > kfree_rcu() directly.
> > > > 
> > > > The changes were done using the following Coccinelle semantic patch.
> > > > This semantic patch is designed to ignore cases where the callback
> > > > function is used in another way.
> > > 
> > > How does the discussion on:
> > >   [PATCH] Revert "batman-adv: prefer kfree_rcu() over call_rcu() with free-only callbacks"
> > >   https://lore.kernel.org/all/20240612133357.2596-1-linus.luessing@c0d3.blue/
> > > reflect on this series? IIUC we should hold off..
> > 
> > We do need to hold off for the ones in kernel modules (such as 07/14)
> > where the kmem_cache is destroyed during module unload.
> > 
> > OK, I might as well go through them...
> > 
> > [PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
> > 	Needs to wait, see wg_allowedips_slab_uninit().
> 
> Also, notably, this patch needs additionally:
> 
> diff --git a/drivers/net/wireguard/allowedips.c b/drivers/net/wireguard/allowedips.c
> index e4e1638fce1b..c95f6937c3f1 100644
> --- a/drivers/net/wireguard/allowedips.c
> +++ b/drivers/net/wireguard/allowedips.c
> @@ -377,7 +377,6 @@ int __init wg_allowedips_slab_init(void)
> 
>  void wg_allowedips_slab_uninit(void)
>  {
> -	rcu_barrier();
>  	kmem_cache_destroy(node_cache);
>  }
> 
> Once kmem_cache_destroy has been fixed to be deferrable.
> 
> I assume the other patches are similar -- an rcu_barrier() can be
> removed. So some manual meddling of these might be in order.

Assuming that the deferrable kmem_cache_destroy() is the option chosen,
agreed.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-13 12:47       ` Paul E. McKenney
@ 2024-06-13 13:06         ` Uladzislau Rezki
  2024-06-13 15:06           ` Paul E. McKenney
  0 siblings, 1 reply; 50+ messages in thread
From: Uladzislau Rezki @ 2024-06-13 13:06 UTC (permalink / raw)
  To: Paul E. McKenney, Vlastimil Babka
  Cc: Jason A. Donenfeld, Jakub Kicinski, Julia Lawall, linux-block,
	kernel-janitors, bridge, linux-trace-kernel, Mathieu Desnoyers,
	kvm, linuxppc-dev, Naveen N. Rao, Christophe Leroy,
	Nicholas Piggin, netdev, wireguard, linux-kernel, ecryptfs,
	Neil Brown, Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-nfs,
	linux-can, Lai Jiangshan, netfilter-devel, coreteam,
	Vlastimil Babka

On Thu, Jun 13, 2024 at 05:47:08AM -0700, Paul E. McKenney wrote:
> On Thu, Jun 13, 2024 at 01:58:59PM +0200, Jason A. Donenfeld wrote:
> > On Wed, Jun 12, 2024 at 03:37:55PM -0700, Paul E. McKenney wrote:
> > > On Wed, Jun 12, 2024 at 02:33:05PM -0700, Jakub Kicinski wrote:
> > > > On Sun,  9 Jun 2024 10:27:12 +0200 Julia Lawall wrote:
> > > > > Since SLOB was removed, it is not necessary to use call_rcu
> > > > > when the callback only performs kmem_cache_free. Use
> > > > > kfree_rcu() directly.
> > > > > 
> > > > > The changes were done using the following Coccinelle semantic patch.
> > > > > This semantic patch is designed to ignore cases where the callback
> > > > > function is used in another way.
> > > > 
> > > > How does the discussion on:
> > > >   [PATCH] Revert "batman-adv: prefer kfree_rcu() over call_rcu() with free-only callbacks"
> > > >   https://lore.kernel.org/all/20240612133357.2596-1-linus.luessing@c0d3.blue/
> > > > reflect on this series? IIUC we should hold off..
> > > 
> > > We do need to hold off for the ones in kernel modules (such as 07/14)
> > > where the kmem_cache is destroyed during module unload.
> > > 
> > > OK, I might as well go through them...
> > > 
> > > [PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
> > > 	Needs to wait, see wg_allowedips_slab_uninit().
> > 
> > Also, notably, this patch needs additionally:
> > 
> > diff --git a/drivers/net/wireguard/allowedips.c b/drivers/net/wireguard/allowedips.c
> > index e4e1638fce1b..c95f6937c3f1 100644
> > --- a/drivers/net/wireguard/allowedips.c
> > +++ b/drivers/net/wireguard/allowedips.c
> > @@ -377,7 +377,6 @@ int __init wg_allowedips_slab_init(void)
> > 
> >  void wg_allowedips_slab_uninit(void)
> >  {
> > -	rcu_barrier();
> >  	kmem_cache_destroy(node_cache);
> >  }
> > 
> > Once kmem_cache_destroy has been fixed to be deferrable.
> > 
> > I assume the other patches are similar -- an rcu_barrier() can be
> > removed. So some manual meddling of these might be in order.
> 
> Assuming that the deferrable kmem_cache_destroy() is the option chosen,
> agreed.
>
<snip>
void kmem_cache_destroy(struct kmem_cache *s)
{
	int err = -EBUSY;
	bool rcu_set;

	if (unlikely(!s) || !kasan_check_byte(s))
		return;

	cpus_read_lock();
	mutex_lock(&slab_mutex);

	rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;

	s->refcount--;
	if (s->refcount)
		goto out_unlock;

	err = shutdown_cache(s);
	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
	     __func__, s->name, (void *)_RET_IP_);
...
	cpus_read_unlock();
	if (!err && !rcu_set)
		kmem_cache_release(s);
}
<snip>

so we have SLAB_TYPESAFE_BY_RCU flag that defers freeing slab-pages
and a cache by a grace period. Similar flag can be added, like
SLAB_DESTROY_ONCE_FULLY_FREED, in this case a worker rearm itself
if there are still objects which should be freed.

Any thoughts here?

--
Uladzislau Rezki

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-13 12:46             ` Paul E. McKenney
@ 2024-06-13 14:11               ` Jason A. Donenfeld
  2024-06-13 15:12                 ` Paul E. McKenney
  0 siblings, 1 reply; 50+ messages in thread
From: Jason A. Donenfeld @ 2024-06-13 14:11 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam, Vlastimil Babka

On Thu, Jun 13, 2024 at 05:46:11AM -0700, Paul E. McKenney wrote:
> How about a kmem_cache_destroy_rcu() that marks that specified cache
> for destruction, and then a kmem_cache_destroy_barrier() that waits?
> 
> I took the liberty of adding your name to the Google document [1] and
> adding this section:

Cool, though no need to make me yellow!

> > But then, if that mechanism generally works, we don't really need a new
> > function and we can just go with the first option of making
> > kmem_cache_destroy() asynchronously wait. It'll wait, as you described,
> > but then we adjust the tail of every kfree_rcu batch freeing cycle to
> > check if there are _still_ any old outstanding kmem_cache_destroy()
> > requests. If so, then we can splat and keep the old debugging info we
> > currently have for finding memleaks.
> 
> The mechanism can always be sabotaged by memory-leak bugs on the part
> of the user of the kmem_cache structure in play, right?
> 
> OK, but I see your point.  I added this to the existing
> "kmem_cache_destroy() Lingers for kfree_rcu()" section:
> 
> 	One way of preserving this debugging information is to splat if
> 	all of the slab’s memory has not been freed within a reasonable
> 	timeframe, perhaps the same 21 seconds that causes an RCU CPU
> 	stall warning.
> 
> Does that capture it?

Not quite what I was thinking. Your 21 seconds as a time-based thing I
guess could be fine. But I was mostly thinking:

1) kmem_cache_destroy() is called, but there are outstanding objects, so
   it defers.

2) Sometime later, a kfree_rcu_work batch freeing operation runs.

3) At the end of this batch freeing, the kernel notices that the
   kmem_cache whose destruction was previously deferred still has
   outstanding objects and has not been destroyed. It can conclude that
   there's thus been a memory leak.

In other words, instead of having to do this based on timers, you can
just have the batch freeing code ask, "did those pending kmem_cache
destructions get completed as a result of this last operation?"

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-13  3:38         ` Paul E. McKenney
  2024-06-13 12:22           ` Jason A. Donenfeld
@ 2024-06-13 14:17           ` Jakub Kicinski
  2024-06-13 14:53             ` Paul E. McKenney
  1 sibling, 1 reply; 50+ messages in thread
From: Jakub Kicinski @ 2024-06-13 14:17 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Jason A. Donenfeld, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam, Vlastimil Babka

On Wed, 12 Jun 2024 20:38:02 -0700 Paul E. McKenney wrote:
> o	Make rcu_barrier() wait for kfree_rcu() objects.  (This is
> 	surprisingly complex and will wait unnecessarily in some cases.
> 	However, it does preserve current code.)

Not sure how much mental capacity for API variations we expect from
people using caches, but I feel like this would score the highest
on Rusty's API scale. I'd even venture an opinion that it's less
confusing to require cache users to have their own (trivial) callbacks
than add API variants we can't error check even at runtime...

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-13 14:17           ` Jakub Kicinski
@ 2024-06-13 14:53             ` Paul E. McKenney
  0 siblings, 0 replies; 50+ messages in thread
From: Paul E. McKenney @ 2024-06-13 14:53 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Jason A. Donenfeld, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam, Vlastimil Babka

On Thu, Jun 13, 2024 at 07:17:38AM -0700, Jakub Kicinski wrote:
> On Wed, 12 Jun 2024 20:38:02 -0700 Paul E. McKenney wrote:
> > o	Make rcu_barrier() wait for kfree_rcu() objects.  (This is
> > 	surprisingly complex and will wait unnecessarily in some cases.
> > 	However, it does preserve current code.)
> 
> Not sure how much mental capacity for API variations we expect from
> people using caches, but I feel like this would score the highest
> on Rusty's API scale. I'd even venture an opinion that it's less
> confusing to require cache users to have their own (trivial) callbacks
> than add API variants we can't error check even at runtime...

Fair point, though please see Jason's emails.

And the underlying within-RCU mechanism is the same either way, so that
API decision can be deferred for some time.

But the within-slab mechanism does have the advantage of also possibly
simplifying reference-counting and the potential upcoming hazard pointers.
On the other hand, I currently have no idea what level of violence this
change would make to the slab subsystem.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-13 13:06         ` Uladzislau Rezki
@ 2024-06-13 15:06           ` Paul E. McKenney
  2024-06-13 17:38             ` Uladzislau Rezki
  0 siblings, 1 reply; 50+ messages in thread
From: Paul E. McKenney @ 2024-06-13 15:06 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: Vlastimil Babka, Jason A. Donenfeld, Jakub Kicinski,
	Julia Lawall, linux-block, kernel-janitors, bridge,
	linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam

On Thu, Jun 13, 2024 at 03:06:54PM +0200, Uladzislau Rezki wrote:
> On Thu, Jun 13, 2024 at 05:47:08AM -0700, Paul E. McKenney wrote:
> > On Thu, Jun 13, 2024 at 01:58:59PM +0200, Jason A. Donenfeld wrote:
> > > On Wed, Jun 12, 2024 at 03:37:55PM -0700, Paul E. McKenney wrote:
> > > > On Wed, Jun 12, 2024 at 02:33:05PM -0700, Jakub Kicinski wrote:
> > > > > On Sun,  9 Jun 2024 10:27:12 +0200 Julia Lawall wrote:
> > > > > > Since SLOB was removed, it is not necessary to use call_rcu
> > > > > > when the callback only performs kmem_cache_free. Use
> > > > > > kfree_rcu() directly.
> > > > > > 
> > > > > > The changes were done using the following Coccinelle semantic patch.
> > > > > > This semantic patch is designed to ignore cases where the callback
> > > > > > function is used in another way.
> > > > > 
> > > > > How does the discussion on:
> > > > >   [PATCH] Revert "batman-adv: prefer kfree_rcu() over call_rcu() with free-only callbacks"
> > > > >   https://lore.kernel.org/all/20240612133357.2596-1-linus.luessing@c0d3.blue/
> > > > > reflect on this series? IIUC we should hold off..
> > > > 
> > > > We do need to hold off for the ones in kernel modules (such as 07/14)
> > > > where the kmem_cache is destroyed during module unload.
> > > > 
> > > > OK, I might as well go through them...
> > > > 
> > > > [PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
> > > > 	Needs to wait, see wg_allowedips_slab_uninit().
> > > 
> > > Also, notably, this patch needs additionally:
> > > 
> > > diff --git a/drivers/net/wireguard/allowedips.c b/drivers/net/wireguard/allowedips.c
> > > index e4e1638fce1b..c95f6937c3f1 100644
> > > --- a/drivers/net/wireguard/allowedips.c
> > > +++ b/drivers/net/wireguard/allowedips.c
> > > @@ -377,7 +377,6 @@ int __init wg_allowedips_slab_init(void)
> > > 
> > >  void wg_allowedips_slab_uninit(void)
> > >  {
> > > -	rcu_barrier();
> > >  	kmem_cache_destroy(node_cache);
> > >  }
> > > 
> > > Once kmem_cache_destroy has been fixed to be deferrable.
> > > 
> > > I assume the other patches are similar -- an rcu_barrier() can be
> > > removed. So some manual meddling of these might be in order.
> > 
> > Assuming that the deferrable kmem_cache_destroy() is the option chosen,
> > agreed.
> >
> <snip>
> void kmem_cache_destroy(struct kmem_cache *s)
> {
> 	int err = -EBUSY;
> 	bool rcu_set;
> 
> 	if (unlikely(!s) || !kasan_check_byte(s))
> 		return;
> 
> 	cpus_read_lock();
> 	mutex_lock(&slab_mutex);
> 
> 	rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;
> 
> 	s->refcount--;
> 	if (s->refcount)
> 		goto out_unlock;
> 
> 	err = shutdown_cache(s);
> 	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
> 	     __func__, s->name, (void *)_RET_IP_);
> ...
> 	cpus_read_unlock();
> 	if (!err && !rcu_set)
> 		kmem_cache_release(s);
> }
> <snip>
> 
> so we have SLAB_TYPESAFE_BY_RCU flag that defers freeing slab-pages
> and a cache by a grace period. Similar flag can be added, like
> SLAB_DESTROY_ONCE_FULLY_FREED, in this case a worker rearm itself
> if there are still objects which should be freed.
> 
> Any thoughts here?

Wouldn't we also need some additional code to later check for all objects
being freed to the slab, whether or not that code is  initiated from
kmem_cache_destroy()?

Either way, I am adding the SLAB_DESTROY_ONCE_FULLY_FREED possibility,
thank you! [1]

							Thanx, Paul

[1] https://docs.google.com/document/d/1v0rcZLvvjVGejT3523W0rDy_sLFu2LWc_NR3fQItZaA/edit?usp=sharing

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-13 14:11               ` Jason A. Donenfeld
@ 2024-06-13 15:12                 ` Paul E. McKenney
  0 siblings, 0 replies; 50+ messages in thread
From: Paul E. McKenney @ 2024-06-13 15:12 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam, Vlastimil Babka

On Thu, Jun 13, 2024 at 04:11:52PM +0200, Jason A. Donenfeld wrote:
> On Thu, Jun 13, 2024 at 05:46:11AM -0700, Paul E. McKenney wrote:
> > How about a kmem_cache_destroy_rcu() that marks that specified cache
> > for destruction, and then a kmem_cache_destroy_barrier() that waits?
> > 
> > I took the liberty of adding your name to the Google document [1] and
> > adding this section:
> 
> Cool, though no need to make me yellow!

No worries, Jakub is also colored yellow.  People added tomorrow
will be cyan if I follow my usual change-color ordering.  ;-)

> > > But then, if that mechanism generally works, we don't really need a new
> > > function and we can just go with the first option of making
> > > kmem_cache_destroy() asynchronously wait. It'll wait, as you described,
> > > but then we adjust the tail of every kfree_rcu batch freeing cycle to
> > > check if there are _still_ any old outstanding kmem_cache_destroy()
> > > requests. If so, then we can splat and keep the old debugging info we
> > > currently have for finding memleaks.
> > 
> > The mechanism can always be sabotaged by memory-leak bugs on the part
> > of the user of the kmem_cache structure in play, right?
> > 
> > OK, but I see your point.  I added this to the existing
> > "kmem_cache_destroy() Lingers for kfree_rcu()" section:
> > 
> > 	One way of preserving this debugging information is to splat if
> > 	all of the slab’s memory has not been freed within a reasonable
> > 	timeframe, perhaps the same 21 seconds that causes an RCU CPU
> > 	stall warning.
> > 
> > Does that capture it?
> 
> Not quite what I was thinking. Your 21 seconds as a time-based thing I
> guess could be fine. But I was mostly thinking:
> 
> 1) kmem_cache_destroy() is called, but there are outstanding objects, so
>    it defers.
> 
> 2) Sometime later, a kfree_rcu_work batch freeing operation runs.

Or not, if there has been a leak and there happens to be no outstanding
kfree_rcu() memory.

> 3) At the end of this batch freeing, the kernel notices that the
>    kmem_cache whose destruction was previously deferred still has
>    outstanding objects and has not been destroyed. It can conclude that
>    there's thus been a memory leak.

And the batch freeing can be replicated across CPUs, so it would be
necessary to determine which was last to do this effective.  Don't get
me wrong, this can be done, but the performance/latency tradeoffs can
be interesting.

> In other words, instead of having to do this based on timers, you can
> just have the batch freeing code ask, "did those pending kmem_cache
> destructions get completed as a result of this last operation?"

I agree that kfree_rcu_work-batch time is a good time to evaluate slab
(and I have added this to the document), but I do not believe that it
can completely replace timeouts.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-13 15:06           ` Paul E. McKenney
@ 2024-06-13 17:38             ` Uladzislau Rezki
  2024-06-13 17:45               ` Paul E. McKenney
  0 siblings, 1 reply; 50+ messages in thread
From: Uladzislau Rezki @ 2024-06-13 17:38 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Uladzislau Rezki, Vlastimil Babka, Jason A. Donenfeld,
	Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam

On Thu, Jun 13, 2024 at 08:06:30AM -0700, Paul E. McKenney wrote:
> On Thu, Jun 13, 2024 at 03:06:54PM +0200, Uladzislau Rezki wrote:
> > On Thu, Jun 13, 2024 at 05:47:08AM -0700, Paul E. McKenney wrote:
> > > On Thu, Jun 13, 2024 at 01:58:59PM +0200, Jason A. Donenfeld wrote:
> > > > On Wed, Jun 12, 2024 at 03:37:55PM -0700, Paul E. McKenney wrote:
> > > > > On Wed, Jun 12, 2024 at 02:33:05PM -0700, Jakub Kicinski wrote:
> > > > > > On Sun,  9 Jun 2024 10:27:12 +0200 Julia Lawall wrote:
> > > > > > > Since SLOB was removed, it is not necessary to use call_rcu
> > > > > > > when the callback only performs kmem_cache_free. Use
> > > > > > > kfree_rcu() directly.
> > > > > > > 
> > > > > > > The changes were done using the following Coccinelle semantic patch.
> > > > > > > This semantic patch is designed to ignore cases where the callback
> > > > > > > function is used in another way.
> > > > > > 
> > > > > > How does the discussion on:
> > > > > >   [PATCH] Revert "batman-adv: prefer kfree_rcu() over call_rcu() with free-only callbacks"
> > > > > >   https://lore.kernel.org/all/20240612133357.2596-1-linus.luessing@c0d3.blue/
> > > > > > reflect on this series? IIUC we should hold off..
> > > > > 
> > > > > We do need to hold off for the ones in kernel modules (such as 07/14)
> > > > > where the kmem_cache is destroyed during module unload.
> > > > > 
> > > > > OK, I might as well go through them...
> > > > > 
> > > > > [PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
> > > > > 	Needs to wait, see wg_allowedips_slab_uninit().
> > > > 
> > > > Also, notably, this patch needs additionally:
> > > > 
> > > > diff --git a/drivers/net/wireguard/allowedips.c b/drivers/net/wireguard/allowedips.c
> > > > index e4e1638fce1b..c95f6937c3f1 100644
> > > > --- a/drivers/net/wireguard/allowedips.c
> > > > +++ b/drivers/net/wireguard/allowedips.c
> > > > @@ -377,7 +377,6 @@ int __init wg_allowedips_slab_init(void)
> > > > 
> > > >  void wg_allowedips_slab_uninit(void)
> > > >  {
> > > > -	rcu_barrier();
> > > >  	kmem_cache_destroy(node_cache);
> > > >  }
> > > > 
> > > > Once kmem_cache_destroy has been fixed to be deferrable.
> > > > 
> > > > I assume the other patches are similar -- an rcu_barrier() can be
> > > > removed. So some manual meddling of these might be in order.
> > > 
> > > Assuming that the deferrable kmem_cache_destroy() is the option chosen,
> > > agreed.
> > >
> > <snip>
> > void kmem_cache_destroy(struct kmem_cache *s)
> > {
> > 	int err = -EBUSY;
> > 	bool rcu_set;
> > 
> > 	if (unlikely(!s) || !kasan_check_byte(s))
> > 		return;
> > 
> > 	cpus_read_lock();
> > 	mutex_lock(&slab_mutex);
> > 
> > 	rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;
> > 
> > 	s->refcount--;
> > 	if (s->refcount)
> > 		goto out_unlock;
> > 
> > 	err = shutdown_cache(s);
> > 	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
> > 	     __func__, s->name, (void *)_RET_IP_);
> > ...
> > 	cpus_read_unlock();
> > 	if (!err && !rcu_set)
> > 		kmem_cache_release(s);
> > }
> > <snip>
> > 
> > so we have SLAB_TYPESAFE_BY_RCU flag that defers freeing slab-pages
> > and a cache by a grace period. Similar flag can be added, like
> > SLAB_DESTROY_ONCE_FULLY_FREED, in this case a worker rearm itself
> > if there are still objects which should be freed.
> > 
> > Any thoughts here?
> 
> Wouldn't we also need some additional code to later check for all objects
> being freed to the slab, whether or not that code is  initiated from
> kmem_cache_destroy()?
>
Same away as SLAB_TYPESAFE_BY_RCU is handled from the kmem_cache_destroy() function.
It checks that flag and if it is true and extra worker is scheduled to perform a
deferred(instead of right away) destroy after rcu_barrier() finishes.

--
Uladzislau Rezki

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-13 17:38             ` Uladzislau Rezki
@ 2024-06-13 17:45               ` Paul E. McKenney
  2024-06-13 17:58                 ` Uladzislau Rezki
  0 siblings, 1 reply; 50+ messages in thread
From: Paul E. McKenney @ 2024-06-13 17:45 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: Vlastimil Babka, Jason A. Donenfeld, Jakub Kicinski,
	Julia Lawall, linux-block, kernel-janitors, bridge,
	linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam

On Thu, Jun 13, 2024 at 07:38:59PM +0200, Uladzislau Rezki wrote:
> On Thu, Jun 13, 2024 at 08:06:30AM -0700, Paul E. McKenney wrote:
> > On Thu, Jun 13, 2024 at 03:06:54PM +0200, Uladzislau Rezki wrote:
> > > On Thu, Jun 13, 2024 at 05:47:08AM -0700, Paul E. McKenney wrote:
> > > > On Thu, Jun 13, 2024 at 01:58:59PM +0200, Jason A. Donenfeld wrote:
> > > > > On Wed, Jun 12, 2024 at 03:37:55PM -0700, Paul E. McKenney wrote:
> > > > > > On Wed, Jun 12, 2024 at 02:33:05PM -0700, Jakub Kicinski wrote:
> > > > > > > On Sun,  9 Jun 2024 10:27:12 +0200 Julia Lawall wrote:
> > > > > > > > Since SLOB was removed, it is not necessary to use call_rcu
> > > > > > > > when the callback only performs kmem_cache_free. Use
> > > > > > > > kfree_rcu() directly.
> > > > > > > > 
> > > > > > > > The changes were done using the following Coccinelle semantic patch.
> > > > > > > > This semantic patch is designed to ignore cases where the callback
> > > > > > > > function is used in another way.
> > > > > > > 
> > > > > > > How does the discussion on:
> > > > > > >   [PATCH] Revert "batman-adv: prefer kfree_rcu() over call_rcu() with free-only callbacks"
> > > > > > >   https://lore.kernel.org/all/20240612133357.2596-1-linus.luessing@c0d3.blue/
> > > > > > > reflect on this series? IIUC we should hold off..
> > > > > > 
> > > > > > We do need to hold off for the ones in kernel modules (such as 07/14)
> > > > > > where the kmem_cache is destroyed during module unload.
> > > > > > 
> > > > > > OK, I might as well go through them...
> > > > > > 
> > > > > > [PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
> > > > > > 	Needs to wait, see wg_allowedips_slab_uninit().
> > > > > 
> > > > > Also, notably, this patch needs additionally:
> > > > > 
> > > > > diff --git a/drivers/net/wireguard/allowedips.c b/drivers/net/wireguard/allowedips.c
> > > > > index e4e1638fce1b..c95f6937c3f1 100644
> > > > > --- a/drivers/net/wireguard/allowedips.c
> > > > > +++ b/drivers/net/wireguard/allowedips.c
> > > > > @@ -377,7 +377,6 @@ int __init wg_allowedips_slab_init(void)
> > > > > 
> > > > >  void wg_allowedips_slab_uninit(void)
> > > > >  {
> > > > > -	rcu_barrier();
> > > > >  	kmem_cache_destroy(node_cache);
> > > > >  }
> > > > > 
> > > > > Once kmem_cache_destroy has been fixed to be deferrable.
> > > > > 
> > > > > I assume the other patches are similar -- an rcu_barrier() can be
> > > > > removed. So some manual meddling of these might be in order.
> > > > 
> > > > Assuming that the deferrable kmem_cache_destroy() is the option chosen,
> > > > agreed.
> > > >
> > > <snip>
> > > void kmem_cache_destroy(struct kmem_cache *s)
> > > {
> > > 	int err = -EBUSY;
> > > 	bool rcu_set;
> > > 
> > > 	if (unlikely(!s) || !kasan_check_byte(s))
> > > 		return;
> > > 
> > > 	cpus_read_lock();
> > > 	mutex_lock(&slab_mutex);
> > > 
> > > 	rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;
> > > 
> > > 	s->refcount--;
> > > 	if (s->refcount)
> > > 		goto out_unlock;
> > > 
> > > 	err = shutdown_cache(s);
> > > 	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
> > > 	     __func__, s->name, (void *)_RET_IP_);
> > > ...
> > > 	cpus_read_unlock();
> > > 	if (!err && !rcu_set)
> > > 		kmem_cache_release(s);
> > > }
> > > <snip>
> > > 
> > > so we have SLAB_TYPESAFE_BY_RCU flag that defers freeing slab-pages
> > > and a cache by a grace period. Similar flag can be added, like
> > > SLAB_DESTROY_ONCE_FULLY_FREED, in this case a worker rearm itself
> > > if there are still objects which should be freed.
> > > 
> > > Any thoughts here?
> > 
> > Wouldn't we also need some additional code to later check for all objects
> > being freed to the slab, whether or not that code is  initiated from
> > kmem_cache_destroy()?
> >
> Same away as SLAB_TYPESAFE_BY_RCU is handled from the kmem_cache_destroy() function.
> It checks that flag and if it is true and extra worker is scheduled to perform a
> deferred(instead of right away) destroy after rcu_barrier() finishes.

Like this?

	SLAB_DESTROY_ONCE_FULLY_FREED

	Instead of adding a new kmem_cache_destroy_rcu()
	or kmem_cache_destroy_wait() API member, instead add a
	SLAB_DESTROY_ONCE_FULLY_FREED flag that can be passed to the
	existing kmem_cache_destroy() function.  Use of this flag would
	suppress any warnings that would otherwise be issued if there
	was still slab memory yet to be freed, and it would also spawn
	workqueues (or timers or whatever) to do any needed cleanup work.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-13 17:45               ` Paul E. McKenney
@ 2024-06-13 17:58                 ` Uladzislau Rezki
  2024-06-13 18:13                   ` Paul E. McKenney
  0 siblings, 1 reply; 50+ messages in thread
From: Uladzislau Rezki @ 2024-06-13 17:58 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Uladzislau Rezki, Vlastimil Babka, Jason A. Donenfeld,
	Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam

On Thu, Jun 13, 2024 at 10:45:59AM -0700, Paul E. McKenney wrote:
> On Thu, Jun 13, 2024 at 07:38:59PM +0200, Uladzislau Rezki wrote:
> > On Thu, Jun 13, 2024 at 08:06:30AM -0700, Paul E. McKenney wrote:
> > > On Thu, Jun 13, 2024 at 03:06:54PM +0200, Uladzislau Rezki wrote:
> > > > On Thu, Jun 13, 2024 at 05:47:08AM -0700, Paul E. McKenney wrote:
> > > > > On Thu, Jun 13, 2024 at 01:58:59PM +0200, Jason A. Donenfeld wrote:
> > > > > > On Wed, Jun 12, 2024 at 03:37:55PM -0700, Paul E. McKenney wrote:
> > > > > > > On Wed, Jun 12, 2024 at 02:33:05PM -0700, Jakub Kicinski wrote:
> > > > > > > > On Sun,  9 Jun 2024 10:27:12 +0200 Julia Lawall wrote:
> > > > > > > > > Since SLOB was removed, it is not necessary to use call_rcu
> > > > > > > > > when the callback only performs kmem_cache_free. Use
> > > > > > > > > kfree_rcu() directly.
> > > > > > > > > 
> > > > > > > > > The changes were done using the following Coccinelle semantic patch.
> > > > > > > > > This semantic patch is designed to ignore cases where the callback
> > > > > > > > > function is used in another way.
> > > > > > > > 
> > > > > > > > How does the discussion on:
> > > > > > > >   [PATCH] Revert "batman-adv: prefer kfree_rcu() over call_rcu() with free-only callbacks"
> > > > > > > >   https://lore.kernel.org/all/20240612133357.2596-1-linus.luessing@c0d3.blue/
> > > > > > > > reflect on this series? IIUC we should hold off..
> > > > > > > 
> > > > > > > We do need to hold off for the ones in kernel modules (such as 07/14)
> > > > > > > where the kmem_cache is destroyed during module unload.
> > > > > > > 
> > > > > > > OK, I might as well go through them...
> > > > > > > 
> > > > > > > [PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
> > > > > > > 	Needs to wait, see wg_allowedips_slab_uninit().
> > > > > > 
> > > > > > Also, notably, this patch needs additionally:
> > > > > > 
> > > > > > diff --git a/drivers/net/wireguard/allowedips.c b/drivers/net/wireguard/allowedips.c
> > > > > > index e4e1638fce1b..c95f6937c3f1 100644
> > > > > > --- a/drivers/net/wireguard/allowedips.c
> > > > > > +++ b/drivers/net/wireguard/allowedips.c
> > > > > > @@ -377,7 +377,6 @@ int __init wg_allowedips_slab_init(void)
> > > > > > 
> > > > > >  void wg_allowedips_slab_uninit(void)
> > > > > >  {
> > > > > > -	rcu_barrier();
> > > > > >  	kmem_cache_destroy(node_cache);
> > > > > >  }
> > > > > > 
> > > > > > Once kmem_cache_destroy has been fixed to be deferrable.
> > > > > > 
> > > > > > I assume the other patches are similar -- an rcu_barrier() can be
> > > > > > removed. So some manual meddling of these might be in order.
> > > > > 
> > > > > Assuming that the deferrable kmem_cache_destroy() is the option chosen,
> > > > > agreed.
> > > > >
> > > > <snip>
> > > > void kmem_cache_destroy(struct kmem_cache *s)
> > > > {
> > > > 	int err = -EBUSY;
> > > > 	bool rcu_set;
> > > > 
> > > > 	if (unlikely(!s) || !kasan_check_byte(s))
> > > > 		return;
> > > > 
> > > > 	cpus_read_lock();
> > > > 	mutex_lock(&slab_mutex);
> > > > 
> > > > 	rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;
> > > > 
> > > > 	s->refcount--;
> > > > 	if (s->refcount)
> > > > 		goto out_unlock;
> > > > 
> > > > 	err = shutdown_cache(s);
> > > > 	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
> > > > 	     __func__, s->name, (void *)_RET_IP_);
> > > > ...
> > > > 	cpus_read_unlock();
> > > > 	if (!err && !rcu_set)
> > > > 		kmem_cache_release(s);
> > > > }
> > > > <snip>
> > > > 
> > > > so we have SLAB_TYPESAFE_BY_RCU flag that defers freeing slab-pages
> > > > and a cache by a grace period. Similar flag can be added, like
> > > > SLAB_DESTROY_ONCE_FULLY_FREED, in this case a worker rearm itself
> > > > if there are still objects which should be freed.
> > > > 
> > > > Any thoughts here?
> > > 
> > > Wouldn't we also need some additional code to later check for all objects
> > > being freed to the slab, whether or not that code is  initiated from
> > > kmem_cache_destroy()?
> > >
> > Same away as SLAB_TYPESAFE_BY_RCU is handled from the kmem_cache_destroy() function.
> > It checks that flag and if it is true and extra worker is scheduled to perform a
> > deferred(instead of right away) destroy after rcu_barrier() finishes.
> 
> Like this?
> 
> 	SLAB_DESTROY_ONCE_FULLY_FREED
> 
> 	Instead of adding a new kmem_cache_destroy_rcu()
> 	or kmem_cache_destroy_wait() API member, instead add a
> 	SLAB_DESTROY_ONCE_FULLY_FREED flag that can be passed to the
> 	existing kmem_cache_destroy() function.  Use of this flag would
> 	suppress any warnings that would otherwise be issued if there
> 	was still slab memory yet to be freed, and it would also spawn
> 	workqueues (or timers or whatever) to do any needed cleanup work.
> 
>
The flag is passed as all others during creating a cache:

  slab = kmem_cache_create(name, size, ..., SLAB_DESTROY_ONCE_FULLY_FREED | OTHER_FLAGS, NULL);

the rest description is correct to me.

--
Uladzislau Rezki

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-13 17:58                 ` Uladzislau Rezki
@ 2024-06-13 18:13                   ` Paul E. McKenney
  2024-06-14 12:35                     ` Uladzislau Rezki
  0 siblings, 1 reply; 50+ messages in thread
From: Paul E. McKenney @ 2024-06-13 18:13 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: Vlastimil Babka, Jason A. Donenfeld, Jakub Kicinski,
	Julia Lawall, linux-block, kernel-janitors, bridge,
	linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam

On Thu, Jun 13, 2024 at 07:58:17PM +0200, Uladzislau Rezki wrote:
> On Thu, Jun 13, 2024 at 10:45:59AM -0700, Paul E. McKenney wrote:
> > On Thu, Jun 13, 2024 at 07:38:59PM +0200, Uladzislau Rezki wrote:
> > > On Thu, Jun 13, 2024 at 08:06:30AM -0700, Paul E. McKenney wrote:
> > > > On Thu, Jun 13, 2024 at 03:06:54PM +0200, Uladzislau Rezki wrote:
> > > > > On Thu, Jun 13, 2024 at 05:47:08AM -0700, Paul E. McKenney wrote:
> > > > > > On Thu, Jun 13, 2024 at 01:58:59PM +0200, Jason A. Donenfeld wrote:
> > > > > > > On Wed, Jun 12, 2024 at 03:37:55PM -0700, Paul E. McKenney wrote:
> > > > > > > > On Wed, Jun 12, 2024 at 02:33:05PM -0700, Jakub Kicinski wrote:
> > > > > > > > > On Sun,  9 Jun 2024 10:27:12 +0200 Julia Lawall wrote:
> > > > > > > > > > Since SLOB was removed, it is not necessary to use call_rcu
> > > > > > > > > > when the callback only performs kmem_cache_free. Use
> > > > > > > > > > kfree_rcu() directly.
> > > > > > > > > > 
> > > > > > > > > > The changes were done using the following Coccinelle semantic patch.
> > > > > > > > > > This semantic patch is designed to ignore cases where the callback
> > > > > > > > > > function is used in another way.
> > > > > > > > > 
> > > > > > > > > How does the discussion on:
> > > > > > > > >   [PATCH] Revert "batman-adv: prefer kfree_rcu() over call_rcu() with free-only callbacks"
> > > > > > > > >   https://lore.kernel.org/all/20240612133357.2596-1-linus.luessing@c0d3.blue/
> > > > > > > > > reflect on this series? IIUC we should hold off..
> > > > > > > > 
> > > > > > > > We do need to hold off for the ones in kernel modules (such as 07/14)
> > > > > > > > where the kmem_cache is destroyed during module unload.
> > > > > > > > 
> > > > > > > > OK, I might as well go through them...
> > > > > > > > 
> > > > > > > > [PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
> > > > > > > > 	Needs to wait, see wg_allowedips_slab_uninit().
> > > > > > > 
> > > > > > > Also, notably, this patch needs additionally:
> > > > > > > 
> > > > > > > diff --git a/drivers/net/wireguard/allowedips.c b/drivers/net/wireguard/allowedips.c
> > > > > > > index e4e1638fce1b..c95f6937c3f1 100644
> > > > > > > --- a/drivers/net/wireguard/allowedips.c
> > > > > > > +++ b/drivers/net/wireguard/allowedips.c
> > > > > > > @@ -377,7 +377,6 @@ int __init wg_allowedips_slab_init(void)
> > > > > > > 
> > > > > > >  void wg_allowedips_slab_uninit(void)
> > > > > > >  {
> > > > > > > -	rcu_barrier();
> > > > > > >  	kmem_cache_destroy(node_cache);
> > > > > > >  }
> > > > > > > 
> > > > > > > Once kmem_cache_destroy has been fixed to be deferrable.
> > > > > > > 
> > > > > > > I assume the other patches are similar -- an rcu_barrier() can be
> > > > > > > removed. So some manual meddling of these might be in order.
> > > > > > 
> > > > > > Assuming that the deferrable kmem_cache_destroy() is the option chosen,
> > > > > > agreed.
> > > > > >
> > > > > <snip>
> > > > > void kmem_cache_destroy(struct kmem_cache *s)
> > > > > {
> > > > > 	int err = -EBUSY;
> > > > > 	bool rcu_set;
> > > > > 
> > > > > 	if (unlikely(!s) || !kasan_check_byte(s))
> > > > > 		return;
> > > > > 
> > > > > 	cpus_read_lock();
> > > > > 	mutex_lock(&slab_mutex);
> > > > > 
> > > > > 	rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;
> > > > > 
> > > > > 	s->refcount--;
> > > > > 	if (s->refcount)
> > > > > 		goto out_unlock;
> > > > > 
> > > > > 	err = shutdown_cache(s);
> > > > > 	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
> > > > > 	     __func__, s->name, (void *)_RET_IP_);
> > > > > ...
> > > > > 	cpus_read_unlock();
> > > > > 	if (!err && !rcu_set)
> > > > > 		kmem_cache_release(s);
> > > > > }
> > > > > <snip>
> > > > > 
> > > > > so we have SLAB_TYPESAFE_BY_RCU flag that defers freeing slab-pages
> > > > > and a cache by a grace period. Similar flag can be added, like
> > > > > SLAB_DESTROY_ONCE_FULLY_FREED, in this case a worker rearm itself
> > > > > if there are still objects which should be freed.
> > > > > 
> > > > > Any thoughts here?
> > > > 
> > > > Wouldn't we also need some additional code to later check for all objects
> > > > being freed to the slab, whether or not that code is  initiated from
> > > > kmem_cache_destroy()?
> > > >
> > > Same away as SLAB_TYPESAFE_BY_RCU is handled from the kmem_cache_destroy() function.
> > > It checks that flag and if it is true and extra worker is scheduled to perform a
> > > deferred(instead of right away) destroy after rcu_barrier() finishes.
> > 
> > Like this?
> > 
> > 	SLAB_DESTROY_ONCE_FULLY_FREED
> > 
> > 	Instead of adding a new kmem_cache_destroy_rcu()
> > 	or kmem_cache_destroy_wait() API member, instead add a
> > 	SLAB_DESTROY_ONCE_FULLY_FREED flag that can be passed to the
> > 	existing kmem_cache_destroy() function.  Use of this flag would
> > 	suppress any warnings that would otherwise be issued if there
> > 	was still slab memory yet to be freed, and it would also spawn
> > 	workqueues (or timers or whatever) to do any needed cleanup work.
> > 
> >
> The flag is passed as all others during creating a cache:
> 
>   slab = kmem_cache_create(name, size, ..., SLAB_DESTROY_ONCE_FULLY_FREED | OTHER_FLAGS, NULL);
> 
> the rest description is correct to me.

Good catch, fixed, thank you!

							Thanx, Paul

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-13 18:13                   ` Paul E. McKenney
@ 2024-06-14 12:35                     ` Uladzislau Rezki
  2024-06-14 14:17                       ` Paul E. McKenney
  2024-06-14 19:33                       ` Jason A. Donenfeld
  0 siblings, 2 replies; 50+ messages in thread
From: Uladzislau Rezki @ 2024-06-14 12:35 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Uladzislau Rezki, Vlastimil Babka, Jason A. Donenfeld,
	Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam

On Thu, Jun 13, 2024 at 11:13:52AM -0700, Paul E. McKenney wrote:
> On Thu, Jun 13, 2024 at 07:58:17PM +0200, Uladzislau Rezki wrote:
> > On Thu, Jun 13, 2024 at 10:45:59AM -0700, Paul E. McKenney wrote:
> > > On Thu, Jun 13, 2024 at 07:38:59PM +0200, Uladzislau Rezki wrote:
> > > > On Thu, Jun 13, 2024 at 08:06:30AM -0700, Paul E. McKenney wrote:
> > > > > On Thu, Jun 13, 2024 at 03:06:54PM +0200, Uladzislau Rezki wrote:
> > > > > > On Thu, Jun 13, 2024 at 05:47:08AM -0700, Paul E. McKenney wrote:
> > > > > > > On Thu, Jun 13, 2024 at 01:58:59PM +0200, Jason A. Donenfeld wrote:
> > > > > > > > On Wed, Jun 12, 2024 at 03:37:55PM -0700, Paul E. McKenney wrote:
> > > > > > > > > On Wed, Jun 12, 2024 at 02:33:05PM -0700, Jakub Kicinski wrote:
> > > > > > > > > > On Sun,  9 Jun 2024 10:27:12 +0200 Julia Lawall wrote:
> > > > > > > > > > > Since SLOB was removed, it is not necessary to use call_rcu
> > > > > > > > > > > when the callback only performs kmem_cache_free. Use
> > > > > > > > > > > kfree_rcu() directly.
> > > > > > > > > > > 
> > > > > > > > > > > The changes were done using the following Coccinelle semantic patch.
> > > > > > > > > > > This semantic patch is designed to ignore cases where the callback
> > > > > > > > > > > function is used in another way.
> > > > > > > > > > 
> > > > > > > > > > How does the discussion on:
> > > > > > > > > >   [PATCH] Revert "batman-adv: prefer kfree_rcu() over call_rcu() with free-only callbacks"
> > > > > > > > > >   https://lore.kernel.org/all/20240612133357.2596-1-linus.luessing@c0d3.blue/
> > > > > > > > > > reflect on this series? IIUC we should hold off..
> > > > > > > > > 
> > > > > > > > > We do need to hold off for the ones in kernel modules (such as 07/14)
> > > > > > > > > where the kmem_cache is destroyed during module unload.
> > > > > > > > > 
> > > > > > > > > OK, I might as well go through them...
> > > > > > > > > 
> > > > > > > > > [PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
> > > > > > > > > 	Needs to wait, see wg_allowedips_slab_uninit().
> > > > > > > > 
> > > > > > > > Also, notably, this patch needs additionally:
> > > > > > > > 
> > > > > > > > diff --git a/drivers/net/wireguard/allowedips.c b/drivers/net/wireguard/allowedips.c
> > > > > > > > index e4e1638fce1b..c95f6937c3f1 100644
> > > > > > > > --- a/drivers/net/wireguard/allowedips.c
> > > > > > > > +++ b/drivers/net/wireguard/allowedips.c
> > > > > > > > @@ -377,7 +377,6 @@ int __init wg_allowedips_slab_init(void)
> > > > > > > > 
> > > > > > > >  void wg_allowedips_slab_uninit(void)
> > > > > > > >  {
> > > > > > > > -	rcu_barrier();
> > > > > > > >  	kmem_cache_destroy(node_cache);
> > > > > > > >  }
> > > > > > > > 
> > > > > > > > Once kmem_cache_destroy has been fixed to be deferrable.
> > > > > > > > 
> > > > > > > > I assume the other patches are similar -- an rcu_barrier() can be
> > > > > > > > removed. So some manual meddling of these might be in order.
> > > > > > > 
> > > > > > > Assuming that the deferrable kmem_cache_destroy() is the option chosen,
> > > > > > > agreed.
> > > > > > >
> > > > > > <snip>
> > > > > > void kmem_cache_destroy(struct kmem_cache *s)
> > > > > > {
> > > > > > 	int err = -EBUSY;
> > > > > > 	bool rcu_set;
> > > > > > 
> > > > > > 	if (unlikely(!s) || !kasan_check_byte(s))
> > > > > > 		return;
> > > > > > 
> > > > > > 	cpus_read_lock();
> > > > > > 	mutex_lock(&slab_mutex);
> > > > > > 
> > > > > > 	rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;
> > > > > > 
> > > > > > 	s->refcount--;
> > > > > > 	if (s->refcount)
> > > > > > 		goto out_unlock;
> > > > > > 
> > > > > > 	err = shutdown_cache(s);
> > > > > > 	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
> > > > > > 	     __func__, s->name, (void *)_RET_IP_);
> > > > > > ...
> > > > > > 	cpus_read_unlock();
> > > > > > 	if (!err && !rcu_set)
> > > > > > 		kmem_cache_release(s);
> > > > > > }
> > > > > > <snip>
> > > > > > 
> > > > > > so we have SLAB_TYPESAFE_BY_RCU flag that defers freeing slab-pages
> > > > > > and a cache by a grace period. Similar flag can be added, like
> > > > > > SLAB_DESTROY_ONCE_FULLY_FREED, in this case a worker rearm itself
> > > > > > if there are still objects which should be freed.
> > > > > > 
> > > > > > Any thoughts here?
> > > > > 
> > > > > Wouldn't we also need some additional code to later check for all objects
> > > > > being freed to the slab, whether or not that code is  initiated from
> > > > > kmem_cache_destroy()?
> > > > >
> > > > Same away as SLAB_TYPESAFE_BY_RCU is handled from the kmem_cache_destroy() function.
> > > > It checks that flag and if it is true and extra worker is scheduled to perform a
> > > > deferred(instead of right away) destroy after rcu_barrier() finishes.
> > > 
> > > Like this?
> > > 
> > > 	SLAB_DESTROY_ONCE_FULLY_FREED
> > > 
> > > 	Instead of adding a new kmem_cache_destroy_rcu()
> > > 	or kmem_cache_destroy_wait() API member, instead add a
> > > 	SLAB_DESTROY_ONCE_FULLY_FREED flag that can be passed to the
> > > 	existing kmem_cache_destroy() function.  Use of this flag would
> > > 	suppress any warnings that would otherwise be issued if there
> > > 	was still slab memory yet to be freed, and it would also spawn
> > > 	workqueues (or timers or whatever) to do any needed cleanup work.
> > > 
> > >
> > The flag is passed as all others during creating a cache:
> > 
> >   slab = kmem_cache_create(name, size, ..., SLAB_DESTROY_ONCE_FULLY_FREED | OTHER_FLAGS, NULL);
> > 
> > the rest description is correct to me.
> 
> Good catch, fixed, thank you!
> 
And here we go with prototype(untested):

<snip>
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 7247e217e21b..700b8a909f8a 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -59,6 +59,7 @@ enum _slab_flag_bits {
 #ifdef CONFIG_SLAB_OBJ_EXT
 	_SLAB_NO_OBJ_EXT,
 #endif
+	_SLAB_DEFER_DESTROY,
 	_SLAB_FLAGS_LAST_BIT
 };
 
@@ -139,6 +140,7 @@ enum _slab_flag_bits {
  */
 /* Defer freeing slabs to RCU */
 #define SLAB_TYPESAFE_BY_RCU	__SLAB_FLAG_BIT(_SLAB_TYPESAFE_BY_RCU)
+#define SLAB_DEFER_DESTROY __SLAB_FLAG_BIT(_SLAB_DEFER_DESTROY)
 /* Trace allocations and frees */
 #define SLAB_TRACE		__SLAB_FLAG_BIT(_SLAB_TRACE)
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1560a1546bb1..99458a0197b5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -45,6 +45,11 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
 static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
 		    slab_caches_to_rcu_destroy_workfn);
 
+static LIST_HEAD(slab_caches_defer_destroy);
+static void slab_caches_defer_destroy_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(slab_caches_defer_destroy_work,
+	slab_caches_defer_destroy_workfn);
+
 /*
  * Set of flags that will prevent slab merging
  */
@@ -448,6 +453,31 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
 	}
 }
 
+static void
+slab_caches_defer_destroy_workfn(struct work_struct *work)
+{
+	struct kmem_cache *s, *s2;
+
+	mutex_lock(&slab_mutex);
+	list_for_each_entry_safe(s, s2, &slab_caches_defer_destroy, list) {
+		if (__kmem_cache_empty(s)) {
+			/* free asan quarantined objects */
+			kasan_cache_shutdown(s);
+			(void) __kmem_cache_shutdown(s);
+
+			list_del(&s->list);
+
+			debugfs_slab_release(s);
+			kfence_shutdown_cache(s);
+			kmem_cache_release(s);
+		}
+	}
+	mutex_unlock(&slab_mutex);
+
+	if (!list_empty(&slab_caches_defer_destroy))
+		schedule_delayed_work(&slab_caches_defer_destroy_work, HZ);
+}
+
 static int shutdown_cache(struct kmem_cache *s)
 {
 	/* free asan quarantined objects */
@@ -493,6 +523,13 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	if (s->refcount)
 		goto out_unlock;
 
+	/* Should a destroy process be deferred? */
+	if (s->flags & SLAB_DEFER_DESTROY) {
+		list_move_tail(&s->list, &slab_caches_defer_destroy);
+		schedule_delayed_work(&slab_caches_defer_destroy_work, HZ);
+		goto out_unlock;
+	}
+
 	err = shutdown_cache(s);
 	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
 	     __func__, s->name, (void *)_RET_IP_);
<snip>

Thanks!

--
Uladzislau Rezki

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-14 12:35                     ` Uladzislau Rezki
@ 2024-06-14 14:17                       ` Paul E. McKenney
  2024-06-14 14:50                         ` Uladzislau Rezki
  2024-06-14 19:33                       ` Jason A. Donenfeld
  1 sibling, 1 reply; 50+ messages in thread
From: Paul E. McKenney @ 2024-06-14 14:17 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: Vlastimil Babka, Jason A. Donenfeld, Jakub Kicinski,
	Julia Lawall, linux-block, kernel-janitors, bridge,
	linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam

On Fri, Jun 14, 2024 at 02:35:33PM +0200, Uladzislau Rezki wrote:
> On Thu, Jun 13, 2024 at 11:13:52AM -0700, Paul E. McKenney wrote:
> > On Thu, Jun 13, 2024 at 07:58:17PM +0200, Uladzislau Rezki wrote:
> > > On Thu, Jun 13, 2024 at 10:45:59AM -0700, Paul E. McKenney wrote:
> > > > On Thu, Jun 13, 2024 at 07:38:59PM +0200, Uladzislau Rezki wrote:
> > > > > On Thu, Jun 13, 2024 at 08:06:30AM -0700, Paul E. McKenney wrote:
> > > > > > On Thu, Jun 13, 2024 at 03:06:54PM +0200, Uladzislau Rezki wrote:
> > > > > > > On Thu, Jun 13, 2024 at 05:47:08AM -0700, Paul E. McKenney wrote:
> > > > > > > > On Thu, Jun 13, 2024 at 01:58:59PM +0200, Jason A. Donenfeld wrote:
> > > > > > > > > On Wed, Jun 12, 2024 at 03:37:55PM -0700, Paul E. McKenney wrote:
> > > > > > > > > > On Wed, Jun 12, 2024 at 02:33:05PM -0700, Jakub Kicinski wrote:
> > > > > > > > > > > On Sun,  9 Jun 2024 10:27:12 +0200 Julia Lawall wrote:
> > > > > > > > > > > > Since SLOB was removed, it is not necessary to use call_rcu
> > > > > > > > > > > > when the callback only performs kmem_cache_free. Use
> > > > > > > > > > > > kfree_rcu() directly.
> > > > > > > > > > > > 
> > > > > > > > > > > > The changes were done using the following Coccinelle semantic patch.
> > > > > > > > > > > > This semantic patch is designed to ignore cases where the callback
> > > > > > > > > > > > function is used in another way.
> > > > > > > > > > > 
> > > > > > > > > > > How does the discussion on:
> > > > > > > > > > >   [PATCH] Revert "batman-adv: prefer kfree_rcu() over call_rcu() with free-only callbacks"
> > > > > > > > > > >   https://lore.kernel.org/all/20240612133357.2596-1-linus.luessing@c0d3.blue/
> > > > > > > > > > > reflect on this series? IIUC we should hold off..
> > > > > > > > > > 
> > > > > > > > > > We do need to hold off for the ones in kernel modules (such as 07/14)
> > > > > > > > > > where the kmem_cache is destroyed during module unload.
> > > > > > > > > > 
> > > > > > > > > > OK, I might as well go through them...
> > > > > > > > > > 
> > > > > > > > > > [PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
> > > > > > > > > > 	Needs to wait, see wg_allowedips_slab_uninit().
> > > > > > > > > 
> > > > > > > > > Also, notably, this patch needs additionally:
> > > > > > > > > 
> > > > > > > > > diff --git a/drivers/net/wireguard/allowedips.c b/drivers/net/wireguard/allowedips.c
> > > > > > > > > index e4e1638fce1b..c95f6937c3f1 100644
> > > > > > > > > --- a/drivers/net/wireguard/allowedips.c
> > > > > > > > > +++ b/drivers/net/wireguard/allowedips.c
> > > > > > > > > @@ -377,7 +377,6 @@ int __init wg_allowedips_slab_init(void)
> > > > > > > > > 
> > > > > > > > >  void wg_allowedips_slab_uninit(void)
> > > > > > > > >  {
> > > > > > > > > -	rcu_barrier();
> > > > > > > > >  	kmem_cache_destroy(node_cache);
> > > > > > > > >  }
> > > > > > > > > 
> > > > > > > > > Once kmem_cache_destroy has been fixed to be deferrable.
> > > > > > > > > 
> > > > > > > > > I assume the other patches are similar -- an rcu_barrier() can be
> > > > > > > > > removed. So some manual meddling of these might be in order.
> > > > > > > > 
> > > > > > > > Assuming that the deferrable kmem_cache_destroy() is the option chosen,
> > > > > > > > agreed.
> > > > > > > >
> > > > > > > <snip>
> > > > > > > void kmem_cache_destroy(struct kmem_cache *s)
> > > > > > > {
> > > > > > > 	int err = -EBUSY;
> > > > > > > 	bool rcu_set;
> > > > > > > 
> > > > > > > 	if (unlikely(!s) || !kasan_check_byte(s))
> > > > > > > 		return;
> > > > > > > 
> > > > > > > 	cpus_read_lock();
> > > > > > > 	mutex_lock(&slab_mutex);
> > > > > > > 
> > > > > > > 	rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;
> > > > > > > 
> > > > > > > 	s->refcount--;
> > > > > > > 	if (s->refcount)
> > > > > > > 		goto out_unlock;
> > > > > > > 
> > > > > > > 	err = shutdown_cache(s);
> > > > > > > 	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
> > > > > > > 	     __func__, s->name, (void *)_RET_IP_);
> > > > > > > ...
> > > > > > > 	cpus_read_unlock();
> > > > > > > 	if (!err && !rcu_set)
> > > > > > > 		kmem_cache_release(s);
> > > > > > > }
> > > > > > > <snip>
> > > > > > > 
> > > > > > > so we have SLAB_TYPESAFE_BY_RCU flag that defers freeing slab-pages
> > > > > > > and a cache by a grace period. Similar flag can be added, like
> > > > > > > SLAB_DESTROY_ONCE_FULLY_FREED, in this case a worker rearm itself
> > > > > > > if there are still objects which should be freed.
> > > > > > > 
> > > > > > > Any thoughts here?
> > > > > > 
> > > > > > Wouldn't we also need some additional code to later check for all objects
> > > > > > being freed to the slab, whether or not that code is  initiated from
> > > > > > kmem_cache_destroy()?
> > > > > >
> > > > > Same away as SLAB_TYPESAFE_BY_RCU is handled from the kmem_cache_destroy() function.
> > > > > It checks that flag and if it is true and extra worker is scheduled to perform a
> > > > > deferred(instead of right away) destroy after rcu_barrier() finishes.
> > > > 
> > > > Like this?
> > > > 
> > > > 	SLAB_DESTROY_ONCE_FULLY_FREED
> > > > 
> > > > 	Instead of adding a new kmem_cache_destroy_rcu()
> > > > 	or kmem_cache_destroy_wait() API member, instead add a
> > > > 	SLAB_DESTROY_ONCE_FULLY_FREED flag that can be passed to the
> > > > 	existing kmem_cache_destroy() function.  Use of this flag would
> > > > 	suppress any warnings that would otherwise be issued if there
> > > > 	was still slab memory yet to be freed, and it would also spawn
> > > > 	workqueues (or timers or whatever) to do any needed cleanup work.
> > > > 
> > > >
> > > The flag is passed as all others during creating a cache:
> > > 
> > >   slab = kmem_cache_create(name, size, ..., SLAB_DESTROY_ONCE_FULLY_FREED | OTHER_FLAGS, NULL);
> > > 
> > > the rest description is correct to me.
> > 
> > Good catch, fixed, thank you!
> > 
> And here we go with prototype(untested):

Thank you for putting this together!  It looks way simpler than I would
have guessed, and quite a bit simpler than I would expect it would be
to extend rcu_barrier() to cover kfree_rcu().

> <snip>
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index 7247e217e21b..700b8a909f8a 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -59,6 +59,7 @@ enum _slab_flag_bits {
>  #ifdef CONFIG_SLAB_OBJ_EXT
>  	_SLAB_NO_OBJ_EXT,
>  #endif
> +	_SLAB_DEFER_DESTROY,
>  	_SLAB_FLAGS_LAST_BIT
>  };
>  
> @@ -139,6 +140,7 @@ enum _slab_flag_bits {
>   */
>  /* Defer freeing slabs to RCU */
>  #define SLAB_TYPESAFE_BY_RCU	__SLAB_FLAG_BIT(_SLAB_TYPESAFE_BY_RCU)
> +#define SLAB_DEFER_DESTROY __SLAB_FLAG_BIT(_SLAB_DEFER_DESTROY)
>  /* Trace allocations and frees */
>  #define SLAB_TRACE		__SLAB_FLAG_BIT(_SLAB_TRACE)
>  
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 1560a1546bb1..99458a0197b5 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -45,6 +45,11 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
>  static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
>  		    slab_caches_to_rcu_destroy_workfn);
>  
> +static LIST_HEAD(slab_caches_defer_destroy);
> +static void slab_caches_defer_destroy_workfn(struct work_struct *work);
> +static DECLARE_DELAYED_WORK(slab_caches_defer_destroy_work,
> +	slab_caches_defer_destroy_workfn);
> +
>  /*
>   * Set of flags that will prevent slab merging
>   */
> @@ -448,6 +453,31 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
>  	}
>  }
>  
> +static void
> +slab_caches_defer_destroy_workfn(struct work_struct *work)
> +{
> +	struct kmem_cache *s, *s2;
> +
> +	mutex_lock(&slab_mutex);
> +	list_for_each_entry_safe(s, s2, &slab_caches_defer_destroy, list) {
> +		if (__kmem_cache_empty(s)) {
> +			/* free asan quarantined objects */
> +			kasan_cache_shutdown(s);
> +			(void) __kmem_cache_shutdown(s);
> +
> +			list_del(&s->list);
> +
> +			debugfs_slab_release(s);
> +			kfence_shutdown_cache(s);
> +			kmem_cache_release(s);
> +		}

My guess is that there would want to be a splat if the slab stuck around
for too long, but maybe that should instead be handled elsewhere or in
some other way?  I must defer to you guys on that one.

							Thanx, Paul

> +	}
> +	mutex_unlock(&slab_mutex);
> +
> +	if (!list_empty(&slab_caches_defer_destroy))
> +		schedule_delayed_work(&slab_caches_defer_destroy_work, HZ);
> +}
> +
>  static int shutdown_cache(struct kmem_cache *s)
>  {
>  	/* free asan quarantined objects */
> @@ -493,6 +523,13 @@ void kmem_cache_destroy(struct kmem_cache *s)
>  	if (s->refcount)
>  		goto out_unlock;
>  
> +	/* Should a destroy process be deferred? */
> +	if (s->flags & SLAB_DEFER_DESTROY) {
> +		list_move_tail(&s->list, &slab_caches_defer_destroy);
> +		schedule_delayed_work(&slab_caches_defer_destroy_work, HZ);
> +		goto out_unlock;
> +	}
> +
>  	err = shutdown_cache(s);
>  	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
>  	     __func__, s->name, (void *)_RET_IP_);
> <snip>



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-14 14:17                       ` Paul E. McKenney
@ 2024-06-14 14:50                         ` Uladzislau Rezki
  0 siblings, 0 replies; 50+ messages in thread
From: Uladzislau Rezki @ 2024-06-14 14:50 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Uladzislau Rezki, Vlastimil Babka, Jason A. Donenfeld,
	Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam

On Fri, Jun 14, 2024 at 07:17:29AM -0700, Paul E. McKenney wrote:
> On Fri, Jun 14, 2024 at 02:35:33PM +0200, Uladzislau Rezki wrote:
> > On Thu, Jun 13, 2024 at 11:13:52AM -0700, Paul E. McKenney wrote:
> > > On Thu, Jun 13, 2024 at 07:58:17PM +0200, Uladzislau Rezki wrote:
> > > > On Thu, Jun 13, 2024 at 10:45:59AM -0700, Paul E. McKenney wrote:
> > > > > On Thu, Jun 13, 2024 at 07:38:59PM +0200, Uladzislau Rezki wrote:
> > > > > > On Thu, Jun 13, 2024 at 08:06:30AM -0700, Paul E. McKenney wrote:
> > > > > > > On Thu, Jun 13, 2024 at 03:06:54PM +0200, Uladzislau Rezki wrote:
> > > > > > > > On Thu, Jun 13, 2024 at 05:47:08AM -0700, Paul E. McKenney wrote:
> > > > > > > > > On Thu, Jun 13, 2024 at 01:58:59PM +0200, Jason A. Donenfeld wrote:
> > > > > > > > > > On Wed, Jun 12, 2024 at 03:37:55PM -0700, Paul E. McKenney wrote:
> > > > > > > > > > > On Wed, Jun 12, 2024 at 02:33:05PM -0700, Jakub Kicinski wrote:
> > > > > > > > > > > > On Sun,  9 Jun 2024 10:27:12 +0200 Julia Lawall wrote:
> > > > > > > > > > > > > Since SLOB was removed, it is not necessary to use call_rcu
> > > > > > > > > > > > > when the callback only performs kmem_cache_free. Use
> > > > > > > > > > > > > kfree_rcu() directly.
> > > > > > > > > > > > > 
> > > > > > > > > > > > > The changes were done using the following Coccinelle semantic patch.
> > > > > > > > > > > > > This semantic patch is designed to ignore cases where the callback
> > > > > > > > > > > > > function is used in another way.
> > > > > > > > > > > > 
> > > > > > > > > > > > How does the discussion on:
> > > > > > > > > > > >   [PATCH] Revert "batman-adv: prefer kfree_rcu() over call_rcu() with free-only callbacks"
> > > > > > > > > > > >   https://lore.kernel.org/all/20240612133357.2596-1-linus.luessing@c0d3.blue/
> > > > > > > > > > > > reflect on this series? IIUC we should hold off..
> > > > > > > > > > > 
> > > > > > > > > > > We do need to hold off for the ones in kernel modules (such as 07/14)
> > > > > > > > > > > where the kmem_cache is destroyed during module unload.
> > > > > > > > > > > 
> > > > > > > > > > > OK, I might as well go through them...
> > > > > > > > > > > 
> > > > > > > > > > > [PATCH 01/14] wireguard: allowedips: replace call_rcu by kfree_rcu for simple kmem_cache_free callback
> > > > > > > > > > > 	Needs to wait, see wg_allowedips_slab_uninit().
> > > > > > > > > > 
> > > > > > > > > > Also, notably, this patch needs additionally:
> > > > > > > > > > 
> > > > > > > > > > diff --git a/drivers/net/wireguard/allowedips.c b/drivers/net/wireguard/allowedips.c
> > > > > > > > > > index e4e1638fce1b..c95f6937c3f1 100644
> > > > > > > > > > --- a/drivers/net/wireguard/allowedips.c
> > > > > > > > > > +++ b/drivers/net/wireguard/allowedips.c
> > > > > > > > > > @@ -377,7 +377,6 @@ int __init wg_allowedips_slab_init(void)
> > > > > > > > > > 
> > > > > > > > > >  void wg_allowedips_slab_uninit(void)
> > > > > > > > > >  {
> > > > > > > > > > -	rcu_barrier();
> > > > > > > > > >  	kmem_cache_destroy(node_cache);
> > > > > > > > > >  }
> > > > > > > > > > 
> > > > > > > > > > Once kmem_cache_destroy has been fixed to be deferrable.
> > > > > > > > > > 
> > > > > > > > > > I assume the other patches are similar -- an rcu_barrier() can be
> > > > > > > > > > removed. So some manual meddling of these might be in order.
> > > > > > > > > 
> > > > > > > > > Assuming that the deferrable kmem_cache_destroy() is the option chosen,
> > > > > > > > > agreed.
> > > > > > > > >
> > > > > > > > <snip>
> > > > > > > > void kmem_cache_destroy(struct kmem_cache *s)
> > > > > > > > {
> > > > > > > > 	int err = -EBUSY;
> > > > > > > > 	bool rcu_set;
> > > > > > > > 
> > > > > > > > 	if (unlikely(!s) || !kasan_check_byte(s))
> > > > > > > > 		return;
> > > > > > > > 
> > > > > > > > 	cpus_read_lock();
> > > > > > > > 	mutex_lock(&slab_mutex);
> > > > > > > > 
> > > > > > > > 	rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;
> > > > > > > > 
> > > > > > > > 	s->refcount--;
> > > > > > > > 	if (s->refcount)
> > > > > > > > 		goto out_unlock;
> > > > > > > > 
> > > > > > > > 	err = shutdown_cache(s);
> > > > > > > > 	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
> > > > > > > > 	     __func__, s->name, (void *)_RET_IP_);
> > > > > > > > ...
> > > > > > > > 	cpus_read_unlock();
> > > > > > > > 	if (!err && !rcu_set)
> > > > > > > > 		kmem_cache_release(s);
> > > > > > > > }
> > > > > > > > <snip>
> > > > > > > > 
> > > > > > > > so we have SLAB_TYPESAFE_BY_RCU flag that defers freeing slab-pages
> > > > > > > > and a cache by a grace period. Similar flag can be added, like
> > > > > > > > SLAB_DESTROY_ONCE_FULLY_FREED, in this case a worker rearm itself
> > > > > > > > if there are still objects which should be freed.
> > > > > > > > 
> > > > > > > > Any thoughts here?
> > > > > > > 
> > > > > > > Wouldn't we also need some additional code to later check for all objects
> > > > > > > being freed to the slab, whether or not that code is  initiated from
> > > > > > > kmem_cache_destroy()?
> > > > > > >
> > > > > > Same away as SLAB_TYPESAFE_BY_RCU is handled from the kmem_cache_destroy() function.
> > > > > > It checks that flag and if it is true and extra worker is scheduled to perform a
> > > > > > deferred(instead of right away) destroy after rcu_barrier() finishes.
> > > > > 
> > > > > Like this?
> > > > > 
> > > > > 	SLAB_DESTROY_ONCE_FULLY_FREED
> > > > > 
> > > > > 	Instead of adding a new kmem_cache_destroy_rcu()
> > > > > 	or kmem_cache_destroy_wait() API member, instead add a
> > > > > 	SLAB_DESTROY_ONCE_FULLY_FREED flag that can be passed to the
> > > > > 	existing kmem_cache_destroy() function.  Use of this flag would
> > > > > 	suppress any warnings that would otherwise be issued if there
> > > > > 	was still slab memory yet to be freed, and it would also spawn
> > > > > 	workqueues (or timers or whatever) to do any needed cleanup work.
> > > > > 
> > > > >
> > > > The flag is passed as all others during creating a cache:
> > > > 
> > > >   slab = kmem_cache_create(name, size, ..., SLAB_DESTROY_ONCE_FULLY_FREED | OTHER_FLAGS, NULL);
> > > > 
> > > > the rest description is correct to me.
> > > 
> > > Good catch, fixed, thank you!
> > > 
> > And here we go with prototype(untested):
> 
> Thank you for putting this together!  It looks way simpler than I would
> have guessed, and quite a bit simpler than I would expect it would be
> to extend rcu_barrier() to cover kfree_rcu().
> 
Yep, it should be pretty pretty straightforward. The slab mechanism does
not have a functionality when it comes to defer of destroying, i.e. it
is not allowed to destroy non-fully-freed-slab:

<snip>
void kmem_cache_destroy(struct kmem_cache *s)
{
...
	err = shutdown_cache(s);
	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
	     __func__, s->name, (void *)_RET_IP_);
...
<snip>

So, this patch extends it.

> >  
> > +static void
> > +slab_caches_defer_destroy_workfn(struct work_struct *work)
> > +{
> > +	struct kmem_cache *s, *s2;
> > +
> > +	mutex_lock(&slab_mutex);
> > +	list_for_each_entry_safe(s, s2, &slab_caches_defer_destroy, list) {
> > +		if (__kmem_cache_empty(s)) {
> > +			/* free asan quarantined objects */
> > +			kasan_cache_shutdown(s);
> > +			(void) __kmem_cache_shutdown(s);
> > +
> > +			list_del(&s->list);
> > +
> > +			debugfs_slab_release(s);
> > +			kfence_shutdown_cache(s);
> > +			kmem_cache_release(s);
> > +		}
> 
> My guess is that there would want to be a splat if the slab stuck around
> for too long, but maybe that should instead be handled elsewhere or in
> some other way?  I must defer to you guys on that one.
> 
Probably yes.

--
Uladzislau Rezki

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-14 12:35                     ` Uladzislau Rezki
  2024-06-14 14:17                       ` Paul E. McKenney
@ 2024-06-14 19:33                       ` Jason A. Donenfeld
  2024-06-17 13:50                         ` Uladzislau Rezki
  2024-06-17 14:37                         ` Vlastimil Babka
  1 sibling, 2 replies; 50+ messages in thread
From: Jason A. Donenfeld @ 2024-06-14 19:33 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: Paul E. McKenney, Vlastimil Babka, Jakub Kicinski, Julia Lawall,
	linux-block, kernel-janitors, bridge, linux-trace-kernel,
	Mathieu Desnoyers, kvm, linuxppc-dev, Naveen N. Rao,
	Christophe Leroy, Nicholas Piggin, netdev, wireguard,
	linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia, Dai Ngo,
	Tom Talpey, linux-nfs, linux-can, Lai Jiangshan, netfilter-devel,
	coreteam

On Fri, Jun 14, 2024 at 02:35:33PM +0200, Uladzislau Rezki wrote:
> +	/* Should a destroy process be deferred? */
> +	if (s->flags & SLAB_DEFER_DESTROY) {
> +		list_move_tail(&s->list, &slab_caches_defer_destroy);
> +		schedule_delayed_work(&slab_caches_defer_destroy_work, HZ);
> +		goto out_unlock;
> +	}

Wouldn't it be smoother to have the actual kmem_cache_free() function
check to see if it's been marked for destruction and the refcount is
zero, rather than polling every one second? I mentioned this approach
in: https://lore.kernel.org/all/Zmo9-YGraiCj5-MI@zx2c4.com/ -

    I wonder if the right fix to this would be adding a `should_destroy`
    boolean to kmem_cache, which kmem_cache_destroy() sets to true. And
    then right after it checks `if (number_of_allocations == 0)
    actually_destroy()`, and likewise on each kmem_cache_free(), it
    could check `if (should_destroy && number_of_allocations == 0)
    actually_destroy()`. 

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-14 19:33                       ` Jason A. Donenfeld
@ 2024-06-17 13:50                         ` Uladzislau Rezki
  2024-06-17 14:56                           ` Jason A. Donenfeld
  2024-06-17 14:37                         ` Vlastimil Babka
  1 sibling, 1 reply; 50+ messages in thread
From: Uladzislau Rezki @ 2024-06-17 13:50 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Uladzislau Rezki, Paul E. McKenney, Vlastimil Babka,
	Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam

On Fri, Jun 14, 2024 at 09:33:45PM +0200, Jason A. Donenfeld wrote:
> On Fri, Jun 14, 2024 at 02:35:33PM +0200, Uladzislau Rezki wrote:
> > +	/* Should a destroy process be deferred? */
> > +	if (s->flags & SLAB_DEFER_DESTROY) {
> > +		list_move_tail(&s->list, &slab_caches_defer_destroy);
> > +		schedule_delayed_work(&slab_caches_defer_destroy_work, HZ);
> > +		goto out_unlock;
> > +	}
> 
> Wouldn't it be smoother to have the actual kmem_cache_free() function
> check to see if it's been marked for destruction and the refcount is
> zero, rather than polling every one second? I mentioned this approach
> in: https://lore.kernel.org/all/Zmo9-YGraiCj5-MI@zx2c4.com/ -
> 
>     I wonder if the right fix to this would be adding a `should_destroy`
>     boolean to kmem_cache, which kmem_cache_destroy() sets to true. And
>     then right after it checks `if (number_of_allocations == 0)
>     actually_destroy()`, and likewise on each kmem_cache_free(), it
>     could check `if (should_destroy && number_of_allocations == 0)
>     actually_destroy()`. 
> 
I do not find pooling as bad way we can go with. But your proposal
sounds reasonable to me also. We can combine both "prototypes" to
one and offer.

Can you post a prototype here?

Thanks!

--
Uladzislau Rezki

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-14 19:33                       ` Jason A. Donenfeld
  2024-06-17 13:50                         ` Uladzislau Rezki
@ 2024-06-17 14:37                         ` Vlastimil Babka
  1 sibling, 0 replies; 50+ messages in thread
From: Vlastimil Babka @ 2024-06-17 14:37 UTC (permalink / raw)
  To: Jason A. Donenfeld, Uladzislau Rezki
  Cc: Paul E. McKenney, Jakub Kicinski, Julia Lawall, linux-block,
	kernel-janitors, bridge, linux-trace-kernel, Mathieu Desnoyers,
	kvm, linuxppc-dev, Naveen N. Rao, Christophe Leroy,
	Nicholas Piggin, netdev, wireguard, linux-kernel, ecryptfs,
	Neil Brown, Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-nfs,
	linux-can, Lai Jiangshan, netfilter-devel, coreteam

On 6/14/24 9:33 PM, Jason A. Donenfeld wrote:
> On Fri, Jun 14, 2024 at 02:35:33PM +0200, Uladzislau Rezki wrote:
>> +	/* Should a destroy process be deferred? */
>> +	if (s->flags & SLAB_DEFER_DESTROY) {
>> +		list_move_tail(&s->list, &slab_caches_defer_destroy);
>> +		schedule_delayed_work(&slab_caches_defer_destroy_work, HZ);
>> +		goto out_unlock;
>> +	}
> 
> Wouldn't it be smoother to have the actual kmem_cache_free() function
> check to see if it's been marked for destruction and the refcount is
> zero, rather than polling every one second? I mentioned this approach
> in: https://lore.kernel.org/all/Zmo9-YGraiCj5-MI@zx2c4.com/ -
> 
>     I wonder if the right fix to this would be adding a `should_destroy`
>     boolean to kmem_cache, which kmem_cache_destroy() sets to true. And
>     then right after it checks `if (number_of_allocations == 0)
>     actually_destroy()`, and likewise on each kmem_cache_free(), it
>     could check `if (should_destroy && number_of_allocations == 0)
>     actually_destroy()`. 

I would prefer not to affect the performance of kmem_cache_free() by doing
such checks, if possible. Ideally we'd have a way to wait/poll for the
kfree_rcu() "grace period" expiring even with the batching that's
implemented there. Even if it's pesimistically long to avoid affecting
kfree_rcu() performance. The goal here is just to print the warnings if
there was a leak and the precise timing of them shouldn't matter. The owning
module could be already unloaded at that point? I guess only a kunit test
could want to be synchronous and then it could just ask for
kmem_cache_free() to wait synchronously.

> Jason


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-17 13:50                         ` Uladzislau Rezki
@ 2024-06-17 14:56                           ` Jason A. Donenfeld
  2024-06-17 16:30                             ` Uladzislau Rezki
  0 siblings, 1 reply; 50+ messages in thread
From: Jason A. Donenfeld @ 2024-06-17 14:56 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: Paul E. McKenney, Vlastimil Babka, Jakub Kicinski, Julia Lawall,
	linux-block, kernel-janitors, bridge, linux-trace-kernel,
	Mathieu Desnoyers, kvm, linuxppc-dev, Naveen N. Rao,
	Christophe Leroy, Nicholas Piggin, netdev, wireguard,
	linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia, Dai Ngo,
	Tom Talpey, linux-nfs, linux-can, Lai Jiangshan, netfilter-devel,
	coreteam

On Mon, Jun 17, 2024 at 03:50:56PM +0200, Uladzislau Rezki wrote:
> On Fri, Jun 14, 2024 at 09:33:45PM +0200, Jason A. Donenfeld wrote:
> > On Fri, Jun 14, 2024 at 02:35:33PM +0200, Uladzislau Rezki wrote:
> > > +	/* Should a destroy process be deferred? */
> > > +	if (s->flags & SLAB_DEFER_DESTROY) {
> > > +		list_move_tail(&s->list, &slab_caches_defer_destroy);
> > > +		schedule_delayed_work(&slab_caches_defer_destroy_work, HZ);
> > > +		goto out_unlock;
> > > +	}
> > 
> > Wouldn't it be smoother to have the actual kmem_cache_free() function
> > check to see if it's been marked for destruction and the refcount is
> > zero, rather than polling every one second? I mentioned this approach
> > in: https://lore.kernel.org/all/Zmo9-YGraiCj5-MI@zx2c4.com/ -
> > 
> >     I wonder if the right fix to this would be adding a `should_destroy`
> >     boolean to kmem_cache, which kmem_cache_destroy() sets to true. And
> >     then right after it checks `if (number_of_allocations == 0)
> >     actually_destroy()`, and likewise on each kmem_cache_free(), it
> >     could check `if (should_destroy && number_of_allocations == 0)
> >     actually_destroy()`. 
> > 
> I do not find pooling as bad way we can go with. But your proposal
> sounds reasonable to me also. We can combine both "prototypes" to
> one and offer.
> 
> Can you post a prototype here?

This is untested, but the simplest, shortest possible version would be:

diff --git a/mm/slab.h b/mm/slab.h
index 5f8f47c5bee0..907c0ea56c01 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -275,6 +275,7 @@ struct kmem_cache {
 	unsigned int inuse;		/* Offset to metadata */
 	unsigned int align;		/* Alignment */
 	unsigned int red_left_pad;	/* Left redzone padding size */
+	bool is_destroyed;		/* Destruction happens when no objects */
 	const char *name;		/* Name (only for display!) */
 	struct list_head list;		/* List of slab caches */
 #ifdef CONFIG_SYSFS
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1560a1546bb1..f700bed066d9 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -494,8 +494,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
 		goto out_unlock;

 	err = shutdown_cache(s);
-	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
-	     __func__, s->name, (void *)_RET_IP_);
+	if (err)
+		s->is_destroyed = true;
 out_unlock:
 	mutex_unlock(&slab_mutex);
 	cpus_read_unlock();
diff --git a/mm/slub.c b/mm/slub.c
index 1373ac365a46..7db8fe90a323 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4510,6 +4510,8 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
 		return;
 	trace_kmem_cache_free(_RET_IP_, x, s);
 	slab_free(s, virt_to_slab(x), x, _RET_IP_);
+	if (s->is_destroyed)
+		kmem_cache_destroy(s);
 }
 EXPORT_SYMBOL(kmem_cache_free);

@@ -5342,9 +5344,6 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 		if (!slab->inuse) {
 			remove_partial(n, slab);
 			list_add(&slab->slab_list, &discard);
-		} else {
-			list_slab_objects(s, slab,
-			  "Objects remaining in %s on __kmem_cache_shutdown()");
 		}
 	}
 	spin_unlock_irq(&n->list_lock);


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-13 12:22           ` Jason A. Donenfeld
  2024-06-13 12:46             ` Paul E. McKenney
@ 2024-06-17 15:10             ` Vlastimil Babka
  2024-06-17 16:12               ` Paul E. McKenney
  1 sibling, 1 reply; 50+ messages in thread
From: Vlastimil Babka @ 2024-06-17 15:10 UTC (permalink / raw)
  To: Jason A. Donenfeld, Paul E. McKenney, Uladzislau Rezki (Sony)
  Cc: Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam

On 6/13/24 2:22 PM, Jason A. Donenfeld wrote:
> On Wed, Jun 12, 2024 at 08:38:02PM -0700, Paul E. McKenney wrote:
>> o	Make the current kmem_cache_destroy() asynchronously wait for
>> 	all memory to be returned, then complete the destruction.
>> 	(This gets rid of a valuable debugging technique because
>> 	in normal use, it is a bug to attempt to destroy a kmem_cache
>> 	that has objects still allocated.)

This seems like the best option to me. As Jason already said, the debugging
technique is not affected significantly, if the warning just occurs
asynchronously later. The module can be already unloaded at that point, as
the leak is never checked programatically anyway to control further
execution, it's just a splat in dmesg.

> Specifically what I mean is that we can still claim a memory leak has
> occurred if one batched kfree_rcu freeing grace period has elapsed since
> the last call to kmem_cache_destroy_rcu_wait/barrier() or
> kmem_cache_destroy_rcu(). In that case, you quit blocking, or you quit
> asynchronously waiting, and then you splat about a memleak like we have
> now.

Yes so we'd need the kmem_cache_free_barrier() for a slab kunit test (or the
pessimistic variant waiting for the 21 seconds), and a polling variant of
the same thing for the asynchronous destruction. Or we don't need a polling
variant if it's ok to invoke such a barrier in a schedule_work() workfn.

We should not need any new kmem_cache flag nor kmem_cache_destroy() flag to
burden the users of kfree_rcu() with. We have __kmem_cache_shutdown() that
will try to flush everything immediately and if it doesn't succeed, we can
assume kfree_rcu() might be in flight and try to wait for it asynchronously,
without any flags.

SLAB_TYPESAFE_BY_RCU is still handled specially because it has special
semantics as well.

As for users of call_rcu() with arbitrary callbacks that might be functions
from the module that is about to unload, these should not return from
kmem_cache_destroy() with objects in flight. But those should be using
rcu_barrier() before calling kmem_cache_destroy() already, and probably we
should not try to handle this automagically? Maybe one potential change with
the described approach is that today they would get the "cache not empty"
warning immediately. But that wouldn't stop the module unload so later the
callbacks would try to execute unmapped code anyway. With the new approach
the asynchronous handling might delay the "cache not empty" warnings (or
not, if kmem_cache_free_barrier() would finish before a rcu_barrier() would)
so the unmapped code execution would come first. I don't think that would be
a regression.


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-17 15:10             ` Vlastimil Babka
@ 2024-06-17 16:12               ` Paul E. McKenney
  2024-06-17 17:23                 ` Vlastimil Babka
  0 siblings, 1 reply; 50+ messages in thread
From: Paul E. McKenney @ 2024-06-17 16:12 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Jason A. Donenfeld, Uladzislau Rezki (Sony),
	Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam

On Mon, Jun 17, 2024 at 05:10:50PM +0200, Vlastimil Babka wrote:
> On 6/13/24 2:22 PM, Jason A. Donenfeld wrote:
> > On Wed, Jun 12, 2024 at 08:38:02PM -0700, Paul E. McKenney wrote:
> >> o	Make the current kmem_cache_destroy() asynchronously wait for
> >> 	all memory to be returned, then complete the destruction.
> >> 	(This gets rid of a valuable debugging technique because
> >> 	in normal use, it is a bug to attempt to destroy a kmem_cache
> >> 	that has objects still allocated.)
> 
> This seems like the best option to me. As Jason already said, the debugging
> technique is not affected significantly, if the warning just occurs
> asynchronously later. The module can be already unloaded at that point, as
> the leak is never checked programatically anyway to control further
> execution, it's just a splat in dmesg.

Works for me!

> > Specifically what I mean is that we can still claim a memory leak has
> > occurred if one batched kfree_rcu freeing grace period has elapsed since
> > the last call to kmem_cache_destroy_rcu_wait/barrier() or
> > kmem_cache_destroy_rcu(). In that case, you quit blocking, or you quit
> > asynchronously waiting, and then you splat about a memleak like we have
> > now.
> 
> Yes so we'd need the kmem_cache_free_barrier() for a slab kunit test (or the
> pessimistic variant waiting for the 21 seconds), and a polling variant of
> the same thing for the asynchronous destruction. Or we don't need a polling
> variant if it's ok to invoke such a barrier in a schedule_work() workfn.
> 
> We should not need any new kmem_cache flag nor kmem_cache_destroy() flag to
> burden the users of kfree_rcu() with. We have __kmem_cache_shutdown() that
> will try to flush everything immediately and if it doesn't succeed, we can
> assume kfree_rcu() might be in flight and try to wait for it asynchronously,
> without any flags.

That does sound like a very attractive approach.

> SLAB_TYPESAFE_BY_RCU is still handled specially because it has special
> semantics as well.
> 
> As for users of call_rcu() with arbitrary callbacks that might be functions
> from the module that is about to unload, these should not return from
> kmem_cache_destroy() with objects in flight. But those should be using
> rcu_barrier() before calling kmem_cache_destroy() already, and probably we
> should not try to handle this automagically? Maybe one potential change with
> the described approach is that today they would get the "cache not empty"
> warning immediately. But that wouldn't stop the module unload so later the
> callbacks would try to execute unmapped code anyway. With the new approach
> the asynchronous handling might delay the "cache not empty" warnings (or
> not, if kmem_cache_free_barrier() would finish before a rcu_barrier() would)
> so the unmapped code execution would come first. I don't think that would be
> a regression.

Agreed.

There are some use cases where a call_rcu() from a module without an
rcu_barrier() would be OK, for example, if the callback function was
defined in the core kernel and either: (1) The memory was from kmalloc()
or (2) The memory was from kmem_cache_alloc() and your suggested
changes above have been applied.  My current belief is that these are
too special of cases to be worth optimizing for, so that the rule should
remain "If you use call_rcu() in a module, you must call rcu_barrier()
within the module-unload code."

There have been discussions of having module-unload automatically invoke
rcu_barrier() if needed, but thus far we have not come up with a good
way to do this.  Challenges include things like static inline functions
from the core kernel invoking call_rcu(), in which case how to figure
out that the rcu_barrier() is not needed?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-17 14:56                           ` Jason A. Donenfeld
@ 2024-06-17 16:30                             ` Uladzislau Rezki
  2024-06-17 16:33                               ` Jason A. Donenfeld
  0 siblings, 1 reply; 50+ messages in thread
From: Uladzislau Rezki @ 2024-06-17 16:30 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Uladzislau Rezki, Paul E. McKenney, Vlastimil Babka,
	Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam

On Mon, Jun 17, 2024 at 04:56:17PM +0200, Jason A. Donenfeld wrote:
> On Mon, Jun 17, 2024 at 03:50:56PM +0200, Uladzislau Rezki wrote:
> > On Fri, Jun 14, 2024 at 09:33:45PM +0200, Jason A. Donenfeld wrote:
> > > On Fri, Jun 14, 2024 at 02:35:33PM +0200, Uladzislau Rezki wrote:
> > > > +	/* Should a destroy process be deferred? */
> > > > +	if (s->flags & SLAB_DEFER_DESTROY) {
> > > > +		list_move_tail(&s->list, &slab_caches_defer_destroy);
> > > > +		schedule_delayed_work(&slab_caches_defer_destroy_work, HZ);
> > > > +		goto out_unlock;
> > > > +	}
> > > 
> > > Wouldn't it be smoother to have the actual kmem_cache_free() function
> > > check to see if it's been marked for destruction and the refcount is
> > > zero, rather than polling every one second? I mentioned this approach
> > > in: https://lore.kernel.org/all/Zmo9-YGraiCj5-MI@zx2c4.com/ -
> > > 
> > >     I wonder if the right fix to this would be adding a `should_destroy`
> > >     boolean to kmem_cache, which kmem_cache_destroy() sets to true. And
> > >     then right after it checks `if (number_of_allocations == 0)
> > >     actually_destroy()`, and likewise on each kmem_cache_free(), it
> > >     could check `if (should_destroy && number_of_allocations == 0)
> > >     actually_destroy()`. 
> > > 
> > I do not find pooling as bad way we can go with. But your proposal
> > sounds reasonable to me also. We can combine both "prototypes" to
> > one and offer.
> > 
> > Can you post a prototype here?
> 
> This is untested, but the simplest, shortest possible version would be:
> 
> diff --git a/mm/slab.h b/mm/slab.h
> index 5f8f47c5bee0..907c0ea56c01 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -275,6 +275,7 @@ struct kmem_cache {
>  	unsigned int inuse;		/* Offset to metadata */
>  	unsigned int align;		/* Alignment */
>  	unsigned int red_left_pad;	/* Left redzone padding size */
> +	bool is_destroyed;		/* Destruction happens when no objects */
>  	const char *name;		/* Name (only for display!) */
>  	struct list_head list;		/* List of slab caches */
>  #ifdef CONFIG_SYSFS
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 1560a1546bb1..f700bed066d9 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -494,8 +494,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
>  		goto out_unlock;
> 
>  	err = shutdown_cache(s);
> -	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
> -	     __func__, s->name, (void *)_RET_IP_);
> +	if (err)
> +		s->is_destroyed = true;
>
Here if an "err" is less then "0" means there are still objects
whereas "is_destroyed" is set to "true" which is not correlated
with a comment:

"Destruction happens when no objects"

>  out_unlock:
>  	mutex_unlock(&slab_mutex);
>  	cpus_read_unlock();
> diff --git a/mm/slub.c b/mm/slub.c
> index 1373ac365a46..7db8fe90a323 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -4510,6 +4510,8 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
>  		return;
>  	trace_kmem_cache_free(_RET_IP_, x, s);
>  	slab_free(s, virt_to_slab(x), x, _RET_IP_);
> +	if (s->is_destroyed)
> +		kmem_cache_destroy(s);
>  }
>  EXPORT_SYMBOL(kmem_cache_free);
> 
> @@ -5342,9 +5344,6 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
>  		if (!slab->inuse) {
>  			remove_partial(n, slab);
>  			list_add(&slab->slab_list, &discard);
> -		} else {
> -			list_slab_objects(s, slab,
> -			  "Objects remaining in %s on __kmem_cache_shutdown()");
>  		}
>  	}
>  	spin_unlock_irq(&n->list_lock);
> 
Anyway it looks like it was not welcome to do it in the kmem_cache_free()
function due to performance reason.

--
Uladzislau Rezki

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-17 16:30                             ` Uladzislau Rezki
@ 2024-06-17 16:33                               ` Jason A. Donenfeld
  2024-06-17 16:38                                 ` Vlastimil Babka
  2024-06-17 16:42                                 ` Uladzislau Rezki
  0 siblings, 2 replies; 50+ messages in thread
From: Jason A. Donenfeld @ 2024-06-17 16:33 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: Paul E. McKenney, Vlastimil Babka, Jakub Kicinski, Julia Lawall,
	linux-block, kernel-janitors, bridge, linux-trace-kernel,
	Mathieu Desnoyers, kvm, linuxppc-dev, Naveen N. Rao,
	Christophe Leroy, Nicholas Piggin, netdev, wireguard,
	linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia, Dai Ngo,
	Tom Talpey, linux-nfs, linux-can, Lai Jiangshan, netfilter-devel,
	coreteam

On Mon, Jun 17, 2024 at 6:30 PM Uladzislau Rezki <urezki@gmail.com> wrote:
> Here if an "err" is less then "0" means there are still objects
> whereas "is_destroyed" is set to "true" which is not correlated
> with a comment:
>
> "Destruction happens when no objects"

The comment is just poorly written. But the logic of the code is right.

>
> >  out_unlock:
> >       mutex_unlock(&slab_mutex);
> >       cpus_read_unlock();
> > diff --git a/mm/slub.c b/mm/slub.c
> > index 1373ac365a46..7db8fe90a323 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -4510,6 +4510,8 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
> >               return;
> >       trace_kmem_cache_free(_RET_IP_, x, s);
> >       slab_free(s, virt_to_slab(x), x, _RET_IP_);
> > +     if (s->is_destroyed)
> > +             kmem_cache_destroy(s);
> >  }
> >  EXPORT_SYMBOL(kmem_cache_free);
> >
> > @@ -5342,9 +5344,6 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
> >               if (!slab->inuse) {
> >                       remove_partial(n, slab);
> >                       list_add(&slab->slab_list, &discard);
> > -             } else {
> > -                     list_slab_objects(s, slab,
> > -                       "Objects remaining in %s on __kmem_cache_shutdown()");
> >               }
> >       }
> >       spin_unlock_irq(&n->list_lock);
> >
> Anyway it looks like it was not welcome to do it in the kmem_cache_free()
> function due to performance reason.

"was not welcome" - Vlastimil mentioned *potential* performance
concerns before I posted this. I suspect he might have a different
view now, maybe?

Vlastimil, this is just checking a boolean (which could be
unlikely()'d), which should have pretty minimal overhead. Is that
alright with you?

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-17 16:33                               ` Jason A. Donenfeld
@ 2024-06-17 16:38                                 ` Vlastimil Babka
  2024-06-17 17:04                                   ` Jason A. Donenfeld
  2024-06-17 16:42                                 ` Uladzislau Rezki
  1 sibling, 1 reply; 50+ messages in thread
From: Vlastimil Babka @ 2024-06-17 16:38 UTC (permalink / raw)
  To: Jason A. Donenfeld, Uladzislau Rezki
  Cc: Paul E. McKenney, Jakub Kicinski, Julia Lawall, linux-block,
	kernel-janitors, bridge, linux-trace-kernel, Mathieu Desnoyers,
	kvm, linuxppc-dev, Naveen N. Rao, Christophe Leroy,
	Nicholas Piggin, netdev, wireguard, linux-kernel, ecryptfs,
	Neil Brown, Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-nfs,
	linux-can, Lai Jiangshan, netfilter-devel, coreteam

On 6/17/24 6:33 PM, Jason A. Donenfeld wrote:
> On Mon, Jun 17, 2024 at 6:30 PM Uladzislau Rezki <urezki@gmail.com> wrote:
>> Here if an "err" is less then "0" means there are still objects
>> whereas "is_destroyed" is set to "true" which is not correlated
>> with a comment:
>>
>> "Destruction happens when no objects"
> 
> The comment is just poorly written. But the logic of the code is right.
> 
>>
>> >  out_unlock:
>> >       mutex_unlock(&slab_mutex);
>> >       cpus_read_unlock();
>> > diff --git a/mm/slub.c b/mm/slub.c
>> > index 1373ac365a46..7db8fe90a323 100644
>> > --- a/mm/slub.c
>> > +++ b/mm/slub.c
>> > @@ -4510,6 +4510,8 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
>> >               return;
>> >       trace_kmem_cache_free(_RET_IP_, x, s);
>> >       slab_free(s, virt_to_slab(x), x, _RET_IP_);
>> > +     if (s->is_destroyed)
>> > +             kmem_cache_destroy(s);
>> >  }
>> >  EXPORT_SYMBOL(kmem_cache_free);
>> >
>> > @@ -5342,9 +5344,6 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
>> >               if (!slab->inuse) {
>> >                       remove_partial(n, slab);
>> >                       list_add(&slab->slab_list, &discard);
>> > -             } else {
>> > -                     list_slab_objects(s, slab,
>> > -                       "Objects remaining in %s on __kmem_cache_shutdown()");
>> >               }
>> >       }
>> >       spin_unlock_irq(&n->list_lock);
>> >
>> Anyway it looks like it was not welcome to do it in the kmem_cache_free()
>> function due to performance reason.
> 
> "was not welcome" - Vlastimil mentioned *potential* performance
> concerns before I posted this. I suspect he might have a different
> view now, maybe?
> 
> Vlastimil, this is just checking a boolean (which could be
> unlikely()'d), which should have pretty minimal overhead. Is that
> alright with you?

Well I doubt we can just set and check it without any barriers? The
completion of the last pending kfree_rcu() might race with
kmem_cache_destroy() in a way that will leave the cache there forever, no?
And once we add barriers it becomes a perf issue?

> Jason


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-17 16:33                               ` Jason A. Donenfeld
  2024-06-17 16:38                                 ` Vlastimil Babka
@ 2024-06-17 16:42                                 ` Uladzislau Rezki
  2024-06-17 16:57                                   ` Jason A. Donenfeld
  1 sibling, 1 reply; 50+ messages in thread
From: Uladzislau Rezki @ 2024-06-17 16:42 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Uladzislau Rezki, Paul E. McKenney, Vlastimil Babka,
	Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam

On Mon, Jun 17, 2024 at 06:33:23PM +0200, Jason A. Donenfeld wrote:
> On Mon, Jun 17, 2024 at 6:30 PM Uladzislau Rezki <urezki@gmail.com> wrote:
> > Here if an "err" is less then "0" means there are still objects
> > whereas "is_destroyed" is set to "true" which is not correlated
> > with a comment:
> >
> > "Destruction happens when no objects"
> 
> The comment is just poorly written. But the logic of the code is right.
> 
OK.

> >
> > >  out_unlock:
> > >       mutex_unlock(&slab_mutex);
> > >       cpus_read_unlock();
> > > diff --git a/mm/slub.c b/mm/slub.c
> > > index 1373ac365a46..7db8fe90a323 100644
> > > --- a/mm/slub.c
> > > +++ b/mm/slub.c
> > > @@ -4510,6 +4510,8 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
> > >               return;
> > >       trace_kmem_cache_free(_RET_IP_, x, s);
> > >       slab_free(s, virt_to_slab(x), x, _RET_IP_);
> > > +     if (s->is_destroyed)
> > > +             kmem_cache_destroy(s);
>
Here i am not follow you. How do you see that a cache has been fully
freed? Or is it just super draft code?

Thanks!

--
Uladzislau Rezki

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-17 16:42                                 ` Uladzislau Rezki
@ 2024-06-17 16:57                                   ` Jason A. Donenfeld
  2024-06-17 17:19                                     ` Uladzislau Rezki
  0 siblings, 1 reply; 50+ messages in thread
From: Jason A. Donenfeld @ 2024-06-17 16:57 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: Paul E. McKenney, Vlastimil Babka, Jakub Kicinski, Julia Lawall,
	linux-block, kernel-janitors, bridge, linux-trace-kernel,
	Mathieu Desnoyers, kvm, linuxppc-dev, Naveen N. Rao,
	Christophe Leroy, Nicholas Piggin, netdev, wireguard,
	linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia, Dai Ngo,
	Tom Talpey, linux-nfs, linux-can, Lai Jiangshan, netfilter-devel,
	coreteam

On Mon, Jun 17, 2024 at 06:42:23PM +0200, Uladzislau Rezki wrote:
> On Mon, Jun 17, 2024 at 06:33:23PM +0200, Jason A. Donenfeld wrote:
> > On Mon, Jun 17, 2024 at 6:30 PM Uladzislau Rezki <urezki@gmail.com> wrote:
> > > Here if an "err" is less then "0" means there are still objects
> > > whereas "is_destroyed" is set to "true" which is not correlated
> > > with a comment:
> > >
> > > "Destruction happens when no objects"
> > 
> > The comment is just poorly written. But the logic of the code is right.
> > 
> OK.
> 
> > >
> > > >  out_unlock:
> > > >       mutex_unlock(&slab_mutex);
> > > >       cpus_read_unlock();
> > > > diff --git a/mm/slub.c b/mm/slub.c
> > > > index 1373ac365a46..7db8fe90a323 100644
> > > > --- a/mm/slub.c
> > > > +++ b/mm/slub.c
> > > > @@ -4510,6 +4510,8 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
> > > >               return;
> > > >       trace_kmem_cache_free(_RET_IP_, x, s);
> > > >       slab_free(s, virt_to_slab(x), x, _RET_IP_);
> > > > +     if (s->is_destroyed)
> > > > +             kmem_cache_destroy(s);
> >
> Here i am not follow you. How do you see that a cache has been fully
> freed? Or is it just super draft code?

kmem_cache_destroy() does this in shutdown_cache().

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-17 16:38                                 ` Vlastimil Babka
@ 2024-06-17 17:04                                   ` Jason A. Donenfeld
  2024-06-17 21:19                                     ` Vlastimil Babka
  0 siblings, 1 reply; 50+ messages in thread
From: Jason A. Donenfeld @ 2024-06-17 17:04 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Uladzislau Rezki, Paul E. McKenney, Jakub Kicinski, Julia Lawall,
	linux-block, kernel-janitors, bridge, linux-trace-kernel,
	Mathieu Desnoyers, kvm, linuxppc-dev, Naveen N. Rao,
	Christophe Leroy, Nicholas Piggin, netdev, wireguard,
	linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia, Dai Ngo,
	Tom Talpey, linux-nfs, linux-can, Lai Jiangshan, netfilter-devel,
	coreteam

On Mon, Jun 17, 2024 at 06:38:52PM +0200, Vlastimil Babka wrote:
> On 6/17/24 6:33 PM, Jason A. Donenfeld wrote:
> > On Mon, Jun 17, 2024 at 6:30 PM Uladzislau Rezki <urezki@gmail.com> wrote:
> >> Here if an "err" is less then "0" means there are still objects
> >> whereas "is_destroyed" is set to "true" which is not correlated
> >> with a comment:
> >>
> >> "Destruction happens when no objects"
> > 
> > The comment is just poorly written. But the logic of the code is right.
> > 
> >>
> >> >  out_unlock:
> >> >       mutex_unlock(&slab_mutex);
> >> >       cpus_read_unlock();
> >> > diff --git a/mm/slub.c b/mm/slub.c
> >> > index 1373ac365a46..7db8fe90a323 100644
> >> > --- a/mm/slub.c
> >> > +++ b/mm/slub.c
> >> > @@ -4510,6 +4510,8 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
> >> >               return;
> >> >       trace_kmem_cache_free(_RET_IP_, x, s);
> >> >       slab_free(s, virt_to_slab(x), x, _RET_IP_);
> >> > +     if (s->is_destroyed)
> >> > +             kmem_cache_destroy(s);
> >> >  }
> >> >  EXPORT_SYMBOL(kmem_cache_free);
> >> >
> >> > @@ -5342,9 +5344,6 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
> >> >               if (!slab->inuse) {
> >> >                       remove_partial(n, slab);
> >> >                       list_add(&slab->slab_list, &discard);
> >> > -             } else {
> >> > -                     list_slab_objects(s, slab,
> >> > -                       "Objects remaining in %s on __kmem_cache_shutdown()");
> >> >               }
> >> >       }
> >> >       spin_unlock_irq(&n->list_lock);
> >> >
> >> Anyway it looks like it was not welcome to do it in the kmem_cache_free()
> >> function due to performance reason.
> > 
> > "was not welcome" - Vlastimil mentioned *potential* performance
> > concerns before I posted this. I suspect he might have a different
> > view now, maybe?
> > 
> > Vlastimil, this is just checking a boolean (which could be
> > unlikely()'d), which should have pretty minimal overhead. Is that
> > alright with you?
> 
> Well I doubt we can just set and check it without any barriers? The
> completion of the last pending kfree_rcu() might race with
> kmem_cache_destroy() in a way that will leave the cache there forever, no?
> And once we add barriers it becomes a perf issue?

Hm, yea you might be right about barriers being required. But actually,
might this point toward a larger problem with no matter what approach,
polling or event, is chosen? If the current rule is that
kmem_cache_free() must never race with kmem_cache_destroy(), because
users have always made diligent use of call_rcu()/rcu_barrier() and
such, but now we're going to let those race with each other - either by
my thing above or by polling - so we're potentially going to get in trouble
and need some barriers anyway. 

I think?

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-17 16:57                                   ` Jason A. Donenfeld
@ 2024-06-17 17:19                                     ` Uladzislau Rezki
  0 siblings, 0 replies; 50+ messages in thread
From: Uladzislau Rezki @ 2024-06-17 17:19 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Uladzislau Rezki, Paul E. McKenney, Vlastimil Babka,
	Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam

On Mon, Jun 17, 2024 at 06:57:45PM +0200, Jason A. Donenfeld wrote:
> On Mon, Jun 17, 2024 at 06:42:23PM +0200, Uladzislau Rezki wrote:
> > On Mon, Jun 17, 2024 at 06:33:23PM +0200, Jason A. Donenfeld wrote:
> > > On Mon, Jun 17, 2024 at 6:30 PM Uladzislau Rezki <urezki@gmail.com> wrote:
> > > > Here if an "err" is less then "0" means there are still objects
> > > > whereas "is_destroyed" is set to "true" which is not correlated
> > > > with a comment:
> > > >
> > > > "Destruction happens when no objects"
> > > 
> > > The comment is just poorly written. But the logic of the code is right.
> > > 
> > OK.
> > 
> > > >
> > > > >  out_unlock:
> > > > >       mutex_unlock(&slab_mutex);
> > > > >       cpus_read_unlock();
> > > > > diff --git a/mm/slub.c b/mm/slub.c
> > > > > index 1373ac365a46..7db8fe90a323 100644
> > > > > --- a/mm/slub.c
> > > > > +++ b/mm/slub.c
> > > > > @@ -4510,6 +4510,8 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
> > > > >               return;
> > > > >       trace_kmem_cache_free(_RET_IP_, x, s);
> > > > >       slab_free(s, virt_to_slab(x), x, _RET_IP_);
> > > > > +     if (s->is_destroyed)
> > > > > +             kmem_cache_destroy(s);
> > >
> > Here i am not follow you. How do you see that a cache has been fully
> > freed? Or is it just super draft code?
> 
> kmem_cache_destroy() does this in shutdown_cache().
>
Right. In this scenario you invoke kmem_cache_destroy() over and over
until the last object gets freed. This potentially slowing the kmem_cache_free()
which is not OK, at least to me.

--
Uladzislau Rezki

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-17 16:12               ` Paul E. McKenney
@ 2024-06-17 17:23                 ` Vlastimil Babka
  2024-06-17 18:42                   ` Uladzislau Rezki
  2024-06-17 18:54                   ` Paul E. McKenney
  0 siblings, 2 replies; 50+ messages in thread
From: Vlastimil Babka @ 2024-06-17 17:23 UTC (permalink / raw)
  To: paulmck
  Cc: Jason A. Donenfeld, Uladzislau Rezki (Sony),
	Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam, kasan-dev

On 6/17/24 6:12 PM, Paul E. McKenney wrote:
> On Mon, Jun 17, 2024 at 05:10:50PM +0200, Vlastimil Babka wrote:
>> On 6/13/24 2:22 PM, Jason A. Donenfeld wrote:
>> > On Wed, Jun 12, 2024 at 08:38:02PM -0700, Paul E. McKenney wrote:
>> >> o	Make the current kmem_cache_destroy() asynchronously wait for
>> >> 	all memory to be returned, then complete the destruction.
>> >> 	(This gets rid of a valuable debugging technique because
>> >> 	in normal use, it is a bug to attempt to destroy a kmem_cache
>> >> 	that has objects still allocated.)
>> 
>> This seems like the best option to me. As Jason already said, the debugging
>> technique is not affected significantly, if the warning just occurs
>> asynchronously later. The module can be already unloaded at that point, as
>> the leak is never checked programatically anyway to control further
>> execution, it's just a splat in dmesg.
> 
> Works for me!

Great. So this is how a prototype could look like, hopefully? The kunit test
does generate the splat for me, which should be because the rcu_barrier() in
the implementation (marked to be replaced with the real thing) is really
insufficient. Note the test itself passes as this kind of error isn't wired
up properly.

Another thing to resolve is the marked comment about kasan_shutdown() with
potential kfree_rcu()'s in flight.

Also you need CONFIG_SLUB_DEBUG enabled otherwise node_nr_slabs() is a no-op
and it might fail to notice the pending slabs. This will need to change.

----8<----
diff --git a/lib/slub_kunit.c b/lib/slub_kunit.c
index e6667a28c014..e3e4d0ca40b7 100644
--- a/lib/slub_kunit.c
+++ b/lib/slub_kunit.c
@@ -5,6 +5,7 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/rcupdate.h>
 #include "../mm/slab.h"
 
 static struct kunit_resource resource;
@@ -157,6 +158,26 @@ static void test_kmalloc_redzone_access(struct kunit *test)
 	kmem_cache_destroy(s);
 }
 
+struct test_kfree_rcu_struct {
+	struct rcu_head rcu;
+};
+
+static void test_kfree_rcu(struct kunit *test)
+{
+	struct kmem_cache *s = test_kmem_cache_create("TestSlub_kfree_rcu",
+				sizeof(struct test_kfree_rcu_struct),
+				SLAB_NO_MERGE);
+	struct test_kfree_rcu_struct *p = kmem_cache_alloc(s, GFP_KERNEL);
+
+	kasan_disable_current();
+
+	KUNIT_EXPECT_EQ(test, 0, slab_errors);
+
+	kasan_enable_current();
+	kfree_rcu(p, rcu);
+	kmem_cache_destroy(s);
+}
+
 static int test_init(struct kunit *test)
 {
 	slab_errors = 0;
@@ -177,6 +198,7 @@ static struct kunit_case test_cases[] = {
 
 	KUNIT_CASE(test_clobber_redzone_free),
 	KUNIT_CASE(test_kmalloc_redzone_access),
+	KUNIT_CASE(test_kfree_rcu),
 	{}
 };
 
diff --git a/mm/slab.h b/mm/slab.h
index b16e63191578..a0295600af92 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -277,6 +277,8 @@ struct kmem_cache {
 	unsigned int red_left_pad;	/* Left redzone padding size */
 	const char *name;		/* Name (only for display!) */
 	struct list_head list;		/* List of slab caches */
+	struct work_struct async_destroy_work;
+
 #ifdef CONFIG_SYSFS
 	struct kobject kobj;		/* For sysfs */
 #endif
@@ -474,7 +476,7 @@ static inline bool is_kmalloc_cache(struct kmem_cache *s)
 			      SLAB_NO_USER_FLAGS)
 
 bool __kmem_cache_empty(struct kmem_cache *);
-int __kmem_cache_shutdown(struct kmem_cache *);
+int __kmem_cache_shutdown(struct kmem_cache *, bool);
 void __kmem_cache_release(struct kmem_cache *);
 int __kmem_cache_shrink(struct kmem_cache *);
 void slab_kmem_cache_release(struct kmem_cache *);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 5b1f996bed06..c5c356d0235d 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -44,6 +44,8 @@ static LIST_HEAD(slab_caches_to_rcu_destroy);
 static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
 static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
 		    slab_caches_to_rcu_destroy_workfn);
+static void kmem_cache_kfree_rcu_destroy_workfn(struct work_struct *work);
+
 
 /*
  * Set of flags that will prevent slab merging
@@ -234,6 +236,7 @@ static struct kmem_cache *create_cache(const char *name,
 
 	s->refcount = 1;
 	list_add(&s->list, &slab_caches);
+	INIT_WORK(&s->async_destroy_work, kmem_cache_kfree_rcu_destroy_workfn);
 	return s;
 
 out_free_cache:
@@ -449,12 +452,16 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
 	}
 }
 
-static int shutdown_cache(struct kmem_cache *s)
+static int shutdown_cache(struct kmem_cache *s, bool warn_inuse)
 {
 	/* free asan quarantined objects */
+	/*
+	 * XXX: is it ok to call this multiple times? and what happens with a
+	 * kfree_rcu() in flight that finishes after or in parallel with this?
+	 */
 	kasan_cache_shutdown(s);
 
-	if (__kmem_cache_shutdown(s) != 0)
+	if (__kmem_cache_shutdown(s, warn_inuse) != 0)
 		return -EBUSY;
 
 	list_del(&s->list);
@@ -477,6 +484,32 @@ void slab_kmem_cache_release(struct kmem_cache *s)
 	kmem_cache_free(kmem_cache, s);
 }
 
+static void kmem_cache_kfree_rcu_destroy_workfn(struct work_struct *work)
+{
+	struct kmem_cache *s;
+	int err = -EBUSY;
+	bool rcu_set;
+
+	s = container_of(work, struct kmem_cache, async_destroy_work);
+
+	// XXX use the real kmem_cache_free_barrier() or similar thing here
+	rcu_barrier();
+
+	cpus_read_lock();
+	mutex_lock(&slab_mutex);
+
+	rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;
+
+	err = shutdown_cache(s, true);
+	WARN(err, "kmem_cache_destroy %s: Slab cache still has objects",
+	     s->name);
+
+	mutex_unlock(&slab_mutex);
+	cpus_read_unlock();
+	if (!err && !rcu_set)
+		kmem_cache_release(s);
+}
+
 void kmem_cache_destroy(struct kmem_cache *s)
 {
 	int err = -EBUSY;
@@ -494,9 +527,9 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	if (s->refcount)
 		goto out_unlock;
 
-	err = shutdown_cache(s);
-	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
-	     __func__, s->name, (void *)_RET_IP_);
+	err = shutdown_cache(s, false);
+	if (err)
+		schedule_work(&s->async_destroy_work);
 out_unlock:
 	mutex_unlock(&slab_mutex);
 	cpus_read_unlock();
diff --git a/mm/slub.c b/mm/slub.c
index 1617d8014ecd..4d435b3d2b5f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5342,7 +5342,8 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
  * This is called from __kmem_cache_shutdown(). We must take list_lock
  * because sysfs file might still access partial list after the shutdowning.
  */
-static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
+static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n,
+			 bool warn_inuse)
 {
 	LIST_HEAD(discard);
 	struct slab *slab, *h;
@@ -5353,7 +5354,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 		if (!slab->inuse) {
 			remove_partial(n, slab);
 			list_add(&slab->slab_list, &discard);
-		} else {
+		} else if (warn_inuse) {
 			list_slab_objects(s, slab,
 			  "Objects remaining in %s on __kmem_cache_shutdown()");
 		}
@@ -5378,7 +5379,7 @@ bool __kmem_cache_empty(struct kmem_cache *s)
 /*
  * Release all resources used by a slab cache.
  */
-int __kmem_cache_shutdown(struct kmem_cache *s)
+int __kmem_cache_shutdown(struct kmem_cache *s, bool warn_inuse)
 {
 	int node;
 	struct kmem_cache_node *n;
@@ -5386,7 +5387,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
 	flush_all_cpus_locked(s);
 	/* Attempt to free all objects */
 	for_each_kmem_cache_node(s, node, n) {
-		free_partial(s, n);
+		free_partial(s, n, warn_inuse);
 		if (n->nr_partial || node_nr_slabs(n))
 			return 1;
 	}


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-17 17:23                 ` Vlastimil Babka
@ 2024-06-17 18:42                   ` Uladzislau Rezki
  2024-06-17 21:08                     ` Vlastimil Babka
  2024-06-17 18:54                   ` Paul E. McKenney
  1 sibling, 1 reply; 50+ messages in thread
From: Uladzislau Rezki @ 2024-06-17 18:42 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: paulmck, Jason A. Donenfeld, Uladzislau Rezki (Sony),
	Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam, kasan-dev

On Mon, Jun 17, 2024 at 07:23:36PM +0200, Vlastimil Babka wrote:
> On 6/17/24 6:12 PM, Paul E. McKenney wrote:
> > On Mon, Jun 17, 2024 at 05:10:50PM +0200, Vlastimil Babka wrote:
> >> On 6/13/24 2:22 PM, Jason A. Donenfeld wrote:
> >> > On Wed, Jun 12, 2024 at 08:38:02PM -0700, Paul E. McKenney wrote:
> >> >> o	Make the current kmem_cache_destroy() asynchronously wait for
> >> >> 	all memory to be returned, then complete the destruction.
> >> >> 	(This gets rid of a valuable debugging technique because
> >> >> 	in normal use, it is a bug to attempt to destroy a kmem_cache
> >> >> 	that has objects still allocated.)
> >> 
> >> This seems like the best option to me. As Jason already said, the debugging
> >> technique is not affected significantly, if the warning just occurs
> >> asynchronously later. The module can be already unloaded at that point, as
> >> the leak is never checked programatically anyway to control further
> >> execution, it's just a splat in dmesg.
> > 
> > Works for me!
> 
> Great. So this is how a prototype could look like, hopefully? The kunit test
> does generate the splat for me, which should be because the rcu_barrier() in
> the implementation (marked to be replaced with the real thing) is really
> insufficient. Note the test itself passes as this kind of error isn't wired
> up properly.
> 
> Another thing to resolve is the marked comment about kasan_shutdown() with
> potential kfree_rcu()'s in flight.
> 
> Also you need CONFIG_SLUB_DEBUG enabled otherwise node_nr_slabs() is a no-op
> and it might fail to notice the pending slabs. This will need to change.
> 
> ----8<----
> diff --git a/lib/slub_kunit.c b/lib/slub_kunit.c
> index e6667a28c014..e3e4d0ca40b7 100644
> --- a/lib/slub_kunit.c
> +++ b/lib/slub_kunit.c
> @@ -5,6 +5,7 @@
>  #include <linux/slab.h>
>  #include <linux/module.h>
>  #include <linux/kernel.h>
> +#include <linux/rcupdate.h>
>  #include "../mm/slab.h"
>  
>  static struct kunit_resource resource;
> @@ -157,6 +158,26 @@ static void test_kmalloc_redzone_access(struct kunit *test)
>  	kmem_cache_destroy(s);
>  }
>  
> +struct test_kfree_rcu_struct {
> +	struct rcu_head rcu;
> +};
> +
> +static void test_kfree_rcu(struct kunit *test)
> +{
> +	struct kmem_cache *s = test_kmem_cache_create("TestSlub_kfree_rcu",
> +				sizeof(struct test_kfree_rcu_struct),
> +				SLAB_NO_MERGE);
> +	struct test_kfree_rcu_struct *p = kmem_cache_alloc(s, GFP_KERNEL);
> +
> +	kasan_disable_current();
> +
> +	KUNIT_EXPECT_EQ(test, 0, slab_errors);
> +
> +	kasan_enable_current();
> +	kfree_rcu(p, rcu);
> +	kmem_cache_destroy(s);
> +}
> +
>  static int test_init(struct kunit *test)
>  {
>  	slab_errors = 0;
> @@ -177,6 +198,7 @@ static struct kunit_case test_cases[] = {
>  
>  	KUNIT_CASE(test_clobber_redzone_free),
>  	KUNIT_CASE(test_kmalloc_redzone_access),
> +	KUNIT_CASE(test_kfree_rcu),
>  	{}
>  };
>  
> diff --git a/mm/slab.h b/mm/slab.h
> index b16e63191578..a0295600af92 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -277,6 +277,8 @@ struct kmem_cache {
>  	unsigned int red_left_pad;	/* Left redzone padding size */
>  	const char *name;		/* Name (only for display!) */
>  	struct list_head list;		/* List of slab caches */
> +	struct work_struct async_destroy_work;
> +
>  #ifdef CONFIG_SYSFS
>  	struct kobject kobj;		/* For sysfs */
>  #endif
> @@ -474,7 +476,7 @@ static inline bool is_kmalloc_cache(struct kmem_cache *s)
>  			      SLAB_NO_USER_FLAGS)
>  
>  bool __kmem_cache_empty(struct kmem_cache *);
> -int __kmem_cache_shutdown(struct kmem_cache *);
> +int __kmem_cache_shutdown(struct kmem_cache *, bool);
>  void __kmem_cache_release(struct kmem_cache *);
>  int __kmem_cache_shrink(struct kmem_cache *);
>  void slab_kmem_cache_release(struct kmem_cache *);
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 5b1f996bed06..c5c356d0235d 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -44,6 +44,8 @@ static LIST_HEAD(slab_caches_to_rcu_destroy);
>  static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
>  static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
>  		    slab_caches_to_rcu_destroy_workfn);
> +static void kmem_cache_kfree_rcu_destroy_workfn(struct work_struct *work);
> +
>  
>  /*
>   * Set of flags that will prevent slab merging
> @@ -234,6 +236,7 @@ static struct kmem_cache *create_cache(const char *name,
>  
>  	s->refcount = 1;
>  	list_add(&s->list, &slab_caches);
> +	INIT_WORK(&s->async_destroy_work, kmem_cache_kfree_rcu_destroy_workfn);
>  	return s;
>  
>  out_free_cache:
> @@ -449,12 +452,16 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
>  	}
>  }
>  
> -static int shutdown_cache(struct kmem_cache *s)
> +static int shutdown_cache(struct kmem_cache *s, bool warn_inuse)
>  {
>  	/* free asan quarantined objects */
> +	/*
> +	 * XXX: is it ok to call this multiple times? and what happens with a
> +	 * kfree_rcu() in flight that finishes after or in parallel with this?
> +	 */
>  	kasan_cache_shutdown(s);
>  
> -	if (__kmem_cache_shutdown(s) != 0)
> +	if (__kmem_cache_shutdown(s, warn_inuse) != 0)
>  		return -EBUSY;
>  
>  	list_del(&s->list);
> @@ -477,6 +484,32 @@ void slab_kmem_cache_release(struct kmem_cache *s)
>  	kmem_cache_free(kmem_cache, s);
>  }
>  
> +static void kmem_cache_kfree_rcu_destroy_workfn(struct work_struct *work)
> +{
> +	struct kmem_cache *s;
> +	int err = -EBUSY;
> +	bool rcu_set;
> +
> +	s = container_of(work, struct kmem_cache, async_destroy_work);
> +
> +	// XXX use the real kmem_cache_free_barrier() or similar thing here
It implies that we need to introduce kfree_rcu_barrier(), a new API, which i
wanted to avoid initially. Since you do it asynchronous can we just repeat
and wait until it a cache is furry freed?

I am asking because inventing a new kfree_rcu_barrier() might not be so
straight forward.

--
Uladzislau Rezki

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-17 17:23                 ` Vlastimil Babka
  2024-06-17 18:42                   ` Uladzislau Rezki
@ 2024-06-17 18:54                   ` Paul E. McKenney
  2024-06-17 21:34                     ` Vlastimil Babka
  1 sibling, 1 reply; 50+ messages in thread
From: Paul E. McKenney @ 2024-06-17 18:54 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Jason A. Donenfeld, Uladzislau Rezki (Sony),
	Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam, kasan-dev

On Mon, Jun 17, 2024 at 07:23:36PM +0200, Vlastimil Babka wrote:
> On 6/17/24 6:12 PM, Paul E. McKenney wrote:
> > On Mon, Jun 17, 2024 at 05:10:50PM +0200, Vlastimil Babka wrote:
> >> On 6/13/24 2:22 PM, Jason A. Donenfeld wrote:
> >> > On Wed, Jun 12, 2024 at 08:38:02PM -0700, Paul E. McKenney wrote:
> >> >> o	Make the current kmem_cache_destroy() asynchronously wait for
> >> >> 	all memory to be returned, then complete the destruction.
> >> >> 	(This gets rid of a valuable debugging technique because
> >> >> 	in normal use, it is a bug to attempt to destroy a kmem_cache
> >> >> 	that has objects still allocated.)
> >> 
> >> This seems like the best option to me. As Jason already said, the debugging
> >> technique is not affected significantly, if the warning just occurs
> >> asynchronously later. The module can be already unloaded at that point, as
> >> the leak is never checked programatically anyway to control further
> >> execution, it's just a splat in dmesg.
> > 
> > Works for me!
> 
> Great. So this is how a prototype could look like, hopefully? The kunit test
> does generate the splat for me, which should be because the rcu_barrier() in
> the implementation (marked to be replaced with the real thing) is really
> insufficient. Note the test itself passes as this kind of error isn't wired
> up properly.

;-) ;-) ;-)

Some might want confirmation that their cleanup efforts succeeded,
but if so, I will let them make that known.

> Another thing to resolve is the marked comment about kasan_shutdown() with
> potential kfree_rcu()'s in flight.

Could that simply move to the worker function?  (Hey, had to ask!)

> Also you need CONFIG_SLUB_DEBUG enabled otherwise node_nr_slabs() is a no-op
> and it might fail to notice the pending slabs. This will need to change.

Agreed.

Looks generally good.  A few questions below, to be taken with a
grain of salt.

							Thanx, Paul

> ----8<----
> diff --git a/lib/slub_kunit.c b/lib/slub_kunit.c
> index e6667a28c014..e3e4d0ca40b7 100644
> --- a/lib/slub_kunit.c
> +++ b/lib/slub_kunit.c
> @@ -5,6 +5,7 @@
>  #include <linux/slab.h>
>  #include <linux/module.h>
>  #include <linux/kernel.h>
> +#include <linux/rcupdate.h>
>  #include "../mm/slab.h"
>  
>  static struct kunit_resource resource;
> @@ -157,6 +158,26 @@ static void test_kmalloc_redzone_access(struct kunit *test)
>  	kmem_cache_destroy(s);
>  }
>  
> +struct test_kfree_rcu_struct {
> +	struct rcu_head rcu;
> +};
> +
> +static void test_kfree_rcu(struct kunit *test)
> +{
> +	struct kmem_cache *s = test_kmem_cache_create("TestSlub_kfree_rcu",
> +				sizeof(struct test_kfree_rcu_struct),
> +				SLAB_NO_MERGE);
> +	struct test_kfree_rcu_struct *p = kmem_cache_alloc(s, GFP_KERNEL);
> +
> +	kasan_disable_current();
> +
> +	KUNIT_EXPECT_EQ(test, 0, slab_errors);
> +
> +	kasan_enable_current();
> +	kfree_rcu(p, rcu);
> +	kmem_cache_destroy(s);

Looks like the type of test for this!

> +}
> +
>  static int test_init(struct kunit *test)
>  {
>  	slab_errors = 0;
> @@ -177,6 +198,7 @@ static struct kunit_case test_cases[] = {
>  
>  	KUNIT_CASE(test_clobber_redzone_free),
>  	KUNIT_CASE(test_kmalloc_redzone_access),
> +	KUNIT_CASE(test_kfree_rcu),
>  	{}
>  };
>  
> diff --git a/mm/slab.h b/mm/slab.h
> index b16e63191578..a0295600af92 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -277,6 +277,8 @@ struct kmem_cache {
>  	unsigned int red_left_pad;	/* Left redzone padding size */
>  	const char *name;		/* Name (only for display!) */
>  	struct list_head list;		/* List of slab caches */
> +	struct work_struct async_destroy_work;
> +
>  #ifdef CONFIG_SYSFS
>  	struct kobject kobj;		/* For sysfs */
>  #endif
> @@ -474,7 +476,7 @@ static inline bool is_kmalloc_cache(struct kmem_cache *s)
>  			      SLAB_NO_USER_FLAGS)
>  
>  bool __kmem_cache_empty(struct kmem_cache *);
> -int __kmem_cache_shutdown(struct kmem_cache *);
> +int __kmem_cache_shutdown(struct kmem_cache *, bool);
>  void __kmem_cache_release(struct kmem_cache *);
>  int __kmem_cache_shrink(struct kmem_cache *);
>  void slab_kmem_cache_release(struct kmem_cache *);
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 5b1f996bed06..c5c356d0235d 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -44,6 +44,8 @@ static LIST_HEAD(slab_caches_to_rcu_destroy);
>  static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
>  static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
>  		    slab_caches_to_rcu_destroy_workfn);
> +static void kmem_cache_kfree_rcu_destroy_workfn(struct work_struct *work);
> +
>  
>  /*
>   * Set of flags that will prevent slab merging
> @@ -234,6 +236,7 @@ static struct kmem_cache *create_cache(const char *name,
>  
>  	s->refcount = 1;
>  	list_add(&s->list, &slab_caches);
> +	INIT_WORK(&s->async_destroy_work, kmem_cache_kfree_rcu_destroy_workfn);
>  	return s;
>  
>  out_free_cache:
> @@ -449,12 +452,16 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
>  	}
>  }
>  
> -static int shutdown_cache(struct kmem_cache *s)
> +static int shutdown_cache(struct kmem_cache *s, bool warn_inuse)
>  {
>  	/* free asan quarantined objects */
> +	/*
> +	 * XXX: is it ok to call this multiple times? and what happens with a
> +	 * kfree_rcu() in flight that finishes after or in parallel with this?
> +	 */
>  	kasan_cache_shutdown(s);
>  
> -	if (__kmem_cache_shutdown(s) != 0)
> +	if (__kmem_cache_shutdown(s, warn_inuse) != 0)
>  		return -EBUSY;
>  
>  	list_del(&s->list);
> @@ -477,6 +484,32 @@ void slab_kmem_cache_release(struct kmem_cache *s)
>  	kmem_cache_free(kmem_cache, s);
>  }
>  
> +static void kmem_cache_kfree_rcu_destroy_workfn(struct work_struct *work)
> +{
> +	struct kmem_cache *s;
> +	int err = -EBUSY;
> +	bool rcu_set;
> +
> +	s = container_of(work, struct kmem_cache, async_destroy_work);
> +
> +	// XXX use the real kmem_cache_free_barrier() or similar thing here
> +	rcu_barrier();
> +
> +	cpus_read_lock();
> +	mutex_lock(&slab_mutex);
> +
> +	rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;
> +
> +	err = shutdown_cache(s, true);

This is currently the only call to shutdown_cache()?  So there is to be
a way for the caller to have some influence over the value of that bool?

> +	WARN(err, "kmem_cache_destroy %s: Slab cache still has objects",
> +	     s->name);

Don't we want to have some sort of delay here?  Or is this the
21-second delay and/or kfree_rcu_barrier() mentioned before?

> +	mutex_unlock(&slab_mutex);
> +	cpus_read_unlock();
> +	if (!err && !rcu_set)
> +		kmem_cache_release(s);
> +}
> +
>  void kmem_cache_destroy(struct kmem_cache *s)
>  {
>  	int err = -EBUSY;
> @@ -494,9 +527,9 @@ void kmem_cache_destroy(struct kmem_cache *s)
>  	if (s->refcount)
>  		goto out_unlock;
>  
> -	err = shutdown_cache(s);
> -	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
> -	     __func__, s->name, (void *)_RET_IP_);
> +	err = shutdown_cache(s, false);
> +	if (err)
> +		schedule_work(&s->async_destroy_work);
>  out_unlock:
>  	mutex_unlock(&slab_mutex);
>  	cpus_read_unlock();
> diff --git a/mm/slub.c b/mm/slub.c
> index 1617d8014ecd..4d435b3d2b5f 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -5342,7 +5342,8 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
>   * This is called from __kmem_cache_shutdown(). We must take list_lock
>   * because sysfs file might still access partial list after the shutdowning.
>   */
> -static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
> +static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n,
> +			 bool warn_inuse)
>  {
>  	LIST_HEAD(discard);
>  	struct slab *slab, *h;
> @@ -5353,7 +5354,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
>  		if (!slab->inuse) {
>  			remove_partial(n, slab);
>  			list_add(&slab->slab_list, &discard);
> -		} else {
> +		} else if (warn_inuse) {
>  			list_slab_objects(s, slab,
>  			  "Objects remaining in %s on __kmem_cache_shutdown()");
>  		}
> @@ -5378,7 +5379,7 @@ bool __kmem_cache_empty(struct kmem_cache *s)
>  /*
>   * Release all resources used by a slab cache.
>   */
> -int __kmem_cache_shutdown(struct kmem_cache *s)
> +int __kmem_cache_shutdown(struct kmem_cache *s, bool warn_inuse)
>  {
>  	int node;
>  	struct kmem_cache_node *n;
> @@ -5386,7 +5387,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
>  	flush_all_cpus_locked(s);
>  	/* Attempt to free all objects */
>  	for_each_kmem_cache_node(s, node, n) {
> -		free_partial(s, n);
> +		free_partial(s, n, warn_inuse);
>  		if (n->nr_partial || node_nr_slabs(n))
>  			return 1;
>  	}
> 

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-17 18:42                   ` Uladzislau Rezki
@ 2024-06-17 21:08                     ` Vlastimil Babka
  2024-06-18  9:31                       ` Uladzislau Rezki
  0 siblings, 1 reply; 50+ messages in thread
From: Vlastimil Babka @ 2024-06-17 21:08 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: paulmck, Jason A. Donenfeld, Jakub Kicinski, Julia Lawall,
	linux-block, kernel-janitors, bridge, linux-trace-kernel,
	Mathieu Desnoyers, kvm, linuxppc-dev, Naveen N. Rao,
	Christophe Leroy, Nicholas Piggin, netdev, wireguard,
	linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia, Dai Ngo,
	Tom Talpey, linux-nfs, linux-can, Lai Jiangshan, netfilter-devel,
	coreteam, kasan-dev

On 6/17/24 8:42 PM, Uladzislau Rezki wrote:
>> +
>> +	s = container_of(work, struct kmem_cache, async_destroy_work);
>> +
>> +	// XXX use the real kmem_cache_free_barrier() or similar thing here
> It implies that we need to introduce kfree_rcu_barrier(), a new API, which i
> wanted to avoid initially.

I wanted to avoid new API or flags for kfree_rcu() users and this would
be achieved. The barrier is used internally so I don't consider that an
API to avoid. How difficult is the implementation is another question,
depending on how the current batching works. Once (if) we have sheaves
proven to work and move kfree_rcu() fully into SLUB, the barrier might
also look different and hopefully easier. So maybe it's not worth to
invest too much into that barrier and just go for the potentially
longer, but easier to implement?

> Since you do it asynchronous can we just repeat
> and wait until it a cache is furry freed?

The problem is we want to detect the cases when it's not fully freed
because there was an actual read. So at some point we'd need to stop the
repeats because we know there can no longer be any kfree_rcu()'s in
flight since the kmem_cache_destroy() was called.

> I am asking because inventing a new kfree_rcu_barrier() might not be so
> straight forward.

Agreed.

> 
> --
> Uladzislau Rezki

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-17 17:04                                   ` Jason A. Donenfeld
@ 2024-06-17 21:19                                     ` Vlastimil Babka
  0 siblings, 0 replies; 50+ messages in thread
From: Vlastimil Babka @ 2024-06-17 21:19 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Uladzislau Rezki, Paul E. McKenney, Jakub Kicinski, Julia Lawall,
	linux-block, kernel-janitors, bridge, linux-trace-kernel,
	Mathieu Desnoyers, kvm, linuxppc-dev, Naveen N. Rao,
	Christophe Leroy, Nicholas Piggin, netdev, wireguard,
	linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia, Dai Ngo,
	Tom Talpey, linux-nfs, linux-can, Lai Jiangshan, netfilter-devel,
	coreteam

On 6/17/24 7:04 PM, Jason A. Donenfeld wrote:
>>> Vlastimil, this is just checking a boolean (which could be
>>> unlikely()'d), which should have pretty minimal overhead. Is that
>>> alright with you?
>>
>> Well I doubt we can just set and check it without any barriers? The
>> completion of the last pending kfree_rcu() might race with
>> kmem_cache_destroy() in a way that will leave the cache there forever, no?
>> And once we add barriers it becomes a perf issue?
> 
> Hm, yea you might be right about barriers being required. But actually,
> might this point toward a larger problem with no matter what approach,
> polling or event, is chosen? If the current rule is that
> kmem_cache_free() must never race with kmem_cache_destroy(), because

Yes calling alloc/free operations that race with destroy is a bug and we
can't prevent that.

> users have always made diligent use of call_rcu()/rcu_barrier() and

But the issue we are solving here is a bit different - the users are not
buggy, they do kfree_rcu() and then kmem_cache_destroy() and no more
operations on the cache afterwards. We need to ensure that the handling
of kfree_rcu() (which ultimately is basically kmem_cache_free() but
internally to rcu/slub) doesn't race with kmem_cache_destroy().

> such, but now we're going to let those race with each other - either by
> my thing above or by polling - so we're potentially going to get in trouble
> and need some barriers anyway. 

The barrier in the async part of kmem_cache_destroy() should be enough
to make sure all kfree_rcu() have finished before we proceed with the
potentially racy parts of destroying, and we should be able to avoid
changes in kmem_cache_free().

> I think?
> 
> Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-17 18:54                   ` Paul E. McKenney
@ 2024-06-17 21:34                     ` Vlastimil Babka
  0 siblings, 0 replies; 50+ messages in thread
From: Vlastimil Babka @ 2024-06-17 21:34 UTC (permalink / raw)
  To: paulmck
  Cc: Jason A. Donenfeld, Uladzislau Rezki (Sony),
	Jakub Kicinski, Julia Lawall, linux-block, kernel-janitors,
	bridge, linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam, kasan-dev

On 6/17/24 8:54 PM, Paul E. McKenney wrote:
> On Mon, Jun 17, 2024 at 07:23:36PM +0200, Vlastimil Babka wrote:
>> On 6/17/24 6:12 PM, Paul E. McKenney wrote:
>>> On Mon, Jun 17, 2024 at 05:10:50PM +0200, Vlastimil Babka wrote:
>>>> On 6/13/24 2:22 PM, Jason A. Donenfeld wrote:
>>>>> On Wed, Jun 12, 2024 at 08:38:02PM -0700, Paul E. McKenney wrote:
>>>>>> o	Make the current kmem_cache_destroy() asynchronously wait for
>>>>>> 	all memory to be returned, then complete the destruction.
>>>>>> 	(This gets rid of a valuable debugging technique because
>>>>>> 	in normal use, it is a bug to attempt to destroy a kmem_cache
>>>>>> 	that has objects still allocated.)
>>>>
>>>> This seems like the best option to me. As Jason already said, the debugging
>>>> technique is not affected significantly, if the warning just occurs
>>>> asynchronously later. The module can be already unloaded at that point, as
>>>> the leak is never checked programatically anyway to control further
>>>> execution, it's just a splat in dmesg.
>>>
>>> Works for me!
>>
>> Great. So this is how a prototype could look like, hopefully? The kunit test
>> does generate the splat for me, which should be because the rcu_barrier() in
>> the implementation (marked to be replaced with the real thing) is really
>> insufficient. Note the test itself passes as this kind of error isn't wired
>> up properly.
> 
> ;-) ;-) ;-)

Yeah yeah, I just used the kunit module as a convenient way add the code
that should see if there's the splat :)

> Some might want confirmation that their cleanup efforts succeeded,
> but if so, I will let them make that known.

It could be just the kunit test that could want that, but I don't see
how it could wrap and inspect the result of the async handling and
suppress the splats for intentionally triggered errors as many of the
other tests do.

>> Another thing to resolve is the marked comment about kasan_shutdown() with
>> potential kfree_rcu()'s in flight.
> 
> Could that simply move to the worker function?  (Hey, had to ask!)

I think I had a reason why not, but I guess it could move. It would just
mean that if any objects are quarantined, we'll go for the async freeing
even though those could be flushed immediately. Guess that's not too bad.

>> Also you need CONFIG_SLUB_DEBUG enabled otherwise node_nr_slabs() is a no-op
>> and it might fail to notice the pending slabs. This will need to change.
> 
> Agreed.
> 
> Looks generally good.  A few questions below, to be taken with a
> grain of salt.

Thanks!

>> +static void kmem_cache_kfree_rcu_destroy_workfn(struct work_struct *work)
>> +{
>> +	struct kmem_cache *s;
>> +	int err = -EBUSY;
>> +	bool rcu_set;
>> +
>> +	s = container_of(work, struct kmem_cache, async_destroy_work);
>> +
>> +	// XXX use the real kmem_cache_free_barrier() or similar thing here
>> +	rcu_barrier();

Note here's the barrier.

>> +	cpus_read_lock();
>> +	mutex_lock(&slab_mutex);
>> +
>> +	rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;
>> +
>> +	err = shutdown_cache(s, true);
> 
> This is currently the only call to shutdown_cache()?  So there is to be
> a way for the caller to have some influence over the value of that bool?

Not the only caller, there's still the initial attempt in
kmem_cache_destroy() itself below.

> 
>> +	WARN(err, "kmem_cache_destroy %s: Slab cache still has objects",
>> +	     s->name);
> 
> Don't we want to have some sort of delay here?  Or is this the
> 21-second delay and/or kfree_rcu_barrier() mentioned before?

Yes this is after the barrier. The first immediate attempt to shutdown
doesn't warn.

>> +	mutex_unlock(&slab_mutex);
>> +	cpus_read_unlock();
>> +	if (!err && !rcu_set)
>> +		kmem_cache_release(s);
>> +}
>> +
>>  void kmem_cache_destroy(struct kmem_cache *s)
>>  {
>>  	int err = -EBUSY;
>> @@ -494,9 +527,9 @@ void kmem_cache_destroy(struct kmem_cache *s)
>>  	if (s->refcount)
>>  		goto out_unlock;
>>  
>> -	err = shutdown_cache(s);
>> -	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
>> -	     __func__, s->name, (void *)_RET_IP_);
>> +	err = shutdown_cache(s, false);
>> +	if (err)
>> +		schedule_work(&s->async_destroy_work);

And here's the initial attempt that used to warn but now doesn't and
instead schedules the async one.

>>  out_unlock:
>>  	mutex_unlock(&slab_mutex);
>>  	cpus_read_unlock();
>> diff --git a/mm/slub.c b/mm/slub.c
>> index 1617d8014ecd..4d435b3d2b5f 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -5342,7 +5342,8 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
>>   * This is called from __kmem_cache_shutdown(). We must take list_lock
>>   * because sysfs file might still access partial list after the shutdowning.
>>   */
>> -static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
>> +static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n,
>> +			 bool warn_inuse)
>>  {
>>  	LIST_HEAD(discard);
>>  	struct slab *slab, *h;
>> @@ -5353,7 +5354,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
>>  		if (!slab->inuse) {
>>  			remove_partial(n, slab);
>>  			list_add(&slab->slab_list, &discard);
>> -		} else {
>> +		} else if (warn_inuse) {
>>  			list_slab_objects(s, slab,
>>  			  "Objects remaining in %s on __kmem_cache_shutdown()");
>>  		}
>> @@ -5378,7 +5379,7 @@ bool __kmem_cache_empty(struct kmem_cache *s)
>>  /*
>>   * Release all resources used by a slab cache.
>>   */
>> -int __kmem_cache_shutdown(struct kmem_cache *s)
>> +int __kmem_cache_shutdown(struct kmem_cache *s, bool warn_inuse)
>>  {
>>  	int node;
>>  	struct kmem_cache_node *n;
>> @@ -5386,7 +5387,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
>>  	flush_all_cpus_locked(s);
>>  	/* Attempt to free all objects */
>>  	for_each_kmem_cache_node(s, node, n) {
>> -		free_partial(s, n);
>> +		free_partial(s, n, warn_inuse);
>>  		if (n->nr_partial || node_nr_slabs(n))
>>  			return 1;
>>  	}
>>

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback
  2024-06-17 21:08                     ` Vlastimil Babka
@ 2024-06-18  9:31                       ` Uladzislau Rezki
  0 siblings, 0 replies; 50+ messages in thread
From: Uladzislau Rezki @ 2024-06-18  9:31 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Uladzislau Rezki, paulmck, Jason A. Donenfeld, Jakub Kicinski,
	Julia Lawall, linux-block, kernel-janitors, bridge,
	linux-trace-kernel, Mathieu Desnoyers, kvm, linuxppc-dev,
	Naveen N. Rao, Christophe Leroy, Nicholas Piggin, netdev,
	wireguard, linux-kernel, ecryptfs, Neil Brown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, linux-nfs, linux-can, Lai Jiangshan,
	netfilter-devel, coreteam, kasan-dev

> On 6/17/24 8:42 PM, Uladzislau Rezki wrote:
> >> +
> >> +	s = container_of(work, struct kmem_cache, async_destroy_work);
> >> +
> >> +	// XXX use the real kmem_cache_free_barrier() or similar thing here
> > It implies that we need to introduce kfree_rcu_barrier(), a new API, which i
> > wanted to avoid initially.
> 
> I wanted to avoid new API or flags for kfree_rcu() users and this would
> be achieved. The barrier is used internally so I don't consider that an
> API to avoid. How difficult is the implementation is another question,
> depending on how the current batching works. Once (if) we have sheaves
> proven to work and move kfree_rcu() fully into SLUB, the barrier might
> also look different and hopefully easier. So maybe it's not worth to
> invest too much into that barrier and just go for the potentially
> longer, but easier to implement?
> 
Right. I agree here. If the cache is not empty, OK, we just defer the
work, even we can use a big 21 seconds delay, after that we just "warn"
if it is still not empty and leave it as it is, i.e. emit a warning and
we are done.

Destroying the cache is not something that must happen right away. 

> > Since you do it asynchronous can we just repeat
> > and wait until it a cache is furry freed?
> 
> The problem is we want to detect the cases when it's not fully freed
> because there was an actual read. So at some point we'd need to stop the
> repeats because we know there can no longer be any kfree_rcu()'s in
> flight since the kmem_cache_destroy() was called.
> 
Agree. As noted above, we can go with 21 seconds(as an example) interval
and just perform destroy(without repeating).

--
Uladzislau Rezki

^ permalink raw reply	[flat|nested] 50+ messages in thread

end of thread, other threads:[~2024-06-18  9:33 UTC | newest]

Thread overview: 50+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-09  8:27 [PATCH 00/14] replace call_rcu by kfree_rcu for simple kmem_cache_free callback Julia Lawall
2024-06-09  8:27 ` [PATCH 01/14] wireguard: allowedips: " Julia Lawall
2024-06-09 14:32   ` Jason A. Donenfeld
2024-06-09 14:36     ` Julia Lawall
2024-06-10 20:38     ` Vlastimil Babka
2024-06-10 20:59       ` Jason A. Donenfeld
2024-06-12 21:33 ` [PATCH 00/14] " Jakub Kicinski
2024-06-12 22:37   ` Paul E. McKenney
2024-06-12 22:46     ` Jakub Kicinski
     [not found]     ` <7e58e73d-4173-49fe-8f05-38a3699bc2c1@kernel.dk>
2024-06-12 23:04       ` Paul E. McKenney
2024-06-12 23:31     ` Jason A. Donenfeld
2024-06-13  0:31       ` Jason A. Donenfeld
2024-06-13  3:38         ` Paul E. McKenney
2024-06-13 12:22           ` Jason A. Donenfeld
2024-06-13 12:46             ` Paul E. McKenney
2024-06-13 14:11               ` Jason A. Donenfeld
2024-06-13 15:12                 ` Paul E. McKenney
2024-06-17 15:10             ` Vlastimil Babka
2024-06-17 16:12               ` Paul E. McKenney
2024-06-17 17:23                 ` Vlastimil Babka
2024-06-17 18:42                   ` Uladzislau Rezki
2024-06-17 21:08                     ` Vlastimil Babka
2024-06-18  9:31                       ` Uladzislau Rezki
2024-06-17 18:54                   ` Paul E. McKenney
2024-06-17 21:34                     ` Vlastimil Babka
2024-06-13 14:17           ` Jakub Kicinski
2024-06-13 14:53             ` Paul E. McKenney
2024-06-13 11:58     ` Jason A. Donenfeld
2024-06-13 12:47       ` Paul E. McKenney
2024-06-13 13:06         ` Uladzislau Rezki
2024-06-13 15:06           ` Paul E. McKenney
2024-06-13 17:38             ` Uladzislau Rezki
2024-06-13 17:45               ` Paul E. McKenney
2024-06-13 17:58                 ` Uladzislau Rezki
2024-06-13 18:13                   ` Paul E. McKenney
2024-06-14 12:35                     ` Uladzislau Rezki
2024-06-14 14:17                       ` Paul E. McKenney
2024-06-14 14:50                         ` Uladzislau Rezki
2024-06-14 19:33                       ` Jason A. Donenfeld
2024-06-17 13:50                         ` Uladzislau Rezki
2024-06-17 14:56                           ` Jason A. Donenfeld
2024-06-17 16:30                             ` Uladzislau Rezki
2024-06-17 16:33                               ` Jason A. Donenfeld
2024-06-17 16:38                                 ` Vlastimil Babka
2024-06-17 17:04                                   ` Jason A. Donenfeld
2024-06-17 21:19                                     ` Vlastimil Babka
2024-06-17 16:42                                 ` Uladzislau Rezki
2024-06-17 16:57                                   ` Jason A. Donenfeld
2024-06-17 17:19                                     ` Uladzislau Rezki
2024-06-17 14:37                         ` Vlastimil Babka

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).