Skip to content

Commit 3252197

Browse files
committed
mm: memcg: move soft limit reclaim code to memcontrol-v1.c
JIRA: https://issues.redhat.com/browse/RHEL-80382 Summary of changes: - The upstream patch didn't apply directly due to a conflict with the line for explicit linux/zswap.h include introduced with commit 94b1bd8 (zswap: make shrinking memcg-aware). commit d12f6d2 Author: Roman Gushchin <roman.gushchin@linux.dev> Date: Mon Jun 24 17:58:54 2024 -0700 mm: memcg: move soft limit reclaim code to memcontrol-v1.c Soft limits are cgroup v1-specific and are not supported by cgroup v2, so let's move the corresponding code into memcontrol-v1.c. Aside from simple moving the code, this commits introduces a trivial memcg1_soft_limit_reset() function to reset soft limits and also moves the global soft limit tree initialization code into a new memcg1_init() function. It also moves corresponding declarations shared between memcontrol.c and memcontrol-v1.c into mm/memcontrol-v1.h. Link: https://lkml.kernel.org/r/20240625005906.106920-3-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Shakeel Butt <shakeel.butt@linux.dev> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Muchun Song <muchun.song@linux.dev> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Radostin Stoyanov <rstoyano@redhat.com>
1 parent f4132cf commit 3252197

File tree

3 files changed

+353
-333
lines changed

3 files changed

+353
-333
lines changed

mm/memcontrol-v1.c

Lines changed: 342 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,345 @@
11
// SPDX-License-Identifier: GPL-2.0-or-later
22

3+
#include <linux/memcontrol.h>
4+
#include <linux/swap.h>
5+
#include <linux/mm_inline.h>
6+
37
#include "memcontrol-v1.h"
8+
9+
/*
10+
* Cgroups above their limits are maintained in a RB-Tree, independent of
11+
* their hierarchy representation
12+
*/
13+
14+
struct mem_cgroup_tree_per_node {
15+
struct rb_root rb_root;
16+
struct rb_node *rb_rightmost;
17+
spinlock_t lock;
18+
};
19+
20+
struct mem_cgroup_tree {
21+
struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
22+
};
23+
24+
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
25+
26+
/*
27+
* Maximum loops in mem_cgroup_soft_reclaim(), used for soft
28+
* limit reclaim to prevent infinite loops, if they ever occur.
29+
*/
30+
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
31+
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
32+
33+
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
34+
struct mem_cgroup_tree_per_node *mctz,
35+
unsigned long new_usage_in_excess)
36+
{
37+
struct rb_node **p = &mctz->rb_root.rb_node;
38+
struct rb_node *parent = NULL;
39+
struct mem_cgroup_per_node *mz_node;
40+
bool rightmost = true;
41+
42+
if (mz->on_tree)
43+
return;
44+
45+
mz->usage_in_excess = new_usage_in_excess;
46+
if (!mz->usage_in_excess)
47+
return;
48+
while (*p) {
49+
parent = *p;
50+
mz_node = rb_entry(parent, struct mem_cgroup_per_node,
51+
tree_node);
52+
if (mz->usage_in_excess < mz_node->usage_in_excess) {
53+
p = &(*p)->rb_left;
54+
rightmost = false;
55+
} else {
56+
p = &(*p)->rb_right;
57+
}
58+
}
59+
60+
if (rightmost)
61+
mctz->rb_rightmost = &mz->tree_node;
62+
63+
rb_link_node(&mz->tree_node, parent, p);
64+
rb_insert_color(&mz->tree_node, &mctz->rb_root);
65+
mz->on_tree = true;
66+
}
67+
68+
static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
69+
struct mem_cgroup_tree_per_node *mctz)
70+
{
71+
if (!mz->on_tree)
72+
return;
73+
74+
if (&mz->tree_node == mctz->rb_rightmost)
75+
mctz->rb_rightmost = rb_prev(&mz->tree_node);
76+
77+
rb_erase(&mz->tree_node, &mctz->rb_root);
78+
mz->on_tree = false;
79+
}
80+
81+
static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
82+
struct mem_cgroup_tree_per_node *mctz)
83+
{
84+
unsigned long flags;
85+
86+
spin_lock_irqsave(&mctz->lock, flags);
87+
__mem_cgroup_remove_exceeded(mz, mctz);
88+
spin_unlock_irqrestore(&mctz->lock, flags);
89+
}
90+
91+
static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
92+
{
93+
unsigned long nr_pages = page_counter_read(&memcg->memory);
94+
unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
95+
unsigned long excess = 0;
96+
97+
if (nr_pages > soft_limit)
98+
excess = nr_pages - soft_limit;
99+
100+
return excess;
101+
}
102+
103+
void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
104+
{
105+
unsigned long excess;
106+
struct mem_cgroup_per_node *mz;
107+
struct mem_cgroup_tree_per_node *mctz;
108+
109+
if (lru_gen_enabled()) {
110+
if (soft_limit_excess(memcg))
111+
lru_gen_soft_reclaim(memcg, nid);
112+
return;
113+
}
114+
115+
mctz = soft_limit_tree.rb_tree_per_node[nid];
116+
if (!mctz)
117+
return;
118+
/*
119+
* Necessary to update all ancestors when hierarchy is used.
120+
* because their event counter is not touched.
121+
*/
122+
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
123+
mz = memcg->nodeinfo[nid];
124+
excess = soft_limit_excess(memcg);
125+
/*
126+
* We have to update the tree if mz is on RB-tree or
127+
* mem is over its softlimit.
128+
*/
129+
if (excess || mz->on_tree) {
130+
unsigned long flags;
131+
132+
spin_lock_irqsave(&mctz->lock, flags);
133+
/* if on-tree, remove it */
134+
if (mz->on_tree)
135+
__mem_cgroup_remove_exceeded(mz, mctz);
136+
/*
137+
* Insert again. mz->usage_in_excess will be updated.
138+
* If excess is 0, no tree ops.
139+
*/
140+
__mem_cgroup_insert_exceeded(mz, mctz, excess);
141+
spin_unlock_irqrestore(&mctz->lock, flags);
142+
}
143+
}
144+
}
145+
146+
void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
147+
{
148+
struct mem_cgroup_tree_per_node *mctz;
149+
struct mem_cgroup_per_node *mz;
150+
int nid;
151+
152+
for_each_node(nid) {
153+
mz = memcg->nodeinfo[nid];
154+
mctz = soft_limit_tree.rb_tree_per_node[nid];
155+
if (mctz)
156+
mem_cgroup_remove_exceeded(mz, mctz);
157+
}
158+
}
159+
160+
static struct mem_cgroup_per_node *
161+
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
162+
{
163+
struct mem_cgroup_per_node *mz;
164+
165+
retry:
166+
mz = NULL;
167+
if (!mctz->rb_rightmost)
168+
goto done; /* Nothing to reclaim from */
169+
170+
mz = rb_entry(mctz->rb_rightmost,
171+
struct mem_cgroup_per_node, tree_node);
172+
/*
173+
* Remove the node now but someone else can add it back,
174+
* we will to add it back at the end of reclaim to its correct
175+
* position in the tree.
176+
*/
177+
__mem_cgroup_remove_exceeded(mz, mctz);
178+
if (!soft_limit_excess(mz->memcg) ||
179+
!css_tryget(&mz->memcg->css))
180+
goto retry;
181+
done:
182+
return mz;
183+
}
184+
185+
static struct mem_cgroup_per_node *
186+
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
187+
{
188+
struct mem_cgroup_per_node *mz;
189+
190+
spin_lock_irq(&mctz->lock);
191+
mz = __mem_cgroup_largest_soft_limit_node(mctz);
192+
spin_unlock_irq(&mctz->lock);
193+
return mz;
194+
}
195+
196+
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
197+
pg_data_t *pgdat,
198+
gfp_t gfp_mask,
199+
unsigned long *total_scanned)
200+
{
201+
struct mem_cgroup *victim = NULL;
202+
int total = 0;
203+
int loop = 0;
204+
unsigned long excess;
205+
unsigned long nr_scanned;
206+
struct mem_cgroup_reclaim_cookie reclaim = {
207+
.pgdat = pgdat,
208+
};
209+
210+
excess = soft_limit_excess(root_memcg);
211+
212+
while (1) {
213+
victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
214+
if (!victim) {
215+
loop++;
216+
if (loop >= 2) {
217+
/*
218+
* If we have not been able to reclaim
219+
* anything, it might because there are
220+
* no reclaimable pages under this hierarchy
221+
*/
222+
if (!total)
223+
break;
224+
/*
225+
* We want to do more targeted reclaim.
226+
* excess >> 2 is not to excessive so as to
227+
* reclaim too much, nor too less that we keep
228+
* coming back to reclaim from this cgroup
229+
*/
230+
if (total >= (excess >> 2) ||
231+
(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
232+
break;
233+
}
234+
continue;
235+
}
236+
total += mem_cgroup_shrink_node(victim, gfp_mask, false,
237+
pgdat, &nr_scanned);
238+
*total_scanned += nr_scanned;
239+
if (!soft_limit_excess(root_memcg))
240+
break;
241+
}
242+
mem_cgroup_iter_break(root_memcg, victim);
243+
return total;
244+
}
245+
246+
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
247+
gfp_t gfp_mask,
248+
unsigned long *total_scanned)
249+
{
250+
unsigned long nr_reclaimed = 0;
251+
struct mem_cgroup_per_node *mz, *next_mz = NULL;
252+
unsigned long reclaimed;
253+
int loop = 0;
254+
struct mem_cgroup_tree_per_node *mctz;
255+
unsigned long excess;
256+
257+
if (lru_gen_enabled())
258+
return 0;
259+
260+
if (order > 0)
261+
return 0;
262+
263+
mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
264+
265+
/*
266+
* Do not even bother to check the largest node if the root
267+
* is empty. Do it lockless to prevent lock bouncing. Races
268+
* are acceptable as soft limit is best effort anyway.
269+
*/
270+
if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
271+
return 0;
272+
273+
/*
274+
* This loop can run a while, specially if mem_cgroup's continuously
275+
* keep exceeding their soft limit and putting the system under
276+
* pressure
277+
*/
278+
do {
279+
if (next_mz)
280+
mz = next_mz;
281+
else
282+
mz = mem_cgroup_largest_soft_limit_node(mctz);
283+
if (!mz)
284+
break;
285+
286+
reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
287+
gfp_mask, total_scanned);
288+
nr_reclaimed += reclaimed;
289+
spin_lock_irq(&mctz->lock);
290+
291+
/*
292+
* If we failed to reclaim anything from this memory cgroup
293+
* it is time to move on to the next cgroup
294+
*/
295+
next_mz = NULL;
296+
if (!reclaimed)
297+
next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
298+
299+
excess = soft_limit_excess(mz->memcg);
300+
/*
301+
* One school of thought says that we should not add
302+
* back the node to the tree if reclaim returns 0.
303+
* But our reclaim could return 0, simply because due
304+
* to priority we are exposing a smaller subset of
305+
* memory to reclaim from. Consider this as a longer
306+
* term TODO.
307+
*/
308+
/* If excess == 0, no tree ops */
309+
__mem_cgroup_insert_exceeded(mz, mctz, excess);
310+
spin_unlock_irq(&mctz->lock);
311+
css_put(&mz->memcg->css);
312+
loop++;
313+
/*
314+
* Could not reclaim anything and there are no more
315+
* mem cgroups to try or we seem to be looping without
316+
* reclaiming anything.
317+
*/
318+
if (!nr_reclaimed &&
319+
(next_mz == NULL ||
320+
loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
321+
break;
322+
} while (!nr_reclaimed);
323+
if (next_mz)
324+
css_put(&next_mz->memcg->css);
325+
return nr_reclaimed;
326+
}
327+
328+
static int __init memcg1_init(void)
329+
{
330+
int node;
331+
332+
for_each_node(node) {
333+
struct mem_cgroup_tree_per_node *rtpn;
334+
335+
rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
336+
337+
rtpn->rb_root = RB_ROOT;
338+
rtpn->rb_rightmost = NULL;
339+
spin_lock_init(&rtpn->lock);
340+
soft_limit_tree.rb_tree_per_node[node] = rtpn;
341+
}
342+
343+
return 0;
344+
}
345+
subsys_initcall(memcg1_init);

mm/memcontrol-v1.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,12 @@
33
#ifndef __MM_MEMCONTROL_V1_H
44
#define __MM_MEMCONTROL_V1_H
55

6+
void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid);
7+
void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg);
8+
9+
static inline void memcg1_soft_limit_reset(struct mem_cgroup *memcg)
10+
{
11+
WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
12+
}
613

714
#endif /* __MM_MEMCONTROL_V1_H */

0 commit comments

Comments
 (0)