|
1 | 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | 2 |
|
| 3 | +#include <linux/memcontrol.h> |
| 4 | +#include <linux/swap.h> |
| 5 | +#include <linux/mm_inline.h> |
| 6 | + |
3 | 7 | #include "memcontrol-v1.h" |
| 8 | + |
| 9 | +/* |
| 10 | + * Cgroups above their limits are maintained in a RB-Tree, independent of |
| 11 | + * their hierarchy representation |
| 12 | + */ |
| 13 | + |
| 14 | +struct mem_cgroup_tree_per_node { |
| 15 | + struct rb_root rb_root; |
| 16 | + struct rb_node *rb_rightmost; |
| 17 | + spinlock_t lock; |
| 18 | +}; |
| 19 | + |
| 20 | +struct mem_cgroup_tree { |
| 21 | + struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; |
| 22 | +}; |
| 23 | + |
| 24 | +static struct mem_cgroup_tree soft_limit_tree __read_mostly; |
| 25 | + |
| 26 | +/* |
| 27 | + * Maximum loops in mem_cgroup_soft_reclaim(), used for soft |
| 28 | + * limit reclaim to prevent infinite loops, if they ever occur. |
| 29 | + */ |
| 30 | +#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 |
| 31 | +#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 |
| 32 | + |
| 33 | +static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, |
| 34 | + struct mem_cgroup_tree_per_node *mctz, |
| 35 | + unsigned long new_usage_in_excess) |
| 36 | +{ |
| 37 | + struct rb_node **p = &mctz->rb_root.rb_node; |
| 38 | + struct rb_node *parent = NULL; |
| 39 | + struct mem_cgroup_per_node *mz_node; |
| 40 | + bool rightmost = true; |
| 41 | + |
| 42 | + if (mz->on_tree) |
| 43 | + return; |
| 44 | + |
| 45 | + mz->usage_in_excess = new_usage_in_excess; |
| 46 | + if (!mz->usage_in_excess) |
| 47 | + return; |
| 48 | + while (*p) { |
| 49 | + parent = *p; |
| 50 | + mz_node = rb_entry(parent, struct mem_cgroup_per_node, |
| 51 | + tree_node); |
| 52 | + if (mz->usage_in_excess < mz_node->usage_in_excess) { |
| 53 | + p = &(*p)->rb_left; |
| 54 | + rightmost = false; |
| 55 | + } else { |
| 56 | + p = &(*p)->rb_right; |
| 57 | + } |
| 58 | + } |
| 59 | + |
| 60 | + if (rightmost) |
| 61 | + mctz->rb_rightmost = &mz->tree_node; |
| 62 | + |
| 63 | + rb_link_node(&mz->tree_node, parent, p); |
| 64 | + rb_insert_color(&mz->tree_node, &mctz->rb_root); |
| 65 | + mz->on_tree = true; |
| 66 | +} |
| 67 | + |
| 68 | +static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, |
| 69 | + struct mem_cgroup_tree_per_node *mctz) |
| 70 | +{ |
| 71 | + if (!mz->on_tree) |
| 72 | + return; |
| 73 | + |
| 74 | + if (&mz->tree_node == mctz->rb_rightmost) |
| 75 | + mctz->rb_rightmost = rb_prev(&mz->tree_node); |
| 76 | + |
| 77 | + rb_erase(&mz->tree_node, &mctz->rb_root); |
| 78 | + mz->on_tree = false; |
| 79 | +} |
| 80 | + |
| 81 | +static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, |
| 82 | + struct mem_cgroup_tree_per_node *mctz) |
| 83 | +{ |
| 84 | + unsigned long flags; |
| 85 | + |
| 86 | + spin_lock_irqsave(&mctz->lock, flags); |
| 87 | + __mem_cgroup_remove_exceeded(mz, mctz); |
| 88 | + spin_unlock_irqrestore(&mctz->lock, flags); |
| 89 | +} |
| 90 | + |
| 91 | +static unsigned long soft_limit_excess(struct mem_cgroup *memcg) |
| 92 | +{ |
| 93 | + unsigned long nr_pages = page_counter_read(&memcg->memory); |
| 94 | + unsigned long soft_limit = READ_ONCE(memcg->soft_limit); |
| 95 | + unsigned long excess = 0; |
| 96 | + |
| 97 | + if (nr_pages > soft_limit) |
| 98 | + excess = nr_pages - soft_limit; |
| 99 | + |
| 100 | + return excess; |
| 101 | +} |
| 102 | + |
| 103 | +void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) |
| 104 | +{ |
| 105 | + unsigned long excess; |
| 106 | + struct mem_cgroup_per_node *mz; |
| 107 | + struct mem_cgroup_tree_per_node *mctz; |
| 108 | + |
| 109 | + if (lru_gen_enabled()) { |
| 110 | + if (soft_limit_excess(memcg)) |
| 111 | + lru_gen_soft_reclaim(memcg, nid); |
| 112 | + return; |
| 113 | + } |
| 114 | + |
| 115 | + mctz = soft_limit_tree.rb_tree_per_node[nid]; |
| 116 | + if (!mctz) |
| 117 | + return; |
| 118 | + /* |
| 119 | + * Necessary to update all ancestors when hierarchy is used. |
| 120 | + * because their event counter is not touched. |
| 121 | + */ |
| 122 | + for (; memcg; memcg = parent_mem_cgroup(memcg)) { |
| 123 | + mz = memcg->nodeinfo[nid]; |
| 124 | + excess = soft_limit_excess(memcg); |
| 125 | + /* |
| 126 | + * We have to update the tree if mz is on RB-tree or |
| 127 | + * mem is over its softlimit. |
| 128 | + */ |
| 129 | + if (excess || mz->on_tree) { |
| 130 | + unsigned long flags; |
| 131 | + |
| 132 | + spin_lock_irqsave(&mctz->lock, flags); |
| 133 | + /* if on-tree, remove it */ |
| 134 | + if (mz->on_tree) |
| 135 | + __mem_cgroup_remove_exceeded(mz, mctz); |
| 136 | + /* |
| 137 | + * Insert again. mz->usage_in_excess will be updated. |
| 138 | + * If excess is 0, no tree ops. |
| 139 | + */ |
| 140 | + __mem_cgroup_insert_exceeded(mz, mctz, excess); |
| 141 | + spin_unlock_irqrestore(&mctz->lock, flags); |
| 142 | + } |
| 143 | + } |
| 144 | +} |
| 145 | + |
| 146 | +void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) |
| 147 | +{ |
| 148 | + struct mem_cgroup_tree_per_node *mctz; |
| 149 | + struct mem_cgroup_per_node *mz; |
| 150 | + int nid; |
| 151 | + |
| 152 | + for_each_node(nid) { |
| 153 | + mz = memcg->nodeinfo[nid]; |
| 154 | + mctz = soft_limit_tree.rb_tree_per_node[nid]; |
| 155 | + if (mctz) |
| 156 | + mem_cgroup_remove_exceeded(mz, mctz); |
| 157 | + } |
| 158 | +} |
| 159 | + |
| 160 | +static struct mem_cgroup_per_node * |
| 161 | +__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) |
| 162 | +{ |
| 163 | + struct mem_cgroup_per_node *mz; |
| 164 | + |
| 165 | +retry: |
| 166 | + mz = NULL; |
| 167 | + if (!mctz->rb_rightmost) |
| 168 | + goto done; /* Nothing to reclaim from */ |
| 169 | + |
| 170 | + mz = rb_entry(mctz->rb_rightmost, |
| 171 | + struct mem_cgroup_per_node, tree_node); |
| 172 | + /* |
| 173 | + * Remove the node now but someone else can add it back, |
| 174 | + * we will to add it back at the end of reclaim to its correct |
| 175 | + * position in the tree. |
| 176 | + */ |
| 177 | + __mem_cgroup_remove_exceeded(mz, mctz); |
| 178 | + if (!soft_limit_excess(mz->memcg) || |
| 179 | + !css_tryget(&mz->memcg->css)) |
| 180 | + goto retry; |
| 181 | +done: |
| 182 | + return mz; |
| 183 | +} |
| 184 | + |
| 185 | +static struct mem_cgroup_per_node * |
| 186 | +mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) |
| 187 | +{ |
| 188 | + struct mem_cgroup_per_node *mz; |
| 189 | + |
| 190 | + spin_lock_irq(&mctz->lock); |
| 191 | + mz = __mem_cgroup_largest_soft_limit_node(mctz); |
| 192 | + spin_unlock_irq(&mctz->lock); |
| 193 | + return mz; |
| 194 | +} |
| 195 | + |
| 196 | +static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, |
| 197 | + pg_data_t *pgdat, |
| 198 | + gfp_t gfp_mask, |
| 199 | + unsigned long *total_scanned) |
| 200 | +{ |
| 201 | + struct mem_cgroup *victim = NULL; |
| 202 | + int total = 0; |
| 203 | + int loop = 0; |
| 204 | + unsigned long excess; |
| 205 | + unsigned long nr_scanned; |
| 206 | + struct mem_cgroup_reclaim_cookie reclaim = { |
| 207 | + .pgdat = pgdat, |
| 208 | + }; |
| 209 | + |
| 210 | + excess = soft_limit_excess(root_memcg); |
| 211 | + |
| 212 | + while (1) { |
| 213 | + victim = mem_cgroup_iter(root_memcg, victim, &reclaim); |
| 214 | + if (!victim) { |
| 215 | + loop++; |
| 216 | + if (loop >= 2) { |
| 217 | + /* |
| 218 | + * If we have not been able to reclaim |
| 219 | + * anything, it might because there are |
| 220 | + * no reclaimable pages under this hierarchy |
| 221 | + */ |
| 222 | + if (!total) |
| 223 | + break; |
| 224 | + /* |
| 225 | + * We want to do more targeted reclaim. |
| 226 | + * excess >> 2 is not to excessive so as to |
| 227 | + * reclaim too much, nor too less that we keep |
| 228 | + * coming back to reclaim from this cgroup |
| 229 | + */ |
| 230 | + if (total >= (excess >> 2) || |
| 231 | + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) |
| 232 | + break; |
| 233 | + } |
| 234 | + continue; |
| 235 | + } |
| 236 | + total += mem_cgroup_shrink_node(victim, gfp_mask, false, |
| 237 | + pgdat, &nr_scanned); |
| 238 | + *total_scanned += nr_scanned; |
| 239 | + if (!soft_limit_excess(root_memcg)) |
| 240 | + break; |
| 241 | + } |
| 242 | + mem_cgroup_iter_break(root_memcg, victim); |
| 243 | + return total; |
| 244 | +} |
| 245 | + |
| 246 | +unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, |
| 247 | + gfp_t gfp_mask, |
| 248 | + unsigned long *total_scanned) |
| 249 | +{ |
| 250 | + unsigned long nr_reclaimed = 0; |
| 251 | + struct mem_cgroup_per_node *mz, *next_mz = NULL; |
| 252 | + unsigned long reclaimed; |
| 253 | + int loop = 0; |
| 254 | + struct mem_cgroup_tree_per_node *mctz; |
| 255 | + unsigned long excess; |
| 256 | + |
| 257 | + if (lru_gen_enabled()) |
| 258 | + return 0; |
| 259 | + |
| 260 | + if (order > 0) |
| 261 | + return 0; |
| 262 | + |
| 263 | + mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id]; |
| 264 | + |
| 265 | + /* |
| 266 | + * Do not even bother to check the largest node if the root |
| 267 | + * is empty. Do it lockless to prevent lock bouncing. Races |
| 268 | + * are acceptable as soft limit is best effort anyway. |
| 269 | + */ |
| 270 | + if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) |
| 271 | + return 0; |
| 272 | + |
| 273 | + /* |
| 274 | + * This loop can run a while, specially if mem_cgroup's continuously |
| 275 | + * keep exceeding their soft limit and putting the system under |
| 276 | + * pressure |
| 277 | + */ |
| 278 | + do { |
| 279 | + if (next_mz) |
| 280 | + mz = next_mz; |
| 281 | + else |
| 282 | + mz = mem_cgroup_largest_soft_limit_node(mctz); |
| 283 | + if (!mz) |
| 284 | + break; |
| 285 | + |
| 286 | + reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, |
| 287 | + gfp_mask, total_scanned); |
| 288 | + nr_reclaimed += reclaimed; |
| 289 | + spin_lock_irq(&mctz->lock); |
| 290 | + |
| 291 | + /* |
| 292 | + * If we failed to reclaim anything from this memory cgroup |
| 293 | + * it is time to move on to the next cgroup |
| 294 | + */ |
| 295 | + next_mz = NULL; |
| 296 | + if (!reclaimed) |
| 297 | + next_mz = __mem_cgroup_largest_soft_limit_node(mctz); |
| 298 | + |
| 299 | + excess = soft_limit_excess(mz->memcg); |
| 300 | + /* |
| 301 | + * One school of thought says that we should not add |
| 302 | + * back the node to the tree if reclaim returns 0. |
| 303 | + * But our reclaim could return 0, simply because due |
| 304 | + * to priority we are exposing a smaller subset of |
| 305 | + * memory to reclaim from. Consider this as a longer |
| 306 | + * term TODO. |
| 307 | + */ |
| 308 | + /* If excess == 0, no tree ops */ |
| 309 | + __mem_cgroup_insert_exceeded(mz, mctz, excess); |
| 310 | + spin_unlock_irq(&mctz->lock); |
| 311 | + css_put(&mz->memcg->css); |
| 312 | + loop++; |
| 313 | + /* |
| 314 | + * Could not reclaim anything and there are no more |
| 315 | + * mem cgroups to try or we seem to be looping without |
| 316 | + * reclaiming anything. |
| 317 | + */ |
| 318 | + if (!nr_reclaimed && |
| 319 | + (next_mz == NULL || |
| 320 | + loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) |
| 321 | + break; |
| 322 | + } while (!nr_reclaimed); |
| 323 | + if (next_mz) |
| 324 | + css_put(&next_mz->memcg->css); |
| 325 | + return nr_reclaimed; |
| 326 | +} |
| 327 | + |
| 328 | +static int __init memcg1_init(void) |
| 329 | +{ |
| 330 | + int node; |
| 331 | + |
| 332 | + for_each_node(node) { |
| 333 | + struct mem_cgroup_tree_per_node *rtpn; |
| 334 | + |
| 335 | + rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node); |
| 336 | + |
| 337 | + rtpn->rb_root = RB_ROOT; |
| 338 | + rtpn->rb_rightmost = NULL; |
| 339 | + spin_lock_init(&rtpn->lock); |
| 340 | + soft_limit_tree.rb_tree_per_node[node] = rtpn; |
| 341 | + } |
| 342 | + |
| 343 | + return 0; |
| 344 | +} |
| 345 | +subsys_initcall(memcg1_init); |
0 commit comments