|
| 1 | +mm, compaction: rename compact_control->rescan to finish_pageblock |
| 2 | + |
| 3 | +jira LE-4623 |
| 4 | +Rebuild_History Non-Buildable kernel-4.18.0-553.81.1.el8_10 |
| 5 | +commit-author Mel Gorman <mgorman@techsingularity.net> |
| 6 | +commit 48731c8436c68ce5597dfe72f3836bd6808bedde |
| 7 | +Empty-Commit: Cherry-Pick Conflicts during history rebuild. |
| 8 | +Will be included in final tarball splat. Ref for failed cherry-pick at: |
| 9 | +ciq/ciq_backports/kernel-4.18.0-553.81.1.el8_10/48731c84.failed |
| 10 | + |
| 11 | +Patch series "Fix excessive CPU usage during compaction". |
| 12 | + |
| 13 | +Commit 7efc3b726103 ("mm/compaction: fix set skip in fast_find_migrateblock") |
| 14 | +fixed a problem where pageblocks found by fast_find_migrateblock() were |
| 15 | +ignored. Unfortunately there were numerous bug reports complaining about high |
| 16 | +CPU usage and massive stalls once 6.1 was released. Due to the severity, |
| 17 | +the patch was reverted by Vlastimil as a short-term fix[1] to -stable. |
| 18 | + |
| 19 | +The underlying problem for each of the bugs is suspected to be the |
| 20 | +repeated scanning of the same pageblocks. This series should guarantee |
| 21 | +forward progress even with commit 7efc3b726103. More information is in |
| 22 | +the changelog for patch 4. |
| 23 | + |
| 24 | +[1] http://lore.kernel.org/r/20230113173345.9692-1-vbabka@suse.cz |
| 25 | + |
| 26 | + |
| 27 | +This patch (of 4): |
| 28 | + |
| 29 | +The rescan field was not well named albeit accurate at the time. Rename |
| 30 | +the field to finish_pageblock to indicate that the remainder of the |
| 31 | +pageblock should be scanned regardless of COMPACT_CLUSTER_MAX. The intent |
| 32 | +is that pageblocks with transient failures get marked for skipping to |
| 33 | +avoid revisiting the same pageblock. |
| 34 | + |
| 35 | +Link: https://lkml.kernel.org/r/20230125134434.18017-2-mgorman@techsingularity.net |
| 36 | + Signed-off-by: Mel Gorman <mgorman@techsingularity.net> |
| 37 | + Cc: Chuyi Zhou <zhouchuyi@bytedance.com> |
| 38 | + Cc: Jiri Slaby <jirislaby@kernel.org> |
| 39 | + Cc: Maxim Levitsky <mlevitsk@redhat.com> |
| 40 | + Cc: Michal Hocko <mhocko@kernel.org> |
| 41 | + Cc: Paolo Bonzini <pbonzini@redhat.com> |
| 42 | + Cc: Pedro Falcato <pedro.falcato@gmail.com> |
| 43 | + Cc: Vlastimil Babka <vbabka@suse.cz> |
| 44 | + Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| 45 | +(cherry picked from commit 48731c8436c68ce5597dfe72f3836bd6808bedde) |
| 46 | + Signed-off-by: Jonathan Maple <jmaple@ciq.com> |
| 47 | + |
| 48 | +# Conflicts: |
| 49 | +# mm/internal.h |
| 50 | +diff --cc mm/internal.h |
| 51 | +index 7d89d8d7cead,2d1b9fa8083e..000000000000 |
| 52 | +--- a/mm/internal.h |
| 53 | ++++ b/mm/internal.h |
| 54 | +@@@ -327,6 -263,222 +327,225 @@@ static inline unsigned int buddy_order( |
| 55 | + #define buddy_order_unsafe(page) READ_ONCE(page_private(page)) |
| 56 | + |
| 57 | + /* |
| 58 | +++<<<<<<< HEAD |
| 59 | +++======= |
| 60 | ++ * This function checks whether a page is free && is the buddy |
| 61 | ++ * we can coalesce a page and its buddy if |
| 62 | ++ * (a) the buddy is not in a hole (check before calling!) && |
| 63 | ++ * (b) the buddy is in the buddy system && |
| 64 | ++ * (c) a page and its buddy have the same order && |
| 65 | ++ * (d) a page and its buddy are in the same zone. |
| 66 | ++ * |
| 67 | ++ * For recording whether a page is in the buddy system, we set PageBuddy. |
| 68 | ++ * Setting, clearing, and testing PageBuddy is serialized by zone->lock. |
| 69 | ++ * |
| 70 | ++ * For recording page's order, we use page_private(page). |
| 71 | ++ */ |
| 72 | ++ static inline bool page_is_buddy(struct page *page, struct page *buddy, |
| 73 | ++ unsigned int order) |
| 74 | ++ { |
| 75 | ++ if (!page_is_guard(buddy) && !PageBuddy(buddy)) |
| 76 | ++ return false; |
| 77 | ++ |
| 78 | ++ if (buddy_order(buddy) != order) |
| 79 | ++ return false; |
| 80 | ++ |
| 81 | ++ /* |
| 82 | ++ * zone check is done late to avoid uselessly calculating |
| 83 | ++ * zone/node ids for pages that could never merge. |
| 84 | ++ */ |
| 85 | ++ if (page_zone_id(page) != page_zone_id(buddy)) |
| 86 | ++ return false; |
| 87 | ++ |
| 88 | ++ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); |
| 89 | ++ |
| 90 | ++ return true; |
| 91 | ++ } |
| 92 | ++ |
| 93 | ++ /* |
| 94 | ++ * Locate the struct page for both the matching buddy in our |
| 95 | ++ * pair (buddy1) and the combined O(n+1) page they form (page). |
| 96 | ++ * |
| 97 | ++ * 1) Any buddy B1 will have an order O twin B2 which satisfies |
| 98 | ++ * the following equation: |
| 99 | ++ * B2 = B1 ^ (1 << O) |
| 100 | ++ * For example, if the starting buddy (buddy2) is #8 its order |
| 101 | ++ * 1 buddy is #10: |
| 102 | ++ * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 |
| 103 | ++ * |
| 104 | ++ * 2) Any buddy B will have an order O+1 parent P which |
| 105 | ++ * satisfies the following equation: |
| 106 | ++ * P = B & ~(1 << O) |
| 107 | ++ * |
| 108 | ++ * Assumption: *_mem_map is contiguous at least up to MAX_ORDER |
| 109 | ++ */ |
| 110 | ++ static inline unsigned long |
| 111 | ++ __find_buddy_pfn(unsigned long page_pfn, unsigned int order) |
| 112 | ++ { |
| 113 | ++ return page_pfn ^ (1 << order); |
| 114 | ++ } |
| 115 | ++ |
| 116 | ++ /* |
| 117 | ++ * Find the buddy of @page and validate it. |
| 118 | ++ * @page: The input page |
| 119 | ++ * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the |
| 120 | ++ * function is used in the performance-critical __free_one_page(). |
| 121 | ++ * @order: The order of the page |
| 122 | ++ * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to |
| 123 | ++ * page_to_pfn(). |
| 124 | ++ * |
| 125 | ++ * The found buddy can be a non PageBuddy, out of @page's zone, or its order is |
| 126 | ++ * not the same as @page. The validation is necessary before use it. |
| 127 | ++ * |
| 128 | ++ * Return: the found buddy page or NULL if not found. |
| 129 | ++ */ |
| 130 | ++ static inline struct page *find_buddy_page_pfn(struct page *page, |
| 131 | ++ unsigned long pfn, unsigned int order, unsigned long *buddy_pfn) |
| 132 | ++ { |
| 133 | ++ unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order); |
| 134 | ++ struct page *buddy; |
| 135 | ++ |
| 136 | ++ buddy = page + (__buddy_pfn - pfn); |
| 137 | ++ if (buddy_pfn) |
| 138 | ++ *buddy_pfn = __buddy_pfn; |
| 139 | ++ |
| 140 | ++ if (page_is_buddy(page, buddy, order)) |
| 141 | ++ return buddy; |
| 142 | ++ return NULL; |
| 143 | ++ } |
| 144 | ++ |
| 145 | ++ extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, |
| 146 | ++ unsigned long end_pfn, struct zone *zone); |
| 147 | ++ |
| 148 | ++ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, |
| 149 | ++ unsigned long end_pfn, struct zone *zone) |
| 150 | ++ { |
| 151 | ++ if (zone->contiguous) |
| 152 | ++ return pfn_to_page(start_pfn); |
| 153 | ++ |
| 154 | ++ return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); |
| 155 | ++ } |
| 156 | ++ |
| 157 | ++ extern int __isolate_free_page(struct page *page, unsigned int order); |
| 158 | ++ extern void __putback_isolated_page(struct page *page, unsigned int order, |
| 159 | ++ int mt); |
| 160 | ++ extern void memblock_free_pages(struct page *page, unsigned long pfn, |
| 161 | ++ unsigned int order); |
| 162 | ++ extern void __free_pages_core(struct page *page, unsigned int order); |
| 163 | ++ extern void prep_compound_page(struct page *page, unsigned int order); |
| 164 | ++ extern void post_alloc_hook(struct page *page, unsigned int order, |
| 165 | ++ gfp_t gfp_flags); |
| 166 | ++ extern int user_min_free_kbytes; |
| 167 | ++ |
| 168 | ++ extern void free_unref_page(struct page *page, unsigned int order); |
| 169 | ++ extern void free_unref_page_list(struct list_head *list); |
| 170 | ++ |
| 171 | ++ extern void zone_pcp_reset(struct zone *zone); |
| 172 | ++ extern void zone_pcp_disable(struct zone *zone); |
| 173 | ++ extern void zone_pcp_enable(struct zone *zone); |
| 174 | ++ |
| 175 | ++ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, |
| 176 | ++ phys_addr_t min_addr, |
| 177 | ++ int nid, bool exact_nid); |
| 178 | ++ |
| 179 | ++ int split_free_page(struct page *free_page, |
| 180 | ++ unsigned int order, unsigned long split_pfn_offset); |
| 181 | ++ |
| 182 | ++ /* |
| 183 | ++ * This will have no effect, other than possibly generating a warning, if the |
| 184 | ++ * caller passes in a non-large folio. |
| 185 | ++ */ |
| 186 | ++ static inline void folio_set_order(struct folio *folio, unsigned int order) |
| 187 | ++ { |
| 188 | ++ if (WARN_ON_ONCE(!folio_test_large(folio))) |
| 189 | ++ return; |
| 190 | ++ |
| 191 | ++ folio->_folio_order = order; |
| 192 | ++ #ifdef CONFIG_64BIT |
| 193 | ++ /* |
| 194 | ++ * When hugetlb dissolves a folio, we need to clear the tail |
| 195 | ++ * page, rather than setting nr_pages to 1. |
| 196 | ++ */ |
| 197 | ++ folio->_folio_nr_pages = order ? 1U << order : 0; |
| 198 | ++ #endif |
| 199 | ++ } |
| 200 | ++ |
| 201 | ++ #if defined CONFIG_COMPACTION || defined CONFIG_CMA |
| 202 | ++ |
| 203 | ++ /* |
| 204 | ++ * in mm/compaction.c |
| 205 | ++ */ |
| 206 | ++ /* |
| 207 | ++ * compact_control is used to track pages being migrated and the free pages |
| 208 | ++ * they are being migrated to during memory compaction. The free_pfn starts |
| 209 | ++ * at the end of a zone and migrate_pfn begins at the start. Movable pages |
| 210 | ++ * are moved to the end of a zone during a compaction run and the run |
| 211 | ++ * completes when free_pfn <= migrate_pfn |
| 212 | ++ */ |
| 213 | ++ struct compact_control { |
| 214 | ++ struct list_head freepages; /* List of free pages to migrate to */ |
| 215 | ++ struct list_head migratepages; /* List of pages being migrated */ |
| 216 | ++ unsigned int nr_freepages; /* Number of isolated free pages */ |
| 217 | ++ unsigned int nr_migratepages; /* Number of pages to migrate */ |
| 218 | ++ unsigned long free_pfn; /* isolate_freepages search base */ |
| 219 | ++ /* |
| 220 | ++ * Acts as an in/out parameter to page isolation for migration. |
| 221 | ++ * isolate_migratepages uses it as a search base. |
| 222 | ++ * isolate_migratepages_block will update the value to the next pfn |
| 223 | ++ * after the last isolated one. |
| 224 | ++ */ |
| 225 | ++ unsigned long migrate_pfn; |
| 226 | ++ unsigned long fast_start_pfn; /* a pfn to start linear scan from */ |
| 227 | ++ struct zone *zone; |
| 228 | ++ unsigned long total_migrate_scanned; |
| 229 | ++ unsigned long total_free_scanned; |
| 230 | ++ unsigned short fast_search_fail;/* failures to use free list searches */ |
| 231 | ++ short search_order; /* order to start a fast search at */ |
| 232 | ++ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ |
| 233 | ++ int order; /* order a direct compactor needs */ |
| 234 | ++ int migratetype; /* migratetype of direct compactor */ |
| 235 | ++ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ |
| 236 | ++ const int highest_zoneidx; /* zone index of a direct compactor */ |
| 237 | ++ enum migrate_mode mode; /* Async or sync migration mode */ |
| 238 | ++ bool ignore_skip_hint; /* Scan blocks even if marked skip */ |
| 239 | ++ bool no_set_skip_hint; /* Don't mark blocks for skipping */ |
| 240 | ++ bool ignore_block_suitable; /* Scan blocks considered unsuitable */ |
| 241 | ++ bool direct_compaction; /* False from kcompactd or /proc/... */ |
| 242 | ++ bool proactive_compaction; /* kcompactd proactive compaction */ |
| 243 | ++ bool whole_zone; /* Whole zone should/has been scanned */ |
| 244 | ++ bool contended; /* Signal lock contention */ |
| 245 | ++ bool finish_pageblock; /* Scan the remainder of a pageblock. Used |
| 246 | ++ * when there are potentially transient |
| 247 | ++ * isolation or migration failures to |
| 248 | ++ * ensure forward progress. |
| 249 | ++ */ |
| 250 | ++ bool alloc_contig; /* alloc_contig_range allocation */ |
| 251 | ++ }; |
| 252 | ++ |
| 253 | ++ /* |
| 254 | ++ * Used in direct compaction when a page should be taken from the freelists |
| 255 | ++ * immediately when one is created during the free path. |
| 256 | ++ */ |
| 257 | ++ struct capture_control { |
| 258 | ++ struct compact_control *cc; |
| 259 | ++ struct page *page; |
| 260 | ++ }; |
| 261 | ++ |
| 262 | ++ unsigned long |
| 263 | ++ isolate_freepages_range(struct compact_control *cc, |
| 264 | ++ unsigned long start_pfn, unsigned long end_pfn); |
| 265 | ++ int |
| 266 | ++ isolate_migratepages_range(struct compact_control *cc, |
| 267 | ++ unsigned long low_pfn, unsigned long end_pfn); |
| 268 | ++ |
| 269 | ++ int __alloc_contig_migrate_range(struct compact_control *cc, |
| 270 | ++ unsigned long start, unsigned long end); |
| 271 | ++ #endif |
| 272 | ++ int find_suitable_fallback(struct free_area *area, unsigned int order, |
| 273 | ++ int migratetype, bool only_stealable, bool *can_steal); |
| 274 | ++ |
| 275 | ++ /* |
| 276 | +++>>>>>>> 48731c8436c6 (mm, compaction: rename compact_control->rescan to finish_pageblock) |
| 277 | + * These three helpers classifies VMAs for virtual memory accounting. |
| 278 | + */ |
| 279 | + |
| 280 | +diff --git a/mm/compaction.c b/mm/compaction.c |
| 281 | +index c8f609371748..d2b0a737ac38 100644 |
| 282 | +--- a/mm/compaction.c |
| 283 | ++++ b/mm/compaction.c |
| 284 | +@@ -1050,12 +1050,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, |
| 285 | + |
| 286 | + /* |
| 287 | + * Avoid isolating too much unless this block is being |
| 288 | +- * rescanned (e.g. dirty/writeback pages, parallel allocation) |
| 289 | ++ * fully scanned (e.g. dirty/writeback pages, parallel allocation) |
| 290 | + * or a lock is contended. For contention, isolate quickly to |
| 291 | + * potentially remove one source of contention. |
| 292 | + */ |
| 293 | + if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX && |
| 294 | +- !cc->rescan && !cc->contended) { |
| 295 | ++ !cc->finish_pageblock && !cc->contended) { |
| 296 | + ++low_pfn; |
| 297 | + break; |
| 298 | + } |
| 299 | +@@ -1117,14 +1117,14 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, |
| 300 | + } |
| 301 | + |
| 302 | + /* |
| 303 | +- * Updated the cached scanner pfn once the pageblock has been scanned |
| 304 | ++ * Update the cached scanner pfn once the pageblock has been scanned. |
| 305 | + * Pages will either be migrated in which case there is no point |
| 306 | + * scanning in the near future or migration failed in which case the |
| 307 | + * failure reason may persist. The block is marked for skipping if |
| 308 | + * there were no pages isolated in the block or if the block is |
| 309 | + * rescanned twice in a row. |
| 310 | + */ |
| 311 | +- if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) { |
| 312 | ++ if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) { |
| 313 | + if (valid_page && !skip_updated) |
| 314 | + set_pageblock_skip(valid_page); |
| 315 | + update_cached_migrate(cc, low_pfn); |
| 316 | +@@ -2320,17 +2320,17 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) |
| 317 | + unsigned long iteration_start_pfn = cc->migrate_pfn; |
| 318 | + |
| 319 | + /* |
| 320 | +- * Avoid multiple rescans which can happen if a page cannot be |
| 321 | +- * isolated (dirty/writeback in async mode) or if the migrated |
| 322 | +- * pages are being allocated before the pageblock is cleared. |
| 323 | +- * The first rescan will capture the entire pageblock for |
| 324 | +- * migration. If it fails, it'll be marked skip and scanning |
| 325 | +- * will proceed as normal. |
| 326 | ++ * Avoid multiple rescans of the same pageblock which can |
| 327 | ++ * happen if a page cannot be isolated (dirty/writeback in |
| 328 | ++ * async mode) or if the migrated pages are being allocated |
| 329 | ++ * before the pageblock is cleared. The first rescan will |
| 330 | ++ * capture the entire pageblock for migration. If it fails, |
| 331 | ++ * it'll be marked skip and scanning will proceed as normal. |
| 332 | + */ |
| 333 | +- cc->rescan = false; |
| 334 | ++ cc->finish_pageblock = false; |
| 335 | + if (pageblock_start_pfn(last_migrated_pfn) == |
| 336 | + pageblock_start_pfn(iteration_start_pfn)) { |
| 337 | +- cc->rescan = true; |
| 338 | ++ cc->finish_pageblock = true; |
| 339 | + } |
| 340 | + |
| 341 | + switch (isolate_migratepages(cc)) { |
| 342 | +* Unmerged path mm/internal.h |
0 commit comments