|
14 | 14 | #include <linux/pagemap.h> |
15 | 15 | #include <linux/mempolicy.h> |
16 | 16 | #include <linux/compiler.h> |
| 17 | +#include <linux/cpumask.h> |
17 | 18 | #include <linux/cpuset.h> |
18 | 19 | #include <linux/mutex.h> |
19 | 20 | #include <linux/memblock.h> |
| 21 | +#include <linux/minmax.h> |
20 | 22 | #include <linux/sysfs.h> |
21 | 23 | #include <linux/slab.h> |
22 | 24 | #include <linux/sched/mm.h> |
@@ -3605,31 +3607,31 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) |
3605 | 3607 | .numa_aware = true |
3606 | 3608 | }; |
3607 | 3609 |
|
| 3610 | + unsigned int num_allocation_threads = max(num_online_cpus() / 4, 1); |
| 3611 | + |
3608 | 3612 | job.thread_fn = hugetlb_pages_alloc_boot_node; |
3609 | 3613 | job.start = 0; |
3610 | 3614 | job.size = h->max_huge_pages; |
3611 | 3615 |
|
3612 | 3616 | /* |
3613 | | - * job.max_threads is twice the num_node_state(N_MEMORY), |
| 3617 | + * job.max_threads is 25% of the available cpu threads by default. |
3614 | 3618 | * |
3615 | | - * Tests below indicate that a multiplier of 2 significantly improves |
3616 | | - * performance, and although larger values also provide improvements, |
3617 | | - * the gains are marginal. |
| 3619 | + * On large servers with terabytes of memory, huge page allocation |
| 3620 | + * can consume a considerably amount of time. |
3618 | 3621 | * |
3619 | | - * Therefore, choosing 2 as the multiplier strikes a good balance between |
3620 | | - * enhancing parallel processing capabilities and maintaining efficient |
3621 | | - * resource management. |
| 3622 | + * Tests below show how long it takes to allocate 1 TiB of memory with 2MiB huge pages. |
| 3623 | + * 2MiB huge pages. Using more threads can significantly improve allocation time. |
3622 | 3624 | * |
3623 | | - * +------------+-------+-------+-------+-------+-------+ |
3624 | | - * | multiplier | 1 | 2 | 3 | 4 | 5 | |
3625 | | - * +------------+-------+-------+-------+-------+-------+ |
3626 | | - * | 256G 2node | 358ms | 215ms | 157ms | 134ms | 126ms | |
3627 | | - * | 2T 4node | 979ms | 679ms | 543ms | 489ms | 481ms | |
3628 | | - * | 50G 2node | 71ms | 44ms | 37ms | 30ms | 31ms | |
3629 | | - * +------------+-------+-------+-------+-------+-------+ |
| 3625 | + * +-----------------------+-------+-------+-------+-------+-------+ |
| 3626 | + * | threads | 8 | 16 | 32 | 64 | 128 | |
| 3627 | + * +-----------------------+-------+-------+-------+-------+-------+ |
| 3628 | + * | skylake 144 cpus | 44s | 22s | 16s | 19s | 20s | |
| 3629 | + * | cascade lake 192 cpus | 39s | 20s | 11s | 10s | 9s | |
| 3630 | + * +-----------------------+-------+-------+-------+-------+-------+ |
3630 | 3631 | */ |
3631 | | - job.max_threads = num_node_state(N_MEMORY) * 2; |
3632 | | - job.min_chunk = h->max_huge_pages / num_node_state(N_MEMORY) / 2; |
| 3632 | + |
| 3633 | + job.max_threads = num_allocation_threads; |
| 3634 | + job.min_chunk = h->max_huge_pages / num_allocation_threads; |
3633 | 3635 | padata_do_multithreaded(&job); |
3634 | 3636 |
|
3635 | 3637 | return h->nr_huge_pages; |
|
0 commit comments