From 9782e5e22877f283bcb94757404187ca6246a7c9 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 3 Nov 2025 18:19:04 -0800 Subject: [PATCH] kvm: avoid mmio exits when Sentry faults on unmapped memory Currently, we generate page tables for the entire sentry address space. Consequently, when the Sentry faults on unmapped memory - meaning a memory region not yet mapped into the VM - an MMIO exit is triggered. The issue is that because the current instruction is emulated (instead of executed natively), it becomes impossible to trigger a "normal" memory fault. To solve this, we must set up page tables only for the regions that are explicitly mapped into the VM. This, however, is more challenging than it sounds for several reasons: We map memory regions into the VM from a signal handler, where memory allocation is prohibited. This means all necessary page table entries must be allocated during platform initialization. Our memory regions are not aligned to huge page boundaries. Therefore, when mapping a memory slot, we often need to split huge pages and allocate new page table entries. We run into the nosplit stack limit, requiring us to introduce a PTE.Get method to safely avoid accessing splice entries via indices, which could trigger a panic and so requires a lot of extra stack. PiperOrigin-RevId: 827726897 --- pkg/ring0/pagetables/BUILD | 1 + pkg/ring0/pagetables/pagetables.go | 9 +-- pkg/ring0/pagetables/pagetables_aarch64.go | 5 +- pkg/ring0/pagetables/pagetables_unsafe.go | 26 +++++++++ pkg/ring0/pagetables/pagetables_x86.go | 9 ++- pkg/ring0/pagetables/walker_amd64.go | 14 +++-- pkg/ring0/pagetables/walker_arm64.go | 6 +- pkg/sentry/platform/kvm/bluepill_fault.go | 21 ++++--- pkg/sentry/platform/kvm/kvm_test.go | 2 + pkg/sentry/platform/kvm/machine.go | 56 +++++++++++-------- pkg/sentry/platform/kvm/physical_map.go | 10 +++- pkg/sentry/platform/kvm/physical_map_amd64.go | 53 ++++++++++++++++++ pkg/sentry/platform/kvm/physical_map_arm64.go | 6 ++ 13 files changed, 171 insertions(+), 47 deletions(-) create mode 100644 pkg/ring0/pagetables/pagetables_unsafe.go diff --git a/pkg/ring0/pagetables/BUILD b/pkg/ring0/pagetables/BUILD index b111991e01..186fcf596d 100644 --- a/pkg/ring0/pagetables/BUILD +++ b/pkg/ring0/pagetables/BUILD @@ -49,6 +49,7 @@ go_library( "pagetables_aarch64.go", "pagetables_amd64.go", "pagetables_arm64.go", + "pagetables_unsafe.go", "pagetables_x86.go", "pcids.go", "pcids_aarch64.go", diff --git a/pkg/ring0/pagetables/pagetables.go b/pkg/ring0/pagetables/pagetables.go index 04f9195d71..e0c67d65fc 100644 --- a/pkg/ring0/pagetables/pagetables.go +++ b/pkg/ring0/pagetables/pagetables.go @@ -110,8 +110,9 @@ func New(a Allocator) *PageTables { type mapVisitor struct { target uintptr // Input. physical uintptr // Input. - opts MapOpts // Input. - prev bool // Output. + // opts is a pointer just to reduce a stack usage. It should never be changed. + opts *MapOpts // Input. + prev bool // Output. } // visit is used for map. @@ -119,7 +120,7 @@ type mapVisitor struct { //go:nosplit func (v *mapVisitor) visit(start uintptr, pte *PTE, align uintptr) bool { p := v.physical + (start - v.target) - if pte.Valid() && (pte.Address() != p || pte.Opts() != v.opts) { + if pte.Valid() && (pte.Address() != p || pte.Opts() != *v.opts) { v.prev = true } if p&align != 0 { @@ -169,7 +170,7 @@ func (p *PageTables) Map(addr hostarch.Addr, length uintptr, opts MapOpts, physi visitor: mapVisitor{ target: uintptr(addr), physical: physical, - opts: opts, + opts: &opts, }, } w.iterateRange(uintptr(addr), uintptr(addr)+length) diff --git a/pkg/ring0/pagetables/pagetables_aarch64.go b/pkg/ring0/pagetables/pagetables_aarch64.go index 97ce934e08..ca0e8c384b 100644 --- a/pkg/ring0/pagetables/pagetables_aarch64.go +++ b/pkg/ring0/pagetables/pagetables_aarch64.go @@ -91,6 +91,9 @@ type MapOpts struct { // User indicates the page is a user page. User bool + // Static indicates the entries should not be cleared/freed. + Static bool + // MemoryType is the memory type. MemoryType hostarch.MemoryType } @@ -156,7 +159,7 @@ func (p *PTE) IsSect() bool { // This does not change the sect page property. // //go:nosplit -func (p *PTE) Set(addr uintptr, opts MapOpts) { +func (p *PTE) Set(addr uintptr, opts *MapOpts) { v := (addr &^ optionMask) | nG | readOnly | protDefault // Note: p.IsSect is manually inlined to reduce stack size for // nosplit-ness. diff --git a/pkg/ring0/pagetables/pagetables_unsafe.go b/pkg/ring0/pagetables/pagetables_unsafe.go new file mode 100644 index 0000000000..b9f5724bc3 --- /dev/null +++ b/pkg/ring0/pagetables/pagetables_unsafe.go @@ -0,0 +1,26 @@ +// Copyright 2025 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pagetables + +import ( + "unsafe" +) + +// Get returns the entry with the specified index. +// +//go:nosplit +func (p *PTEs) Get(idx uint16) *PTE { + return (*PTE)(unsafe.Pointer(uintptr(unsafe.Pointer(&p[0])) + 8*uintptr(idx))) +} diff --git a/pkg/ring0/pagetables/pagetables_x86.go b/pkg/ring0/pagetables/pagetables_x86.go index 2109ccdf33..5b0947c643 100644 --- a/pkg/ring0/pagetables/pagetables_x86.go +++ b/pkg/ring0/pagetables/pagetables_x86.go @@ -73,6 +73,9 @@ type MapOpts struct { // User indicates the page is a user page. User bool + // Static indicates the entries should not be cleared/freed. + Static bool + // MemoryType is the memory type. MemoryType hostarch.MemoryType } @@ -91,7 +94,7 @@ func (p *PTE) Clear() { // //go:nosplit func (p *PTE) Valid() bool { - return atomic.LoadUintptr((*uintptr)(p))&present != 0 + return atomic.LoadUintptr((*uintptr)(p)) != 0 } // Opts returns the PTE options. @@ -138,8 +141,8 @@ func (p *PTE) IsSuper() bool { // This does not change the super page property. // //go:nosplit -func (p *PTE) Set(addr uintptr, opts MapOpts) { - if !opts.AccessType.Any() { +func (p *PTE) Set(addr uintptr, opts *MapOpts) { + if !opts.AccessType.Any() && !opts.Static { p.Clear() return } diff --git a/pkg/ring0/pagetables/walker_amd64.go b/pkg/ring0/pagetables/walker_amd64.go index 1168e82f88..0e10cbc331 100644 --- a/pkg/ring0/pagetables/walker_amd64.go +++ b/pkg/ring0/pagetables/walker_amd64.go @@ -43,7 +43,7 @@ func (w *Walker) walkPTEs(entries *PTEs, start, end uintptr) (bool, uint16) { var clearEntries uint16 for start < end { pteIndex := uint16((start & pteMask) >> pteShift) - entry := &entries[pteIndex] + entry := entries.Get(pteIndex) if !entry.Valid() && !w.visitor.requiresAlloc() { clearEntries++ start += pteSize @@ -81,7 +81,7 @@ func (w *Walker) walkPMDs(pmdEntries *PTEs, start, end uintptr) (bool, uint16) { var pteEntries *PTEs nextBoundary := addrEnd(start, end, pmdSize) pmdIndex := uint16((start & pmdMask) >> pmdShift) - pmdEntry := &pmdEntries[pmdIndex] + pmdEntry := pmdEntries.Get(pmdIndex) if !pmdEntry.Valid() { if !w.visitor.requiresAlloc() { // Skip over this entry. @@ -114,9 +114,10 @@ func (w *Walker) walkPMDs(pmdEntries *PTEs, start, end uintptr) (bool, uint16) { // Install the relevant entries. pteEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { + opts := pmdEntry.Opts() pteEntries[index].Set( pmdEntry.Address()+(pteSize*uintptr(index)), - pmdEntry.Opts()) + &opts) } pmdEntry.setPageTable(w.pageTables, pteEntries) } else { @@ -173,7 +174,7 @@ func (w *Walker) walkPUDs(pudEntries *PTEs, start, end uintptr) (bool, uint16) { var pmdEntries *PTEs nextBoundary := addrEnd(start, end, pudSize) pudIndex := uint16((start & pudMask) >> pudShift) - pudEntry := &pudEntries[pudIndex] + pudEntry := pudEntries.Get(pudIndex) if !pudEntry.Valid() { if !w.visitor.requiresAlloc() { // Skip over this entry. @@ -209,9 +210,10 @@ func (w *Walker) walkPUDs(pudEntries *PTEs, start, end uintptr) (bool, uint16) { pmdEntries = w.pageTables.Allocator.NewPTEs() // escapes: see above. for index := uint16(0); index < entriesPerPage; index++ { pmdEntries[index].SetSuper() + opts := pudEntry.Opts() pmdEntries[index].Set( pudEntry.Address()+(pmdSize*uintptr(index)), - pudEntry.Opts()) + &opts) } pudEntry.setPageTable(w.pageTables, pmdEntries) } else { @@ -261,7 +263,7 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) bool { var pudEntries *PTEs nextBoundary := addrEnd(start, end, pgdSize) pgdIndex := uint16((start & pgdMask) >> pgdShift) - pgdEntry := &w.pageTables.root[pgdIndex] + pgdEntry := w.pageTables.root.Get(pgdIndex) if !w.pageTables.largeAddressesEnabled { if !pgdEntry.Valid() { if !w.visitor.requiresAlloc() { diff --git a/pkg/ring0/pagetables/walker_arm64.go b/pkg/ring0/pagetables/walker_arm64.go index 726672f817..92dcfe6d3c 100644 --- a/pkg/ring0/pagetables/walker_arm64.go +++ b/pkg/ring0/pagetables/walker_arm64.go @@ -87,9 +87,10 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) bool { pmdEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pmdEntries[index].SetSect() + opts := pudEntry.Opts() pmdEntries[index].Set( pudEntry.Address()+(pmdSize*uintptr(index)), - pudEntry.Opts()) + &opts) } pudEntry.setPageTable(w.pageTables, pmdEntries) } else { @@ -152,9 +153,10 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) bool { // Install the relevant entries. pteEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { + opts := pmdEntry.Opts() pteEntries[index].Set( pmdEntry.Address()+(pteSize*uintptr(index)), - pmdEntry.Opts()) + &opts) } pmdEntry.setPageTable(w.pageTables, pteEntries) } else { diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go index 4e51e64aea..1675d55db8 100644 --- a/pkg/sentry/platform/kvm/bluepill_fault.go +++ b/pkg/sentry/platform/kvm/bluepill_fault.go @@ -25,11 +25,14 @@ import ( var ( // faultBlockSize is the size used for servicing memory faults. // - // This should be large enough to avoid frequent faults and avoid using - // all available KVM slots (~512), but small enough that KVM does not - // complain about slot sizes (~4GB). See handleBluepillFault for how - // this block is used. - faultBlockSize = uintptr(2 << 30) + // This should be large enough so that the total number of slots + // required to cover the 47-bit virtual address space does not exceed + // the KVM slot limit (e.g. 32764). Linux doesn't allocate virtual + // address space above 47-bit by default. + // It must be small enough to limit the memory overhead associated with + // KVM slot allocation. For example, using a 46-bit address space + // results in an overhead of ~250 MB. + faultBlockSize = uintptr(8 << 30) // faultBlockMask is the mask for the fault blocks. // @@ -56,13 +59,17 @@ func calculateBluepillFault(physical uintptr) (virtualStart, physicalStart, leng } // Adjust the block to match our size. - physicalStart = pr.physical + (alignedPhysical-pr.physical)&faultBlockMask - virtualStart = pr.virtual + (physicalStart - pr.physical) + physicalStart = pr.physical / faultBlockSize * faultBlockSize + physicalStart = physicalStart + (alignedPhysical-physicalStart)&faultBlockMask physicalEnd := physicalStart + faultBlockSize + if physicalStart < pr.physical { + physicalStart = pr.physical + } if physicalEnd > end { physicalEnd = end } length = physicalEnd - physicalStart + virtualStart = pr.virtual + (physicalStart - pr.physical) return virtualStart, physicalStart, length, &physicalRegions[i] } diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go index d64db6a5fd..8e8deacb77 100644 --- a/pkg/sentry/platform/kvm/kvm_test.go +++ b/pkg/sentry/platform/kvm/kvm_test.go @@ -39,6 +39,7 @@ import ( var dummyFPState fpu.State type testHarness interface { + Logf(format string, args ...any) Errorf(format string, args ...any) Fatalf(format string, args ...any) } @@ -146,6 +147,7 @@ func applicationTest(t testHarness, useHostMappings bool, targetFn uintptr, fn f // done for regular user code, but is fine for test // purposes.) applyPhysicalRegions(func(pr physicalRegion) bool { + t.Logf("Map %x-%x", pr.virtual, pr.virtual+pr.length) pt.Map(hostarch.Addr(pr.virtual), pr.length, pagetables.MapOpts{ AccessType: hostarch.AnyAccess, User: true, diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index 78b7de31a8..43ba744686 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -332,26 +332,33 @@ func newMachine(vm int, config *Config) (*machine, error) { // faultBlockSize has to equal or less than KVM_MEM_MAX_NR_PAGES. faultBlockSize = uintptr(1) << 42 faultBlockMask = ^uintptr(faultBlockSize - 1) + for _, r := range physicalRegions { + m.mapPhysical(r.physical, r.length) + } } else { + // Apply the physical mappings. Note that these mappings may point to + // guest physical addresses that are not actually available. These + // physical pages are mapped on demand, see kernel_unsafe.go. + applyPhysicalRegions(func(pr physicalRegion) bool { + physical := pr.physical + for physical < pr.physical+pr.length { + virtualStart, physicalStart, length, _ := calculateBluepillFault(physical) + // Pre-allocate page tables in the lower half. + m.kernel.PageTables.Map( + hostarch.Addr(virtualStart), + length, + pagetables.MapOpts{Static: true}, + physicalStart) + physical += length + } + + return true // Keep iterating. + }) // Install seccomp rules to trap runtime mmap system calls. They will // be handled by seccompMmapHandler. seccompMmapRules(m) } - // Apply the physical mappings. Note that these mappings may point to - // guest physical addresses that are not actually available. These - // physical pages are mapped on demand, see kernel_unsafe.go. - applyPhysicalRegions(func(pr physicalRegion) bool { - // Map everything in the lower half. - m.kernel.PageTables.Map( - hostarch.Addr(pr.virtual), - pr.length, - pagetables.MapOpts{AccessType: hostarch.ReadWrite}, - pr.physical) - - return true // Keep iterating. - }) - // Ensure that the currently mapped virtual regions are actually // available in the VM. Note that this doesn't guarantee no future // faults, however it should guarantee that everything is available to @@ -368,6 +375,9 @@ func newMachine(vm int, config *Config) (*machine, error) { // Cap the length to the end of the area. length = vr.virtual + vr.length - virtual } + // Ensure the physical range is mapped. + m.mapPhysical(physical, length) + // Update page tables for executable mappings. if vr.accessType.Execute { if vr.accessType.Write { @@ -380,8 +390,6 @@ func newMachine(vm int, config *Config) (*machine, error) { physical) } - // Ensure the physical range is mapped. - m.mapPhysical(physical, length) virtual += length } } @@ -404,11 +412,6 @@ func newMachine(vm int, config *Config) (*machine, error) { mapRegion(vr, 0) }) - if mapEntireAddressSpace { - for _, r := range physicalRegions { - m.mapPhysical(r.physical, r.length) - } - } enableAsyncPreemption() // Initialize architecture state. if err := m.initArchState(); err != nil { @@ -458,8 +461,15 @@ func (m *machine) mapPhysical(physical, length uintptr) { } // Is this already mapped? Check the usedSlots. - if !pr.mmio && !m.hasSlot(physicalStart) { - m.mapMemorySlot(virtualStart, physicalStart, length, pr.readOnly) + if !m.hasSlot(physicalStart) { + m.kernel.PageTables.Map( + hostarch.Addr(virtualStart), + length, + pagetables.MapOpts{AccessType: hostarch.ReadWrite}, + physicalStart) + if !pr.mmio { + m.mapMemorySlot(virtualStart, physicalStart, length, pr.readOnly) + } } // Move to the next chunk. diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go index 9784e52041..95ed0790f4 100644 --- a/pkg/sentry/platform/kvm/physical_map.go +++ b/pkg/sentry/platform/kvm/physical_map.go @@ -66,6 +66,7 @@ func fillAddressSpace() (specialRegions []specialVirtualRegion) { pSize := uintptr(1) << ring0.PhysicalAddressBits pSize -= reservedMemory + maxUserAddr := uintptr(0) // Add specifically excluded regions; see excludeVirtualRegion. if err := applyVirtualRegions(func(vr virtualRegion) { if excludeVirtualRegion(vr) { @@ -81,10 +82,17 @@ func fillAddressSpace() (specialRegions []specialVirtualRegion) { }) log.Infof("mmio: virtual [%x,%x)", vr.virtual, vr.virtual+vr.length) } + if vr.filename != "[vsyscall]" { + maxUserAddr = vr.region.virtual + vr.region.length + } }); err != nil { panic(fmt.Sprintf("error parsing /proc/self/maps: %v", err)) } + var archRegions []specialVirtualRegion + vSize, archRegions = archSpecialRegions(vSize, maxUserAddr) + specialRegions = append(specialRegions, archRegions...) + // Do we need any more work? if vSize < pSize { return specialRegions @@ -109,7 +117,7 @@ func fillAddressSpace() (specialRegions []specialVirtualRegion) { current := required // Attempted mmap size. filled := uintptr(0) suggestedAddr := uintptr(0) - if ring0.VirtualAddressBits > 48 { + if exendedAddressSpaceAllowed && ring0.VirtualAddressBits > 48 { // Pass a hint address above 47 bits to indicate to the kernel that // we can handle, and want, mappings above 47 bits: // https://docs.kernel.org/arch/x86/x86_64/5level-paging.html#user-space-and-large-virtual-address-space. diff --git a/pkg/sentry/platform/kvm/physical_map_amd64.go b/pkg/sentry/platform/kvm/physical_map_amd64.go index c5adfb577f..664421e90a 100644 --- a/pkg/sentry/platform/kvm/physical_map_amd64.go +++ b/pkg/sentry/platform/kvm/physical_map_amd64.go @@ -14,9 +14,62 @@ package kvm +import ( + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/ring0" +) + const ( // reservedMemory is a chunk of physical memory reserved starting at // physical address zero. There are some special pages in this region, // so we just call the whole thing off. reservedMemory = 0x100000000 ) + +const ( + // defaultAddressSpaceSize is the default limit for the user virtual + // address space, which is 47-bits (2^47 bytes). The mmap syscall + // respects this limit by default, even with 5-level page tables + // enabled. + defaultAddressSpaceSize = uintptr(1) << 47 + + // exendedAddressSpaceAllowed controls address space usage beyond + // the default 47-bit limit. It is set to 'false' for several reasons: + // * There are no known use cases requiring the extended address space. + // * By restricting the size, we avoid the overhead of: + // a) Aligning the virtual address space size to the physical + // address space size. + // b) Creating unnecessary page table entries for the unused + // extended range. + // * The memory slot size is currently configured only to cover + // the default 47-bit address space. + // * 5-level page table support was primarily introduced to workaround + // a specific kernel bug where VDSO could be mapped above the 47-bit + // boundary (v6.9-rc1~186^2~7). + exendedAddressSpaceAllowed = false +) + +// archSpecialRegions returns special regions that are excluded from the virtual +// address space. Linux doesn't map vma-s above 47-bit by default. +func archSpecialRegions(vSize uintptr, maxUserAddr uintptr) (uintptr, []specialVirtualRegion) { + var specialRegions []specialVirtualRegion + if exendedAddressSpaceAllowed || vSize <= defaultAddressSpaceSize { + return vSize, nil + } + // This is a workaround for the kernel bug when vdso can be + // mapped above the 47-bit address space boundary. + if defaultAddressSpaceSize > maxUserAddr { + maxUserAddr = defaultAddressSpaceSize + } + r := region{ + virtual: maxUserAddr, + length: ring0.MaximumUserAddress - defaultAddressSpaceSize, + } + specialRegions = append(specialRegions, specialVirtualRegion{ + region: r, + }) + vSize -= r.length + log.Infof("excluded: virtual [%x,%x)", r.virtual, r.virtual+r.length) + + return vSize, specialRegions +} diff --git a/pkg/sentry/platform/kvm/physical_map_arm64.go b/pkg/sentry/platform/kvm/physical_map_arm64.go index 4d85614539..bd4d06aa36 100644 --- a/pkg/sentry/platform/kvm/physical_map_arm64.go +++ b/pkg/sentry/platform/kvm/physical_map_arm64.go @@ -16,4 +16,10 @@ package kvm const ( reservedMemory = 0 + // 5-level page tables are not implemeted on arm64. + exendedAddressSpaceAllowed = false ) + +func archSpecialRegions(vSize uintptr, maxUserAddr uintptr) (uintptr, []specialVirtualRegion) { + return vSize, nil +}