From de24be468723ff9ace746d48d9d4610f290ddf2e Mon Sep 17 00:00:00 2001 From: Egor Lazarchuk Date: Fri, 15 Nov 2024 10:47:56 +0000 Subject: [PATCH 1/4] refactor: replace PAGE_SIZE with GUEST/HOST_PAGE_SIZE Firecracker was assuming page sizes for both host and guest are 4K. But they can differ, so split into 2 values. Signed-off-by: Egor Lazarchuk --- src/vmm/src/arch/aarch64/mod.rs | 3 ++- src/vmm/src/arch/mod.rs | 7 +++++-- src/vmm/src/arch/x86_64/mod.rs | 2 +- src/vmm/src/builder.rs | 4 ++-- src/vmm/src/devices/virtio/iov_deque.rs | 8 ++++---- src/vmm/src/devices/virtio/iovec.rs | 6 +++--- src/vmm/src/gdb/target.rs | 6 +++--- src/vmm/src/vstate/vm.rs | 2 +- 8 files changed, 21 insertions(+), 17 deletions(-) diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index 08ca1b65edb..a409f330c56 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -94,7 +94,8 @@ pub fn initrd_load_addr( guest_mem: &GuestMemoryMmap, initrd_size: usize, ) -> Result { - let round_to_pagesize = |size| (size + (super::PAGE_SIZE - 1)) & !(super::PAGE_SIZE - 1); + let round_to_pagesize = + |size| (size + (super::GUEST_PAGE_SIZE - 1)) & !(super::GUEST_PAGE_SIZE - 1); match GuestAddress(get_fdt_addr(guest_mem)).checked_sub(round_to_pagesize(initrd_size) as u64) { Some(offset) => { if guest_mem.address_in_range(offset) { diff --git a/src/vmm/src/arch/mod.rs b/src/vmm/src/arch/mod.rs index f5a2f98cb7c..0f3bd6e46ef 100644 --- a/src/vmm/src/arch/mod.rs +++ b/src/vmm/src/arch/mod.rs @@ -52,8 +52,11 @@ pub struct InitrdConfig { pub size: usize, } -/// Default (smallest) memory page size for the supported architectures. -pub const PAGE_SIZE: usize = 4096; +/// Default page size for the guest OS. +pub const GUEST_PAGE_SIZE: usize = 4096; + +/// Default page size for the host OS. +pub const HOST_PAGE_SIZE: usize = 4096; impl fmt::Display for DeviceType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index 1066d9734c3..40f4f15607a 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -97,7 +97,7 @@ pub fn initrd_load_addr( return Err(ConfigurationError::InitrdAddress); } - let align_to_pagesize = |address| address & !(super::PAGE_SIZE - 1); + let align_to_pagesize = |address| address & !(super::GUEST_PAGE_SIZE - 1); Ok(align_to_pagesize(lowmem_size - initrd_size) as u64) } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 8594da9f077..6131e248d91 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -1305,7 +1305,7 @@ pub(crate) mod tests { use crate::vstate::memory::GuestMemory; let image = make_test_bin(); - let mem_size: usize = image.len() * 2 + crate::arch::PAGE_SIZE; + let mem_size: usize = image.len() * 2 + crate::arch::GUEST_PAGE_SIZE; let tempfile = TempFile::new().unwrap(); let mut tempfile = tempfile.into_file(); @@ -1344,7 +1344,7 @@ pub(crate) mod tests { let tempfile = TempFile::new().unwrap(); let mut tempfile = tempfile.into_file(); tempfile.write_all(&image).unwrap(); - let gm = single_region_mem_at(crate::arch::PAGE_SIZE as u64 + 1, image.len() * 2); + let gm = single_region_mem_at(crate::arch::GUEST_PAGE_SIZE as u64 + 1, image.len() * 2); let res = load_initrd(&gm, &mut tempfile); assert!( diff --git a/src/vmm/src/devices/virtio/iov_deque.rs b/src/vmm/src/devices/virtio/iov_deque.rs index 51bf28a49eb..994ba5cd320 100644 --- a/src/vmm/src/devices/virtio/iov_deque.rs +++ b/src/vmm/src/devices/virtio/iov_deque.rs @@ -6,7 +6,7 @@ use std::os::fd::AsRawFd; use libc::{c_int, c_void, iovec, off_t, size_t}; use memfd; -use crate::arch::PAGE_SIZE; +use crate::arch::HOST_PAGE_SIZE; #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum IovDequeError { @@ -79,8 +79,8 @@ pub enum IovDequeError { // ``` // // This value must be a multiple of 256 because this is the maximum number of `iovec` can fit into -// 1 memory page: 256 * sizeof(iovec) == 4096 == PAGE_SIZE. IovDeque only operates with `PAGE_SIZE` -// granularity. +// 1 memory page: 256 * sizeof(iovec) == 4096 == HOST_PAGE_SIZE. IovDeque only operates with +// `HOST_PAGE_SIZE` granularity. #[derive(Debug)] pub struct IovDeque { pub iov: *mut libc::iovec, @@ -93,7 +93,7 @@ unsafe impl Send for IovDeque {} impl IovDeque { const BYTES: usize = L as usize * std::mem::size_of::(); - const _ASSERT: () = assert!(Self::BYTES % PAGE_SIZE == 0); + const _ASSERT: () = assert!(Self::BYTES % HOST_PAGE_SIZE == 0); /// Create a [`memfd`] object that represents a single physical page fn create_memfd() -> Result { diff --git a/src/vmm/src/devices/virtio/iovec.rs b/src/vmm/src/devices/virtio/iovec.rs index 9262dff661b..c9893260d9e 100644 --- a/src/vmm/src/devices/virtio/iovec.rs +++ b/src/vmm/src/devices/virtio/iovec.rs @@ -815,13 +815,13 @@ mod verification { use vm_memory::VolatileSlice; use super::IoVecBuffer; + use crate::arch::GUEST_PAGE_SIZE; use crate::devices::virtio::iov_deque::IovDeque; // Redefine `IoVecBufferMut` and `IovDeque` with specific length. Otherwise // Rust will not know what to do. type IoVecBufferMutDefault = super::IoVecBufferMut; type IovDequeDefault = IovDeque; - use crate::arch::PAGE_SIZE; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; // Maximum memory size to use for our buffers. For the time being 1KB. @@ -912,8 +912,8 @@ mod verification { // SAFETY: safe because the layout has non-zero size let mem = unsafe { std::alloc::alloc(std::alloc::Layout::from_size_align_unchecked( - 2 * PAGE_SIZE, - PAGE_SIZE, + 2 * GUEST_PAGE_SIZE, + GUEST_PAGE_SIZE, )) }; IovDequeDefault { diff --git a/src/vmm/src/gdb/target.rs b/src/vmm/src/gdb/target.rs index b8230342b27..6f3f2593c15 100644 --- a/src/vmm/src/gdb/target.rs +++ b/src/vmm/src/gdb/target.rs @@ -33,7 +33,7 @@ use vm_memory::{Bytes, GuestAddress, GuestMemoryError}; use super::arch; #[cfg(target_arch = "aarch64")] use crate::arch::aarch64::vcpu::VcpuError as AarchVcpuError; -use crate::arch::PAGE_SIZE; +use crate::arch::GUEST_PAGE_SIZE; use crate::logger::{error, info}; use crate::utils::u64_to_usize; use crate::vstate::vcpu::VcpuSendEventError; @@ -396,7 +396,7 @@ impl MultiThreadBase for FirecrackerTarget { // Compute the amount space left in the page after the gpa let read_len = std::cmp::min( data.len(), - PAGE_SIZE - (u64_to_usize(gpa) & (PAGE_SIZE - 1)), + GUEST_PAGE_SIZE - (u64_to_usize(gpa) & (GUEST_PAGE_SIZE - 1)), ); vmm.guest_memory() @@ -430,7 +430,7 @@ impl MultiThreadBase for FirecrackerTarget { // Compute the amount space left in the page after the gpa let write_len = std::cmp::min( data.len(), - PAGE_SIZE - (u64_to_usize(gpa) & (PAGE_SIZE - 1)), + GUEST_PAGE_SIZE - (u64_to_usize(gpa) & (GUEST_PAGE_SIZE - 1)), ); vmm.guest_memory() diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 3d24dc6f9ac..d213d4d7bb6 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -593,7 +593,7 @@ pub(crate) mod tests { let res = vm.set_kvm_memory_regions(&gm, false); res.unwrap(); - // Trying to set a memory region with a size that is not a multiple of PAGE_SIZE + // Trying to set a memory region with a size that is not a multiple of GUEST_PAGE_SIZE // will result in error. let gm = single_region_mem(0x10); let res = vm.set_kvm_memory_regions(&gm, false); From c15554930acea45ace76a493c884c59dbd4fe502 Mon Sep 17 00:00:00 2001 From: Egor Lazarchuk Date: Fri, 15 Nov 2024 11:59:35 +0000 Subject: [PATCH 2/4] feat: query host page size Define global variable with host page size and update it at the very beginning of the main function in Firecracker. This way data types which rely on specific host page size can adapt to it. Signed-off-by: Egor Lazarchuk --- src/firecracker/src/main.rs | 5 +++++ src/vmm/src/arch/mod.rs | 18 ++++++++++++++++-- src/vmm/src/devices/virtio/iov_deque.rs | 5 +++-- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/firecracker/src/main.rs b/src/firecracker/src/main.rs index 8fb5392afcf..300afd0ad66 100644 --- a/src/firecracker/src/main.rs +++ b/src/firecracker/src/main.rs @@ -20,6 +20,7 @@ use seccomp::FilterError; use seccompiler::BpfThreadMap; use utils::arg_parser::{ArgParser, Argument}; use utils::validators::validate_instance_id; +use vmm::arch::host_page_size; use vmm::builder::StartMicrovmError; use vmm::logger::{ debug, error, info, LoggerConfig, ProcessTimeReporter, StoreMetric, LOGGER, METRICS, @@ -108,6 +109,10 @@ fn main_exec() -> Result<(), MainError> { // Initialize the logger. LOGGER.init().map_err(MainError::SetLogger)?; + // First call to this function updates the value to current + // host page size. + _ = host_page_size(); + // We need this so that we can reset terminal to canonical mode if panic occurs. let stdin = io::stdin(); diff --git a/src/vmm/src/arch/mod.rs b/src/vmm/src/arch/mod.rs index 0f3bd6e46ef..a51055622e4 100644 --- a/src/vmm/src/arch/mod.rs +++ b/src/vmm/src/arch/mod.rs @@ -2,7 +2,9 @@ // SPDX-License-Identifier: Apache-2.0 use std::fmt; +use std::sync::LazyLock; +use log::warn; use serde::{Deserialize, Serialize}; /// Module for aarch64 related functionality. @@ -55,8 +57,20 @@ pub struct InitrdConfig { /// Default page size for the guest OS. pub const GUEST_PAGE_SIZE: usize = 4096; -/// Default page size for the host OS. -pub const HOST_PAGE_SIZE: usize = 4096; +/// Get the size of the host page size. +pub fn host_page_size() -> usize { + /// Default page size for the host OS. + static PAGE_SIZE: LazyLock = LazyLock::new(|| { + // # Safety: Value always valid + let r = unsafe { libc::sysconf(libc::_SC_PAGESIZE) }; + usize::try_from(r).unwrap_or_else(|_| { + warn!("Could not get host page size with sysconf, assuming default 4K host pages"); + 4096 + }) + }); + + *PAGE_SIZE +} impl fmt::Display for DeviceType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { diff --git a/src/vmm/src/devices/virtio/iov_deque.rs b/src/vmm/src/devices/virtio/iov_deque.rs index 994ba5cd320..e3a2534994a 100644 --- a/src/vmm/src/devices/virtio/iov_deque.rs +++ b/src/vmm/src/devices/virtio/iov_deque.rs @@ -6,7 +6,7 @@ use std::os::fd::AsRawFd; use libc::{c_int, c_void, iovec, off_t, size_t}; use memfd; -use crate::arch::HOST_PAGE_SIZE; +use crate::arch::host_page_size; #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum IovDequeError { @@ -93,7 +93,6 @@ unsafe impl Send for IovDeque {} impl IovDeque { const BYTES: usize = L as usize * std::mem::size_of::(); - const _ASSERT: () = assert!(Self::BYTES % HOST_PAGE_SIZE == 0); /// Create a [`memfd`] object that represents a single physical page fn create_memfd() -> Result { @@ -153,6 +152,8 @@ impl IovDeque { /// Create a new [`IovDeque`] that can hold memory described by a single VirtIO queue. pub fn new() -> Result { + assert!(Self::BYTES % host_page_size() == 0); + let memfd = Self::create_memfd()?; let raw_memfd = memfd.as_file().as_raw_fd(); let buffer = Self::allocate_ring_buffer_memory()?; From ce5cb58acb43b37220acc689ce3bf3bb3417d8fb Mon Sep 17 00:00:00 2001 From: Egor Lazarchuk Date: Fri, 15 Nov 2024 12:07:06 +0000 Subject: [PATCH 3/4] feat: update IovDeque to support arbitrary size and host page size Remove restriction on size and host page size. Signed-off-by: Egor Lazarchuk --- src/vmm/src/devices/virtio/iov_deque.rs | 57 ++++++++++++++++--------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/src/vmm/src/devices/virtio/iov_deque.rs b/src/vmm/src/devices/virtio/iov_deque.rs index e3a2534994a..b28d7076f43 100644 --- a/src/vmm/src/devices/virtio/iov_deque.rs +++ b/src/vmm/src/devices/virtio/iov_deque.rs @@ -77,10 +77,7 @@ pub enum IovDequeError { // pub iov_len: ::size_t, // } // ``` -// -// This value must be a multiple of 256 because this is the maximum number of `iovec` can fit into -// 1 memory page: 256 * sizeof(iovec) == 4096 == HOST_PAGE_SIZE. IovDeque only operates with -// `HOST_PAGE_SIZE` granularity. + #[derive(Debug)] pub struct IovDeque { pub iov: *mut libc::iovec, @@ -92,17 +89,15 @@ pub struct IovDeque { unsafe impl Send for IovDeque {} impl IovDeque { - const BYTES: usize = L as usize * std::mem::size_of::(); - /// Create a [`memfd`] object that represents a single physical page - fn create_memfd() -> Result { + fn create_memfd(pages_bytes: usize) -> Result { // Create a sealable memfd. let opts = memfd::MemfdOptions::default().allow_sealing(true); let mfd = opts.create("iov_deque")?; // Resize to system page size. mfd.as_file() - .set_len(Self::BYTES.try_into().unwrap()) + .set_len(pages_bytes.try_into().unwrap()) .map_err(IovDequeError::MemfdResize)?; // Add seals to prevent further resizing. @@ -135,13 +130,13 @@ impl IovDeque { /// Allocate memory for our ring buffer /// - /// This will allocate 2 * `Self::BYTES` bytes of virtual memory. - fn allocate_ring_buffer_memory() -> Result<*mut c_void, IovDequeError> { + /// This will allocate 2 * `pages_bytes` bytes of virtual memory. + fn allocate_ring_buffer_memory(pages_bytes: usize) -> Result<*mut c_void, IovDequeError> { // SAFETY: We are calling the system call with valid arguments unsafe { Self::mmap( std::ptr::null_mut(), - Self::BYTES * 2, + pages_bytes * 2, libc::PROT_NONE, libc::MAP_PRIVATE | libc::MAP_ANONYMOUS, -1, @@ -150,20 +145,29 @@ impl IovDeque { } } + /// Calculate a number of bytes in full pages required for + /// the type to operate. + fn pages_bytes() -> usize { + let host_page_size = host_page_size(); + let bytes = L as usize * std::mem::size_of::(); + let num_host_pages = bytes.div_ceil(host_page_size); + num_host_pages * host_page_size + } + /// Create a new [`IovDeque`] that can hold memory described by a single VirtIO queue. pub fn new() -> Result { - assert!(Self::BYTES % host_page_size() == 0); + let pages_bytes = Self::pages_bytes(); - let memfd = Self::create_memfd()?; + let memfd = Self::create_memfd(pages_bytes)?; let raw_memfd = memfd.as_file().as_raw_fd(); - let buffer = Self::allocate_ring_buffer_memory()?; + let buffer = Self::allocate_ring_buffer_memory(pages_bytes)?; // Map the first page of virtual memory to the physical page described by the memfd object // SAFETY: We are calling the system call with valid arguments let _ = unsafe { Self::mmap( buffer, - Self::BYTES, + pages_bytes, libc::PROT_READ | libc::PROT_WRITE, libc::MAP_SHARED | libc::MAP_FIXED, raw_memfd, @@ -174,17 +178,17 @@ impl IovDeque { // Map the second page of virtual memory to the physical page described by the memfd object // // SAFETY: This is safe because: - // * Both `buffer` and the result of `buffer.add(Self::BYTES)` are within bounds of the + // * Both `buffer` and the result of `buffer.add(pages_bytes)` are within bounds of the // allocation we got from `Self::allocate_ring_buffer_memory`. // * The resulting pointer is the beginning of the second page of our allocation, so it // doesn't wrap around the address space. - let next_page = unsafe { buffer.add(Self::BYTES) }; + let next_page = unsafe { buffer.add(pages_bytes) }; // SAFETY: We are calling the system call with valid arguments let _ = unsafe { Self::mmap( next_page, - Self::BYTES, + pages_bytes, libc::PROT_READ | libc::PROT_WRITE, libc::MAP_SHARED | libc::MAP_FIXED, raw_memfd, @@ -312,9 +316,10 @@ impl IovDeque { impl Drop for IovDeque { fn drop(&mut self) { + let pages_bytes = Self::pages_bytes(); // SAFETY: We are passing an address that we got from a previous allocation of `2 * - // Self::BYTES` bytes by calling mmap - let _ = unsafe { libc::munmap(self.iov.cast(), Self::BYTES * 2) }; + // pages_bytes` by calling mmap + let _ = unsafe { libc::munmap(self.iov.cast(), 2 * pages_bytes) }; } } @@ -332,6 +337,18 @@ mod tests { assert_eq!(deque.len(), 0); } + #[test] + fn test_new_less_than_page() { + let deque = super::IovDeque::<128>::new().unwrap(); + assert_eq!(deque.len(), 0); + } + + #[test] + fn test_new_more_than_page() { + let deque = super::IovDeque::<512>::new().unwrap(); + assert_eq!(deque.len(), 0); + } + fn make_iovec(id: u16, len: u16) -> iovec { iovec { iov_base: id as *mut libc::c_void, From d51aca05e70544ee34912c4a57d098c94b7fb921 Mon Sep 17 00:00:00 2001 From: Egor Lazarchuk Date: Thu, 21 Nov 2024 17:13:31 +0000 Subject: [PATCH 4/4] chore: add `IovDeque` fix to the CHANGELOG Add note about making `IovDeque` to work with any host page size. Signed-off-by: Egor Lazarchuk --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 65ef73a1e1f..e402b85ad43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,9 @@ and this project adheres to - [#4921](https://github.com/firecracker-microvm/firecracker/pull/4921): Fixed swagger `CpuConfig` definition to include missing aarch64-specific fields. +- [#4916](https://github.com/firecracker-microvm/firecracker/pull/4916): Fixed + `IovDeque` implementation to work with any host page size. This fixes + virtio-net device on non 4K host kernels. ## [1.10.1]