From 778eb1dea574b9f7a68f41e636d0c7a490a1b3f7 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Thu, 30 Jan 2025 15:01:15 +0000 Subject: [PATCH 1/4] fix(test): validate restored test works in test_valid_handler In test_valid_handler, we had a comment to validate that the guest still works if we mess with the balloon after UFFD-based restoration. However, we didn't actually do so. Fix this by running some simple SSH command after both inflation and deflation. Signed-off-by: Patrick Roy --- tests/integration_tests/functional/test_uffd.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/integration_tests/functional/test_uffd.py b/tests/integration_tests/functional/test_uffd.py index fbb86c4c987..398529de77d 100644 --- a/tests/integration_tests/functional/test_uffd.py +++ b/tests/integration_tests/functional/test_uffd.py @@ -118,10 +118,14 @@ def test_valid_handler(uvm_plain, snapshot, uffd_handler_paths): # Inflate balloon. vm.api.balloon.patch(amount_mib=200) + # Verify if the restored guest works. + vm.ssh.check_output("true") + # Deflate balloon. vm.api.balloon.patch(amount_mib=0) # Verify if the restored guest works. + vm.ssh.check_output("true") def test_malicious_handler(uvm_plain, snapshot, uffd_handler_paths): From 67591680ee03b82047689f21e9f19a7056382bf2 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Thu, 30 Jan 2025 15:18:49 +0000 Subject: [PATCH 2/4] fix: only use mmap trick when restoring from file When the balloon inflates, and the guest gives us back some pages of memory, we need to free those pages. In booted VMs, we do this with madvise(MADV_DONTNEED), and in restored VMs we do it by MAP_FIXED-ing a new VMA on top of the range-to-free. This is because if guest memory is a MAP_PRIVATE of a memory file, madvise has no effect. However, we also do this MAP_FIXED trick if the snapshot is restored with UFFD. In this case, its not needed (madvise works perfectly fine), and in fact, its wrong: If we map over the memory range, UFFD will not receive Remove events for the specified range. Fix this by only using the mmap trick for file-based restored. Fixes #4988 Signed-off-by: Patrick Roy --- src/vmm/src/builder.rs | 1 + src/vmm/src/device_manager/persist.rs | 7 ++++++- src/vmm/src/devices/virtio/balloon/device.rs | 10 +++++----- src/vmm/src/devices/virtio/balloon/persist.rs | 15 ++++++++++++--- src/vmm/src/devices/virtio/balloon/util.rs | 4 ++-- src/vmm/src/lib.rs | 2 -- 6 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 4b43e67541f..fc23d8add0b 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -531,6 +531,7 @@ pub fn build_microvm_from_snapshot( resource_allocator: &mut vmm.resource_allocator, vm_resources, instance_id: &instance_info.id, + restored_from_file: vmm.uffd.is_none(), }; vmm.mmio_device_manager = diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 5773fa0ba09..bdf63409d68 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -214,6 +214,7 @@ pub struct MMIODevManagerConstructorArgs<'a> { pub resource_allocator: &'a mut ResourceAllocator, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, + pub restored_from_file: bool, } impl fmt::Debug for MMIODevManagerConstructorArgs<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -512,7 +513,10 @@ impl<'a> Persist<'a> for MMIODeviceManager { if let Some(balloon_state) = &state.balloon_device { let device = Arc::new(Mutex::new(Balloon::restore( - BalloonConstructorArgs { mem: mem.clone() }, + BalloonConstructorArgs { + mem: mem.clone(), + restored_from_file: constructor_args.restored_from_file, + }, &balloon_state.device_state, )?)); @@ -807,6 +811,7 @@ mod tests { resource_allocator: &mut resource_allocator, vm_resources, instance_id: "microvm-id", + restored_from_file: true, }; let restored_dev_manager = MMIODeviceManager::restore(restore_args, &device_states).unwrap(); diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 697928ae9c6..f6be2536de5 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -164,7 +164,7 @@ pub struct Balloon { pub(crate) irq_trigger: IrqTrigger, // Implementation specific fields. - pub(crate) restored: bool, + pub(crate) restored_from_file: bool, pub(crate) stats_polling_interval_s: u16, pub(crate) stats_timer: TimerFd, // The index of the previous stats descriptor is saved because @@ -189,7 +189,7 @@ impl fmt::Debug for Balloon { .field("queue_evts", &self.queue_evts) .field("device_state", &self.device_state) .field("irq_trigger", &self.irq_trigger) - .field("restored", &self.restored) + .field("restored_from_file", &self.restored_from_file) .field("stats_polling_interval_s", &self.stats_polling_interval_s) .field("stats_desc_index", &self.stats_desc_index) .field("latest_stats", &self.latest_stats) @@ -204,7 +204,7 @@ impl Balloon { amount_mib: u32, deflate_on_oom: bool, stats_polling_interval_s: u16, - restored: bool, + restored_from_file: bool, ) -> Result { let mut avail_features = 1u64 << VIRTIO_F_VERSION_1; @@ -245,7 +245,7 @@ impl Balloon { irq_trigger: IrqTrigger::new().map_err(BalloonError::EventFd)?, device_state: DeviceState::Inactive, activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?, - restored, + restored_from_file, stats_polling_interval_s, stats_timer, stats_desc_index: None, @@ -355,7 +355,7 @@ impl Balloon { if let Err(err) = remove_range( mem, (guest_addr, u64::from(range_len) << VIRTIO_BALLOON_PFN_SHIFT), - self.restored, + self.restored_from_file, ) { error!("Error removing memory range: {:?}", err); } diff --git a/src/vmm/src/devices/virtio/balloon/persist.rs b/src/vmm/src/devices/virtio/balloon/persist.rs index 4e768ddd2e2..c1fb6865b5f 100644 --- a/src/vmm/src/devices/virtio/balloon/persist.rs +++ b/src/vmm/src/devices/virtio/balloon/persist.rs @@ -95,6 +95,7 @@ pub struct BalloonState { pub struct BalloonConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, + pub restored_from_file: bool, } impl Persist<'_> for Balloon { @@ -121,7 +122,12 @@ impl Persist<'_> for Balloon { ) -> Result { // We can safely create the balloon with arbitrary flags and // num_pages because we will overwrite them after. - let mut balloon = Balloon::new(0, false, state.stats_polling_interval_s, true)?; + let mut balloon = Balloon::new( + 0, + false, + state.stats_polling_interval_s, + constructor_args.restored_from_file, + )?; let mut num_queues = BALLOON_NUM_QUEUES; // As per the virtio 1.1 specification, the statistics queue @@ -192,13 +198,16 @@ mod tests { // Deserialize and restore the balloon device. let restored_balloon = Balloon::restore( - BalloonConstructorArgs { mem: guest_mem }, + BalloonConstructorArgs { + mem: guest_mem, + restored_from_file: true, + }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), ) .unwrap(); assert_eq!(restored_balloon.device_type(), TYPE_BALLOON); - assert!(restored_balloon.restored); + assert!(restored_balloon.restored_from_file); assert_eq!(restored_balloon.acked_features, balloon.acked_features); assert_eq!(restored_balloon.avail_features, balloon.avail_features); diff --git a/src/vmm/src/devices/virtio/balloon/util.rs b/src/vmm/src/devices/virtio/balloon/util.rs index f8cf7aa2000..a9960540a60 100644 --- a/src/vmm/src/devices/virtio/balloon/util.rs +++ b/src/vmm/src/devices/virtio/balloon/util.rs @@ -68,7 +68,7 @@ pub(crate) fn compact_page_frame_numbers(v: &mut [u32]) -> Vec<(u32, u32)> { pub(crate) fn remove_range( guest_memory: &GuestMemoryMmap, range: (GuestAddress, u64), - restored: bool, + restored_from_file: bool, ) -> Result<(), RemoveRegionError> { let (guest_address, range_len) = range; @@ -83,7 +83,7 @@ pub(crate) fn remove_range( // Mmap a new anonymous region over the present one in order to create a hole. // This workaround is (only) needed after resuming from a snapshot because the guest memory // is mmaped from file as private and there is no `madvise` flag that works for this case. - if restored { + if restored_from_file { // SAFETY: The address and length are known to be valid. let ret = unsafe { libc::mmap( diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 6833a3a12d2..618f5d7b6c3 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -314,8 +314,6 @@ pub struct Vmm { vm: Vm, guest_memory: GuestMemoryMmap, // Save UFFD in order to keep it open in the Firecracker process, as well. - // Since this field is never read again, we need to allow `dead_code`. - #[allow(dead_code)] uffd: Option, vcpus_handles: Vec, // Used by Vcpus and devices to initiate teardown; Vmm should never write here. From 6784aaf138645948fcdd240b0f43bdbe82099885 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Thu, 30 Jan 2025 15:47:02 +0000 Subject: [PATCH 3/4] fix(example): correctly handle `remove` events in uffd exammple The crux of the issue was that UFFD gets blocked (all ioctls return -EAGAIN) when there's any `remove` events pending in the queue, which means during processing we not only need to look at the "head" of the queue, but also make sure there's no `remove` events in the "tail". Deal with these scenarios correctly by always greedily reading the entire queue, to ensure there's nothing pending, and only then processing things one-by-one. Please see the new code comments for intricacies with this approach. Fixes #4990 Signed-off-by: Patrick Roy --- .../examples/uffd/fault_all_handler.rs | 2 +- src/firecracker/examples/uffd/uffd_utils.rs | 43 ++++++--- .../examples/uffd/valid_handler.rs | 89 +++++++++++++++---- 3 files changed, 106 insertions(+), 28 deletions(-) diff --git a/src/firecracker/examples/uffd/fault_all_handler.rs b/src/firecracker/examples/uffd/fault_all_handler.rs index 5e9f49a3207..cfeaa099236 100644 --- a/src/firecracker/examples/uffd/fault_all_handler.rs +++ b/src/firecracker/examples/uffd/fault_all_handler.rs @@ -36,7 +36,7 @@ fn main() { userfaultfd::Event::Pagefault { .. } => { for region in uffd_handler.mem_regions.clone() { uffd_handler - .serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size) + .serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size); } } _ => panic!("Unexpected event on userfaultfd"), diff --git a/src/firecracker/examples/uffd/uffd_utils.rs b/src/firecracker/examples/uffd/uffd_utils.rs index 8cc70ab7c21..a2f7879f591 100644 --- a/src/firecracker/examples/uffd/uffd_utils.rs +++ b/src/firecracker/examples/uffd/uffd_utils.rs @@ -116,7 +116,7 @@ impl UffdHandler { } } - pub fn serve_pf(&mut self, addr: *mut u8, len: usize) { + pub fn serve_pf(&mut self, addr: *mut u8, len: usize) -> bool { // Find the start of the page that the current faulting address belongs to. let dst = (addr as usize & !(self.page_size - 1)) as *mut libc::c_void; let fault_page_addr = dst as u64; @@ -133,14 +133,18 @@ impl UffdHandler { // event was received. This can be a consequence of guest reclaiming back its // memory from the host (through balloon device) Some(MemPageState::Uninitialized) | Some(MemPageState::FromFile) => { - let (start, end) = self.populate_from_file(region, fault_page_addr, len); - self.update_mem_state_mappings(start, end, MemPageState::FromFile); - return; + match self.populate_from_file(region, fault_page_addr, len) { + Some((start, end)) => { + self.update_mem_state_mappings(start, end, MemPageState::FromFile) + } + None => return false, + } + return true; } Some(MemPageState::Removed) | Some(MemPageState::Anonymous) => { let (start, end) = self.zero_out(fault_page_addr); self.update_mem_state_mappings(start, end, MemPageState::Anonymous); - return; + return true; } None => {} } @@ -152,20 +156,39 @@ impl UffdHandler { ); } - fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> (u64, u64) { + fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> Option<(u64, u64)> { let offset = dst - region.mapping.base_host_virt_addr; let src = self.backing_buffer as u64 + region.mapping.offset + offset; let ret = unsafe { - self.uffd - .copy(src as *const _, dst as *mut _, len, true) - .expect("Uffd copy failed") + match self.uffd.copy(src as *const _, dst as *mut _, len, true) { + Ok(value) => value, + // Catch EAGAIN errors, which occur when a `remove` event lands in the UFFD + // queue while we're processing `pagefault` events. + // The weird cast is because the `bytes_copied` field is based on the + // `uffdio_copy->copy` field, which is a signed 64 bit integer, and if something + // goes wrong, it gets set to a -errno code. However, uffd-rs always casts this + // value to an unsigned `usize`, which scrambled the errno. + Err(Error::PartiallyCopied(bytes_copied)) + if bytes_copied == 0 || bytes_copied == (-libc::EAGAIN) as usize => + { + return None + } + Err(Error::CopyFailed(errno)) + if std::io::Error::from(errno).raw_os_error().unwrap() == libc::EEXIST => + { + len + } + Err(e) => { + panic!("Uffd copy failed: {e:?}"); + } + } }; // Make sure the UFFD copied some bytes. assert!(ret > 0); - (dst, dst + len as u64) + Some((dst, dst + len as u64)) } fn zero_out(&mut self, addr: u64) -> (u64, u64) { diff --git a/src/firecracker/examples/uffd/valid_handler.rs b/src/firecracker/examples/uffd/valid_handler.rs index 6c681d932ac..936b9f517a3 100644 --- a/src/firecracker/examples/uffd/valid_handler.rs +++ b/src/firecracker/examples/uffd/valid_handler.rs @@ -26,24 +26,79 @@ fn main() { let mut runtime = Runtime::new(stream, file); runtime.install_panic_hook(); runtime.run(|uffd_handler: &mut UffdHandler| { - // Read an event from the userfaultfd. - let event = uffd_handler - .read_event() - .expect("Failed to read uffd_msg") - .expect("uffd_msg not ready"); - - // We expect to receive either a Page Fault or Removed - // event (if the balloon device is enabled). - match event { - userfaultfd::Event::Pagefault { addr, .. } => { - uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) + // !DISCLAIMER! + // When using UFFD together with the balloon device, this handler needs to deal with + // `remove` and `pagefault` events. There are multiple things to keep in mind in + // such setups: + // + // As long as any `remove` event is pending in the UFFD queue, all ioctls return EAGAIN + // ----------------------------------------------------------------------------------- + // + // This means we cannot process UFFD events simply one-by-one anymore - if a `remove` event + // arrives, we need to pre-fetch all other events up to the `remove` event, to unblock the + // UFFD, and then go back to the process the pre-fetched events. + // + // UFFD might receive events in not in their causal order + // ----------------------------------------------------- + // + // For example, the guest + // kernel might first respond to a balloon inflation by freeing some memory, and + // telling Firecracker about this. Firecracker will then madvise(MADV_DONTNEED) the + // free memory range, which causes a `remove` event to be sent to UFFD. Then, the + // guest kernel might immediately fault the page in again (for example because + // default_on_oom was set). which causes a `pagefault` event to be sent to UFFD. + // + // However, the pagefault will be triggered from inside KVM on the vCPU thread, while the + // balloon device is handled by Firecracker on its VMM thread. This means that potentially + // this handler can receive the `pagefault` _before_ the `remove` event. + // + // This means that the simple "greedy" strategy of simply prefetching _all_ UFFD events + // to make sure no `remove` event is blocking us can result in the handler acting on + // the `pagefault` event before the `remove` message (despite the `remove` event being + // in the causal past of the `pagefault` event), which means that we will fault in a page + // from the snapshot file, while really we should be faulting in a zero page. + // + // In this example handler, we ignore this problem, to avoid + // complexity (under the assumption that the guest kernel will zero a newly faulted in + // page anyway). A production handler will most likely want to ensure that `remove` + // events for a specific range are always handled before `pagefault` events. + // + // Lastly, we still need to deal with the race condition where a `remove` event arrives + // in the UFFD queue after we got done reading all events, in which case we need to go + // back to reading more events before we can continue processing `pagefault`s. + let mut deferred_events = Vec::new(); + + loop { + // First, try events that we couldn't handle last round + let mut events_to_handle = Vec::from_iter(deferred_events.drain(..)); + + // Read all events from the userfaultfd. + while let Some(event) = uffd_handler.read_event().expect("Failed to read uffd_msg") { + events_to_handle.push(event); + } + + for event in events_to_handle.drain(..) { + // We expect to receive either a Page Fault or `remove` + // event (if the balloon device is enabled). + match event { + userfaultfd::Event::Pagefault { addr, .. } => { + if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) { + deferred_events.push(event); + } + } + userfaultfd::Event::Remove { start, end } => uffd_handler + .update_mem_state_mappings(start as u64, end as u64, MemPageState::Removed), + _ => panic!("Unexpected event on userfaultfd"), + } + } + + // We assume that really only the above removed/pagefault interaction can result in + // deferred events. In that scenario, the loop will always terminate (unless + // newly arriving `remove` events end up indefinitely blocking it, but there's nothing + // we can do about that, and it's a largely theoretical problem). + if deferred_events.is_empty() { + break; } - userfaultfd::Event::Remove { start, end } => uffd_handler.update_mem_state_mappings( - start as u64, - end as u64, - MemPageState::Removed, - ), - _ => panic!("Unexpected event on userfaultfd"), } }); } From 5c2dec5d3d927f2d8c89d2d3a33a56e0135fc2a4 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Thu, 30 Jan 2025 16:00:40 +0000 Subject: [PATCH 4/4] doc: Update Changelog Add entry about the UFFD fix. Signed-off-by: Patrick Roy --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d6a8e1b2ded..30f5b70330c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,10 @@ and this project adheres to - [#5007](https://github.com/firecracker-microvm/firecracker/pull/5007): Fixed watchdog softlockup warning on x86_64 guests when a vCPU is paused during GDB debugging. +- [#5021](https://github.com/firecracker-microvm/firecracker/pull/5021) If a + balloon device is inflated post UFFD-backed snapshot restore, Firecracker now + causes `remove` UFFD messages to be sent to the UFFD handler. Previously, no + such message would be sent. ## [1.10.1]