Skip to content

Commit 5707696

Browse files
committed
fix(example): correctly handle remove events in uffd exammple
The crux of the issue was that UFFD gets blocked (all ioctls return -EAGAIN) when there's any `remove` events pending in the queue, which means during processing we not only need to look at the "head" of the queue, but also make sure there's no `remove` events in the "tail". Deal with these scenarios correctly by always greedily reading the entire queue, to ensure there's nothing pending, and only then processing things one-by-one. Please see the new code comments for intricacies with this approach. Fixes #4990 Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
1 parent 6759168 commit 5707696

File tree

3 files changed

+106
-28
lines changed

3 files changed

+106
-28
lines changed

src/firecracker/examples/uffd/fault_all_handler.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ fn main() {
3636
userfaultfd::Event::Pagefault { .. } => {
3737
for region in uffd_handler.mem_regions.clone() {
3838
uffd_handler
39-
.serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size)
39+
.serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size);
4040
}
4141
}
4242
_ => panic!("Unexpected event on userfaultfd"),

src/firecracker/examples/uffd/uffd_utils.rs

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ impl UffdHandler {
116116
}
117117
}
118118

119-
pub fn serve_pf(&mut self, addr: *mut u8, len: usize) {
119+
pub fn serve_pf(&mut self, addr: *mut u8, len: usize) -> bool {
120120
// Find the start of the page that the current faulting address belongs to.
121121
let dst = (addr as usize & !(self.page_size - 1)) as *mut libc::c_void;
122122
let fault_page_addr = dst as u64;
@@ -133,14 +133,18 @@ impl UffdHandler {
133133
// event was received. This can be a consequence of guest reclaiming back its
134134
// memory from the host (through balloon device)
135135
Some(MemPageState::Uninitialized) | Some(MemPageState::FromFile) => {
136-
let (start, end) = self.populate_from_file(region, fault_page_addr, len);
137-
self.update_mem_state_mappings(start, end, MemPageState::FromFile);
138-
return;
136+
match self.populate_from_file(region, fault_page_addr, len) {
137+
Some((start, end)) => {
138+
self.update_mem_state_mappings(start, end, MemPageState::FromFile)
139+
}
140+
None => return false,
141+
}
142+
return true;
139143
}
140144
Some(MemPageState::Removed) | Some(MemPageState::Anonymous) => {
141145
let (start, end) = self.zero_out(fault_page_addr);
142146
self.update_mem_state_mappings(start, end, MemPageState::Anonymous);
143-
return;
147+
return true;
144148
}
145149
None => {}
146150
}
@@ -152,20 +156,39 @@ impl UffdHandler {
152156
);
153157
}
154158

155-
fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> (u64, u64) {
159+
fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> Option<(u64, u64)> {
156160
let offset = dst - region.mapping.base_host_virt_addr;
157161
let src = self.backing_buffer as u64 + region.mapping.offset + offset;
158162

159163
let ret = unsafe {
160-
self.uffd
161-
.copy(src as *const _, dst as *mut _, len, true)
162-
.expect("Uffd copy failed")
164+
match self.uffd.copy(src as *const _, dst as *mut _, len, true) {
165+
Ok(value) => value,
166+
// Catch EAGAIN errors, which occur when a `remove` event lands in the UFFD
167+
// queue while we're processing `pagefault` events.
168+
// The weird cast is because the `bytes_copied` field is based on the
169+
// `uffdio_copy->copy` field, which is a signed 64 bit integer, and if something
170+
// goes wrong, it gets set to a -errno code. However, uffd-rs always casts this
171+
// value to an unsigned `usize`, which scrambled the errno.
172+
Err(Error::PartiallyCopied(bytes_copied))
173+
if bytes_copied == 0 || bytes_copied == (-libc::EAGAIN) as usize =>
174+
{
175+
return None
176+
}
177+
Err(Error::CopyFailed(errno))
178+
if std::io::Error::from(errno).raw_os_error().unwrap() == libc::EEXIST =>
179+
{
180+
len
181+
}
182+
Err(e) => {
183+
panic!("Uffd copy failed: {e:?}");
184+
}
185+
}
163186
};
164187

165188
// Make sure the UFFD copied some bytes.
166189
assert!(ret > 0);
167190

168-
(dst, dst + len as u64)
191+
Some((dst, dst + len as u64))
169192
}
170193

171194
fn zero_out(&mut self, addr: u64) -> (u64, u64) {

src/firecracker/examples/uffd/valid_handler.rs

Lines changed: 72 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,24 +26,79 @@ fn main() {
2626
let mut runtime = Runtime::new(stream, file);
2727
runtime.install_panic_hook();
2828
runtime.run(|uffd_handler: &mut UffdHandler| {
29-
// Read an event from the userfaultfd.
30-
let event = uffd_handler
31-
.read_event()
32-
.expect("Failed to read uffd_msg")
33-
.expect("uffd_msg not ready");
34-
35-
// We expect to receive either a Page Fault or Removed
36-
// event (if the balloon device is enabled).
37-
match event {
38-
userfaultfd::Event::Pagefault { addr, .. } => {
39-
uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size)
29+
// !DISCLAIMER!
30+
// When using UFFD together with the balloon device, this handler needs to deal with
31+
// `remove` and `pagefault` events. There are multiple things to keep in mind in
32+
// such setups:
33+
//
34+
// As long as any `remove`` event is pending in the UFFD queue, all ioctls return EAGAIN
35+
// -----------------------------------------------------------------------------------
36+
//
37+
// This means we cannot process UFFD events simply one-by-one anymore - if a `remove` event
38+
// arrives, we need to pre-fetch all other events up to the `remove` event, to unblock the
39+
// UFFD, and then go back to the process the pre-fetched events.
40+
//
41+
// UFFD might receive events in not in their causal order
42+
// -----------------------------------------------------
43+
//
44+
// For example, the guest
45+
// kernel might first respond to a balloon inflation by freeing some memory, and
46+
// telling Firecracker about this. Firecracker will then madvise(MADV_DONTNEED) the
47+
// free memory range, which causes a `remove` event to be sent to UFFD. Then, the
48+
// guest kernel might immediately fault the page in again (for example because
49+
// default_on_oom was set). which causes a `pagefault` event to be sent to UFFD.
50+
//
51+
// However, the pagefault will be triggered from inside KVM on the vCPU thread, while the
52+
// balloon device is handled by Firecracker on its VMM thread. This means that potentially
53+
// this handler can receive the `pagefault` _before_ the `remove` event.
54+
//
55+
// This means that the simple "greedy" strategy of simply prefetching _all_ UFFD events
56+
// to make sure no `remove` event is blocking us can result in the handler acting on
57+
// the `pagefault` event before the `remove` message (despite the `remove` event being
58+
// in the causal past of the `pagefault` event), which means that we will fault in a page
59+
// from the snapshot file, while really we should be faulting in a zero page.
60+
//
61+
// In this example handler, we ignore this problem, to avoid
62+
// complexity (under the assumption that the guest kernel will zero a newly faulted in
63+
// page anyway). A production handler will most likely want to ensure that `remove`
64+
// events for a specific range are always handled before `pagefault` events.
65+
//
66+
// Lastly, we still need to deal with the race condition where a `remove` event arrives
67+
// in the UFFD queue after we got done reading all events, in which case we need to go
68+
// back to reading more events before we can continue processing `pagefault`s.
69+
let mut deferred_events = Vec::new();
70+
71+
loop {
72+
// First, try events that we couldn't handle last round
73+
let mut events_to_handle = Vec::from_iter(deferred_events.drain(..));
74+
75+
// Read all events from the userfaultfd.
76+
while let Some(event) = uffd_handler.read_event().expect("Failed to read uffd_msg") {
77+
events_to_handle.push(event);
78+
}
79+
80+
for event in events_to_handle.drain(..) {
81+
// We expect to receive either a Page Fault or `remove`
82+
// event (if the balloon device is enabled).
83+
match event {
84+
userfaultfd::Event::Pagefault { addr, .. } => {
85+
if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) {
86+
deferred_events.push(event);
87+
}
88+
}
89+
userfaultfd::Event::Remove { start, end } => uffd_handler
90+
.update_mem_state_mappings(start as u64, end as u64, MemPageState::Removed),
91+
_ => panic!("Unexpected event on userfaultfd"),
92+
}
93+
}
94+
95+
// We assume that really only the above removed/pagefault interaction can result in
96+
// deferred events. In that scenario, the loop will always terminate (unless
97+
// newly arriving `remove` events end up indefinitely blocking it, but there's nothing
98+
// we can do about that, and it's a largely theoretical problem).
99+
if deferred_events.is_empty() {
100+
break;
40101
}
41-
userfaultfd::Event::Remove { start, end } => uffd_handler.update_mem_state_mappings(
42-
start as u64,
43-
end as u64,
44-
MemPageState::Removed,
45-
),
46-
_ => panic!("Unexpected event on userfaultfd"),
47102
}
48103
});
49104
}

0 commit comments

Comments
 (0)