@@ -26,24 +26,79 @@ fn main() {
26
26
let mut runtime = Runtime :: new ( stream, file) ;
27
27
runtime. install_panic_hook ( ) ;
28
28
runtime. run ( |uffd_handler : & mut UffdHandler | {
29
- // Read an event from the userfaultfd.
30
- let event = uffd_handler
31
- . read_event ( )
32
- . expect ( "Failed to read uffd_msg" )
33
- . expect ( "uffd_msg not ready" ) ;
34
-
35
- // We expect to receive either a Page Fault or Removed
36
- // event (if the balloon device is enabled).
37
- match event {
38
- userfaultfd:: Event :: Pagefault { addr, .. } => {
39
- uffd_handler. serve_pf ( addr. cast ( ) , uffd_handler. page_size )
29
+ // !DISCLAIMER!
30
+ // When using UFFD together with the balloon device, this handler needs to deal with
31
+ // `remove` and `pagefault` events. There are multiple things to keep in mind in
32
+ // such setups:
33
+ //
34
+ // As long as any `remove`` event is pending in the UFFD queue, all ioctls return EAGAIN
35
+ // -----------------------------------------------------------------------------------
36
+ //
37
+ // This means we cannot process UFFD events simply one-by-one anymore - if a `remove` event
38
+ // arrives, we need to pre-fetch all other events up to the `remove` event, to unblock the
39
+ // UFFD, and then go back to the process the pre-fetched events.
40
+ //
41
+ // UFFD might receive events in not in their causal order
42
+ // -----------------------------------------------------
43
+ //
44
+ // For example, the guest
45
+ // kernel might first respond to a balloon inflation by freeing some memory, and
46
+ // telling Firecracker about this. Firecracker will then madvise(MADV_DONTNEED) the
47
+ // free memory range, which causes a `remove` event to be sent to UFFD. Then, the
48
+ // guest kernel might immediately fault the page in again (for example because
49
+ // default_on_oom was set). which causes a `pagefault` event to be sent to UFFD.
50
+ //
51
+ // However, the pagefault will be triggered from inside KVM on the vCPU thread, while the
52
+ // balloon device is handled by Firecracker on its VMM thread. This means that potentially
53
+ // this handler can receive the `pagefault` _before_ the `remove` event.
54
+ //
55
+ // This means that the simple "greedy" strategy of simply prefetching _all_ UFFD events
56
+ // to make sure no `remove` event is blocking us can result in the handler acting on
57
+ // the `pagefault` event before the `remove` message (despite the `remove` event being
58
+ // in the causal past of the `pagefault` event), which means that we will fault in a page
59
+ // from the snapshot file, while really we should be faulting in a zero page.
60
+ //
61
+ // In this example handler, we ignore this problem, to avoid
62
+ // complexity (under the assumption that the guest kernel will zero a newly faulted in
63
+ // page anyway). A production handler will most likely want to ensure that `remove`
64
+ // events for a specific range are always handled before `pagefault` events.
65
+ //
66
+ // Lastly, we still need to deal with the race condition where a `remove` event arrives
67
+ // in the UFFD queue after we got done reading all events, in which case we need to go
68
+ // back to reading more events before we can continue processing `pagefault`s.
69
+ let mut deferred_events = Vec :: new ( ) ;
70
+
71
+ loop {
72
+ // First, try events that we couldn't handle last round
73
+ let mut events_to_handle = Vec :: from_iter ( deferred_events. drain ( ..) ) ;
74
+
75
+ // Read all events from the userfaultfd.
76
+ while let Some ( event) = uffd_handler. read_event ( ) . expect ( "Failed to read uffd_msg" ) {
77
+ events_to_handle. push ( event) ;
78
+ }
79
+
80
+ for event in events_to_handle. drain ( ..) {
81
+ // We expect to receive either a Page Fault or `remove`
82
+ // event (if the balloon device is enabled).
83
+ match event {
84
+ userfaultfd:: Event :: Pagefault { addr, .. } => {
85
+ if !uffd_handler. serve_pf ( addr. cast ( ) , uffd_handler. page_size ) {
86
+ deferred_events. push ( event) ;
87
+ }
88
+ }
89
+ userfaultfd:: Event :: Remove { start, end } => uffd_handler
90
+ . update_mem_state_mappings ( start as u64 , end as u64 , MemPageState :: Removed ) ,
91
+ _ => panic ! ( "Unexpected event on userfaultfd" ) ,
92
+ }
93
+ }
94
+
95
+ // We assume that really only the above removed/pagefault interaction can result in
96
+ // deferred events. In that scenario, the loop will always terminate (unless
97
+ // newly arriving `remove` events end up indefinitely blocking it, but there's nothing
98
+ // we can do about that, and it's a largely theoretical problem).
99
+ if deferred_events. is_empty ( ) {
100
+ break ;
40
101
}
41
- userfaultfd:: Event :: Remove { start, end } => uffd_handler. update_mem_state_mappings (
42
- start as u64 ,
43
- end as u64 ,
44
- MemPageState :: Removed ,
45
- ) ,
46
- _ => panic ! ( "Unexpected event on userfaultfd" ) ,
47
102
}
48
103
} ) ;
49
104
}
0 commit comments