From fc4528c288ec5cbb552c24c94b5108019992d66b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Barb=C3=A1chano?= Date: Mon, 14 Oct 2024 15:58:49 +0200 Subject: [PATCH 1/3] devtool: fix sandbox_native for ubuntu22.04 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's also some more packages that are required since the devctr upgrade to Ubuntu 24.04. Signed-off-by: Pablo Barbáchano --- tools/devtool | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/devtool b/tools/devtool index de49b36d579..2c3c28eeb44 100755 --- a/tools/devtool +++ b/tools/devtool @@ -863,8 +863,8 @@ cmd_sandbox_native() { cmd_build --release source /etc/os-release - case $ID-$VERSION_ID in - ubuntu-22.04) + case $ID$VERSION_ID in + ubuntu22.04) sudo apt install python3-pip python3.11-dev gcc tmux ;; al2023) @@ -873,7 +873,8 @@ cmd_sandbox_native() { esac python3.11 -m venv sandbox source sandbox/bin/activate - pip3.11 install ipython requests requests_unixsocket "urllib3<2" psutil tenacity filelock + pip3.11 install ipython requests requests_unixsocket2 psutil tenacity filelock + pip3.11 install jsonschema aws_embedded_metrics pip3.11 install packaging pytest ensure_ci_artifacts tmux neww sudo --preserve-env=HOME,PATH,TMUX env PYTHONPATH=tests IPYTHONDIR=\$PWD/.ipython ipython -i ./tools/sandbox.py $@ From 18f3255c4eb67c3ce6401a7e7ef04df4d79ca479 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Barb=C3=A1chano?= Date: Mon, 14 Oct 2024 16:27:42 +0200 Subject: [PATCH 2/3] test: use openapi-spec-validator validate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit validate_spec is deprecated DeprecationWarning: validate_spec shortcut is deprecated. Use validate instead. Signed-off-by: Pablo Barbáchano --- tests/integration_tests/style/test_swagger.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration_tests/style/test_swagger.py b/tests/integration_tests/style/test_swagger.py index e4eb37b0f5e..74b3db043cd 100644 --- a/tests/integration_tests/style/test_swagger.py +++ b/tests/integration_tests/style/test_swagger.py @@ -4,14 +4,14 @@ from pathlib import Path -from openapi_spec_validator import validate_spec +from openapi_spec_validator import validate from openapi_spec_validator.readers import read_from_filename def validate_swagger(swagger_spec): """Fail if OpenAPI spec is not followed.""" spec_dict, _ = read_from_filename(swagger_spec) - validate_spec(spec_dict) + validate(spec_dict) def test_firecracker_swagger(): From 7a5d938adce1202913529cbe160cec4a7ff2c9cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Barb=C3=A1chano?= Date: Fri, 11 Oct 2024 21:48:22 +0200 Subject: [PATCH 3/3] doc: fix and simplify network-for-clones.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address the discussion in https://github.com/firecracker-microvm/firecracker/discussions/4720 - Reflect that the doc is just an example. - Fix the doc so that it works. - Split the ingress setup as that is a specific requirement that may not always be required, and complicates the implementation. - Also simplify the test framework helpers since on revisiting some of the iptables rules were not needed. Signed-off-by: Pablo Barbáchano --- docs/snapshotting/network-for-clones.md | 144 +++++++++++------------- tests/framework/microvm_helpers.py | 47 +++++--- 2 files changed, 97 insertions(+), 94 deletions(-) diff --git a/docs/snapshotting/network-for-clones.md b/docs/snapshotting/network-for-clones.md index 59bcb4b2db6..d0a44e89413 100644 --- a/docs/snapshotting/network-for-clones.md +++ b/docs/snapshotting/network-for-clones.md @@ -1,8 +1,12 @@ # Network Connectivity for Clones -This document presents the strategy to ensure continued network connectivity for -multiple clones created from a single Firecracker microVM snapshot. This -document also provides an overview of the scalability benchmarks we performed. +This document presents a strategy to ensure continued network connectivity for +multiple clones created from a single Firecracker microVM snapshot. + +> \[!CAUTION\] +> +> This should be considered as just an example to get you started, and we don't +> claim this is a performant or secure setup. ## Setup @@ -13,14 +17,7 @@ names, and each guest will be resumed with the same network configuration, most importantly with the same IP address(es). To work around the former, each clone should be started within a separate network namespace (we can have multiple TAP interfaces with the same name, as long as they reside in distinct network -namespaces). The latter can be mitigated by leveraging `iptables` `SNAT` and -`DNAT` support. We choose a clone address (**CA**) for each clone, which is the -new address that’s going to represent the guest, and make it so all packets -leaving the VM have their source address rewritten to CA, and all incoming -packets with the destination address equal to CA have it rewritten to the IP -address configured inside the guest (which remains the same for all clones). -Each individual clone continues to believe it’s using the original address, but -outside the VM packets are assigned a different one for every clone. +namespaces). The latter can be mitigated by leveraging `iptables` `NAT` support. Let’s have a more detailed look at this approach. We assume each VM has a single network interface attached. If multiple interfaces with full connectivity are @@ -37,8 +34,8 @@ corresponding virtio device (referred to as the guest IP address, for example Attempting to restore multiple clones from the same snapshot faces the problem of every single one of them attempting to use a TAP device with the original name, which is not possible by default. Therefore, we need to start each clone -in a separate network namespace. This is already possible using the netns jailer -parameter, described in the [documentation](../jailer.md). The specified +in a separate network namespace. This is already possible using the `--netns` +jailer parameter, described in the [documentation](../jailer.md). The specified namespace must already exist, so we have to create it first using ```bash @@ -94,9 +91,7 @@ use the following commands (for namespace `fc0`): ```bash # create the veth pair inside the namespace -sudo ip netns exec fc0 ip link add veth1 type veth peer name veth0 -# move veth1 to the global host namespace -sudo ip netns exec fc0 ip link set veth1 netns 1 +sudo ip link add name veth1 type veth peer name veth0 netns fc0 sudo ip netns exec fc0 ip addr add 10.0.0.2/24 dev veth0 sudo ip netns exec fc0 ip link set dev veth0 up @@ -108,31 +103,32 @@ sudo ip link set dev veth1 up sudo ip netns exec fc0 ip route add default via 10.0.0.1 ``` -### `iptables` rules for end-to-end connectivity +### `iptables` rules for VM egress connectivity The last step involves adding the `iptables` rules which change the source/destination IP address of packets on the fly (thus allowing all clones to -have the same internal IP). We need to choose a clone address, which is unique -on the host for each VM. In the demo, we use -`192.168..<(idx % 30) * 8 + 3>` (which is `192.168.0.3` for -`clone 0`): +have the same internal IP). + +```sh +# Find the host egress device +UPSTREAM=$(ip -j route list default |jq -r '.[0].dev') +# anything coming from the VMs, we NAT the address +iptables -t nat -A POSTROUTING -s 10.0.0.0/30 -o $UPSTREAM -j MASQUERADE +# forward packets by default +iptables -P FORWARD ACCEPT +ip netns exec fc0 ip route add default via 10.0.0.1 +ip netns exec fc0 iptables -P FORWARD ACCEPT +``` + +You may also want to configure the guest with a default route and a DNS +nameserver: ```bash -# for packets that leave the namespace and have the source IP address of the -# original guest, rewrite the source address to clone address 192.168.0.3 -sudo ip netns exec fc0 iptables -t nat -A POSTROUTING -o veth0 \ --s 192.168.241.2 -j SNAT --to 192.168.0.3 - -# do the reverse operation; rewrites the destination address of packets -# heading towards the clone address to 192.168.241.2 -sudo ip netns exec fc0 iptables -t nat -A PREROUTING -i veth0 \ --d 192.168.0.3 -j DNAT --to 192.168.241.2 - -# (adds a route on the host for the clone address) -sudo ip route add 192.168.0.3 via 10.0.0.2 +ip route default via 10.0.0.1 +echo nameserver 8.8.8.8 >/etc/resolv.conf ``` -**Full connectivity to/from the clone should be present at this point.** +**Connectivity from the clone should be present at this point.** To make sure the guest also adjusts to the new environment, you can explicitly clear the ARP/neighbour table in the guest: @@ -146,47 +142,39 @@ Otherwise, packets originating from the guest might be using old Link Layer Address for up to arp cache timeout seconds. After said timeout period, connectivity will work both ways even without an explicit flush. -## Scalability evaluation - -We ran synthetic tests to determine the impact of the addtional iptables rules -and namespaces on network performance. We compare the case where each VM runs as -regular Firecracker (gets assigned a TAP interface and a unique IP address in -the global namespace) versus the setup with a separate network namespace for -each VM (together with the veth pair and additional rules). We refer to the -former as the basic case, while the latter is the ns case. We measure latency -with the `ping` command and throughput with `iperf`. - -The experiments, ran on an Amazon AWS `m5d.metal` EC2 instace, go as follows: - -- Set up 3000 network resource slots (different TAP interfaces for the basic - case, and namespaces + everything else for ns). This is mainly to account for - any difference the setup itself might make, even if there are not as many - active endpoints at any given time. -- Start 1000 Firecracker VMs, and pick `N < 1000` as the number of active VMs - that are going to generate network traffic. For ping experiments, we ping each - active VM from the host every 500ms for 30 seconds. For `iperf` experiments, - we measure the average bandwidth of connections from the host to every active - VM lasting 40 seconds. There is one separate client process per VM. -- When `N = 100`, in the basic case we get average latencies of - `0.315 ms (0.160/0.430 min/max)` for `ping`, and an average throughput of - `2.25 Gbps (1.62/3.21 min/max)` per VM for `iperf`. In the ns case, the ping - results **are bumped higher by around 10-20 us**, while the `iperf` results - are virtually the same on average, with a higher minimum (1.73 Gbps) and a - lower maximum (3 Gbps). -- When `N = 1000`, we start facing desynchronizations caused by difficulties in - starting (and thus finishing) the client processes all at the same time, which - creates a wider distribution of results. In the basic case, the average - latency for ping experiments has gone down to 0.305 ms, the minimum decreased - to 0.155 ms, but the maximum increased to 0.640 ms. The average `iperf` per VM - throughput is around `440 Mbps (110/3936 min/max)`. In the ns case, average - `ping` latency is now `0.318 ms (0.170/0.650 min/max)`. For `iperf`, the - average throughput is very close to basic at `~430 Mbps`, while the minimum - and maximum values are lower at `85/3803 Mbps`. - -**The above measurements give a significant degree of confidence in the -scalability of the solution** (subject to repeating for different values of the -experimental parameters, if necessary). The increase in latency is almost -negligible considering usual end-to-end delays. The lower minimum throughput -from the iperf measurements might be significant, but only if that magnitude of -concurrent, data-intensive transfers is likely. Moreover, the basic measurements -are close to an absolute upper bound. +# Ingress connectivity + +The above setup only provides egress connectivity. If in addition we also want +to add ingress (in other words, make the guest VM routable outside the network +namespace), then we need to choose a "clone address" that will represent this VM +uniquely. For our example we can use IPs from `172.16.0.0/12`, for example +`172.16.0.1`. + +Then we can rewrite destination address heading towards the "clone address" to +the guest IP. + +```bash +ip netns exec fc0 iptables -t nat -A PREROUTING -i veth0 \ + -d 172.16.0.1 -j DNAT --to 192.168.241.2 +``` + +And add a route on the host so we can access the guest VM from the host network +namespace: + +```bash +ip route add 172.16.0.1 via 10.0.0.2 +``` + +To confirm that ingress connectivity works, try + +```bash +ping 172.16.0.1 +# or +ssh root@172.16.0.1 +``` + +# See also + +For an improved setup with full ingress and egress connectivity to the +individual VMs, see +[this discussion](https://github.com/firecracker-microvm/firecracker/discussions/4720). diff --git a/tests/framework/microvm_helpers.py b/tests/framework/microvm_helpers.py index 6f1f9aeb11c..4239eb2489f 100644 --- a/tests/framework/microvm_helpers.py +++ b/tests/framework/microvm_helpers.py @@ -58,6 +58,9 @@ class MicrovmHelpers: # private block _supernet = ipaddress.IPv4Network("10.255.0.0/16") _subnets_gen = _supernet.subnets(new_prefix=30) + # Addresses that can be used outside the netns. Could be public IPv4 blocks + _ingress_net = ipaddress.IPv4Network("172.16.0.0/12") + _ingress_gen = _ingress_net.hosts() def __init__(self, vm): self.vm = vm @@ -140,18 +143,17 @@ def how_to_docker(self): """How to get into this container from outside""" return f"docker exec -it {DOCKER.id}" - def enable_ip_forwarding(self, iface="eth0"): + def enable_ip_forwarding(self, iface="eth0", ingress_ipv4=None): """Enables IP forwarding in the guest""" i = MicrovmHelpers.shared_subnet_ctr MicrovmHelpers.shared_subnet_ctr += 1 netns = self.vm.netns.id veth_host = f"vethhost{i}" - veth_vpn = f"vethvpn{i}" + veth_guest = f"vethguest{i}" veth_net = next(self._subnets_gen) - veth_host_ip, veth_vpn_ip = list(veth_net.hosts()) + veth_host_ip, veth_guest_ip = list(veth_net.hosts()) iface = self.vm.iface[iface]["iface"] tap_host_ip = iface.host_ip - tap_dev = iface.tap_name tap_net = iface.network.with_netmask # i.e. 192.168.7.0/255.255.255.0 # get the device associated with the default route upstream_dev = ( @@ -170,30 +172,26 @@ def run_in_netns(cmd): return run(f"ip netns exec {netns} " + cmd) # outside netns - # iptables -L -v -n + # iptables -L -v -n --line-numbers run( - f"ip link add name {veth_host} type veth peer name {veth_vpn} netns {netns}" + f"ip link add name {veth_host} type veth peer name {veth_guest} netns {netns}" ) run(f"ip addr add {veth_host_ip}/{veth_net.prefixlen} dev {veth_host}") - run_in_netns(f"ip addr add {veth_vpn_ip}/{veth_net.prefixlen} dev {veth_vpn}") + run_in_netns( + f"ip addr add {veth_guest_ip}/{veth_net.prefixlen} dev {veth_guest}" + ) run(f"ip link set {veth_host} up") - run_in_netns(f"ip link set {veth_vpn} up") + run_in_netns(f"ip link set {veth_guest} up") - run("iptables -P FORWARD DROP") + run("iptables -P FORWARD ACCEPT") # iptables -L FORWARD # iptables -t nat -L run( f"iptables -t nat -A POSTROUTING -s {veth_net} -o {upstream_dev} -j MASQUERADE" ) - run(f"iptables -A FORWARD -i {upstream_dev} -o {veth_host} -j ACCEPT") - run(f"iptables -A FORWARD -i {veth_host} -o {upstream_dev} -j ACCEPT") - - # in the netns run_in_netns(f"ip route add default via {veth_host_ip}") - run_in_netns(f"iptables -A FORWARD -i {tap_dev} -o {veth_vpn} -j ACCEPT") - run_in_netns(f"iptables -A FORWARD -i {veth_vpn} -o {tap_dev} -j ACCEPT") run_in_netns( - f"iptables -t nat -A POSTROUTING -s {tap_net} -o {veth_vpn} -j MASQUERADE" + f"iptables -t nat -A POSTROUTING -s {tap_net} -o {veth_guest} -j MASQUERADE" ) # Configure the guest @@ -207,3 +205,20 @@ def run_in_netns(cmd): .strip() ) self.vm.ssh.run(f"echo nameserver {nameserver} >/etc/resolv.conf") + + # only configure ingress if we get an IP + if not ingress_ipv4: + return + + if not isinstance(ingress_ipv4, ipaddress.IPv4Address): + ingress_ipv4 = next(self._ingress_gen) + + guest_ip = iface.guest_ip + + # packets heading towards the clone address are rewritten to the guest ip + run_in_netns( + f"iptables -t nat -A PREROUTING -i {veth_guest} -d {ingress_ipv4} -j DNAT --to {guest_ip}" + ) + + # add a route on the host for the clone address + run(f"ip route add {ingress_ipv4} via {veth_guest_ip}")