Skip to content

Commit

Permalink
test: use single SSH connection for lifetime of microvm
Browse files Browse the repository at this point in the history
Instead of creating new SSH connections every time we want to run a
command inside the microvm, open a single daemonized ssh connection in
the constructor of `SSHConnection`, and reuse it until we kill the
microvm.

Realize this using openssh's ControlMaster/ControlPersist functionality,
which allows us to persist the first connection opened to a microvm and
multiplex all subsequent commands over this one connection.

We need some slight changes in test_pause_restore.py and test_net.py. In
the former, since we no longer reconnect to the VM on every ssh command,
we will now observe a timeout instead of a connection failure. For the
latter, it turns out that @lru_cache treats .ssh_iface() and
.ssh_iface(0) as distinct invokations, despire the iface_idx argument of
ssh_iface defaulting to 0 (meaning they are actually the same function
call). This caused a re-intialization of the SSHConnection object, which
then triggered the assertion in _init_connection about the socket file
not existing.

Signed-off-by: Patrick Roy <[email protected]>
  • Loading branch information
roypat committed Dec 11, 2024
1 parent 8c359cd commit 67cb53e
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 50 deletions.
11 changes: 10 additions & 1 deletion tests/framework/microvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,8 @@ def __init__(
self.mem_size_bytes = None
self.cpu_template_name = None

self._connections = []

self._pre_cmd = []
if numa_node:
node_str = str(numa_node)
Expand Down Expand Up @@ -282,6 +284,10 @@ def kill(self):
for monitor in self.monitors:
monitor.stop()

# Kill all background SSH connections
for connection in self._connections:
connection.close()

# We start with vhost-user backends,
# because if we stop Firecracker first, the backend will want
# to exit as well and this will cause a race condition.
Expand Down Expand Up @@ -1007,13 +1013,16 @@ def ssh_iface(self, iface_idx=0):
"""Return a cached SSH connection on a given interface id."""
guest_ip = list(self.iface.values())[iface_idx]["iface"].guest_ip
self.ssh_key = Path(self.ssh_key)
return net_tools.SSHConnection(
connection = net_tools.SSHConnection(
netns=self.netns.id,
ssh_key=self.ssh_key,
user="root",
host=guest_ip,
control_path=Path(self.chroot()) / f"ssh-{iface_idx}.sock",
on_error=self._dump_debug_information,
)
self._connections.append(connection)
return connection

@property
def ssh(self):
Expand Down
123 changes: 81 additions & 42 deletions tests/host_tools/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,35 @@
"""Utilities for test host microVM network setup."""

import ipaddress
import os
import random
import re
import signal
import string
import subprocess
from dataclasses import dataclass, field
from pathlib import Path

from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
from tenacity import retry, stop_after_attempt, wait_fixed

from framework import utils
from framework.utils import Timeout


class SSHConnection:
"""
SSHConnection encapsulates functionality for microVM SSH interaction.
This class should be instantiated as part of the ssh fixture with the
This class should be instantiated as part of the ssh fixture with
the hostname obtained from the MAC address, the username for logging into
the image and the path of the ssh key.
This translates into an SSH connection as follows:
ssh -i ssh_key_path username@hostname
Establishes a ControlMaster upon construction, which is then re-used
for all subsequent SSH interactions.
"""

def __init__(self, netns, ssh_key: Path, host, user, *, on_error=None):
def __init__(
self, netns, ssh_key: Path, control_path: Path, host, user, *, on_error=None
):
"""Instantiate a SSH client and connect to a microVM."""
self.netns = netns
self.ssh_key = ssh_key
Expand All @@ -37,22 +42,13 @@ def __init__(self, netns, ssh_key: Path, host, user, *, on_error=None):
assert (ssh_key.stat().st_mode & 0o777) == 0o400
self.host = host
self.user = user
self._control_path = control_path

self._on_error = None

self.options = [
"-o",
"LogLevel=ERROR",
"-o",
"ConnectTimeout=1",
"-o",
"StrictHostKeyChecking=no",
"-o",
"UserKnownHostsFile=/dev/null",
"-o",
"PreferredAuthentications=publickey",
"-i",
str(self.ssh_key),
f"ControlPath={self._control_path}",
]

# _init_connection loops until it can connect to the guest
Expand Down Expand Up @@ -96,27 +92,91 @@ def scp_get(self, remote_path, local_path, recursive=False):
self._scp(self.remote_path(remote_path), local_path, opts)

@retry(
retry=retry_if_exception_type(ChildProcessError),
wait=wait_fixed(0.5),
wait=wait_fixed(1),
stop=stop_after_attempt(20),
reraise=True,
)
def _init_connection(self):
"""Create an initial SSH client connection (retry until it works).
"""Initialize the persistent background connection which will be used
to execute all commands sent via this `SSHConnection` object.
Since we're connecting to a microVM we just started, we'll probably
have to wait for it to boot up and start the SSH server.
We'll keep trying to execute a remote command that can't fail
(`/bin/true`), until we get a successful (0) exit code.
"""
self.check_output("true", debug=True)
assert not self._control_path.exists()

# Sadly, we cannot get debug output from this command (e.g. `-vvv`),
# because passing -vvv causes the daemonized ssh to hold on to stderr,
# and inside utils.run_cmd we're using subprocess.communicate, which
# only returns once stderr gets closed (which would thus result in an
# indefinite hang).
establish_cmd = [
"ssh",
# Only need to pass the ssh key here, as all multiplexed
# connections won't have to re-authenticate
"-i",
str(self.ssh_key),
"-o",
"StrictHostKeyChecking=no",
"-o",
"ConnectTimeout=2",
# Set up a persistent background connection
"-o",
"ControlMaster=auto",
"-o",
"ControlPersist=yes",
*self.options,
self.user_host,
"/usr/bin/true",
]

# don't set a low timeout here, because otherwise we might get into a race condition
# where ssh already forked off the persisted connection daemon, but gets killed here
# before exiting itself. In that case, self._control_path will exist, and the retry
# will hit the assert at the start of this function.
self._exec(establish_cmd, check=True)

def _check_liveness(self) -> int:
"""Checks whether the ControlPersist connection is still alive"""
check_cmd = ["ssh", "-O", "check", *self.options, self.user_host]

_, _, stderr = self._exec(check_cmd, check=True)

pid_match = re.match(r"Master running \(pid=(\d+)\)", stderr)

assert pid_match, f"SSH ControlMaster connection not alive anymore: {stderr}"

return int(pid_match.group(1))

def close(self):
"""Closes the ControlPersist connection"""
master_pid = self._check_liveness()

stop_cmd = ["ssh", "-O", "stop", *self.options, self.user_host]

_, _, stderr = self._exec(stop_cmd, check=True)

assert "Stop listening request sent" in stderr

try:
with Timeout(5):
utils.wait_process_termination(master_pid)
except TimeoutError:
# for some reason it won't exit, let's force it...
# if this also fails, when during teardown we'll get an error about
# "found a process with supposedly dead Firecracker's jailer ID"
os.kill(master_pid, signal.SIGKILL)

def run(self, cmd_string, timeout=100, *, check=False, debug=False):
"""
Execute the command passed as a string in the ssh context.
If `debug` is set, pass `-vvv` to `ssh`. Note that this will clobber stderr.
"""
self._check_liveness()

command = ["ssh", *self.options, self.user_host, cmd_string]

if debug:
Expand All @@ -141,27 +201,6 @@ def _exec(self, cmd, timeout=100, check=False):

raise

# pylint:disable=invalid-name
def Popen(
self,
cmd: str,
stdin=subprocess.DEVNULL,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
**kwargs,
) -> subprocess.Popen:
"""Execute the command in the guest and return a Popen object.
pop = uvm.ssh.Popen("while true; do echo $(date -Is) $RANDOM; sleep 1; done")
pop.stdout.read(16)
"""
cmd = ["ssh", *self.options, self.user_host, cmd]
if self.netns is not None:
cmd = ["ip", "netns", "exec", self.netns] + cmd
return subprocess.Popen(
cmd, stdin=stdin, stdout=stdout, stderr=stderr, **kwargs
)


def mac_from_ip(ip_address):
"""Create a MAC address based on the provided IP.
Expand Down
7 changes: 2 additions & 5 deletions tests/integration_tests/functional/test_net.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,8 @@ def test_tap_offload(uvm_any):
)

# Try to send a UDP message from host with UDP offload enabled
cmd = f"ip netns exec {vm.ssh_iface().netns} python3 ./host_tools/udp_offload.py {vm.ssh_iface().host} {port}"
ret = utils.run_cmd(cmd)

# Check that the transmission was successful
assert ret.returncode == 0, f"{ret.stdout=} {ret.stderr=}"
cmd = f"ip netns exec {vm.ssh.netns} python3 ./host_tools/udp_offload.py {vm.ssh.host} {port}"
utils.check_output(cmd)

# Check that the server received the message
ret = vm.ssh.run(f"cat {out_filename}")
Expand Down
5 changes: 3 additions & 2 deletions tests/integration_tests/functional/test_pause_resume.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import platform
import time
from subprocess import TimeoutExpired

import pytest

Expand Down Expand Up @@ -52,8 +53,8 @@ def test_pause_resume(uvm_nano):
microvm.flush_metrics()

# Verify guest is no longer active.
with pytest.raises(ChildProcessError):
microvm.ssh.check_output("true")
with pytest.raises(TimeoutExpired):
microvm.ssh.check_output("true", timeout=1)

# Verify emulation was indeed paused and no events from either
# guest or host side were handled.
Expand Down

0 comments on commit 67cb53e

Please sign in to comment.